def test_build_texts(self): output = shared.build_texts('lda') self.assertEqual(output[0], self.texts) self.assertEqual(output[1], self.docs) output = shared.build_texts('min_hash', 2) self.assertEqual(output[0], self.s) self.assertEqual( output[1], self.docs, )
def bm_25(): args = sys.argv[0:] if len(args) == 2: k = 10 q = args[1].split(" ") elif len(args) == 3: k = int(args[1]) q = args[2].split(" ") else: shared.error("11", ["bm25", ""]) return -1 try: texts, documents = shared.build_texts("bm25") except: shared.error("0", ["bm25", " ".join(q)]) return -1 try: scores = score(texts, q) except: shared.error("1", ["bm25", " ".join(q)]) return -1 try: write_to_file(scores, documents, q, k) except: shared.error("8", ["bm25", " ".join(q)]) return -1 try: shared.insert_to_db("bm25", "", "Finished") except: shared.error("10", ["bm25", " ".join(q)]) return -1 return 1
def tfidf(): if len(sys.argv) != 1: shared.error("11", ["tfidf", ""]) return -1 try: texts, documents = shared.build_texts("tfidf") except: shared.error("0", ["tfidf", ""]) return -1 try: tfidf, raw_tf, dictionary = shared.get_tfidf(texts) except: shared.error("1", ["tfidf", ""]) return -1 try: tokens, postings = write_to_files(tfidf, raw_tf, dictionary, documents) except: shared.error("8", ["tfidf", ""]) return -1 try: shared.insert_to_db("tfidf", "", "Finished") except: shared.error("10", ["tfidf", ""]) return -1 return 1
def lsi(): if len(sys.argv) == 1: k = 100 # Default dimensions is 100 elif len(sys.argv) == 2: k = int(sys.argv[1]) else: shared.error("11", ["lsi", ""]) return -1 try: texts, documents = shared.build_texts("lsi") except: shared.error("0", ["lsi", k]) return -1 try: ck = get_lsi(texts, k) except: shared.error("4", ["lsi", k]) return -1 try: shared.write_to_file(ck, documents, LSIFOLDER, "lsi.csv") except: shared.error("8", ["lsi", k]) return -1 try: shared.insert_to_db("lsi", k, "Finished") except: shared.error("10", ["lsi", k]) return -1 return 1
def k_means_clusterer(): if len(sys.argv) in [1, 2]: k = 3 # Default number of clusters is 3 if len(sys.argv) == 2: k = sys.argv[1] k = int(k) elif len(sys.argv) >= 3: k = sys.argv[1] k = int(k) seeds = sys.argv[2:] if len(seeds) != k: print("Number of clusters doesn't match number of seeds given") return -1 else: shared.error("11", ["k_means_clusterer", ""]) return -1 try: texts, documents = shared.build_texts("k_means_clusterer") except: shared.error("0", ["k_means_clusterer", ""]) return -1 if len(sys.argv) in [1, 2]: try: seeds = gen_seeds(k, documents) except: shared.error("5", ["k_means_clusterer", k], "random seed documents") return -1 try: tfidf, raw_tf, dictionary = shared.get_tfidf(texts) except: shared.error("1", ["k_means_clusterer", ""]) return -1 try: inverted_index = build_inverted_index(tfidf, raw_tf, dictionary, documents) except: shared.error("5", ["k_means_clusterer", k], "inverted_index") return -1 try: centroids = get_seed_vector(seeds, inverted_index, dictionary, documents) except: shared.error("5", ["k_means_clusterer", k], "centroids") return -1 """ Loop through reclustering documents and recalculating centroids until the clusters don't change anymore. """ while True: try: cluster1 = get_cluster(centroids, documents, inverted_index, dictionary) centroid1 = update_centroids(cluster1) cluster2 = get_cluster(centroid1, documents, inverted_index, dictionary) except: shared.error("7", ["k_means_clusterer", k]) return -1 if cluster1 == cluster2: try: write_to_file(cluster2) break except: shared.error("8", ["k_means_clusterer", k]) return -1 else: centroids = update_centroids(cluster2) try: shared.insert_to_db("k_means_clusterer", k, "Finished") except: shared.error("10", ["k_means_clusterer", k]) return 1