def min_hash_sim(): if len(sys.argv) == 2: doc = sys.argv[1] k = 10.0 elif len(sys.argv) == 3: doc = sys.argv[1] k = float(sys.argv[2]) else: shared.error("11", ["min_hash_sim", ""]) return -1 try: docid = get_docid(doc) except: shared.error("5", ["min_hash_sim", ""], "docname") return -1 try: dup_docs = get_document_list(docid) except: shared.error("5", ["min_hash_sim", k], "closest duplicate documents") return -1 try: write_to_file(dup_docs, docid, k) except: shared.error("8", ["min_hash_sim", k]) return -1 try: shared.insert_to_db("min_hash_sim", k, "Finished") except: shared.error("10", ["min_hash_sim", k]) return -1 return 1
def bm_25(): args = sys.argv[0:] if len(args) == 2: k = 10 q = args[1].split(" ") elif len(args) == 3: k = int(args[1]) q = args[2].split(" ") else: shared.error("11", ["bm25", ""]) return -1 try: texts, documents = shared.build_texts("bm25") except: shared.error("0", ["bm25", " ".join(q)]) return -1 try: scores = score(texts, q) except: shared.error("1", ["bm25", " ".join(q)]) return -1 try: write_to_file(scores, documents, q, k) except: shared.error("8", ["bm25", " ".join(q)]) return -1 try: shared.insert_to_db("bm25", "", "Finished") except: shared.error("10", ["bm25", " ".join(q)]) return -1 return 1
def tfidf(): if len(sys.argv) != 1: shared.error("11", ["tfidf", ""]) return -1 try: texts, documents = shared.build_texts("tfidf") except: shared.error("0", ["tfidf", ""]) return -1 try: tfidf, raw_tf, dictionary = shared.get_tfidf(texts) except: shared.error("1", ["tfidf", ""]) return -1 try: tokens, postings = write_to_files(tfidf, raw_tf, dictionary, documents) except: shared.error("8", ["tfidf", ""]) return -1 try: shared.insert_to_db("tfidf", "", "Finished") except: shared.error("10", ["tfidf", ""]) return -1 return 1
def lsi(): if len(sys.argv) == 1: k = 100 # Default dimensions is 100 elif len(sys.argv) == 2: k = int(sys.argv[1]) else: shared.error("11", ["lsi", ""]) return -1 try: texts, documents = shared.build_texts("lsi") except: shared.error("0", ["lsi", k]) return -1 try: ck = get_lsi(texts, k) except: shared.error("4", ["lsi", k]) return -1 try: shared.write_to_file(ck, documents, LSIFOLDER, "lsi.csv") except: shared.error("8", ["lsi", k]) return -1 try: shared.insert_to_db("lsi", k, "Finished") except: shared.error("10", ["lsi", k]) return -1 return 1
def test_error(self): shared.error('0', ['test', '']) conn = sqlite3.connect('processed/hist.db') c = conn.cursor() c.execute("SELECT * "\ + "FROM History "\ + "WHERE tool='test' AND command='' "\ + "AND output='Error code: 0'") self.assertTrue(c.fetchone() != None) c.execute("DELETE FROM History "\ + "WHERE tool='test' AND command='' "\ + "AND output='Error code: 0'") conn.commit() conn.close()
def min_hash(): if len(sys.argv) == 1: args = "" k = 3 hashes = 200 elif len(sys.argv) == 3: args = " ".join(sys.argv[1:]) k = int(sys.argv[1]) hashes = int(sys.argv[2]) else: shared.error("11", ["min_hash", ""]) return -1 conn = sqlite3.connect(SHINGLEDB) c = conn.cursor() c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name="\ "'Shingle'") if c.fetchone() == None: try: shingles, docs = shared.gen_shingles(k) except: shared.error("6", ["min_hash", args], k) return -1 try: insert_shingles(shingles, docs) except: shared.error("9", ["min_hash", args], SHINGLEDB) return -1 try: if not os.path.exists(MINHASHFOLDER): os.makedirs(MINHASHFOLDER) _minHash.minHash(hashes) except: shared.error("2", ["min_hash", args]) return -1 try: shared.insert_to_db("min_hash", args, "Finished") except: shared.error("10", ["min_hash", args]) return -1 return 1
def word_count(): if len(sys.argv) != 1: shared.error("11", ["word_count", ""]) return -1 try: output = count_collection("source") except: shared.error("3", ["word_count", ""]) return -1 try: write_to_file(output) except: shared.error("8", ["word_count", ""]) return -1 try: shared.insert_to_db("word_count", "", "Finished") except: shared.error("10", ["word_count", ""]) return -1 return 1
def k_means_clusterer(): if len(sys.argv) in [1, 2]: k = 3 # Default number of clusters is 3 if len(sys.argv) == 2: k = sys.argv[1] k = int(k) elif len(sys.argv) >= 3: k = sys.argv[1] k = int(k) seeds = sys.argv[2:] if len(seeds) != k: print("Number of clusters doesn't match number of seeds given") return -1 else: shared.error("11", ["k_means_clusterer", ""]) return -1 try: texts, documents = shared.build_texts("k_means_clusterer") except: shared.error("0", ["k_means_clusterer", ""]) return -1 if len(sys.argv) in [1, 2]: try: seeds = gen_seeds(k, documents) except: shared.error("5", ["k_means_clusterer", k], "random seed documents") return -1 try: tfidf, raw_tf, dictionary = shared.get_tfidf(texts) except: shared.error("1", ["k_means_clusterer", ""]) return -1 try: inverted_index = build_inverted_index(tfidf, raw_tf, dictionary, documents) except: shared.error("5", ["k_means_clusterer", k], "inverted_index") return -1 try: centroids = get_seed_vector(seeds, inverted_index, dictionary, documents) except: shared.error("5", ["k_means_clusterer", k], "centroids") return -1 """ Loop through reclustering documents and recalculating centroids until the clusters don't change anymore. """ while True: try: cluster1 = get_cluster(centroids, documents, inverted_index, dictionary) centroid1 = update_centroids(cluster1) cluster2 = get_cluster(centroid1, documents, inverted_index, dictionary) except: shared.error("7", ["k_means_clusterer", k]) return -1 if cluster1 == cluster2: try: write_to_file(cluster2) break except: shared.error("8", ["k_means_clusterer", k]) return -1 else: centroids = update_centroids(cluster2) try: shared.insert_to_db("k_means_clusterer", k, "Finished") except: shared.error("10", ["k_means_clusterer", k]) return 1