Exemple #1
0
def min_hash_sim():
    if len(sys.argv) == 2:
        doc = sys.argv[1]
        k = 10.0
    elif len(sys.argv) == 3:
        doc = sys.argv[1]
        k = float(sys.argv[2])
    else:
        shared.error("11", ["min_hash_sim", ""])
        return -1
    try:
        docid = get_docid(doc)
    except:
        shared.error("5", ["min_hash_sim", ""], "docname")
        return -1
    try:
        dup_docs = get_document_list(docid)
    except:
        shared.error("5", ["min_hash_sim", k], "closest duplicate documents")
        return -1
    try:
        write_to_file(dup_docs, docid, k)
    except:
        shared.error("8", ["min_hash_sim", k])
        return -1
    try:
        shared.insert_to_db("min_hash_sim", k, "Finished")
    except:
        shared.error("10", ["min_hash_sim", k])
        return -1
    return 1
Exemple #2
0
def bm_25():
    args = sys.argv[0:]
    if len(args) == 2:
        k = 10
        q = args[1].split(" ")
    elif len(args) == 3:
        k = int(args[1])
        q = args[2].split(" ")
    else:
        shared.error("11", ["bm25", ""])
        return -1
    try:
        texts, documents = shared.build_texts("bm25")
    except:
        shared.error("0", ["bm25", " ".join(q)])
        return -1
    try:
        scores = score(texts, q)
    except:
        shared.error("1", ["bm25", " ".join(q)])
        return -1
    try:
        write_to_file(scores, documents, q, k)
    except:
        shared.error("8", ["bm25", " ".join(q)])
        return -1
    try:
        shared.insert_to_db("bm25", "", "Finished")
    except:
        shared.error("10", ["bm25", " ".join(q)])
        return -1
    return 1
Exemple #3
0
def tfidf():
    if len(sys.argv) != 1:
        shared.error("11", ["tfidf", ""])
        return -1
    try:
        texts, documents = shared.build_texts("tfidf")
    except:
        shared.error("0", ["tfidf", ""])
        return -1
    try:
        tfidf, raw_tf, dictionary = shared.get_tfidf(texts)
    except:
        shared.error("1", ["tfidf", ""])
        return -1
    try:
        tokens, postings = write_to_files(tfidf, raw_tf, dictionary, documents)
    except:
        shared.error("8", ["tfidf", ""])
        return -1
    try:
        shared.insert_to_db("tfidf", "", "Finished")
    except:
        shared.error("10", ["tfidf", ""])
        return -1
    return 1
Exemple #4
0
def lsi():
    if len(sys.argv) == 1:
        k = 100  # Default dimensions is 100
    elif len(sys.argv) == 2:
        k = int(sys.argv[1])
    else:
        shared.error("11", ["lsi", ""])
        return -1
    try:
        texts, documents = shared.build_texts("lsi")
    except:
        shared.error("0", ["lsi", k])
        return -1
    try:
        ck = get_lsi(texts, k)
    except:
        shared.error("4", ["lsi", k])
        return -1
    try:
        shared.write_to_file(ck, documents, LSIFOLDER, "lsi.csv")
    except:
        shared.error("8", ["lsi", k])
        return -1
    try:
        shared.insert_to_db("lsi", k, "Finished")
    except:
        shared.error("10", ["lsi", k])
        return -1
    return 1
Exemple #5
0
 def test_insert_to_db(self):
     shared.insert_to_db('test', '', 'Error code: 0')
     conn = sqlite3.connect('processed/hist.db')
     c = conn.cursor()
     c.execute("SELECT * "\
             + "FROM History "\
             + "WHERE tool='test' AND command='' "\
             + "AND output='Error code: 0'")
     self.assertTrue(c.fetchone() != None)
     c.execute("DELETE FROM History "\
             + "WHERE tool='test' AND command='' "\
             + "AND output='Error code: 0'")
     conn.commit()
     conn.close()
Exemple #6
0
def word_count():
    if len(sys.argv) != 1:
        shared.error("11", ["word_count", ""])
        return -1
    try:
        output = count_collection("source")
    except:
        shared.error("3", ["word_count", ""])
        return -1
    try:
        write_to_file(output)
    except:
        shared.error("8", ["word_count", ""])
        return -1
    try:
        shared.insert_to_db("word_count", "", "Finished")
    except:
        shared.error("10", ["word_count", ""])
        return -1
    return 1
Exemple #7
0
def min_hash():
    if len(sys.argv) == 1:
        args = ""
        k = 3
        hashes = 200
    elif len(sys.argv) == 3:
        args = " ".join(sys.argv[1:])
        k = int(sys.argv[1])
        hashes = int(sys.argv[2])
    else:
        shared.error("11", ["min_hash", ""])
        return -1
    conn = sqlite3.connect(SHINGLEDB)
    c = conn.cursor()
    c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name="\
        "'Shingle'")
    if c.fetchone() == None:
        try:
            shingles, docs = shared.gen_shingles(k)
        except:
            shared.error("6", ["min_hash", args], k)
            return -1
        try:
            insert_shingles(shingles, docs)
        except:
            shared.error("9", ["min_hash", args], SHINGLEDB)
            return -1
    try:
        if not os.path.exists(MINHASHFOLDER):
            os.makedirs(MINHASHFOLDER)
        _minHash.minHash(hashes)
    except:
        shared.error("2", ["min_hash", args])
        return -1
    try:
        shared.insert_to_db("min_hash", args, "Finished")
    except:
        shared.error("10", ["min_hash", args])
        return -1
    return 1
Exemple #8
0
def k_means_clusterer():
    if len(sys.argv) in [1, 2]:
        k = 3  # Default number of clusters is 3
        if len(sys.argv) == 2:
            k = sys.argv[1]
            k = int(k)
    elif len(sys.argv) >= 3:
        k = sys.argv[1]
        k = int(k)
        seeds = sys.argv[2:]
        if len(seeds) != k:
            print("Number of clusters doesn't match number of seeds given")
            return -1
    else:
        shared.error("11", ["k_means_clusterer", ""])
        return -1
    try:
        texts, documents = shared.build_texts("k_means_clusterer")
    except:
        shared.error("0", ["k_means_clusterer", ""])
        return -1
    if len(sys.argv) in [1, 2]:
        try:
            seeds = gen_seeds(k, documents)
        except:
            shared.error("5", ["k_means_clusterer", k],
                         "random seed documents")
            return -1
    try:
        tfidf, raw_tf, dictionary = shared.get_tfidf(texts)
    except:
        shared.error("1", ["k_means_clusterer", ""])
        return -1
    try:
        inverted_index = build_inverted_index(tfidf, raw_tf, dictionary,
                                              documents)
    except:
        shared.error("5", ["k_means_clusterer", k], "inverted_index")
        return -1
    try:
        centroids = get_seed_vector(seeds, inverted_index, dictionary,
                                    documents)
    except:
        shared.error("5", ["k_means_clusterer", k], "centroids")
        return -1
    """
    Loop through reclustering documents and recalculating centroids until the 
    clusters don't change anymore.
    """
    while True:
        try:
            cluster1 = get_cluster(centroids, documents, inverted_index,
                                   dictionary)
            centroid1 = update_centroids(cluster1)
            cluster2 = get_cluster(centroid1, documents, inverted_index,
                                   dictionary)
        except:
            shared.error("7", ["k_means_clusterer", k])
            return -1
        if cluster1 == cluster2:
            try:
                write_to_file(cluster2)
                break
            except:
                shared.error("8", ["k_means_clusterer", k])
                return -1
        else:
            centroids = update_centroids(cluster2)
    try:
        shared.insert_to_db("k_means_clusterer", k, "Finished")
    except:
        shared.error("10", ["k_means_clusterer", k])
    return 1