Ejemplo n.º 1
0
 def test_build_texts(self):
     output = shared.build_texts('lda')
     self.assertEqual(output[0], self.texts)
     self.assertEqual(output[1], self.docs)
     output = shared.build_texts('min_hash', 2)
     self.assertEqual(output[0], self.s)
     self.assertEqual(
         output[1],
         self.docs,
     )
Ejemplo n.º 2
0
def bm_25():
    args = sys.argv[0:]
    if len(args) == 2:
        k = 10
        q = args[1].split(" ")
    elif len(args) == 3:
        k = int(args[1])
        q = args[2].split(" ")
    else:
        shared.error("11", ["bm25", ""])
        return -1
    try:
        texts, documents = shared.build_texts("bm25")
    except:
        shared.error("0", ["bm25", " ".join(q)])
        return -1
    try:
        scores = score(texts, q)
    except:
        shared.error("1", ["bm25", " ".join(q)])
        return -1
    try:
        write_to_file(scores, documents, q, k)
    except:
        shared.error("8", ["bm25", " ".join(q)])
        return -1
    try:
        shared.insert_to_db("bm25", "", "Finished")
    except:
        shared.error("10", ["bm25", " ".join(q)])
        return -1
    return 1
Ejemplo n.º 3
0
def tfidf():
    if len(sys.argv) != 1:
        shared.error("11", ["tfidf", ""])
        return -1
    try:
        texts, documents = shared.build_texts("tfidf")
    except:
        shared.error("0", ["tfidf", ""])
        return -1
    try:
        tfidf, raw_tf, dictionary = shared.get_tfidf(texts)
    except:
        shared.error("1", ["tfidf", ""])
        return -1
    try:
        tokens, postings = write_to_files(tfidf, raw_tf, dictionary, documents)
    except:
        shared.error("8", ["tfidf", ""])
        return -1
    try:
        shared.insert_to_db("tfidf", "", "Finished")
    except:
        shared.error("10", ["tfidf", ""])
        return -1
    return 1
Ejemplo n.º 4
0
def lsi():
    if len(sys.argv) == 1:
        k = 100  # Default dimensions is 100
    elif len(sys.argv) == 2:
        k = int(sys.argv[1])
    else:
        shared.error("11", ["lsi", ""])
        return -1
    try:
        texts, documents = shared.build_texts("lsi")
    except:
        shared.error("0", ["lsi", k])
        return -1
    try:
        ck = get_lsi(texts, k)
    except:
        shared.error("4", ["lsi", k])
        return -1
    try:
        shared.write_to_file(ck, documents, LSIFOLDER, "lsi.csv")
    except:
        shared.error("8", ["lsi", k])
        return -1
    try:
        shared.insert_to_db("lsi", k, "Finished")
    except:
        shared.error("10", ["lsi", k])
        return -1
    return 1
Ejemplo n.º 5
0
def k_means_clusterer():
    if len(sys.argv) in [1, 2]:
        k = 3  # Default number of clusters is 3
        if len(sys.argv) == 2:
            k = sys.argv[1]
            k = int(k)
    elif len(sys.argv) >= 3:
        k = sys.argv[1]
        k = int(k)
        seeds = sys.argv[2:]
        if len(seeds) != k:
            print("Number of clusters doesn't match number of seeds given")
            return -1
    else:
        shared.error("11", ["k_means_clusterer", ""])
        return -1
    try:
        texts, documents = shared.build_texts("k_means_clusterer")
    except:
        shared.error("0", ["k_means_clusterer", ""])
        return -1
    if len(sys.argv) in [1, 2]:
        try:
            seeds = gen_seeds(k, documents)
        except:
            shared.error("5", ["k_means_clusterer", k],
                         "random seed documents")
            return -1
    try:
        tfidf, raw_tf, dictionary = shared.get_tfidf(texts)
    except:
        shared.error("1", ["k_means_clusterer", ""])
        return -1
    try:
        inverted_index = build_inverted_index(tfidf, raw_tf, dictionary,
                                              documents)
    except:
        shared.error("5", ["k_means_clusterer", k], "inverted_index")
        return -1
    try:
        centroids = get_seed_vector(seeds, inverted_index, dictionary,
                                    documents)
    except:
        shared.error("5", ["k_means_clusterer", k], "centroids")
        return -1
    """
    Loop through reclustering documents and recalculating centroids until the 
    clusters don't change anymore.
    """
    while True:
        try:
            cluster1 = get_cluster(centroids, documents, inverted_index,
                                   dictionary)
            centroid1 = update_centroids(cluster1)
            cluster2 = get_cluster(centroid1, documents, inverted_index,
                                   dictionary)
        except:
            shared.error("7", ["k_means_clusterer", k])
            return -1
        if cluster1 == cluster2:
            try:
                write_to_file(cluster2)
                break
            except:
                shared.error("8", ["k_means_clusterer", k])
                return -1
        else:
            centroids = update_centroids(cluster2)
    try:
        shared.insert_to_db("k_means_clusterer", k, "Finished")
    except:
        shared.error("10", ["k_means_clusterer", k])
    return 1