Ejemplo n.º 1
0
def get_lda(out_dir, topic_name):
    global corp

    dictionary = corpora.Dictionary(corp)
    corpus = [dictionary.doc2bow(text) for text in corp]

    lda = models.LdaModel(corpus,
                          id2word=dictionary,
                          num_topics=n_topic,
                          random_state=1)
    #lda.save(out_dir + "{}-{}-{}-{}-{}-{}-{}-lda_model-single-nyt.model".format(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],topic_name) )

    ldaOut = lda.print_topics(n_topic)
    file_ldaOut = codecs.open(
        out_dir + "{}-{}-{}-{}-{}-{}-{}-lda_topics-rocchio-nyt.txt".format(
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6], topic_name), "w", "utf-8")
    file_ldaOut.write(str(ldaOut))

    file_id_lda = codecs.open(
        out_dir + "{}-{}-{}-{}-{}-{}-{}-fileid_lda-rocchio-nyt.txt".format(
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6], topic_name), "w", "utf-8")
    corpus_lda_doc = lda[corpus]

    index_id = -1
    dict_id_lda = {}
    for doc in corpus_lda_doc:
        index_id += 1
        file_id = get_id[index_id]
        dict_id_lda[file_id] = doc

    file_id_lda.write(json.dumps(dict_id_lda))
Ejemplo n.º 2
0
tfidf = TfidfVectorizer()
vector = []
vectors = []
for i in range(1, 100):
    with open('data/%s.txt' % i) as f:
        content = []
        is_content = False
        for line in f.readlines():
            line = line.rstrip()
            if line.startswith("# title #") and line.endswith('# / title #'):
                line = line.replace("# title #", "").replace("# / title #", "")
                content.append(line)
            elif line.startswith("# content #") and is_content is False:
                content.append(line.replace("# content #", ""))
                is_content = True
            elif line.startswith("# / content #"):
                is_content = False
            elif is_content is True:
                content.append(line)
        vectors.append(line.split())
        vector.append(' '.join(content))
tfidf_vector = tfidf.fit_transform(vector[:-3])
cluster = KMeans(init='k-means++', n_clusters=3)
cluster_result = cluster.fit(tfidf_vector)
tfidf_vector_test = tfidf.fit_transform(vector[-3:])
dc = Dictionary(vectors)
corpus = [dc.doc2bow(vec) for vec in vectors]
lda = LdaModel(corpus=corpus, id2word=dc, num_topics=3)
print lda.show_topics()
print lda.print_topics(3)[0]
Ejemplo n.º 3
0
tfidf = TfidfVectorizer()
vector = []
vectors = []
for i in range(1, 100):
    with open('data/%s.txt' % i) as f:
        content = []
        is_content = False
        for line in f.readlines():
            line = line.rstrip()
            if line.startswith("# title #") and line.endswith('# / title #'):
                line = line.replace("# title #" , "" ).replace("# / title #","")
                content.append(line)
            elif line.startswith("# content #") and is_content is False:
                content.append(line.replace("# content #",""))
                is_content = True
            elif line.startswith("# / content #"):
                is_content = False
            elif is_content is True:
                content.append(line)
        vectors.append(line.split())
        vector.append(' '.join(content))
tfidf_vector = tfidf.fit_transform(vector[:-3])
cluster = KMeans(init = 'k-means++' , n_clusters =  3  )
cluster_result = cluster.fit(tfidf_vector)
tfidf_vector_test = tfidf.fit_transform(vector[-3:])
dc = Dictionary(vectors)
corpus = [dc.doc2bow(vec) for vec in vectors]
lda = LdaModel(corpus = corpus , id2word=dc , num_topics = 3)
print lda.show_topics()
print lda.print_topics(3)[0]