Example #1
0
def create_gensim_lsa_model(doc_clean, number_of_topics, words):
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix,
                        num_topics=number_of_topics,
                        id2word=dictionary)
    return (lsamodel.print_topics(num_topics=number_of_topics,
                                  num_words=words))
Example #2
0
def lsi(clean_docs, model_name, topics):

    from gensim import corpora
    # turn all data into a dictionary mappping of normalized words and their integer ids
    dictionary = corpora.Dictionary(clean_docs)

    # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples)
    # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus
    corpus = []
    for doc in clean_docs:
        corpus.append(dictionary.doc2bow(doc))

    # serialize version: save dictionary and corpus for future use
    from gensim.corpora import MmCorpus
    MmCorpus.serialize('corpus_' + model_name + '.mm', corpus)
    dictionary.save('dictionary_' + model_name + '.gensim')

    # Train LSI model
    from gensim.models import LsiModel
    num_topics = topics  # find this number of topics in the data

    lsimodel = LsiModel(corpus, num_topics=num_topics, id2word=dictionary)
    lsimodel.save('lsi_model_' + model_name + '.gensim')
    topics = lsimodel.print_topics(num_words=5)

    for topic in topics:
        print(topic)
Example #3
0
def create_gensim_lsa_model(doc_clean,number_of_topics,words):

    prepare_corpus = project2.initialize_terms_and_postings()
    dictionary,doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel
Example #4
0
def run_on_time_period(start, stop):
    # create lists to hold data
    start_date = start
    stop_date = stop
    date_list = []
    raw_docs = []

    # run the data getter
    while start_date <= stop_date:
        timestamp = start_date.replace(tzinfo=timezone.utc).timestamp()
        doc = get_data(timestamp)
        raw_docs.append(doc)
        real_date = start_date - timedelta(days=1)
        date_list.append(real_date.date())
        start_date += timedelta(days=1)

    # make list of docs without name

    for i in range(len(raw_docs)):
        for name in nicknames:
            if name in raw_docs[i]:
                raw_docs[i] = raw_docs[i].replace(name, '')

    final_docs = preprocess(raw_docs)
    dict, doc_term_matrix = create_corpus(final_docs)
    # lsi_models, coherence_values = get_coherence_values(dict, doc_term_matrix, final_docs, 10, 1, 2)
    lsi_model = LsiModel(doc_term_matrix, num_topics=10, id2word=dict)
    counter = 1
    print(lsi_model.print_topics(num_topics=5, num_words=5))
Example #5
0
def create_lsi(num_topic, dictionary):
    corpus, dic = generate_corpus(dictionary)
    print("__________________________Create LSI_________________________")
    lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dic)
    topics = lsimodel.print_topics(num_topic)  # Showing only the top 5 topics
    # see list of topics
    for topic in topics:
        print(topic)

    return lsimodel
Example #6
0
def lsi(all_tokens_lists):
    dictionary = corpora.Dictionary(all_tokens_lists)
    corpus = [dictionary.doc2bow(text) for text in all_tokens_lists]
    tfidf = models.TfidfModel(corpus, smartirs='ntc')
    tfidf_model = tfidf[corpus]
    lsi_model = LsiModel(corpus=tfidf_model,
                         id2word=dictionary,
                         num_topics=7,
                         decay=0.5)
    pprint(lsi_model.print_topics(-1, 10))
Example #7
0
 def create_gensim_lsa_model(self,doc_clean,number_of_topics,words):
     """
     Input  : clean document, number of topics and number of words associated with each topic
     Purpose: create LSA model using gensim
     Output : return LSA model
     """
     dictionary,doc_term_matrix=self.prepare_corpus(doc_clean)
     # generate LSA model
     lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
     print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
     return lsamodel
def lsi_model(dictionary, corpus, corpus_tfidf, cluster_keyword_lsi):  # 使用lsi模型,获取主题分布
    lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20)
    f_keyword = open(cluster_keyword_lsi, 'w+',encoding='utf-8')
    for topic in lsi.print_topics(20, 20):
        print(topic[0])
        words = []
        for word in topic[1].split('+'):
            word = word.split('*')[1].replace(' ', '')
            words.append(word)
        f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')
    return lsi
Example #9
0
    def run(self, input_file_path: str, output_file_path: str,
            num_topics: int) -> bool:
        """execute."""
        # input
        input_df = pd.read_csv(input_file_path, header=None,
                               dtype=np.float64).fillna(0.0)

        # get label column
        label_column_name = input_df.columns[0]
        label_df = pd.DataFrame(input_df.iloc[:, 0],
                                columns=[label_column_name],
                                dtype=np.int64)
        input_df = input_df.drop(label_column_name, axis=1)

        # convert to dictionaty
        input_df_column_names = input_df.columns.T.tolist()
        dict_values = input_df.T.to_dict().values()

        # make corpus
        corpus = []
        for row in dict_values:
            tmp = []
            for word, score in row.items():
                tmp.append((input_df_column_names.index(word), score))

            corpus.append(tmp)

        # lsi
        lsi_model = LsiModel(corpus, num_topics=num_topics)
        all_topics = lsi_model.print_topics(num_topics)

        # convert to list
        corpus_lsi = lsi_model[corpus]
        data = []
        for doc in corpus_lsi:
            tmp = []
            for i in range(len(all_topics)):
                try:
                    tmp.append(doc[i][1])
                except IndexError:
                    tmp.append(0.0)

            data.append(tmp)
        # convert to dataframe
        lsi_df = pd.DataFrame(data, dtype=np.float64)

        # join svd and label column
        output_df = pd.concat([label_df, lsi_df], axis=1)

        # save
        output_df.to_csv(output_file_path, index=False, header=False)

        return True
    def __create_lsa_model(self, clean_doc, num_topics, words):

        dictionary, corpus_tfidf = self.__prepare_corpus(clean_doc)

        # generate LSA model
        lsamodel = LsiModel(corpus_tfidf,
                            num_topics=num_topics,
                            id2word=dictionary)

        print("Topics: ",
              lsamodel.print_topics(num_topics=num_topics, num_words=words))

        return lsamodel
def generate_lsa_model(documents, n_topics):

    dictionary, document_term_matrix = prepare_corpus(documents)

    print('Generating the LSA model...')

    lsaModel = LsiModel(document_term_matrix,
                        id2word=dictionary,
                        num_topics=n_topics)
    print(lsaModel.print_topics(num_topics=n_topics, num_words=10))
    print('Done!')

    return lsaModel
Example #12
0
def lsi_model(sentence_dict,dictionary,corpus,corpus_tfidf,cluster_keyword_lsi, target_lt, num_cluster=11):
    '''使用lsi模型,获取主题分布'''
    lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_cluster)
    f_keyword = open(cluster_keyword_lsi, 'w+')
    for topic in lsi.print_topics(num_cluster, 50):
        words=[]
        for word in topic[1].split('+'):
            word=word.split('*')[1].replace(' ','')
            words.append(word)
        f_keyword.write(str(topic[0])+'\t'+','.join(words)+'\n')
   
    corpus_lsi = lsi[corpus_tfidf]
    write_results("./results_lsi.txt", corpus_lsi, target_lt)
    return lsi
def lsi_model(sentence_dict, dictionary, corpus, corpus_tfidf,
              cluster_keyword_lsi):
    '''
    Obtain topic distribution by using LSI model
    :param sentence_dict: sentence dictionary
    :param dictionary: corpus dictionary
    :param corpus: corpus in document term matrix
    :param corpus_tfidf: TF-IDF model of corpus
    :param cluster_keyword_lsi: LSI method name
    :return: None
    '''
    lsi = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=10)
    f_keyword = open(cluster_keyword_lsi, 'w+')
    for topic in lsi.print_topics(num_topics=10, num_words=50):
        words = []
        for word in topic[1].split('+'):
            word = word.split('*')[1].replace(' ', '')
            words.append(word)
        f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')
Example #14
0
    def LSAmodel(words, num_topics=5, num_words=5):

        dictionary = corpora.Dictionary(words)
        # Term Document Frequency
        corpus = [dictionary.doc2bow(word) for word in words]
        # save it!
        pickle.dump(corpus, open('corpus.pkl', 'wb'))
        dictionary.save('dictionary.gensim')
        # Train model
        lsimodel = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        # print_topics(num_topics=20, num_words=10)
        topics = lsimodel.print_topics(num_topics=num_topics, num_words=num_words)
        # Validation
        # A measure of how good the model is. lower the better.
        val_perplexity = lsimodel.log_perplexity(corpus)
        # cohherent score
        coherence_lsimodel = CoherenceModel(model=lsimodel, texts=words, dictionary=dictionary, coherence='c_v')
        val_coherence = coherence_lsimodel.get_coherence()

        return topics, val_perplexity, val_coherence
Example #15
0
def create_gensim_lsa_model(doc_clean, number_of_topics, words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix,
                        num_topics=number_of_topics,
                        id2word=dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))

    # temp
    coherencemodel = CoherenceModel(model=lsamodel,
                                    texts=doc_clean,
                                    dictionary=dictionary,
                                    coherence='c_v')
    print(coherencemodel.get_coherence())

    return lsamodel
 def getLSATopics(self,
                  doc,
                  number_of_topics,
                  chunk=2000,
                  gram=(1, 2),
                  option='c'):
     dictionary, doc_term_matrix = self.prepare_corpus(doc, gram, option)
     # generate LSA model
     lsa = LsiModel(doc_term_matrix,
                    num_topics=number_of_topics,
                    id2word=dictionary,
                    chunksize=chunk)  # train model
     display(lsa.print_topics())
     # Let's take a look at which topics each transcript contains
     corpus_transformed = lsa[doc_term_matrix]
     # transform the result into numpy array to get the score for each title
     all_topics_csr = matutils.corpus2csc(corpus_transformed)
     all_topics_numpy = all_topics_csr.T.toarray()
     #Lsa_Topic=pd.DataFrame(all_topics_numpy)
     Lsa_Topic = pd.DataFrame(all_topics_numpy, doc)
     display(Lsa_Topic.head(5))
     print('shape ', Lsa_Topic.shape)
     return Lsa_Topic
Example #17
0
    num_topics = 2 + int(len(corpus) / 250)
    if num_topics >= 20:
        num_topics = 10
    num_words = (num_topics - 2) * 2 + 10
    print('本院系文章总数为%d,即将分为主题数%d个,关键字%d个......' %
          (len(corpus), num_topics, num_words))
    # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=50)
    # result = ldamodel.print_topics(num_topics=num_topics, num_words=num_words)
    # doc_lda = ldamodel[corpus]
    model = LsiModel(
        corpus,
        id2word=dictionary,
        num_topics=num_topics,
    )
    doc_lda = model[corpus]
    result = model.print_topics(num_topics=num_topics, num_words=num_words)
    time2 = time.time()
    print('模型训练用时:', time2 - time1)
    print('LDA模型训练完成。插入数据库......')

    for n in range(len(doc_lda)):
        Topic = doc_lda[n]
        if len(Topic) == 0:
            prams = (institution_paper_list[n][0], institution + "其他",
                     json.dumps({}, ensure_ascii=False),
                     json.dumps({}, ensure_ascii=False))
            sql = 'insert into lda2 values(%s,%s,%s,%s)'
            list = dbs.exe_sql(sql, prams)
            continue
        c1 = sorted(Topic, key=lambda x: x[1], reverse=True)
Example #18
0
def LSI(doc_term_matrix):
    # Running and Trainign LDA model on the document term matrix.
    lsimodel = LsiModel(doc_term_matrix, num_topics=25, id2word = dictionary, decay=0.5)
    lsimodel.save("lsimodel")
    pprint(lsimodel.print_topics(-1))
Example #19
0
#corpus_lsi = lsi[corpus_tfidf]
lsi = LsiModel(corpus, id2word=dictionary,
               num_topics=5)  # initialize an LSI transformation
corpus_lsi = lsi[corpus]

#affection_lsi = tfidf[corpus[0]]
#affection_lsi = lsi[tfidf[corpus[0]]]
numb_lsi = lsi[corpus[3]]

#print corpus_tfidf[0]
#print corpus_lsi[0]
#print affection_lsi

index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[numb_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])

sims = sims[:5]
for doc_id, sim in sims:
    print doc_id, document_song_mapping[doc_id], sim

lsi.print_topics(5)
#print affection_lsi
#print lsi[corpus[15]]
#print sims
#
#print index
#sims = index[tfidf[corpus[0]]]
#print(list(enumerate(sims)))
#for word_id, weight in affection:
#    print inverted_dict[word_id], weight
Example #20
0
start, stop, step = 0, 12, 1

for num_topics in range(start, stop, step):
    model = LsiModel(doc_term_matrix, num_topics=12, id2word=dictionary)
    # generate LSA model
    # train model

    model_list.append(model)
    coherencemodel = CoherenceModel(model=model,
                                    texts=texts,
                                    dictionary=dictionary,
                                    coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

x = range(start, stop, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

number_of_topics = 3
words = 15

model = LsiModel(doc_term_matrix,
                 num_topics=number_of_topics,
                 id2word=dictionary)
topics = model.print_topics(num_topics=number_of_topics, num_words=words)
print(model.print_topics(num_topics=number_of_topics, num_words=words))
logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt")
parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm")
parser.add_argument("-m", "--model", help="path to model output")
args = parser.parse_args()

# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary))

# load corpus iterator
mm = MmCorpus(args.corpus)

print(mm)
# MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)

# extract num_topics LSI topics; use the default one-pass algorithm
num_topics = 400
model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics)

# print the most contributing words (both positively and negatively) for each of the first ten topics
model.print_topics(10)

model.save("%s/%s.model" % (args.model, timestamp))
Example #22
0
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

#LSA - Topic Modelling
##Application du modèle sur le corpus LONGIT
number_of_topics = 1

words = 100

document_list,titles = load_data("",'./corpus_files/prod_all_txt/corpus_longit.csv')

clean_text = preprocess_data(document_list)

dictionary,doc_term_matrix = prepare_corpus(clean_text)

lsamodel = LsiModel(doc_term_matrix, num_topics = number_of_topics, id2word = dictionary)  # train model

print(lsamodel.print_topics(num_topics = number_of_topics, num_words = words))

output_file = open('./corpus_files/tm_csv/topic_modelling.csv', mode = 'w', encoding = 'utf8')

output_file.write("Topic modelling du corpus LONGIT : "+str(lsamodel.print_topics(num_topics = number_of_topics, num_words = words)))

output_file.close()
data_lemmatized = make_bigrams(data_words)
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
lda_model = LdaModel.load('lda_model_full2')

for c in lda_model[corpus[5:8]]:
    print("Document Topics      : ", c[0])
    print("Word id, Topics      : ", c[1][:3])
    print("Phi Values (word id) : ", c[2][:2])
    print("Word, Topics         : ", [(dct[wd], topic) for wd, topic in c[1][:2]])
    print("Phi Values (word)    : ", [(dct[wd], topic) for wd, topic in c[2][:2]])
    print("------------------------------------------------------\n")

lsi_model = LsiModel(corpus=corpus, id2word=dct, num_topics=7, decay=0.5)
pprint(lsi_model.print_topics(-1))


def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True)
            else:
                break
bow_corpus = [dictionary.doc2bow(text) for text in texts]

word_counts = [[(dictionary[id], count) for id, count in line]
               for line in bow_corpus]

tfidf = models.TfidfModel(bow_corpus)
doc = tfidf[bow_corpus[2]]

corpus_tfidf = tfidf[bow_corpus]

lsi_model = LsiModel(corpus_tfidf,
                     id2word=dictionary,
                     num_topics=250,
                     decay=0.5)  # initialize an LSI transformation
corpus_lsi = lsi_model[corpus_tfidf]
topics = lsi_model.print_topics(5)
for topic in topics:
    print(topic)


def compute_coherence_values(dictionary,
                             doc_term_matrix,
                             doc_clean,
                             stop,
                             start=2,
                             step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix,
Example #25
0
        )  # Dict.txt - file with clear review's text from future recycle

mydict = corpora.Dictionary(
    simple_preprocess(line, deacc=True)
    for line in open('Dict.txt', encoding='utf-8'))
#print(mydict)
corpus = [
    mydict.doc2bow(simple_preprocess(line))
    for line in open('Dict.txt', encoding='utf-8')
]
#print (corpus)

lsamodel = LsiModel(corpus, num_topics=number_of_topics,
                    id2word=mydict)  # train model
# print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
tx = lsamodel.print_topics()
pprint(
    tx, open("Topic.txt", "w")
)  #Topic.txt - file with main thems(parametrs) from reviews, which have rating 5.0

# ----------------------------------create vectors with word's index and tf-idf recycling------------------------------
with open("Dict.txt", "r") as file:
    documents = file.read().splitlines()
# print(documents)
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bag_of_words = count_vectorizer.fit_transform(documents)
feature_names = count_vectorizer.get_feature_names()
pprint(pd.DataFrame(bag_of_words.toarray(), columns=feature_names),
       open("matrix.txt", "w"))
Example #26
0
from utils import generate_timestamp

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt")
parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm")
parser.add_argument("-m", "--model", help="path to model output")
args = parser.parse_args()

# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary))

# load corpus iterator
mm = MmCorpus(args.corpus)

print(mm)
# MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)

# extract num_topics LSI topics; use the default one-pass algorithm
num_topics = 400
model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics)

# print the most contributing words (both positively and negatively) for each of the first ten topics
model.print_topics(10)

model.save("%s/%s.model" % (args.model, timestamp))
def lsi2(corpus_tfidf,dictionary):
  lsi_model = LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=30, decay=0.5)
  for topic in lsi_model.print_topics(5,10):
      print(topic)
Example #28
0
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)

# 读取字典和词袋
dictionary_en = corpora.Dictionary.load(r"Data\Intermediate\dict_en.dict")
corpus_en = corpora.BleiCorpus(r"Data\Intermediate\corpus_en.blei")

# tfidf模型
tfidf = TfidfModel(corpus_en)
tfidf.save(r"Data\Output\tfidf_model_en")

# 用tfidf模型计算tfidf的语料
tfidf_corpus = tfidf[corpus_en]
corpora.BleiCorpus.serialize(r"Data\Intermediate\tfidf_corpus_en.blei", tfidf_corpus)

# lsi
lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary_en, num_topics=20)
lsi_corpus = lsi[tfidf_corpus]
lsi.save(r"Data\Output\lsi_model_en")
corpora.BleiCorpus.serialize(r"Data\Intermediate\lsi_corpus_en.blei", lsi_corpus)
lsi.print_topics()

# lda
lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary_en, num_topics=20)
lda_corpus = lda[tfidf_corpus]
lda.save(r"Data\Output\lda_model_en")
corpora.BleiCorpus.serialize(r"Data\Intermediate\lda_corpus_en.blei", lda_corpus)
lda.print_topics()
Example #29
0
    print("TOPIC (LSI) " + str(topic_id) + " : ", topic)

print('#' * 50)
print(lsi_model.num_topics)
for i in range(0, lsi_model.num_topics - 1):
    if lsi_model.print_topic(i):
        print(lsi_model.print_topic(i))

corpus_tfidf = tfidf_model[corpus]
corpus_lsi = lsi_model[corpus]

lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
corpus_lsi_2 = lsi_model_2[corpus]
print('完成创建模型')

print('*' * 10, lsi_model_2.print_topics(5))

topic_id = 0
for topic in lsi_model_2.show_topics():
    print("TOPIC (LSI2) ", str(topic_id), " : ", topic)
    group_topic = [doc for doc in corpus_lsi_2 if doc[topic_id][1] > 0.5]
    print(str(group_topic))
    topic_id += 1

print("文档加工 " + str(lsi_model_2.docs_processed))

for doc in corpus_lsi_2:  # 无论 bow->tfidf 还是 tfidf->lsi 实际上是在此运行
    print("Doc " + str(doc))

#模型的保存
#corpus.dictionary.save("dictionary.dump")
Example #30
0
dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


document_num = 30
bow_doc_x = bow_corpus[document_num]
print(bow_corpus[10])
#
# for i in range(len(bow_doc_x)):
#     print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
#                                                      dictionary[bow_doc_x[i][0]],
#                                                      bow_doc_x[i][1]))
#
#
lsamodel = LsiModel(bow_corpus, num_topics=7, id2word=dictionary)  # train model
print(lsamodel.print_topics(num_topics=7, num_words=10))
for idx, topic in lsamodel.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")


from gensim.test.utils import datapath

# Save model to disk.
temp_file = datapath("lsa_model_optimized")
lsamodel.save(temp_file)

# Load a potentially pretrained model from disk.
df_test_jokes = pd.read_csv("JokeText.csv")
if False:
    lsamodel = gensim.models.LsiModel.load(temp_file)