def predict(sentences: List[str]): with open('data/models/dictionary.pkl', 'rb') as input_file: dictionary = pickle.load(input_file) lda_model = gensim.models.ldamodel.LdaModel.load( "data/models/lda_model.pkl") data_lemmatized = tokenize(sentences) corpus = term_document_matrix(data_lemmatized, dictionary) corpus_topics = [] for bow in corpus: bow_topics = lda_model[bow][0] topics = [] for topic in bow_topics: topic_words = [] for word in lda_model.show_topic(topic[0]): topic_words.append(word[0]) topics.append({ 'id': str(topic[0]), 'prob': str(topic[1]), 'words': topic_words }) corpus_topics.append({'bow': bow, 'topics': topics}) return corpus_topics
def lda_model(raw_file: pd.DataFrame) -> List[gensim.LdaModel,gensim.CoherenceModel,float]: doc = clean_up_text(text) lemma = tokenize(doc) id2word = create_dictionary(lemma) corpus = [id2word.doc2bow(text) for text in lemma] lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100, update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True) coherence = coherence_model(lda_model,lemma,id2word) perplexity = perplexity(lda_model,corpus) modelo = [lda_model,coherence, perplexity] return(modelo)
def lda_model(raw_file: pd.DataFrame): doc = clean_up_text(raw_file) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100, update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True) return(lda_model)
def test(model, text: str): text = pd.DataFrame(data={'content': [text]}, columns=['content']) doc = clean_up_text(text) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) prediction = model[corpus] topics = list() for prob in prediction[0][1]: for topic in prob[1]: topics.append(topic) moda = max(set(topics), key=topics.count) topics = model.print_topics()[moda] return topics
def lda_model(raw_file: pd.DataFrame): doc = clean_up_text(raw_file) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) modelo = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) with open("models/model.pkl", "wb") as output_file: pickle.dump(modelo, output_file) return (lda_model)
def lda_model(text: pd.DataFrame ): # -> List[gensim.LdaModel,gensim.CoherenceModel,float]: doc = clean_up_text(text) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) with open("models/model.pkl", "wb") as output_file: pickle.dump(lda_model, output_file) return (lda_model)
def train(): df = read_sample() data = df_to_list(df) data_lemmatized = tokenize(data) dictionary = create_dictionary(data_lemmatized) with open('data/models/dictionary.pkl', 'wb') as output_file: pickle.dump(dictionary, output_file) corpus = term_document_matrix(data_lemmatized, dictionary) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) lda_model.save("data/models/lda_model.pkl")