def predict(sentences: List[str]):
    with open('data/models/dictionary.pkl', 'rb') as input_file:
        dictionary = pickle.load(input_file)

    lda_model = gensim.models.ldamodel.LdaModel.load(
        "data/models/lda_model.pkl")

    data_lemmatized = tokenize(sentences)
    corpus = term_document_matrix(data_lemmatized, dictionary)

    corpus_topics = []
    for bow in corpus:
        bow_topics = lda_model[bow][0]
        topics = []
        for topic in bow_topics:
            topic_words = []
            for word in lda_model.show_topic(topic[0]):
                topic_words.append(word[0])
            topics.append({
                'id': str(topic[0]),
                'prob': str(topic[1]),
                'words': topic_words
            })
        corpus_topics.append({'bow': bow, 'topics': topics})

    return corpus_topics
Example #2
0
def lda_model(raw_file: pd.DataFrame) -> List[gensim.LdaModel,gensim.CoherenceModel,float]:
    doc       = clean_up_text(text)
    lemma     = tokenize(doc)
    id2word   = create_dictionary(lemma)
    corpus    = [id2word.doc2bow(text) for text in lemma]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                                    update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
    coherence = coherence_model(lda_model,lemma,id2word)
    perplexity = perplexity(lda_model,corpus)
    modelo = [lda_model,coherence, perplexity]
    return(modelo)
Example #3
0
def lda_model(raw_file: pd.DataFrame):
    doc       = clean_up_text(raw_file)
    lemma     = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                                    update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
    return(lda_model)
    
    
    
    
Example #4
0
def test(model, text: str):
    text = pd.DataFrame(data={'content': [text]}, columns=['content'])
    doc = clean_up_text(text)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    prediction = model[corpus]

    topics = list()
    for prob in prediction[0][1]:
        for topic in prob[1]:
            topics.append(topic)
    moda = max(set(topics), key=topics.count)
    topics = model.print_topics()[moda]
    return topics
Example #5
0
def lda_model(raw_file: pd.DataFrame):
    doc = clean_up_text(raw_file)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    modelo = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                             id2word=id2word,
                                             num_topics=20,
                                             random_state=100,
                                             update_every=1,
                                             chunksize=100,
                                             passes=10,
                                             alpha='auto',
                                             per_word_topics=True)

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(modelo, output_file)

    return (lda_model)
Example #6
0
def lda_model(text: pd.DataFrame
              ):  # -> List[gensim.LdaModel,gensim.CoherenceModel,float]:
    doc = clean_up_text(text)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=5,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(lda_model, output_file)

    return (lda_model)
Example #7
0
def train():
    df = read_sample()
    data = df_to_list(df)
    data_lemmatized = tokenize(data)

    dictionary = create_dictionary(data_lemmatized)
    with open('data/models/dictionary.pkl', 'wb') as output_file:
        pickle.dump(dictionary, output_file)

    corpus = term_document_matrix(data_lemmatized, dictionary)

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=20,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    lda_model.save("data/models/lda_model.pkl")