Python tokenize Examples

Programming Language: Python

Namespace/Package Name: src.features.tokenize

Method/Function: tokenize

Examples at hotexamples.com: 7

Python tokenize - 7 examples found. These are the top rated real world Python examples of src.features.tokenize.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: predict.py Project: mes10208/topic_modeling_tripadvisor

def predict(sentences: List[str]):
    with open('data/models/dictionary.pkl', 'rb') as input_file:
        dictionary = pickle.load(input_file)

    lda_model = gensim.models.ldamodel.LdaModel.load(
        "data/models/lda_model.pkl")

    data_lemmatized = tokenize(sentences)
    corpus = term_document_matrix(data_lemmatized, dictionary)

    corpus_topics = []
    for bow in corpus:
        bow_topics = lda_model[bow][0]
        topics = []
        for topic in bow_topics:
            topic_words = []
            for word in lda_model.show_topic(topic[0]):
                topic_words.append(word[0])
            topics.append({
                'id': str(topic[0]),
                'prob': str(topic[1]),
                'words': topic_words
            })
        corpus_topics.append({'bow': bow, 'topics': topics})

    return corpus_topics

Example #2

Show file

def lda_model(raw_file: pd.DataFrame) -> List[gensim.LdaModel,gensim.CoherenceModel,float]:
    doc       = clean_up_text(text)
    lemma     = tokenize(doc)
    id2word   = create_dictionary(lemma)
    corpus    = [id2word.doc2bow(text) for text in lemma]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                                    update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
    coherence = coherence_model(lda_model,lemma,id2word)
    perplexity = perplexity(lda_model,corpus)
    modelo = [lda_model,coherence, perplexity]
    return(modelo)

Example #3

Show file

def lda_model(raw_file: pd.DataFrame):
    doc       = clean_up_text(raw_file)
    lemma     = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                                    update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
    return(lda_model)

Example #4

Show file

def test(model, text: str):
    text = pd.DataFrame(data={'content': [text]}, columns=['content'])
    doc = clean_up_text(text)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    prediction = model[corpus]

    topics = list()
    for prob in prediction[0][1]:
        for topic in prob[1]:
            topics.append(topic)
    moda = max(set(topics), key=topics.count)
    topics = model.print_topics()[moda]
    return topics

Example #5

Show file

File: train.py Project: chiruconew/Topic_model

def lda_model(raw_file: pd.DataFrame):
    doc = clean_up_text(raw_file)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    modelo = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                             id2word=id2word,
                                             num_topics=20,
                                             random_state=100,
                                             update_every=1,
                                             chunksize=100,
                                             passes=10,
                                             alpha='auto',
                                             per_word_topics=True)

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(modelo, output_file)

    return (lda_model)

Example #6

Show file

def lda_model(text: pd.DataFrame
              ):  # -> List[gensim.LdaModel,gensim.CoherenceModel,float]:
    doc = clean_up_text(text)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=5,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(lda_model, output_file)

    return (lda_model)

Example #7

Show file

def train():
    df = read_sample()
    data = df_to_list(df)
    data_lemmatized = tokenize(data)

    dictionary = create_dictionary(data_lemmatized)
    with open('data/models/dictionary.pkl', 'wb') as output_file:
        pickle.dump(dictionary, output_file)

    corpus = term_document_matrix(data_lemmatized, dictionary)

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=20,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    lda_model.save("data/models/lda_model.pkl")