Ejemplo n.º 1
0
def train():
    with open(r"../data/interim/positive_words.pkl", "rb") as input_file:
        positive_docs = pickle.load(input_file)

    with open(r"../data/interim/negative_words.pkl", "rb") as input_file:
        negative_docs = pickle.load(input_file)

    negative_words = [item for sublist in negative_docs for item in sublist]
    positive_words = [item for sublist in positive_docs for item in sublist]

    dictionary = create_dictionary([negative_words, positive_words])

    negative_split = split_data(negative_words)
    positive_split = split_data(positive_words)

    negative_bow = dictionary.doc2bow(negative_split['train'])
    positive_bow = dictionary.doc2bow(positive_split['train'])

    total_negative = len(negative_split['train']) + len(negative_bow)
    total_positive = len(positive_split['train']) + len(positive_bow)

    negative_word_probs = {}
    for id, count in negative_bow:
        negative_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_negative),
        }

    positive_word_probs = {}
    for id, count in positive_bow:
        positive_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_positive),
        }
Ejemplo n.º 2
0
def train():
    df = read_sample()
    document_classes = create_classes(df)

    word_classes = tokenize_classes(document_classes, False)

    negative_words = [
        item for sublist in word_classes['NEG'] for item in sublist
    ]
    positive_words = [
        item for sublist in word_classes['POS'] for item in sublist
    ]

    dictionary = create_dictionary([negative_words, positive_words])

    negative_split = split_data(negative_words, (1, 0.0, 0.0))
    positive_split = split_data(positive_words, (1, 0.0, 0.0))

    negative_bow = dictionary.doc2bow(negative_split['train'])
    positive_bow = dictionary.doc2bow(positive_split['train'])

    total_negative = len(negative_split['train']) + len(negative_bow)
    total_positive = len(positive_split['train']) + len(positive_bow)

    negative_prob = np.log(
        len(negative_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))
    positive_prob = np.log(
        len(positive_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))

    negative_word_probs = {}
    for id, count in negative_bow:
        negative_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_negative),
        }

    negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)}

    positive_word_probs = {}
    for id, count in positive_bow:
        positive_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_positive),
        }

    positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)}

    model = {
        'POS_PROB': positive_prob,
        'NEG_PROB': negative_prob,
        'COND_POS_PROBS': positive_word_probs,
        'COND_NEG_PROBS': negative_word_probs,
    }

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(model, output_file)
    logging.info('Model saved to artifact model.pkl')
Ejemplo n.º 3
0
def lda_model(raw_file: pd.DataFrame) -> List[gensim.LdaModel,gensim.CoherenceModel,float]:
    doc       = clean_up_text(text)
    lemma     = tokenize(doc)
    id2word   = create_dictionary(lemma)
    corpus    = [id2word.doc2bow(text) for text in lemma]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                                    update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
    coherence = coherence_model(lda_model,lemma,id2word)
    perplexity = perplexity(lda_model,corpus)
    modelo = [lda_model,coherence, perplexity]
    return(modelo)
Ejemplo n.º 4
0
def lda_model(raw_file: pd.DataFrame):
    doc       = clean_up_text(raw_file)
    lemma     = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100,
                                    update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True)
    return(lda_model)
    
    
    
    
Ejemplo n.º 5
0
def test(model, text: str):
    text = pd.DataFrame(data={'content': [text]}, columns=['content'])
    doc = clean_up_text(text)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    prediction = model[corpus]

    topics = list()
    for prob in prediction[0][1]:
        for topic in prob[1]:
            topics.append(topic)
    moda = max(set(topics), key=topics.count)
    topics = model.print_topics()[moda]
    return topics
Ejemplo n.º 6
0
def lda_model(raw_file: pd.DataFrame):
    doc = clean_up_text(raw_file)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    modelo = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                             id2word=id2word,
                                             num_topics=20,
                                             random_state=100,
                                             update_every=1,
                                             chunksize=100,
                                             passes=10,
                                             alpha='auto',
                                             per_word_topics=True)

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(modelo, output_file)

    return (lda_model)
Ejemplo n.º 7
0
def lda_model(text: pd.DataFrame
              ):  # -> List[gensim.LdaModel,gensim.CoherenceModel,float]:
    doc = clean_up_text(text)
    lemma = tokenize(doc)
    id2word, corpus = create_dictionary(lemma)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=5,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(lda_model, output_file)

    return (lda_model)
Ejemplo n.º 8
0
def train():
    df = read_sample()
    data = df_to_list(df)
    data_lemmatized = tokenize(data)

    dictionary = create_dictionary(data_lemmatized)
    with open('data/models/dictionary.pkl', 'wb') as output_file:
        pickle.dump(dictionary, output_file)

    corpus = term_document_matrix(data_lemmatized, dictionary)

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=20,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    lda_model.save("data/models/lda_model.pkl")
Ejemplo n.º 9
0
def train():
    df = read_sample()
    logging.info('Source data file read succesfully')

    document_classes = create_classes(df)
    logging.info('Documents split between different classes.')

    logging.info('Tokenization started. This will for sure take some time.')
    word_classes = tokenize_classes(document_classes, False)
    logging.info('Tokenization completed for all documents.')

    negative_words = [
        item for sublist in word_classes['NEG'] for item in sublist
    ]
    positive_words = [
        item for sublist in word_classes['POS'] for item in sublist
    ]

    dictionary = create_dictionary([negative_words, positive_words])
    logging.info('Dictionary generated from all document words.')

    negative_split = split_data(negative_words)
    positive_split = split_data(positive_words)

    negative_bow = dictionary.doc2bow(negative_split['train'])
    positive_bow = dictionary.doc2bow(positive_split['train'])

    logging.info('Counts for bag of words for documents in all classes done')

    total_negative = len(negative_split['train']) + len(negative_bow)
    total_positive = len(positive_split['train']) + len(positive_bow)

    negative_prob = np.log(
        len(negative_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))
    positive_prob = np.log(
        len(positive_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))

    negative_word_probs = {}
    for id, count in negative_bow:
        negative_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_negative),
        }

    negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)}

    positive_word_probs = {}
    for id, count in positive_bow:
        positive_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_positive),
        }

    positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)}

    model = {
        'POS_PROB': positive_prob,
        'NEG_PROB': negative_prob,
        'COND_POS_PROBS': positive_word_probs,
        'COND_NEG_PROBS': negative_word_probs,
    }
    logging.info('Log probabilities for tokens in all classed computed')
    basePath = os.path.dirname(os.path.abspath(__file__))
    with open(basePath + "/../../models/model.pkl", "wb") as output_file:
        pickle.dump(model, output_file)
    logging.info('Model saved to artifact model.pkl')