Esempio n. 1
0
def train():
    df = read_sample()
    document_classes = create_classes(df)

    word_classes = tokenize_classes(document_classes, False)

    negative_words = [
        item for sublist in word_classes['NEG'] for item in sublist
    ]
    positive_words = [
        item for sublist in word_classes['POS'] for item in sublist
    ]

    dictionary = create_dictionary([negative_words, positive_words])

    negative_split = split_data(negative_words, (1, 0.0, 0.0))
    positive_split = split_data(positive_words, (1, 0.0, 0.0))

    negative_bow = dictionary.doc2bow(negative_split['train'])
    positive_bow = dictionary.doc2bow(positive_split['train'])

    total_negative = len(negative_split['train']) + len(negative_bow)
    total_positive = len(positive_split['train']) + len(positive_bow)

    negative_prob = np.log(
        len(negative_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))
    positive_prob = np.log(
        len(positive_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))

    negative_word_probs = {}
    for id, count in negative_bow:
        negative_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_negative),
        }

    negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)}

    positive_word_probs = {}
    for id, count in positive_bow:
        positive_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_positive),
        }

    positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)}

    model = {
        'POS_PROB': positive_prob,
        'NEG_PROB': negative_prob,
        'COND_POS_PROBS': positive_word_probs,
        'COND_NEG_PROBS': negative_word_probs,
    }

    with open("models/model.pkl", "wb") as output_file:
        pickle.dump(model, output_file)
    logging.info('Model saved to artifact model.pkl')
Esempio n. 2
0
def train():
    df = read_sample()
    data = df_to_list(df)
    data_lemmatized = tokenize(data)

    dictionary = create_dictionary(data_lemmatized)
    with open('data/models/dictionary.pkl', 'wb') as output_file:
        pickle.dump(dictionary, output_file)

    corpus = term_document_matrix(data_lemmatized, dictionary)

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=20,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)

    lda_model.save("data/models/lda_model.pkl")
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
#logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

#### hacer referencias

from src.features.utils import sent_to_words
from src.features.tokenize import tokenize_classes
from src.data.prepare_data import read_sample
#######finalizan referencias

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Obtener datos
data = read_sample()

data_words = list(sent_to_words(data))

#agregar tokenize
data_lemmatized = tokenize_classes(data_words)
print(data_lemmatized)
# Creamos diccionario
id2word = corpora.Dictionary(data_lemmatized)
print(id2word)
#for key, value in id2word.items():
#   print(key, value)

# Create Corpus
texts = data_lemmatized
Esempio n. 4
0
def load_doc() -> pd.DataFrame:
    return read_sample()
Esempio n. 5
0
def train():
    df = read_sample()
    logging.info('Source data file read succesfully')

    document_classes = create_classes(df)
    logging.info('Documents split between different classes.')

    logging.info('Tokenization started. This will for sure take some time.')
    word_classes = tokenize_classes(document_classes, False)
    logging.info('Tokenization completed for all documents.')

    negative_words = [
        item for sublist in word_classes['NEG'] for item in sublist
    ]
    positive_words = [
        item for sublist in word_classes['POS'] for item in sublist
    ]

    dictionary = create_dictionary([negative_words, positive_words])
    logging.info('Dictionary generated from all document words.')

    negative_split = split_data(negative_words)
    positive_split = split_data(positive_words)

    negative_bow = dictionary.doc2bow(negative_split['train'])
    positive_bow = dictionary.doc2bow(positive_split['train'])

    logging.info('Counts for bag of words for documents in all classes done')

    total_negative = len(negative_split['train']) + len(negative_bow)
    total_positive = len(positive_split['train']) + len(positive_bow)

    negative_prob = np.log(
        len(negative_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))
    positive_prob = np.log(
        len(positive_split['train']) /
        (len(negative_split['train']) + len(positive_split['train'])))

    negative_word_probs = {}
    for id, count in negative_bow:
        negative_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_negative),
        }

    negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)}

    positive_word_probs = {}
    for id, count in positive_bow:
        positive_word_probs[dictionary[id]] = {
            'id': id,
            'logprob': np.log((count + 1) / total_positive),
        }

    positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)}

    model = {
        'POS_PROB': positive_prob,
        'NEG_PROB': negative_prob,
        'COND_POS_PROBS': positive_word_probs,
        'COND_NEG_PROBS': negative_word_probs,
    }
    logging.info('Log probabilities for tokens in all classed computed')
    basePath = os.path.dirname(os.path.abspath(__file__))
    with open(basePath + "/../../models/model.pkl", "wb") as output_file:
        pickle.dump(model, output_file)
    logging.info('Model saved to artifact model.pkl')