def train(): with open(r"../data/interim/positive_words.pkl", "rb") as input_file: positive_docs = pickle.load(input_file) with open(r"../data/interim/negative_words.pkl", "rb") as input_file: negative_docs = pickle.load(input_file) negative_words = [item for sublist in negative_docs for item in sublist] positive_words = [item for sublist in positive_docs for item in sublist] dictionary = create_dictionary([negative_words, positive_words]) negative_split = split_data(negative_words) positive_split = split_data(positive_words) negative_bow = dictionary.doc2bow(negative_split['train']) positive_bow = dictionary.doc2bow(positive_split['train']) total_negative = len(negative_split['train']) + len(negative_bow) total_positive = len(positive_split['train']) + len(positive_bow) negative_word_probs = {} for id, count in negative_bow: negative_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_negative), } positive_word_probs = {} for id, count in positive_bow: positive_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_positive), }
def train(): df = read_sample() document_classes = create_classes(df) word_classes = tokenize_classes(document_classes, False) negative_words = [ item for sublist in word_classes['NEG'] for item in sublist ] positive_words = [ item for sublist in word_classes['POS'] for item in sublist ] dictionary = create_dictionary([negative_words, positive_words]) negative_split = split_data(negative_words, (1, 0.0, 0.0)) positive_split = split_data(positive_words, (1, 0.0, 0.0)) negative_bow = dictionary.doc2bow(negative_split['train']) positive_bow = dictionary.doc2bow(positive_split['train']) total_negative = len(negative_split['train']) + len(negative_bow) total_positive = len(positive_split['train']) + len(positive_bow) negative_prob = np.log( len(negative_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) positive_prob = np.log( len(positive_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) negative_word_probs = {} for id, count in negative_bow: negative_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_negative), } negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)} positive_word_probs = {} for id, count in positive_bow: positive_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_positive), } positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)} model = { 'POS_PROB': positive_prob, 'NEG_PROB': negative_prob, 'COND_POS_PROBS': positive_word_probs, 'COND_NEG_PROBS': negative_word_probs, } with open("models/model.pkl", "wb") as output_file: pickle.dump(model, output_file) logging.info('Model saved to artifact model.pkl')
def lda_model(raw_file: pd.DataFrame) -> List[gensim.LdaModel,gensim.CoherenceModel,float]: doc = clean_up_text(text) lemma = tokenize(doc) id2word = create_dictionary(lemma) corpus = [id2word.doc2bow(text) for text in lemma] lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100, update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True) coherence = coherence_model(lda_model,lemma,id2word) perplexity = perplexity(lda_model,corpus) modelo = [lda_model,coherence, perplexity] return(modelo)
def lda_model(raw_file: pd.DataFrame): doc = clean_up_text(raw_file) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20, random_state=100, update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True) return(lda_model)
def test(model, text: str): text = pd.DataFrame(data={'content': [text]}, columns=['content']) doc = clean_up_text(text) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) prediction = model[corpus] topics = list() for prob in prediction[0][1]: for topic in prob[1]: topics.append(topic) moda = max(set(topics), key=topics.count) topics = model.print_topics()[moda] return topics
def lda_model(raw_file: pd.DataFrame): doc = clean_up_text(raw_file) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) modelo = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) with open("models/model.pkl", "wb") as output_file: pickle.dump(modelo, output_file) return (lda_model)
def lda_model(text: pd.DataFrame ): # -> List[gensim.LdaModel,gensim.CoherenceModel,float]: doc = clean_up_text(text) lemma = tokenize(doc) id2word, corpus = create_dictionary(lemma) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) with open("models/model.pkl", "wb") as output_file: pickle.dump(lda_model, output_file) return (lda_model)
def train(): df = read_sample() data = df_to_list(df) data_lemmatized = tokenize(data) dictionary = create_dictionary(data_lemmatized) with open('data/models/dictionary.pkl', 'wb') as output_file: pickle.dump(dictionary, output_file) corpus = term_document_matrix(data_lemmatized, dictionary) lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) lda_model.save("data/models/lda_model.pkl")
def train(): df = read_sample() logging.info('Source data file read succesfully') document_classes = create_classes(df) logging.info('Documents split between different classes.') logging.info('Tokenization started. This will for sure take some time.') word_classes = tokenize_classes(document_classes, False) logging.info('Tokenization completed for all documents.') negative_words = [ item for sublist in word_classes['NEG'] for item in sublist ] positive_words = [ item for sublist in word_classes['POS'] for item in sublist ] dictionary = create_dictionary([negative_words, positive_words]) logging.info('Dictionary generated from all document words.') negative_split = split_data(negative_words) positive_split = split_data(positive_words) negative_bow = dictionary.doc2bow(negative_split['train']) positive_bow = dictionary.doc2bow(positive_split['train']) logging.info('Counts for bag of words for documents in all classes done') total_negative = len(negative_split['train']) + len(negative_bow) total_positive = len(positive_split['train']) + len(positive_bow) negative_prob = np.log( len(negative_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) positive_prob = np.log( len(positive_split['train']) / (len(negative_split['train']) + len(positive_split['train']))) negative_word_probs = {} for id, count in negative_bow: negative_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_negative), } negative_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_negative)} positive_word_probs = {} for id, count in positive_bow: positive_word_probs[dictionary[id]] = { 'id': id, 'logprob': np.log((count + 1) / total_positive), } positive_word_probs[-1] = {'id': -1, 'logprob': np.log(1 / total_positive)} model = { 'POS_PROB': positive_prob, 'NEG_PROB': negative_prob, 'COND_POS_PROBS': positive_word_probs, 'COND_NEG_PROBS': negative_word_probs, } logging.info('Log probabilities for tokens in all classed computed') basePath = os.path.dirname(os.path.abspath(__file__)) with open(basePath + "/../../models/model.pkl", "wb") as output_file: pickle.dump(model, output_file) logging.info('Model saved to artifact model.pkl')