def train_model(): data = read_json_file("data/bible_kjv_wrangled.json") sentences = list(data.values()) # Do we want everything in lowercase? sentences = [s.lower() for s in sentences] print("-----------Tokenize corpus-------------") tokenized_sentences = [] for s in sentences: tokens = nltk.word_tokenize(s) tokenized_sentences.append(tokens) for s in abc.sents(): s = list(filter(lambda x: x.isalpha() and len(x) > 1, s)) s = [x.lower() for x in s] # Do we want everything in lowercase? tokenized_sentences.append(s) for s in brown.sents(): s = list(filter(lambda x: x.isalpha() and len(x) > 1, s)) s = [x.lower() for x in s] # Do we want everything in lowercase? tokenized_sentences.append(s) print("------------TRAINING FASTTEXT-----------") model = FastText(tokenized_sentences, size=100, window=5, min_count=5, workers=4, sg=1) print("----------------DONE-------------") return model
def init_data(): all_sentence_data = abc.sents() all_words = [] for sent in all_sentence_data: for word in sent: if word not in ["!", ",", "?", '"', "(", ")", ".", ":", ";"]: all_words.append(word.lower()) return all_words
def ari(fileid): """Accept text as list of words""" print(fileid) num_chars = len(abc.raw(fileid)) num_words = len(abc.words(fileid)) num_sents = len(abc.sents(fileid)) avg_word_len = num_chars / num_words avg_sent_len = num_words / num_sents return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
def main(): corpus = [sentence for sentence in abc_corpus.sents()] corpus = clean_corpus(corpus) corpus_size = len(corpus) w2v = word2vec(corpus) print ("Corpus Size - {}".format(corpus_size)) print ("Vocab size - {}".format(w2v.vocab_size)) if mode == "cbow": model = cbow(w2v) train(model, generate_cbow_train_data, corpus, w2v) elif mode == "skp": model = skip_gram(w2v) train(model, generate_skp_train_data, corpus, w2v)
def pre_process(): """Remove stop words and punctuation marks from corpus """ if 'cleaned_corpus.pkl' not in os.listdir( os.curdir) or 'cleaned_sentences.pkl' not in os.listdir(os.curdir): print('Pre-processing...') words = abc.words() words = [w for w in words] sentences = abc.sents() sentences = [s for s in sentences] stop_words = stopwords.words('english') punctuation = list(string.punctuation) for i in range(len(sentences)): print(i) for j in sentences[i]: prev = len(sentences[i]) #print(i*j) if set(j) - set(punctuation) == set() or j.lower( ) in stop_words: print(j) print('removed') if j in words: words.remove(j) sentences[i].remove(j) assert prev == len(sentences[i]) + 1 for s in sentences: if len(s) <= 1: print(s) sentences.remove(s) pickle.dump(words, open('cleaned_corpus.pkl', 'wb')) pickle.dump(sentences, open('cleaned_sentences.pkl', 'wb')) else: print('Pre processed data already present..') words = pickle.load(open('cleaned_corpus.pkl', 'rb')) sentences = pickle.load(open('cleaned_sentences.pkl', 'rb')) return words, sentences
if len(raw_sentence) > 0: book_sentences.append(sentence_to_wordlist(raw_sentence)) #print(raw_sentences[5]) #print(book_sentences[5]) conll2000_corp_sents = conll2000.sents() print("condll2000 to sents") conll2002_corp_sents = conll2002.sents() print("conll2002 to sents") conll2007_corp_sents = conll2007.sents() print("condll2007 to sents") inaugural_corp_sents = inaugural.sents() print("inaugural to sents") abc_corp_sents = abc.sents() print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents") movie_reviews_corp_sents = movie_reviews.sents() print("Movie reviews to sents ") guttenberg_corp_sents = gutenberg.sents() print("Guttenberg to sents")
nltk.download('abc') from nltk.corpus import stopwords nltk.download('stopwords') import string import numpy as np import torch import itertools import re from torch.autograd import Variable import numpy as np import torch.functional as F import torch.nn.functional as F """**Get sentence out of the data**""" nltk.download('punkt') data = [" ".join(list_of_words) for list_of_words in abc.sents()] #print(len(data)) new_data = data[0:50] print(len(new_data)) """**Pre Processing and Tokenize**""" def preprocess_tokenize_text(new_data): corpus = [] for sentence in new_data: text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', sentence) text = re.sub(' +', ' ', text) text.replace('\n', ' ')
###################### 3 ##################################################### ############# Training Phase ################################################### ################################################################################ ########################## Training a couple models ################### #Training a model with the vocabulary from Pro_Lyrics_list RM_model = Word2Vec(Pro_Lyrics_list, min_count=2, size=150, workers=15, window=15) print() #Saving the model RM_model.save("word2vec.RM_model") RM_model.save("RM_model.bin") print() #Training a model with the imported vocabulary from abc.sents() abc_model = gensim.models.Word2Vec(abc.sents(), min_count=2, size=150, workers=15, window=15) print() #Saving the model abc_model.save("word2vec.abc_model") abc_model.save("abc_model.bin") print() ####################### Storing vectors gen by models ################# # Store the vectors for train data in following file ### Finish <----------------------------------------------------------------------------------------Incomplete #word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec.csv' #RM_vectors_filename = r'C:\Users\hprob\Desktop\ErdosProjectMay2020\Sample_project\RM_vectors.csv' #with open(RM_vectors_filename, 'w+') as word2vec_file: # for index, row in Lyrics_train.iterrows(): # model_vector = (np.mean([RM_model[token] for token in row['lyrics']], axis=0)).tolist()
from sklearn.manifold import TSNE model = TSNE(n_components=3, random_state=0) np.set_printoptions(suppress=True) from nltk.corpus import stopwords nltk.download('stopwords') stop_words = set(stopwords.words('english')) nltk.download('abc') nltk.download('punkt') stop_words = list(stop_words) words = abc.words() len(abc.sents()) sentences = abc.sents()[0:100] # sentences=[['he', 'is', 'the', 'king'], ['the', 'king', 'is', 'royal'], ['she', 'is', 'the', 'royal', 'queen']] sentences words = [] sent = [] for i in sentences: temp = [] for j in i: if j not in stop_words and j.isalpha(): words.append(j) temp.append(j) sent.append(temp)
print "Adding gutenberg sentence structures ({0}) ...".format( len(gutenberg.sents())) for sentence in gutenberg.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding abc sentence structures ({0})...".format(len(abc.sents())) for sentence in abc.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding reuters sentence structures ({0})...".format(len( reuters.sents()))
# Example of comparison of reading difficulty score (ARI) for two NLTK corpora. from nltk.corpus import abc def avg(lst): lentotal = 0.0 for word in lst: lentotal = lentotal + len(word) return lentotal / len(lst) def ari(corpus_words, corpus_sents): avgchar = avg(corpus_words) avgsent = avg(corpus_sents) ari = 4.71 * avgchar + 0.5 * avgsent - 21.43 return ari print ari(abc.words('rural.txt'), abc.sents('rural.txt')) print ari(abc.words('science.txt'), abc.sents('science.txt'))
#This is a single class text classifier example using a Naive Bayes algorithm. It is an adaptation of the tutorial by http://textblob.readthedocs.io/ from textblob.classifiers import NaiveBayesClassifier from textblob import TextBlob from nltk.corpus import gutenberg from nltk.corpus import abc as corpus # lenght = len (corpus.raw()) # print (corpus.readme()) #print (gutenberg.fileids()) #print (corpus.fileids()) #head = corpus.raw('science.txt') train = [] for sent in corpus.sents('science.txt')[50:150]: train.append((' '.join(sent), 'science')) for sent in gutenberg.sents('austen-emma.txt')[50:150]: train.append((' '.join(sent), 'austen')) for sent in gutenberg.sents('shakespeare-hamlet.txt')[5:150]: train.append((' '.join(sent), 'shakes')) for sent in gutenberg.sents('melville-moby_dick.txt')[5:150]: train.append((' '.join(sent), 'melville')) #print ("new_____" , ' '.join(sent)) #print (train2) # # print (corpus.words('science.txt')) test = [] for sent in corpus.sents('science.txt')[400:420]: test.append((' '.join(sent), 'science')) for sent in gutenberg.sents('austen-emma.txt')[400:420]: test.append((' '.join(sent), 'austen')) for sent in gutenberg.sents('shakespeare-hamlet.txt')[400:420]:
bible = genesis.sents('english-kjv.txt') blake = gutenberg.sents('blake-poems.txt') bryant = gutenberg.sents('bryant-stories.txt') burgess = gutenberg.sents('burgess-busterbrown.txt') carroll = gutenberg.sents('carroll-alice.txt') ch_ball = gutenberg.sents('chesterton-ball.txt') ch_brown = gutenberg.sents('chesterton-brown.txt') ch_thurs = gutenberg.sents('chesterton-thursday.txt') edge = gutenberg.sents('edgeworth-parents.txt') mel = gutenberg.sents('melville-moby_dick.txt') mil = gutenberg.sents('milton-paradise.txt') caesar = gutenberg.sents('shakespeare-caesar.txt') hamlet = gutenberg.sents('shakespeare-hamlet.txt') macbeth = gutenberg.sents('shakespeare-macbeth.txt') whit = gutenberg.sents('whitman-leaves.txt') rural = abc.sents('rural.txt') science = abc.sents('science.txt') plots = subjectivity.sents('plot.tok.gt9.5000') quotes = subjectivity.sents('quote.tok.gt9.5000') austen = sense + emma + persuasion shakespeare = caesar + hamlet + macbeth facts = rural + science opinions = plots + quotes gute = bryant + burgess + carroll + edge + mel + mil + whit chester = ch_ball + ch_brown + ch_thurs total = austen + shakespeare + facts + opinions + gute + chester + b + sents #print(plots) #print(science) #print(bible) g = Word2Vec(total) g.wv.save_word2vec_format('model.bin', binary=True)
from DocumentFeatureSelection import interface from DocumentFeatureSelection.models import PersistentDict from sqlitedict import SqliteDict import time import os """This example shows you how to work on huge dataset. For persisted-dict object you can choose PersistentDict or SqliteDict You're supposed to be ready to use following corpora object in nltk - abc - genesis - web - gutenberg """ #---------------------------------------------------------- abc_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() # Case of PersistentDict persistent_dict_obj = PersistentDict('demo.json', 'c', format='json') persistent_dict_obj['abc'] = list(abc_corpus) persistent_dict_obj['genesis'] = list(genesis_corpus) persistent_dict_obj['web'] = list(web_corpus) persistent_dict_obj['gutenberg'] = list(gutenberg_corpus) start = time.time() # If you put is_use_cache=True, it uses cache object for keeping huge objects during computation # If you put is_use_memmap=True, it uses memmap for keeping matrix during computation scored_matrix_obj = interface.run_feature_selection(
start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True ) elapsed_time = time.time() - start print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import genesis from nltk.corpus import abc abs_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() input_corpus = { 'abs': list(abs_corpus), 'genesis': list(genesis_corpus), 'web': list(web_corpus), 'gutenberg': list(gutenberg_corpus) } pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus)
def saveContextWords(data): context = {} for i in range(len(data)): context[i] = data[i] dumpPickle("contextWords.pkl", context) if __name__ == '__main__': obj = Word2Vec(2) # corpus = "natural language processing and machine learning is fun and exciting".split(" ") # sentences = [corpus] sentences = list(abc.sents()) data = obj.preprocessing(sentences) print("preprocessing done") trainDataX, trainDataY = obj.targetAndContext(data) print("training Data is generated") exit(0) dumpPickle("index2WordMap.pkl", obj.index2WordMap) dumpPickle("word2IndexMap.pkl", obj.word2IndexMap) saveContextWords(trainDataX) # network = NN(obj.uniqueCount, 50) # network.train(trainDataX,trainDataY,100) # obj.nn = network # obj.findSimilarWords("natural",3)
import numpy as np import torch from torch.autograd import Variable import torch.functional as F import torch.nn.functional as F import nltk nltk.download('abc') from nltk.corpus import abc import itertools, re corpus = [] for text_id in abc.fileids(): raw_text = list(itertools.chain.from_iterable(abc.sents(text_id))) text = ' '.join(raw_text) text = text.lower() text.replace('\n', ' ') text = re.sub('[^a-z ]+', '', text) corpus.append([w for w in text.split() if w != '']) from collections import Counter import random, math def subsample_frequent_words(corpus): filtered_corpus = [] word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus)))) sum_word_counts = sum(list(word_counts.values())) word_counts = { word: word_counts[word] / float(sum_word_counts) for word in word_counts
print "Building clean words list..." words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")] words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) print "Building clean sentences list" sentences = [] for s in brown.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in treebank.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in abc.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in movie_reviews.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in genesis.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) def singles(words): if len(words) < 1: return for w in words: if re.match("[a-zA-Z'-]+", w) and w.strip() != "''": yield w def doubles(sentences):
import nltk import gensim from nltk.corpus import abc a = 1 print("Model is training") model = gensim.models.Word2Vec(abc.sents()) print("1") X = list(model.wv.vocab) print("2") data = model.most_similar('science') print("3") print(data) print("Training Completed")
def similar_words(word): model = gensim.models.Word2Vec(abc.sents()) x = list(model.wv.vocab) data = model.wv.most_similar(word) print(data)
def pmi_with_cython(input_corpus): logging.debug(msg='With cython is True') start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True) elapsed_time = time.time() - start print(("elapsed_time with cython:{} [sec]".format(elapsed_time))) from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import genesis from nltk.corpus import abc abs_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() input_corpus = { 'abs': list(abs_corpus), 'genesis': list(genesis_corpus), 'web': list(web_corpus), 'gutenberg': list(gutenberg_corpus) } pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus)
print "Current Structure total: {0}".format(len(sentences)) del blob print "Adding gutenberg sentence structures ({0}) ...".format(len(gutenberg.sents())) for sentence in gutenberg.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding abc sentence structures ({0})...".format(len(abc.sents())) for sentence in abc.sents(): processed_count += 1 try: blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger()) tags = tuple([tag[1] for tag in blob.tags]) sentences.add(tags) except: print "\r", print "Processed {0} sentences\r".format(processed_count), print "Current Structure total: {0}".format(len(sentences)) print "Adding reuters sentence structures ({0})...".format(len(reuters.sents())) for sentence in reuters.sents(): processed_count += 1 try:
from nltk.corpus import abc,stopwords from string import punctuation from gensim.models import Word2Vec from sklearn.manifold import TSNE import pandas as pd import matplotlib.pyplot as plt sents = abc.sents() #print(sents[:10]) puncs = list(punctuation) stop = set(stopwords.words('english') + puncs + ["''" , "``"]) processed_sents = [] for sent in sents: temp = [] for word in sent: if word not in stop: temp.append(word.lower()) processed_sents.append(temp) print(processed_sents[:10]) #Output #[['pm', 'denies', 'knowledge', 'awb', 'kickbacks', 'the', 'prime', 'minister', 'denied', 'knew', 'awb', 'paying', 'kickbacks', 'iraq', 'despite', 'writing', 'wheat', 'exporter', 'asking', 'kept', 'fully', 'informed', 'iraq', 'wheat', 'sales'], ['letters', 'john', 'howard', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'awb', 'released', 'cole', 'inquiry', 'oil', 'food', 'program'], ['in', 'one', 'letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'remain', 'close', 'contact', 'government', 'iraq', 'wheat', 'sales'], ['the', 'opposition', 'gavan', 'o', 'connor', 'says', 'letter', 'sent', '2002', 'time', 'awb', 'paying', 'kickbacks', 'iraq', 'though', 'jordanian', 'trucking', 'company'], ['he', 'says', 'government', 'longer', 'wipe', 'hands', 'illicit', 'payments', 'totalled', '290', 'million'], ['the', 'responsibility', 'must', 'lay', 'may', 'squarely', 'feet', 'coalition', 'ministers', 'trade', 'agriculture', 'prime', 'minister', ',"', 'said'], ['but', 'prime', 'minister#', 'says', 'letters', 'show', 'inquiring', 'future', 'wheat', 'sales', 'iraq', 'prove', 'government', 'knew', 'payments'], ['it', 'would', 'astonishing', '2002', 'prime', 'minister', 'i', 'done', 'anything', 'i', 'possibly', 'could', 'preserve', 'australia', 'valuable', 'wheat', 'market', ',"', 'said'], ['email', 'questions', 'today', 'inquiry', 'awb', 'trading', 'manager', 'peter', 'geary', 'questioned', 'email', 'received', 'may', '2000'], ['it', 'indicated', 'iraqi', 'grains', 'board', 'approached', 'awb', 'provide', 'sales', 'service', '".']] embeddings = Word2Vec(sentences=processed_sents,size=300,min_count=20,workers=4,sg=0,iter=5,hs=0) print(embeddings.wv.most_similar('government')) vocab = list(embeddings.wv.vocab) X = embeddings[vocab] tsne_model = TSNE(n_components=2) X_tsne = tsne_model.fit_transform(X)
from DocumentFeatureSelection.models import PersistentDict from sqlitedict import SqliteDict import time import os """This example shows you how to work on huge dataset. For persisted-dict object you can choose PersistentDict or SqliteDict You're supposed to be ready to use following corpora object in nltk - abc - genesis - web - gutenberg """ #---------------------------------------------------------- abc_corpus = abc.sents() genesis_corpus = genesis.sents() web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() # Case of PersistentDict persistent_dict_obj = PersistentDict('demo.json', 'c', format='json') persistent_dict_obj['abc'] = list(abc_corpus) persistent_dict_obj['genesis'] = list(genesis_corpus) persistent_dict_obj['web'] = list(web_corpus) persistent_dict_obj['gutenberg'] = list(gutenberg_corpus) start = time.time() # If you put is_use_cache=True, it uses cache object for keeping huge objects during computation # If you put is_use_memmap=True, it uses memmap for keeping matrix during computation scored_matrix_obj = interface.run_feature_selection(