def bigram_model(): X_train, y_train, X_test, y_test = Sentences( n_sentences=1000).train_test_split_bigram() model = LogisticRegression() model.fit(X_train.transpose(), y_train) print("Bigram model test accuracy: %f" % model.score(X_test.transpose(), y_test))
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), \ save=True, save_file="doc_vector.bin", \ to_normalize = False, to_scale = False, cluster_factor=20, num_cpus=-2): sentences = Sentences(dir_name) docs = sentences.paragraph_iterator() doc_num = sentences.doc_num stop_words = set(stopwords.words("english")) tknzr = TweetTokenizer(preserve_case=False) post_docs = util.process_sentences(docs, tknzr, process_option, stop_words) if build_option == 1: # average doc_vector = build_average_dv(post_docs, doc_num, model, save, save_file) elif build_option == 2: # cluster doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model, save, save_file) else: doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus, save, save_file) if (normalize): doc_vector = normalize(doc_vector, copy=False) if (scale): doc_vector = scale(doc_vector, copy=True) return doc_vector # "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/dataset"
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), \ save=True, save_file="doc_vector.bin", \ to_normalize = False, to_scale = False, cluster_factor=20, num_cpus=-2): sentences = Sentences(dir_name) docs = sentences.paragraph_iterator() doc_num = sentences.doc_num stop_words = set(stopwords.words("english")) tknzr = TweetTokenizer(preserve_case=False) post_docs = util.process_sentences(docs, tknzr, process_option, stop_words) if build_option == 1: # average doc_vector = build_average_dv(post_docs, doc_num, model, save, save_file) elif build_option == 2: # cluster doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model, save, save_file) else: doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus, save, save_file) if(normalize): doc_vector = normalize(doc_vector, copy=False) if(scale): doc_vector = scale(doc_vector, copy=True) return doc_vector # "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/dataset"
def build_doc_vector(dir_name, model, build_option, process_option=ProcessOption(), cluster_factor=20, num_cpus=-2): sentences = Sentences(dir_name) docs = sentences.paragraph_iterator() doc_num = sentences.doc_num stop_words = set(stopwords.words("english")) post_docs = util.process_sentences(docs, process_option, stop_words) if build_option == 1: # average doc_vector = build_average_dv(post_docs, doc_num, model) elif build_option == 2: # cluster doc_vector = build_av_tf_idf_dv(post_docs, doc_num, model) else: doc_vector = build_cluster_dv(post_docs, doc_num, model, cluster_factor, num_cpus) return doc_vector
def load_model(self): ipdb.set_trace files = [ os.path.join(self.basedir, fn) for fn in os.listdir(self.basedir) if fn.endswith(self.model_name) ] if len(files) > 0: return Doc2Vec.load(self.basedir + self.model_name) else: files = [ os.path.join(self.basedir, fn) for fn in os.listdir(self.basedir) if fn.endswith(self.lyrics_filename) ] for file in files: songs_lyrics_ids_df = pd.read_csv(file, delimiter=',', encoding="utf-8") songs_lyrics_ids_df.head() group = ['lyric', 'mood'] lyrics_by_song = songs_lyrics_ids_df.sort_values(group)\ .groupby(group).lyric\ .apply(' '.join)\ #.reset_index(name='lyric') lyrics_by_song.head(1) file_name = file filename = file_name nltk.download('wordnet') sentences = Sentences(filename=filename, column="lyric") df_train = pd.read_csv(filename, index_col=[0], usecols=[0, 3], header=0, names=["mood", "lyric"]) model = Doc2Vec(alpha=0.025, min_alpha=0.025, workers=15, min_count=2, window=10, size=200, iter=20, sample=0.001, negative=5) model.build_vocab(sentences) epochs = 10 for epoch in range(epochs): model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.save("lyrics.doc2vec") return model
def train(data_path, stop_words_file_path): sentences = Sentences(data_path, stop_words_file_path) num_features = 256 min_word_count = 10 num_workers = 48 context = 20 epoch = 20 sample = 1e-5 model = Word2Vec( sentences, size=num_features, min_count=min_word_count, workers=num_workers, sample=sample, window=context, iter=epoch, ) return model
def tfidf_tsne(): indx_sent, word2idx, idx2word = Sentences().limit_vocab() word_sent_counts = np.zeros((len(word2idx) + 1, len(indx_sent) + 1)) j = 0 for sentence in indx_sent: for idx in sentence: word_sent_counts[idx, j] += 1 j += 1 word_sent_tfidf = TFIDF().fit_transform(word_sent_counts).toarray() word_sent_tsne = TSNE().fit_transform(word_sent_tfidf) plt.scatter(word_sent_tsne[:, 0], word_sent_tsne[:, 1]) for label in range(len(word2idx)): try: plt.annotate(s=idx2word[label].encode('utf8'), xy=(word_sent_tsne[label, 0], word_sent_tsne[label, 1])) except UnicodeError: pass except KeyError: pass plt.show()
def load_sentences(self): return Sentences(task=self.c["task"], language_code=self.c["language"], id_by_word=self.embedding.vocabulary.word_id)
from wiki.xml_handler import WikiTitleTextHandler from wiki.iterator import WikiPageIterator from sentences import Sentences import settings if __name__ == '__main__': handler = WikiTitleTextHandler() it = WikiPageIterator(settings.DATA_PATH, handler) cntr = 0 for sentence in Sentences(it): print(sentence) cntr += 1 if cntr == 1: break
def main(train_dir, test_dir): # these may be function parameters w2v_option = Word2VecOption(num_features=300, min_word_count=40, \ num_workers=4, context=10, downsampling=1e-3) csv_option = CsvOption(deli=",", title=["review", "sentiment"], \ chunksize=100, review_name="review", sentiment_name="sentiment") process_option = ProcessOption(rm_html=True, rm_punc=True, rm_num=True, \ lower_case=True, rm_stop_words=False) model_name = "model.bin" save_model = True save_fv = True train_fv_name = "train_fv.bin" test_fv_name = "test_fv.bin" build_option = 2 save_classifier = True classifier_name = "classifier/classifier.bin" to_normalize = True to_scale = False # logger info build_method = "average word vector" if build_option == 2: build_method = "average word vector with tf-idf" elif build_option == 3: build_method = "cluster word vector" logger.debug("text process option: %s", str(process_option)) logger.debug("use %s to build doc vector", build_method) train_sentences = Sentences(train_dir, csv_option, process_option) logger.info("number of docs: %d", train_sentences.doc_num) # train word2vec if (os.path.isfile(model_name)): model = Word2Vec.load(model_name) logger.debug("model %s already exist, stop training wordvector", model_name) else: logger.info("start trainning word vector") start_time = timeit.default_timer() model = wordvector.build_word_vector(train_sentences, w2v_option, save=True, save_file=model_name) logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time) # get doc vector logger.info("start building training set doc vector") start_time = timeit.default_timer() train_fv = wordvector.build_doc_vector( train_dir, model, build_option, process_option, save_fv, train_fv_name, ) print train_fv logger.info("training set doc vector built in %.4lfs", timeit.default_timer() - start_time) logger.info("training set doc vector saved to %s", train_fv_name) logger.debug("training size: %s", str(train_fv.shape)) # train classifier logger.info("start training classifier") start_time = timeit.default_timer() forest = grid_search.GridSearchCV(RandomForestClassifier(), { 'n_estimators': [100], 'n_jobs': [100] }, cv=5, scoring='f1_weighted', n_jobs=100) best_model = forest.fit(train_fv, list(train_sentences.sentiment_iterator())) logger.info("finished training classifier in %.4lfs", timeit.default_timer() - start_time) if save_classifier: joblib.dump(best_model, classifier_name) # evaluate on test set logger.info("start building test set doc vector") start_time = timeit.default_timer() test_sentences = Sentences(test_dir, csv_option, process_option) test_fv = wordvector.build_doc_vector(test_dir, model, build_option, process_option, save_fv, test_fv_name) print test_fv logger.info("test set doc vector built in %.4lfs", timeit.default_timer() - start_time) logger.info("test set doc vector saved to %s", test_fv_name) logger.debug("test size: %s", str(test_fv.shape)) logger.info("start predicting test set sentiment") start_time = timeit.default_timer() predicted_sentiment = best_model.predict(test_fv) logger.info("finished prediction in %.4lfs", timeit.default_timer() - start_time) accuracy = np.mean( predicted_sentiment == list(test_sentences.sentiment_iterator())) print "Test Set Accuracy = ", accuracy print metrics.classification_report(list(test_sentences.sentiment_iterator()), \ predicted_sentiment, target_names=['0', '1', '2', '3'])
self.encoder.train() self.decoder.train() #数据预处理 #初始化词向量 model_file = sys.argv[1] embedding = CustomEmbedding(model_file) flag_words_end = embedding.get_length() print(flag_words_end) speical_words = ['GO', 'EOS', 'UNK'] speical_words_index = [ flag_words_end + 1, flag_words_end + 2, flag_words_end + 3 ] #读取英文输入 input_data = Sentences('./data/wmt14/train.en') # input_data.readline(reversed=True) # input_file = open('./data/wmt14/train.en', 'r',encoding='utf8') # line = input_file.readline() # sentences = [] # while line: # #向量化 # words = seg_sentence.segment(line) # vectors = [] # for word in words: # input = embedding.get_index(word) # vectors.append(input) # #句子按词翻转逆序 # vectors.reverse() # vectors.append(speical_words_index[0]) # add 'GO' # sentences.append(vectors)
logger = logging.getLogger('sys.stdout') # keywords = ['apple', ''] dataset = "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/dataset/all" model_name = "model.bin" pre_model_name = "/Users/Crazyconv/Conv/DEVELOPMENT/GitFolder/Word2Vec2NLP/GoogleNews-vectors-negative300.bin.gz" w2v_option = Word2VecOption(num_features=500, min_word_count=10, \ num_workers=4, context=10, downsampling=1e-7) csv_option = CsvOption(deli=",", title=["review", "sentiment"], \ chunksize=100, review_name="review", sentiment_name="sentiment") process_option = ProcessOption(rm_html=True, rm_punc=True, rm_num=True, \ lower_case=True, rm_stop_words=False) train_sentences = Sentences(dataset, csv_option, process_option) # build model if (os.path.isfile(model_name)): model = Word2Vec.load(model_name) logger.debug("model %s already exist, stop training wordvector", model_name) else: logger.info("start trainning word vector") start_time = timeit.default_timer() model = wordvector.build_word_vector(train_sentences, w2v_option, save=True, save_file=model_name) logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time)
bigram = Phraser(Phrases(documents)) trigram = Phraser(Phrases(bigram[documents])) sentences = open(os.path.join(data_dir, "sentences-3.txt"), "w+") print("saving sentences to file") for s in trigram[bigram[documents]]: for sentece in s: sentences.write("{}\t".format(sentece)) sentences.write("\n") sentences.close() # trigram = Phraser(Phrases(bigram[sentences])) #trigram = Phraser.load(os.path.join(data_dir,"phrases")) sentences = Sentences(os.path.join(data_dir, "sentences-3.txt")) # # print("training model") # model = Word2Vec(sentences=sentences,window=5,min_count=3,sg=0,size=300) # model.save(os.path.join(data_dir,"word2vec_model")) # model = Word2Vec.load(os.path.join(data_dir, "word2vec_model")) # # print(model.wv.most_similar("kpk")) # res = open("senteces.txt","w+") # for sentences in trigram[sentences]: # for sentence in sentences: # res.write(sentence) # res.write("\n") # res.close()
from datetime import datetime # seeds the random function # this makes the random function spit out different random numbers # random is just like a math equations, if you put in the same number, you get # out the same number. If we give it time, which changes, it will give us # different numbers. random.seed(datetime.now) # this stores our dictionary of words in a variable called words so that we can # use it more easily words = Words() # this stores our sentence structures in a variable called sentences so that we # can print them more easily sentences = Sentences() # this is a list of all the sentence structure options that we can print. We use # this list so that we can pick a random one to print when we write the poem options = { 0: sentences.print_sentence_with_simile, 1: sentences.print_singular_sentence, 2: sentences.print_plural_sentence, 3: sentences.print_plural_action_on_singular_sentence, 4: sentences.print_plural_action_on_plural_sentence, 5: sentences.print_singular_action_on_plural_sentence, 6: sentences.print_singular_action_on_singular_sentence, 7: sentences.print_singular_action_with_adverb_sentence, 8: sentences.print_plural_action_with_adverb_sentence }
def skipgram_model(): np.random.seed(0) act = tanh context = 3 vocab_limit = 500 indx_sent, word2idx, idx2word = Sentences(n_sentences=5000).limit_vocab( n_limit=vocab_limit) skipgrams = [[] for i in range(vocab_limit)] for sentence in indx_sent: len_sent = len(sentence) for tidx, token in enumerate(sentence): bigram = [] for step in range(1, context + 1): if tidx + step < len_sent: bigram.append(sentence[tidx + step]) if tidx - step >= 0: bigram.append(sentence[tidx - step]) skipgrams[token].append(bigram) train_percentage = 0.8 train_grams = [[] for i in range(vocab_limit)] n_train = 0 test_grams = [[] for i in range(vocab_limit)] n_test = 0 for sidx, skipgram in enumerate(skipgrams): skip_len = len(skipgram) if skip_len < 3: ValueError('Insufficient data for training and testing!') train_len = floor(train_percentage * skip_len) n_train += train_len n_test += skip_len - train_len train_grams[sidx].extend(skipgram[:train_len]) test_grams[sidx].extend(skipgram[train_len:]) # number based on: # https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw n_hidden = round(n_train / (5 * (vocab_limit + vocab_limit))) input_to_hidden = 2 * np.random.random((vocab_limit, n_hidden)) - 1.0 hidden_to_output = 2 * np.random.random((n_hidden, vocab_limit)) - 1.0 base_error = 0.0 for tidx, token_skipgrams in enumerate(test_grams): guess = np.dot(input_to_hidden[tidx, :], hidden_to_output) for skipgram in token_skipgrams: iter_error = -guess iter_error[skipgram] += 1.0 base_error += np.linalg.norm(tanh(iter_error)) print('Base L2 error: %s' % str(base_error)) n_epoch = 10 alpha = 1.0 # this implements one-at-a-time stochastic gradient descent for epoch in range(n_epoch): for tidx, token_skipgrams in enumerate(train_grams): for skipgram in token_skipgrams: # input is one-hot, so returns one column of input_to_hidden l1 = act(input_to_hidden[tidx, :]) l2 = act(np.dot(l1, hidden_to_output)) err_output_to_hidden = -l2 # skipgram is a list err_output_to_hidden[skipgram] += 1.0 d1 = err_output_to_hidden * act(l2, d=True) err_hidden_to_input = np.dot(d1, hidden_to_output.T) d0 = err_hidden_to_input * act(l1, d=True) hidden_to_output += alpha * np.outer(l1, d1) input_to_hidden[tidx, :] += alpha * d0 ih_norm = np.linalg.norm(input_to_hidden[tidx, :]) if (isnan(ih_norm)): raise ValueError() test_error = 0.0 for tidx, token_skipgrams in enumerate(test_grams): guess = np.dot(input_to_hidden[tidx, :], hidden_to_output) for skipgram in token_skipgrams: iter_error = -guess iter_error[skipgram] += 1.0 test_error += np.linalg.norm(tanh(iter_error)) print('Test L2 error: %s' % str(test_error))
def main(train_dir, test_dir): # these may be function parameters w2v_option = Word2VecOption(num_features=300, min_word_count=40, \ num_workers=4, context=10, downsampling=1e-3) csv_option = CsvOption(deli=",", title=["review", "sentiment"], \ chunksize=100, review_name="review", sentiment_name="sentiment") process_option = ProcessOption(rm_html=True, rm_punc=True, rm_num=True, \ lower_case=True, rm_stop_words=False) model_name = "model.bin" save_model = True save_fv = True train_fv_name = "train_fv.bin" test_fv_name = "test_fv.bin" build_option = 2 save_classifier = True classifier_name = "classifier/classifier.bin" to_normalize = True to_scale = False # logger info build_method = "average word vector" if build_option == 2: build_method = "average word vector with tf-idf" elif build_option == 3: build_method = "cluster word vector" logger.debug("text process option: %s", str(process_option)) logger.debug("use %s to build doc vector", build_method) train_sentences = Sentences(train_dir, csv_option, process_option) logger.info("number of docs: %d", train_sentences.doc_num) # train word2vec if(os.path.isfile(model_name)): model = Word2Vec.load(model_name) logger.debug("model %s already exist, stop training wordvector", model_name) else: logger.info("start trainning word vector") start_time = timeit.default_timer() model = wordvector.build_word_vector(train_sentences, w2v_option, save=True, save_file=model_name) logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time) # get doc vector logger.info("start building training set doc vector") start_time = timeit.default_timer() train_fv = wordvector.build_doc_vector(train_dir, model, build_option, process_option, save_fv, train_fv_name, ) print train_fv logger.info("training set doc vector built in %.4lfs", timeit.default_timer() - start_time) logger.info("training set doc vector saved to %s", train_fv_name) logger.debug("training size: %s", str(train_fv.shape)) # train classifier logger.info("start training classifier") start_time = timeit.default_timer() forest = grid_search.GridSearchCV(RandomForestClassifier(), {'n_estimators':[100], 'n_jobs':[100]}, cv=5, scoring = 'f1_weighted', n_jobs=100) best_model = forest.fit(train_fv, list(train_sentences.sentiment_iterator())) logger.info("finished training classifier in %.4lfs", timeit.default_timer() - start_time) if save_classifier: joblib.dump(best_model, classifier_name) # evaluate on test set logger.info("start building test set doc vector") start_time = timeit.default_timer() test_sentences = Sentences(test_dir, csv_option, process_option) test_fv = wordvector.build_doc_vector(test_dir, model, build_option, process_option, save_fv, test_fv_name) print test_fv logger.info("test set doc vector built in %.4lfs", timeit.default_timer() - start_time) logger.info("test set doc vector saved to %s", test_fv_name) logger.debug("test size: %s", str(test_fv.shape)) logger.info("start predicting test set sentiment") start_time = timeit.default_timer() predicted_sentiment = best_model.predict(test_fv) logger.info("finished prediction in %.4lfs", timeit.default_timer() - start_time) accuracy = np.mean(predicted_sentiment == list(test_sentences.sentiment_iterator())) print "Test Set Accuracy = ", accuracy print metrics.classification_report(list(test_sentences.sentiment_iterator()), \ predicted_sentiment, target_names=['0', '1', '2', '3'])