def get_word2vec_model(): with open("../sentence_level_corpus_all_information_normalized.csv") as f: data = f.readlines() sentences = [] for i in data[1:]: sen = i.split("|")[1] words = nltk.word_tokenize(sen) sentences.append(words) sentences_1 = [ i for i in Text8Corpus('/home2/hk/workshop/Data/text8/text8') ] sentences = sentences + sentences_1 model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=5, iter=10) model.wv.save(path) # model.save("word2vec.model") vector = model.wv['brother-in-law'] # numpy vector of a word print(vector)
def train_models(corpus_file, output_name): output_file = '{:s}_ft'.format(output_name) if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))): print('Training fasttext on {:s} corpus..'.format(corpus_file)) %time !{FT_HOME}fasttext skipgram -input {corpus_file} -output {MODELS_DIR+output_file} -lr {lr} -dim {dim} -ws {ws} -epoch {epoch} -minCount {minCount} -neg {neg} -loss {loss} -t {t} else: print('\nUsing existing model file {:s}.vec'.format(output_file)) output_file = '{:s}_ft_no_ng'.format(output_name) if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))): print('\nTraining fasttext on {:s} corpus (without char n-grams)..'.format(corpus_file)) %time !{FT_HOME}fasttext skipgram -input {corpus_file} -output {MODELS_DIR+output_file} -lr {lr} -dim {dim} -ws {ws} -epoch {epoch} -minCount {minCount} -neg {neg} -loss {loss} -t {t} -maxn 0 else: print('\nUsing existing model file {:s}.vec'.format(output_file)) output_file = '{:s}_gs'.format(output_name) if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))): print('\nTraining word2vec on {:s} corpus..'.format(corpus_file)) # Text8Corpus class for reading space-separated words file %time gs_model = Word2Vec(Text8Corpus(corpus_file), **params); gs_model # Direct local variable lookup doesn't work properly with magic statements (%time) locals()['gs_model'].save_word2vec_format(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))) print('\nSaved gensim model as {:s}.vec'.format(output_file)) else: print('\nUsing existing model file {:s}.vec'.format(output_file))
def fit_transform(self, data, return_ids=False, overwrite_corpus=True): filtered_data = self.filter.filter_dataset(data) if overwrite_corpus: self.save_corpus(filtered_data) self.tf_idf = self.tf_idf.fit( filtered_data['description'].values.astype('U')) corpus = Text8Corpus(self.corpus_path) self.word2vec = Word2Vec(corpus, size=300, min_count=1) tf_idf_weightings = dict( zip(self.tf_idf.get_feature_names(), self.tf_idf.idf_)) vectors = [] ids = [] counter_empty = 0 for i, d in enumerate(filtered_data['description'].values.astype('U')): descriptor_count = 0 weighted_terms = [] terms = d.split(' ') for term in terms: if term in tf_idf_weightings.keys(): tf_idf_weighting = tf_idf_weightings[term] word_vector = self.word2vec.wv.get_vector(term).reshape( 1, 300) weighted_word_vector = tf_idf_weighting * word_vector weighted_terms.append(weighted_word_vector) descriptor_count += 1 if len(weighted_terms) == 0: counter_empty += 1 review_vector = [np.zeros(300)] if not len( weighted_terms) else sum(weighted_terms) / len(weighted_terms) vectors.append(review_vector) ids.append(filtered_data['id'][i]) # print('Вин без описания', counter_empty) vectors = np.concatenate(vectors) return vectors, np.array(ids) if return_ids else vectors
def train_glove(path): import itertools from gensim.models.word2vec import Text8Corpus from gensim.scripts.glove2word2vec import glove2word2vec from glove import Corpus, Glove #import os #import struct sentences = list(itertools.islice(Text8Corpus(path), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES) glove.save(file_name) glove2word2vec(file_name, file_name + '_modified') """ command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified' os.system(command) with open(file_name+'_modified', mode='rb') as file: # b is important -> binary fileContent = file.read() print 'Content',fileContent """ print 'Finished' return glove
def load_wv(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = Text8Corpus('text8') model = Word2Vec(sentences, size=200) model.save('features/text8_w2v_features/text8.model') model.wv.save_word2vec_format('features/text8_w2v_features/text.model.bin', binary=True)
def main(): psr = argparse.ArgumentParser() psr.add_argument('-d', '--dim', default=200, type=int) psr.add_argument('-p', '--path', default='ubuntu_data/train.txt') args = psr.parse_args() sentences = Text8Corpus(args.path) print('training') model = Word2Vec(sentences, size=args.dim, window=5, min_count=5, workers=4) model.save('ubuntu_word2vec_' + str(args.dim) + '.model') print('saved.')
def preprocess_hebrew(reports_df, corpus_name): """ Hebrew reports preprocessing :param reports_df: dataframe of hebrew reports :param corpus_name: Name of the corpus :return: model name, vocab list, model_name, raw_text_file to tokenize on """ reports_df = reports_df[conf.col_report][reports_df[conf.col_source] == conf.HEBREW_REPORT_INDICATOR] # preprocess reports_df = reports_df.apply(sub_preporcess_hebrew) text_reports = reports_df.values text_reports = ' '.join(list(text_reports)).encode('utf-8') # Save raw text date = get_date_for_model_name() model_name = date + '_bert_' + conf.HEBREW_IDENTIFIER_NAME + '_' + corpus_name raw_text_file = conf.DATA_PATH + model_name + '.txt' if os.path.exists(raw_text_file): os.remove(raw_text_file) with open(raw_text_file, "wb") as text_file: text_file.write(text_reports) # Load training data. sentences = Text8Corpus(raw_text_file) # Train a toy bigram model. phrases = Phrases(sentences, min_count=7, threshold=100, max_vocab_size=len(text_reports)) del text_reports # Export the trained model = use less RAM, faster processing. Model updates no longer possible. bigram = Phraser(phrases) reports_df = reports_df.str.split(' ') reports_df = reports_df.apply(lambda x: bigram[x]) reports_df = reports_df.apply(lambda x: " ".join(x)) reports_df.to_csv(raw_text_file, header=None, index=None, sep=' ', mode='a') del reports_df counter = Counter() with open(raw_text_file, encoding="utf-8") as f: for line in f: counter.update(line.split()) vocab = list(counter.keys()) return model_name, vocab, raw_text_file
def collocation(filepath): '''Creates corpus considering collocations, frequent co-occuring bigrams are merged (new york -> new_york)''' abs_path = os.getcwd() + "/" corpus = Text8Corpus(datapath(abs_path + filepath)) phrases = Phrases(corpus) collocations = Phraser(phrases) text_list = [collocations[line] for line in corpus] flattened_list = [i for sub in text_list for i in sub] flattened_corpus = " ".join(flattened_list) outfile = open(filepath + ".collocation", "w") for line in flattened_corpus: outfile.write(line) outfile.close()
def main(): docs = list(itertools.islice(Text8Corpus('text8'), None)) ''' Make model ''' corpus = Corpus() corpus.fit(docs, window=10) ''' Load Model ''' glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) print('man') pprint.pprint(glove.most_similar('man', number=10)) print('flog') pprint.pprint(glove.most_similar('flog', number=10)) return
def create_graph(): #ctx = mx.gpu(0) bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_cased') text8Corpus = Text8Corpus( "/Users/divyakoyyalagunta/Projects/codenames/text8.txt", max_sentence_length=10) # sentences = ["urged Filipinos to stop weeping for the man who had laughed all the way to the bank", # "Soon after setting off we came to a forested valley along the bank of the Gwaun", # "The condom balloon was denied official entry status this year", # "The marine said, get down behind that grass bank, sir, and he immediately lobbed a mills grenade into the river"] for sentence in text8Corpus: joined_sentence = (" ").join(sentence) add_sentence_emb(bert_embedding, joined_sentence) print("Number of embeddings", len(embedding_vector_averages)) print("Number of words in word_to_idx_dict", len(word_to_idx_dict.keys())) tree_idx = 0 mod = 50000 emb_size = 768 t = AnnoyIndex(emb_size, metric='angular') for x in range(len(embedding_vector_averages)): if (tree_idx % mod == 0): print("ADDED ", tree_idx, " EMBEDDINGS TO ANNOY TREE") embedding = embedding_vector_averages[x][0] if len(embedding) == 0: continue t.add_item(x, embedding) tree_idx += 1 t.build(100) idx_to_word_dict = {v: k for k, v in word_to_idx_dict.items()} t.save('annoy_tree_bert_emb_768_test.ann') np.save('annoy_tree_index_to_word_bert_emb_768_test.npy', idx_to_word_dict)
def createModel(self, pathCorpus, min_count=5, size=300, workers=8, window=5, iter=5, sg=1, negative=10): sentences = Text8Corpus(datapath(pathCorpus)) model = Word2Vec( sentences, min_count=min_count, # Ignore words that appear less than this size=size, # Dimensionality of word embeddings workers=workers, # Number of processors window=window, # Context window for words during training iter=iter, # Number of epochs training over corpus sg=sg, # skip gram true negative=negative) return model
from gensim.models import Word2Vec, KeyedVectors from gensim.models.word2vec import Text8Corpus # Using params from Word2Vec_FastText_Comparison params = { 'alpha': 0.05, 'vector_size': 100, 'window': 5, 'epochs': 5, 'min_count': 5, 'sample': 1e-4, 'sg': 1, 'hs': 0, 'negative': 5, } model = Word2Vec(Text8Corpus(text8_path), **params) wv = model.wv print("Using trained model", wv) ############################################################################### # 3. Construct AnnoyIndex with model & make a similarity query # ------------------------------------------------------------ # # An instance of ``AnnoyIndexer`` needs to be created in order to use Annoy in Gensim. # The ``AnnoyIndexer`` class is located in ``gensim.similarities.annoy``. # # ``AnnoyIndexer()`` takes two parameters: # # * **model**: A ``Word2Vec`` or ``Doc2Vec`` model. # * **num_trees**: A positive integer. ``num_trees`` effects the build # time and the index size. **A larger value will give more accurate results,
def multibleutest(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) if not os.path.exists("./multiBleu_log/"): os.makedirs("./multiBleu_log/") gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] with tf.Session() as sess: model = GAN('G_test', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) model.saver.restore(sess, ckpt.model_checkpoint_path) print('read in model from {}'.format(ckpt.model_checkpoint_path)) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) prompt = None cheatsheetMAP = {} with open(FLAGS.test_path, 'r') as source: for line in source.readlines(): line = line.strip() if line == "": prompt = None continue elif prompt == None: cheatsheetMAP[line] = [] prompt = line else: cheatsheetMAP[prompt].append(line) answer = [] with open("./multiBleu_log/" + FLAGS.file_head + "_ref.txt", 'w') as ffop: with open("./multiBleu_log/" + FLAGS.file_head + "_Q.txt", 'w') as fop: feature, output_file, output_list = [], [], [] for i in range(FLAGS.feature_size): output_file.append("./multiBleu_log/" + FLAGS.file_head + "_{}.txt".format(i)) feature.append([[ 3 if x == i else 0 for x in range(FLAGS.feature_size) ]]) output_list.append([]) for p, refs in cheatsheetMAP.items(): check = False token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(p), vocab, normalize_digits=False) token_ids.append(data_utils.EOS_ID) if len(token_ids) > _buckets[-1][0]: continue encoder_pad = [data_utils.PAD_ID ] * (_buckets[-1][0] - len(token_ids)) encoder_lens = [len(token_ids)] token_ids = list(token_ids) + encoder_pad encoder_inputs = [] for idx in token_ids: encoder_inputs.append([idx]) decoder_inputs = [[data_utils.GO_ID]] outputs_list = [] for x in range(FLAGS.feature_size): A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \ decoder_inputs, feature[x], gloveA_emb, gloveB_emb) outputs = [output_ids[0] for output_ids in outputs] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID )] if data_utils.UNK_ID in outputs: check = True break outputs_list.append(" ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ])) if check: continue else: fop.write(p) fop.write('\n') for x in refs: ffop.write(x + '\n') ffop.write('\n') answer.append(refs) for x in range(FLAGS.feature_size): output_list[x].append(outputs_list[x]) for x in range(FLAGS.feature_size): with open(output_file[x], 'w') as op: bleu = [] for i, line in enumerate(output_list[x]): op.write(line) op.write("\n") score = sentence_bleu(answer[i], line) bleu.append(score) op.write("My BLEU: {}".format(sum(bleu) / len(bleu))) op.write('\n')
scorer=None) # we will use our score_item function redefinition new_s = [] for words, score in bigrams: if score is not None: words = delimiter.join(words) new_s.append(words) return [utils.to_unicode(w) for w in new_s] if __name__ == '__main__': logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s", " ".join(sys.argv)) # check and process cmdline input program = os.path.basename(sys.argv[0]) if len(sys.argv) < 2: print(globals()['__doc__'] % locals()) sys.exit(1) infile = sys.argv[1] from gensim.models import Phrases # noqa:F811 for pickle from gensim.models.word2vec import Text8Corpus sentences = Text8Corpus(infile) # test_doc = LineSentence('test/test_data/testcorpus.txt') bigram = Phrases(sentences, min_count=5, threshold=100) for s in bigram[sentences]: print(utils.to_utf8(u' '.join(s)))
import re import numpy as np from gensim.models.word2vec import Text8Corpus import glove from multiprocessing import Pool from scipy import spatial import itertools sentences = list(itertools.islice(Text8Corpus('text8'),None)) iv = open("iv.txt","r").read() iv.fit(sentences, window = 10)
def word2vec(): data = Text8Corpus('data/corpus_winestyle.txt') return Word2Vec(data, size=300)
def Similarity_Criteria(data1, criteria, MAX_CAT, Max_Iter): import pandas as pd import numpy as np text_to_be_used_later = data1['Text'].tolist() data1['Text'] = cleanText(data1) data1['Category'] = data1['Category'].astype(str) y = data1['Category'].tolist() cols_in_data_final = [w.replace('[', '_') for w in y] cols_in_data_final = [w.replace(']', '_') for w in cols_in_data_final] cols_in_data_final = [w.replace('<', '_') for w in cols_in_data_final] data1['Category'] = cols_in_data_final if criteria == 'Doc2Vec': from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk.tokenize import word_tokenize data = data1['Text'].astype(str).tolist() tagged_data = [ TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data) ] vec_size = 100 ALPHA = 0.01 max_epochs = 100 DM = 1 MIN_COUNT = 5 MIN_ALPHA = 0.0001 model = Doc2Vec(size=vec_size, alpha=ALPHA, min_alpha=MIN_ALPHA, min_count=MIN_COUNT, dm=DM, seed=1234, workers=1) model.build_vocab(tagged_data) for epoch in range(int(max_epochs)): #print('iteration {0}'.format(epoch)) model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.0002 # fix the learning rate, no decay model.min_alpha = model.alpha freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] new_categories_to_fill = [] MAX_CAT = MAX_CAT j = 1 while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter: new_categories_to_fill = [] for i in range(len(data1)): original_cat = data1.iloc[i]['Category'] most_similar_cat = data1.iloc[int( model.docvecs.most_similar([i])[0][0])]['Category'] orig_freq_cnt = freq_count[freq_count['catName'] == original_cat]['Cnt'].values[0] most_similar_freq_cnt = freq_count[ freq_count['catName'] == most_similar_cat]['Cnt'].values[0] if orig_freq_cnt >= most_similar_freq_cnt: new_categories_to_fill.append(original_cat) else: new_categories_to_fill.append(most_similar_cat) data1['Category'] = new_categories_to_fill freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] j += 1 print('Iteration....', j) if criteria == 'TfIdf': from sklearn.feature_extraction.text import TfidfVectorizer from scipy.sparse.csr import csr_matrix from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer import re from sklearn.metrics.pairwise import linear_kernel tf = TfidfVectorizer(input=data1['Text'].tolist(), analyzer='word', lowercase=False, ngram_range=(1, 10), sublinear_tf=True, norm='l2') tfidf_matrix = tf.fit_transform(data1['Text'].tolist()) cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) np.fill_diagonal(cosine_similarities, -1) distances = np.argmax(cosine_similarities, axis=1) freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] new_categories_to_fill = [] MAX_CAT = MAX_CAT j = 1 while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter: new_categories_to_fill = [] for i in range(len(data1)): original_cat = data1.iloc[i]['Category'] most_similar_cat = data1.iloc[distances[i]]['Category'] orig_freq_cnt = freq_count[freq_count['catName'] == original_cat]['Cnt'].values[0] most_similar_freq_cnt = freq_count[ freq_count['catName'] == most_similar_cat]['Cnt'].values[0] if orig_freq_cnt >= most_similar_freq_cnt: new_categories_to_fill.append(original_cat) else: new_categories_to_fill.append(most_similar_cat) data1['Category'] = new_categories_to_fill freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] j += 1 print('Iteration....', j) if criteria == 'Word2Vec-PretrainedGoogle': import gensim.models.keyedvectors as word2vec model = word2vec.KeyedVectors.load_word2vec_format( 'GoogleNews-vectors-negative300.bin.gz', binary=True) from scipy import spatial index2word_set = set(model.wv.index2word) def avg_feature_vector(sentence, model, num_features, index2word_set): words = sentence.split() feature_vec = np.zeros((num_features, ), dtype='float32') n_words = 0 for word in words: if word in index2word_set: n_words += 1 feature_vec = np.add(feature_vec, model[word]) if (n_words > 0): feature_vec = np.divide(feature_vec, n_words) return feature_vec foo = avg_feature_vector(data1['Text'][0], model=model, num_features=300, index2word_set=index2word_set) foo = pd.DataFrame(foo).T fooToFill = foo[0:0] for i in range(len(data1)): foo = avg_feature_vector(data1['Text'][i], model=model, num_features=300, index2word_set=index2word_set) foo = pd.DataFrame(foo).T fooToFill = pd.concat([fooToFill, foo], axis=0) from sklearn.metrics.pairwise import cosine_distances cosine_similarities = 1 - cosine_distances(fooToFill, fooToFill) np.fill_diagonal(cosine_similarities, -1) distances = np.argmax(cosine_similarities, axis=1) freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] new_categories_to_fill = [] MAX_CAT = MAX_CAT j = 1 while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter: new_categories_to_fill = [] for i in range(len(data1)): original_cat = data1.iloc[i]['Category'] most_similar_cat = data1.iloc[distances[i]]['Category'] orig_freq_cnt = freq_count[freq_count['catName'] == original_cat]['Cnt'].values[0] most_similar_freq_cnt = freq_count[ freq_count['catName'] == most_similar_cat]['Cnt'].values[0] if orig_freq_cnt >= most_similar_freq_cnt: new_categories_to_fill.append(original_cat) else: new_categories_to_fill.append(most_similar_cat) data1['Category'] = new_categories_to_fill freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] j += 1 print('Iteration....', j) if criteria == 'Word2Vec-Text8Corpus': from gensim.models.word2vec import Text8Corpus from gensim.models import Word2Vec w2v_model2 = Word2Vec(Text8Corpus('text8'), size=100, window=5, min_count=5, workers=4) index2word_set = set(w2v_model2.wv.index2word) def avg_feature_vector_Text8(sentence, model, num_features, index2word_set): words = sentence.split() feature_vec = np.zeros((num_features, ), dtype='float32') n_words = 0 for word in words: if word in index2word_set: n_words += 1 feature_vec = np.add(feature_vec, model[word]) if (n_words > 0): feature_vec = np.divide(feature_vec, n_words) return feature_vec foo = avg_feature_vector_Text8(data1['Text'][0], model=w2v_model2, num_features=100, index2word_set=index2word_set) foo = pd.DataFrame(foo).T fooToFill = foo[0:0] for i in range(len(data1)): foo = avg_feature_vector_Text8(data1['Text'][i], model=w2v_model2, num_features=100, index2word_set=index2word_set) foo = pd.DataFrame(foo).T fooToFill = pd.concat([fooToFill, foo], axis=0) from sklearn.metrics.pairwise import cosine_distances cosine_similarities = 1 - cosine_distances(fooToFill, fooToFill) np.fill_diagonal(cosine_similarities, -1) distances = np.argmax(cosine_similarities, axis=1) freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] new_categories_to_fill = [] MAX_CAT = MAX_CAT j = 1 while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter: new_categories_to_fill = [] for i in range(len(data1)): original_cat = data1.iloc[i]['Category'] most_similar_cat = data1.iloc[distances[i]]['Category'] orig_freq_cnt = freq_count[freq_count['catName'] == original_cat]['Cnt'].values[0] most_similar_freq_cnt = freq_count[ freq_count['catName'] == most_similar_cat]['Cnt'].values[0] if orig_freq_cnt >= most_similar_freq_cnt: new_categories_to_fill.append(original_cat) else: new_categories_to_fill.append(most_similar_cat) data1['Category'] = new_categories_to_fill freq_count = data1['Category'].value_counts() freq_count = pd.DataFrame(freq_count) freq_count['catName'] = freq_count.index freq_count.columns = ['Cnt', 'catName'] j += 1 print('Iteration....', j) data1['Text'] = text_to_be_used_later return data1
def seriestest(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] with tf.Session() as sess: # build the model model = StarGAN('G_test', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) #sess.run(tf.variables_initializer(tf.global_variables())) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) model.saver.restore(sess, ckpt.model_checkpoint_path) print('read in model from {}'.format(ckpt.model_checkpoint_path)) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) sys.stdout.write('> ') sys.stdout.flush() sentence = sys.stdin.readline() while sentence: if sentence.strip() == 'exit()': break # step number = 0 feature = [] for f in range(FLAGS.feature_size): feature.append( [[3 if x == f else 0 for x in range(FLAGS.feature_size)]]) token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab, normalize_digits=False) print(token_ids) token_ids.append(data_utils.EOS_ID) encoder_pad = [data_utils.PAD_ID ] * (_buckets[-1][0] - len(token_ids)) encoder_lens = [len(token_ids)] # feature in my implementation token_ids = list(token_ids) + encoder_pad encoder_inputs = [] for idx in token_ids: encoder_inputs.append([idx]) print(encoder_inputs) decoder_inputs = [[data_utils.GO_ID]] for x in range(FLAGS.feature_size): A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \ decoder_inputs, feature[x], gloveA_emb, gloveB_emb) #print(A) #outputs = [int(np.argmax(logit, axis=1)) for logit in outputs] outputs = [output_ids[0] for output_ids in outputs] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print( feature[x], ':', " ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ])) print(log_prob) sys.stdout.write('> ') sys.stdout.flush() sentence = sys.stdin.readline()
def filetest(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' if not os.path.exists('./logout/'): os.makedirs('./logout/') glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] with tf.Session() as sess: # build the model model = StarGAN('G_test', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) #sess.run(tf.variables_initializer(tf.global_variables())) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) model.saver.restore(sess, ckpt.model_checkpoint_path) print('read in model from {}'.format(ckpt.model_checkpoint_path)) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) output_path = 'responses_six.txt' output_log = open(output_path, 'w') name_list = ['Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel'] with open(FLAGS.test_path, 'r') as sentences: # step number = 0 feature = [] output_file = [] output_list = [] output_file.append('./logout/1.txt') output_file.append('./logout/2.txt') output_file.append('./logout/3.txt') output_file.append('./logout/4.txt') output_file.append('./logout/5.txt') output_file.append('./logout/6.txt') for f in range(FLAGS.feature_size): feature.append( [[3 if x == f else 0 for x in range(FLAGS.feature_size)]]) output_list.append([]) for id, sentence in enumerate(sentences.readlines()): if id % 2 == 1: continue number = number + 1 token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab, normalize_digits=False) token_ids.append(data_utils.EOS_ID) if len(token_ids) > _buckets[-1][0]: continue output_line = 'input : ' + sentence output_log.write(output_line) encoder_pad = [data_utils.PAD_ID ] * (_buckets[-1][0] - len(token_ids)) encoder_lens = [len(token_ids)] # feature in my implementation token_ids = list(token_ids) + encoder_pad encoder_inputs = [] for idx in token_ids: encoder_inputs.append([idx]) decoder_inputs = [[data_utils.GO_ID]] for x in range(FLAGS.feature_size): A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \ decoder_inputs, feature[x], gloveA_emb, gloveB_emb) outputs = [output_ids[0] for output_ids in outputs] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] output_list[x].append(" ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ])) output_line = name_list[x] + ': ' + " ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ]) + '\n' output_log.write(output_line) output_log.write('\n') if number % 10 == 0: print('parsing line ', number) if number == 1000: output_log.close() break for x in range(FLAGS.feature_size): with open(output_file[x], 'w') as op: for line in output_list[x]: op.write(line) op.write('\n')
from gensim.models import Word2Vec, KeyedVectors from gensim.models.word2vec import Text8Corpus params = { 'alpha': 0.05, 'size': 100, 'window': 5, 'iter': 5, 'min_count': 5, 'sample': 1e-4, 'sg': 1, 'hs': 0, 'negative': 5 } model = Word2Vec(Text8Corpus('text8'), **params) print(model) from gensim.similarities.index import AnnoyIndexer model.init_sims() annoy_index = AnnoyIndexer(model, 100) def GetWords(vector): return model.most_similar([vector], topn=5, indexer=annoy_index) vector = model.wv.syn0norm[0] GetWords(vector)
def __iter__(self): corpus = Text8Corpus(self.fn) for doc in corpus: yield doc
def disctest(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] with tf.Session() as sess: model = GAN('D_test', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) model.saver.restore(sess, ckpt.model_checkpoint_path) print('read in model from {}'.format(ckpt.model_checkpoint_path)) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) sys.stdout.write('> ') sys.stdout.flush() sentence = sys.stdin.readline() while sentence: if sentence.strip() == 'exit()': break token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab, normalize_digits=False) token_ids.append(data_utils.EOS_ID) encoder_pad = [data_utils.PAD_ID ] * (_buckets[-1][0] - len(token_ids)) encoder_lens = [len(token_ids)] token_ids = list(token_ids) + encoder_pad encoder_inputs = [] encoder_inputs.append([idx] for idx in token_ids) rf, c = model.dynamic_decode_D(sess, encoder_inputs, encoder_lens, gloveA_emb) print('rf : ', rf) print('c : ', c) sys.stdout.write('> ') sys.stdout.flush() sentence = sys.stdin.readline()
import logging from gensim.models.word2vec import Word2Vec, Text8Corpus # Enable logging. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # Read data. sentences = Text8Corpus('data/ja.text8') # Train the model using Skip-gram. model = Word2Vec(sentences, size=100, window=5, sg=1) # Save the model. model.save('models/model.bin') #################################################### # Load the model. model = Word2Vec.load('models/model.bin') # 1. Get the word embedding. ret1 = model.wv['猫'] print(ret1) # [ 0.02126932 0.15553345 0.10472752 0.82008636 0.323413 0.42847344 # -0.05886601 -0.03228 -0.49861073 -0.13823172 0.15907998 -0.2955121 # 0.15381564 0.02959213 0.45588103 0.04573615 0.10711562 -0.7498988 # 0.34765413 0.5668533 0.0082125 0.40620092 0.1419684 -0.15594704 # 0.2681074 -0.00587511 -0.17240909 -0.04313468 0.01801641 -0.08588244 # -0.26890314 -0.58127177 -0.29637957 0.09391042 0.26176983 -0.09310274 # -0.05240794 0.4360441 0.25374362 0.2856923 -0.09161343 -0.34498295 # 0.4329259 0.14602754 0.03789869 -0.16791926 -0.4877344 0.17858095 # 0.4094406 -0.0850195 -0.11097047 -0.22874318 0.20079853 -0.22009209
from util.utils import get_logger logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = get_logger(__name__) class Word2VecModel: def __init__(self): self.default_model_path = 'text8.model' self.model = None def get_model(self): if self.model: return self.model logger.info('Start loading word2vec model...') word2Vec = Word2Vec() self.model = word2Vec.load_word2vec_format(os.getenv('WORD2VEC_MODEL_PATH', self.default_model_path), binary=True) # self.model = word2Vec.load(os.getenv('WORD2VEC_MODEL_PATH', self.default_model_path)) self.model.init_sims(replace=True) logger.info('Finish loading word2vec model...') return self.model if __name__ == '__main__': # test word2vec text8Corpus = Text8Corpus(fname='/home/diepdt/Documents/word2vec/text8') model = Word2Vec(text8Corpus, workers=4) model.save('text8.model') model.most_similar(['apple'])
def train_GAN(): os.environ['CUDA_VISIBLE_DEVICES'] = '1' from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.3 set_session(tf.Session(config=config)) if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) if not os.path.exists(FLAGS.pretrain_dir): os.makedirs(FLAGS.pretrain_dir) if not os.path.exists(FLAGS.gan_dir): os.makedirs(FLAGS.gan_dir) def build_summaries(): train_loss = tf.Variable(0.) tf.summary.scalar("train_loss", train_loss) summary_vars = [train_loss] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars feature, data, train, data_voc, train_voc = \ data_utils.prepare_data(FLAGS.feature_path, FLAGS.feature_size, FLAGS.data_dir, \ FLAGS.data_path, FLAGS.train_path, FLAGS.vocab_size) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size data_utils.combine_corpus(data_voc, train_voc, vocab_path, glove_corpus_path, 28) sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) if not os.path.exists(modelA): gloveA = Glove(no_components=FLAGS.size, learning_rate=0.05) gloveA.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True) gloveA.add_dictionary(corpus.dictionary) gloveA.save(modelA) # 512 if not os.path.exists(modelB): gloveB = Glove(no_components=int(FLAGS.size * 3 / 4), learning_rate=0.05) gloveB.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True) gloveB.add_dictionary(corpus.dictionary) gloveB.save(modelB) # 384 gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) with tf.Session() as sess: model = GAN('GAN', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) # build summary and intialize summary_ops, summary_vars = build_summaries() sess.run(tf.variables_initializer(tf.global_variables())) log_dir = os.path.join(FLAGS.model_dir, 'log') writer = tf.summary.FileWriter(log_dir, sess.graph) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('read in model from {}'.format(ckpt.model_checkpoint_path)) model.saver.restore(sess, ckpt.model_checkpoint_path) # load in train and dev(valid) data with buckets train_set = read_data_with_buckets(train, FLAGS.max_train_data_size) data_set = read_data_with_buckets(data, FLAGS.max_train_data_size) train_buckets_sizes = [len(train_set[b]) for b in range(len(_buckets))] train_total_size = float(sum(train_buckets_sizes)) print('each buckets has: {d}'.format(d=train_buckets_sizes)) train_buckets_scale = [ sum(train_buckets_sizes[:i + 1]) / train_total_size for i in range(len(train_buckets_sizes)) ] # main process step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] ### ------------------------------------------------------------ ### ### Pretrain ### ### ------------------------------------------------------------ ### while True: # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, weights_f, seq_lens_f, _, \ feature_inputs_b, encoder_inputs_b, decoder_inputs_b, weights_b, seq_lens_b, _, = \ get_batch_with_buckets(FLAGS.feature_size, data_set, FLAGS.batch_size, bucket_id) # pretrain start ! start_time = time.time() forloss, _ , _, _ = model.train_previous(sess, encoder_inputs_f, feature_inputs_f, \ decoder_inputs_f, weights_f, encoder_inputs_b, \ feature_inputs_b, decoder_inputs_b, weights_b, \ bucket_id, gloveA_emb, gloveB_emb, seq_lens_f, seq_lens_b) step_loss = forloss step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('pretrain : ',step_loss) ### ------------------------------------------------------------ ### ### Train GAN ### ### ------------------------------------------------------------ ### for _ in range(FLAGS.Dstep): # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \ feature_inputs_b, decoder_inputs_b, weights_b, \ real_inputs, real_feature , real_seq_lens= \ get_gan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id) # D_step start ! start_time = time.time() _, D_loss = model.train_gan(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \ decoder_inputs_b, weights_b, feature_inputs_b, \ real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \ disc = True,real_seq_len=real_seq_lens, forward_seq_len=seq_lens_f) step_loss = D_loss step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('D_step : ', step_loss) for _ in range(FLAGS.Gstep): # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \ feature_inputs_b, decoder_inputs_b, weights_b, \ real_inputs, real_feature, real_seq_lens = \ get_gan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id) # G_step start ! start_time = time.time() _, for_reward = model.train_gan(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \ decoder_inputs_b, weights_b, feature_inputs_b, \ real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \ forward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f) step_loss = for_reward step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('for_loss :', step_loss) # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \ feature_inputs_b, decoder_inputs_b, weights_b, \ real_inputs, real_feature, real_seq_lens = \ get_gan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id) # G_step start ! start_time = time.time() _, back_reward = model.train_gan(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \ decoder_inputs_b, weights_b, feature_inputs_b, \ real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \ backward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f) step_loss = back_reward step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('back_loss :', step_loss) current_step += 1 # log, save and eval if current_step % FLAGS.steps_per_checkpoint == 0: perplexity = math.exp( float(loss)) if loss < 300 else float('inf') print( "Generator step %d; learning rate %.4f; learning_rate_star %.6f; D_lr %6f; step-time %.2f; perplexity " "%.2f; loss %.2f" % (model.global_F_step.eval(), model.learning_rate.eval(), model.learning_rate_star.eval(), model.D_lr.eval(), step_time, perplexity, loss)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.op_lr_decay) sess.run(model.op_D_lr_decay) sess.run(model.learning_rate_star_decay) previous_losses.append(loss) # write summary feed_dict = {} feed_dict[summary_vars[0]] = loss summary_str = sess.run(summary_ops, feed_dict=feed_dict) writer.add_summary(summary_str, model.global_F_step.eval()) writer.flush() # Save checkpoint and zero timer and loss. ckpt_path = os.path.join(FLAGS.model_dir, "ckpt") model.saver.save(sess, ckpt_path, global_step=model.global_F_step) gan_path = os.path.join(FLAGS.gan_dir, "ckpt_prev") model.saver.save(sess, gan_path, global_step=model.global_F_step) step_time, loss = 0.0, 0.0 sys.stdout.flush()
def train(self, pathCorpus, epochs=60, compute_loss=True): sentences = Text8Corpus(datapath(pathCorpus)) self.model.train(sentences, epochs=epochs, total_examples=self.model.corpus_count, compute_loss=compute_loss)
def filetest(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] if not os.path.exists('./fileTest_log/'): os.makedirs('./fileTest_log/') with tf.Session() as sess: model = GAN('G_test', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) model.saver.restore(sess, ckpt.model_checkpoint_path) print('read in model from {}'.format(ckpt.model_checkpoint_path)) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) with open(FLAGS.test_path, 'r') as sentences: # step number = 0 feature = [] output_file = [] for idx in range(FLAGS.feature_size): output_file.append('./fileTest_log/%s.txt' % idx) output_list = [] for f in range(FLAGS.feature_size): feature.append( [[3 if x == f else 0 for x in range(FLAGS.feature_size)]]) output_list.append([]) for id, sentence in enumerate(sentences.readlines()): token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab, normalize_digits=False) token_ids.append(data_utils.EOS_ID) if len(token_ids) > _buckets[-1][0]: continue encoder_pad = [data_utils.PAD_ID ] * (_buckets[-1][0] - len(token_ids)) encoder_lens = [len(token_ids)] token_ids = list(token_ids) + encoder_pad encoder_inputs = [] encoder_inputs.append([idx] for idx in token_ids) decoder_inputs = [[data_utils.GO_ID]] for x in range(FLAGS.feature_size): A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \ decoder_inputs, feature[x], gloveA_emb, gloveB_emb) outputs = [output_ids[0] for output_ids in outputs] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] output_list[x].append(" ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ])) if number % 1000 == 0: print('fileTest: generating line %d' % number) if number == 1000: break number += 1 for x in range(FLAGS.feature_size): with open(output_file[x], 'w') as op: for line in output_list[x]: op.write(line) op.write('\n')
#model.wv.save_word2vec_format('article_vec','article_vocab') # sentences = LineSentence('new_data/word_seg') # model = Word2Vec(sentences,min_count=5, size=200,window=5, negative=5, sg=1, # hs=0, iter=1, workers=4 ) # model.save("word_seg_word2vec.model") # model.get_latest_training_loss() # 保存词典 #model.wv.save_word2vec_format('word_seg_vec','word_seg_vocab') # 相似性搜索 # ---- text8 文件做示例---- # 加载数据,按照要求的格式 sentences = Text8Corpus('new_data/text8') # 创建模型并训练 model = Word2Vec(sentences, size=200, window=5, min_count=5, workers=4, sg=1, hs=0, negative=5, iter=1) # 保存模型 model.save("text8_word2vec.model") # 查看某个单词的向量 print(model.wv['computer']) # 相似性搜索,测试效果
def maxbleutest(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) if not os.path.exists("./maxBleu_log/"): os.makedirs("./maxBleu_log/") gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] with tf.Session() as sess: model = GAN('G_test', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) model.saver.restore(sess, ckpt.model_checkpoint_path) print('read in model from {}'.format(ckpt.model_checkpoint_path)) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) group_data, _ = data_utils.training_data_grouping( FLAGS.train_path, FLAGS.feature_path, FLAGS.feature_size) with open("./maxBleu_log/" + FLAGS.file_head + "_ref_msg.txt", 'w') as fop: with open(FLAGS.test_path, 'r') as sentences: number = 0 feature, output_file, output_list = [], [], [] for i in range(FLAGS.feature_size): output_file.append("./maxBleu_log/" + FLAGS.file_head + "_{}.txt".format(i)) feature.append([[ 3 if x == i else 0 for x in range(FLAGS.feature_size) ]]) output_list.append([]) for sentence in sentences.readlines(): if number % 100 == 0: print("maxBleuTesting: parsing line {}".format(number)) if number == 500: break token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab, normalize_digits=False) token_ids.append(data_utils.EOS_ID) if len(token_ids) > _buckets[-1][0]: continue encoder_pad = [data_utils.PAD_ID ] * (_buckets[-1][0] - len(token_ids)) encoder_lens = [len(token_ids)] token_ids = list(token_ids) + encoder_pad encoder_inputs = [] encoder_inputs.append([idx] for idx in token_ids) decoder_inputs = [[data_utils.GO_ID]] check = False outputs_list = [] for x in range(FLAGS.feature_size): A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \ decoder_inputs, feature[x], gloveA_emb, gloveB_emb) outputs = [output_ids[0] for output_ids in outputs] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID )] if data_utils.UNK_ID in outputs: check = True break outputs_list.append(" ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ])) if check: continue else: number += 1 fop.write(sentence) for x in range(FLAGS.feature_size): output_list[x].append(outputs_list[x]) for x in range(FLAGS.feature_size): with open(output_file[x], 'w') as op: pos_lst = [] for i, line in enumerate(output_list[x]): op.write(line) op.write("\n") grps_bleu = [] for _, sentence_lst in group_data.items(): bleu = -1 for ref in sentence_lst: score = sentence_bleu(ref, line) if score > bleu: bleu = score grps_bleu.append(bleu) max_bleu = max(grps_bleu) #predicted persona: max_candidates max_candidates = [ i for i, x in enumerate(grps_bleu) if x == max_bleu ] if len(max_candidates) > 1: pos_lst.append(-1) else: pos_lst.append(max_candidates[0]) tranc_pos_lst = [ pos_lst[i] for i in range(len(pos_lst)) if pos_lst[i] != -1 ] op.write("MaxBLEU Acc: {}".format( tranc_pos_lst.count(x) / len(tranc_pos_lst)))
from gensim.models.word2vec import Word2Vec, BrownCorpus, Text8Corpus import os, pdb, itertools word_count = 0 sent_count = 0 for f in os.listdir("./rawdata/training-monolingual.tokenized.shuffled"): if not os.path.isfile("./rawdata/training-monolingual.tokenized.shuffled/" + f): continue with open("rawdata/training-monolingual.tokenized.shuffled/"+f,"r") as file: for line in file: sent_count += 1 word_count += len(line.strip().split(" ")) brown = BrownCorpus("/home/david/nltk_data/corpora/brown/") ## Gotta provide total_examples text8 = Text8Corpus("./rawdata/text8") sent = itertools.chain(text8, brown) for snt in sent: sent_count += 1 word_count += len(snt) print "SENTENCE COUNT " + str(sent_count) print "WORD COUNT " + str(word_count)