def word_embedding(sentences,embedding_size,windows_len): """ Verify that the square error diminishes with fitting """ corpus_model = Corpus() corpus_model.fit(sentences,window=windows_len) # Check that the performance is poor without fitting glove_model = Glove(no_components=embedding_size, learning_rate=0.05) glove_model.fit(corpus_model.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus_model.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) corpus_dict=corpus_model.dictionary corpus_inverse_dict=dict(map(reversed, corpus_dict.items())) return glove_model,corpus_dict,corpus_inverse_dict
def test_supplied_dict_missing(): dictionary = {'a': 1, 'naïve': 0} corpus = [['a', 'naïve', 'fox']] model = Corpus(dictionary=dictionary) model.fit(corpus, max_map_size=0, window=10)
def pretrain(self,data_src): if not os.path.isfile("glove.model"): data_src = DataClean([ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=True,split_words=True).fit(data_src).transform(data_src) corpus_model = Corpus() corpus_model.fit(data_src,window=self.window) glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate) glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save("glove.model")
def main(): corpus_model = Corpus() corpus_model = Corpus.load('bioc-corpus-AZ2.model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('bioc-glove-AZ2.model')
def train_glove(sentences): print 'training glove model...' t0 = time() num_features = 300 # Word vector dimensionality context = 5 # Context window size learning_rate = 0.05 corpus = Corpus() corpus.fit(sentences, window=context) glove = Glove(no_components=num_features, learning_rate=learning_rate) glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True) glove.add_dictionary(corpus.dictionary) print 'took %0.5fs.' % (time() - t0) return glove
def test_supplied_dictionary(): dictionary = {'a': 2, 'naïve': 1, 'fox': 0} corpus = [['a', 'naïve', 'fox']] model = Corpus(dictionary=dictionary) model.fit(corpus, max_map_size=0, window=10) assert model.dictionary == dictionary assert model.matrix.shape == (len(dictionary), len(dictionary)) assert (model.matrix.tocsr()[2]).sum() == 0
def run_glove(self): """ run global vector """ #sentences = [["hi","good","to"],["see","u"]] sentences = self.get_sentences() print '\n' + '-'*80 print "Fitting words into corpus" corpus = Corpus() corpus.fit(sentences, window=10) print "Running Glove" glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) print "Fitting words and vectors into unique_words and vectors200" unique_words = [] vectors200 = [] cnt1 = 0 length1 = len(glove.inverse_dictionary) for word_id in glove.inverse_dictionary: cnt1 += 1 unique_words.append(glove.inverse_dictionary[word_id]) vectors200.append(glove.word_vectors[word_id]) sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1)) sys.stdout.flush() print '\n' + "Processing vectors200" processed_vectors200 = [] processed_vector = [] cnt2 = 0 length2 = len(vectors200) for vector in vectors200: cnt2 += 1 for float_num in vector: processed_vector.append(float_num) processed_vectors200.append(processed_vector) sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2)) sys.stdout.flush() return unique_words, processed_vectors200
def test_corpus_construction(): corpus_words = ['a', 'naïve', 'fox'] corpus = [corpus_words] model = Corpus() model.fit(corpus, max_map_size=0, window=10) for word in corpus_words: assert word in model.dictionary assert model.matrix.shape == (len(corpus_words), len(corpus_words)) expected = [[0.0, 1.0, 0.5], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0]] assert (model.matrix.todense().tolist() == expected)
def build_model_glove(args): from glove import Glove, Corpus if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def build_glove_embeddings(training, testing, args): ''' Trains the model on the sentiment140 dataset @Arguments: data: the loaded sentiment140 dataset from module num_epochs: the number of epochs to train on num_threads: the number of threads to use num_components: the number of components the glove model should use learning_rate: the model's learning rate window_size: the size of the window to use when looking for word co-occurence verbose: boolean for whether or not extensive output should be printed to screen @Return: A trained glove model ''' # initialize model glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate) txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing)) # read in the data to train on corpus_model = Corpus() corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window) # fit the model using the given parameters logging.info("Training GloVe") glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose) # add a dictionary just to make it easier for similarity queries glove.add_dictionary(corpus_model.dictionary) transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca) fromTraining = to_sklearn_format(transformer, training, args.vecsize) fromTesting = to_sklearn_format(transformer, testing, args.vecsize) return fromTraining, fromTesting
def test_fitting(): """ Verify that the square error diminishes with fitting """ num_sentences = 5000 seed = 10 corpus = Corpus() corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed)) # Check that the performance is poor without fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0 # Check that it is good with fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=500, no_threads=2) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
def test_supplied_dict_missing_ignored(): dictionary = {'a': 0, 'fox': 1} corpus = [['a', 'naïve', 'fox']] model = Corpus(dictionary=dictionary) model.fit(corpus, max_map_size=0, window=10, ignore_missing=True) assert model.dictionary == dictionary assert model.matrix.shape == (len(dictionary), len(dictionary)) # Ensure that context windows and context window # weights are preserved. full_model = Corpus() full_model.fit(corpus, window=10) assert (full_model.matrix.todense()[0, 2] == model.matrix.todense()[0, 1] == 0.5)
default='', help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') if args.wiki: print('Using wikipedia corpus') get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary))
def glove(docsents, n_dim, random_state=0, min_tf=1, docnames=None, keywords=None, **kwargs): ''' Interface for Glove algorithm using python-glove package. Output: Returns DocModel object containing the results of the word embedding. Inputs: docbows: Iterable of document token iterables (nested by document, will be flattened). n_dim: Number of dimensions to use in the embedding model. random_state: Integer for seeding random generator. Allows for making reproducable embedding models. min_tf: Minimum number of times a token must appear in the corpus to be added to the topic model. docnames: document names that will appear in DocModel for convenience. keywords: list of lists of words that will form the basis of a hyper-rotated version of a new embedding model. The list [['hat','dog'],['cat',]] would orient the first (arbitrary) dimension of the vector space to hat + dog, and the second to the vector for cat minus the component in hat + dog to preserve orthogonality. **kwargs: other keyword arguments fed directly into the sklearn NMF function. ''' # count frequencies fdist = Counter([w for s in pretendsents(docsents) for w in s]) sfdist = list(sorted(fdist.items(), key=lambda x: x[1], reverse=True)) dictionary = {wf[0]: i for i, wf in enumerate(sfdist)} cutoff = calc_cutoffind([f for w, f in sfdist], min_tf) # calculate corpus matrix corpus = Corpus(dictionary=dictionary) corpus.fit(pretendsents(docsents), window=10) # GloVe found that bigger windows helped corpus.matrix = corpus.matrix.tocsr()[:cutoff, :cutoff].tocoo() # train glove model glove = Glove(no_components=n_dim, learning_rate=0.05, random_state=random_state) glove.fit(corpus.matrix, **kwargs) # modify dictionary after cutoff applied cutoff_dictionary = { wf[0]: i for i, wf in enumerate(sfdist) if wf[1] > min_tf } glove.add_dictionary(cutoff_dictionary) vocab = [w for w, f in sfdist if f > min_tf] # if keywords provided, transform vector space to new basis based on keywords if keywords is not None: glove = supervised_vectors(glove, keywords) # transform documents to single vectors transpar = lambda doc: glove_transform_paragraph( glove, doc, ignore_missing=True) docvectors = [transpar(doc) for doc in pretenddocs(docsents)] # words associated with each dimension of the embedding space dimwords = np.zeros((n_dim, len(vocab))) for dim in range(n_dim): # create natural basis unit vector e = np.zeros(n_dim) e[dim] = 1 for i, w in enumerate(vocab): dimwords[dim, i] = glove_projection(glove, w, e) return DocModel(np.vstack(docvectors), dimwords, vocab, docnames=docnames)
''' from glove import Glove from glove import Corpus from gensim import corpora import time dic_file=r'/home/dannl/tmp/newstech/glove/news.dic' corpus_file='/home/dannl/tmp/newstech/news.txt' cooc_file='/home/dannl/tmp/newstech/glove/word.cooc' def read_corpus(filename): with open(filename, 'r') as datafile: for line in datafile: yield line.split()[1:] # get a cooccurrence matrix oldtime=time.time() dictionary = corpora.Dictionary.load(dic_file) # corpus_cooc = Corpus() # corpus_cooc.fit(read_corpus(corpus_file), window=10) corpus_cooc = Corpus(dictionary=dictionary.token2id) corpus_cooc.fit(read_corpus(corpus_file), window=10,ignore_missing=True) corpus_cooc.save(cooc_file) print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) print 'time cost:%.2f'%(time.time()-oldtime,)
#-*- coding:utf-8 -*- ''' Created on 2016-3-12 @author: dannl ''' from glove import Glove from glove import Corpus import time cooc_file='/home/dannl/tmp/newstech/glove/word.cooc' model_file='/home/dannl/tmp/newstech/glove/glove.model' oldtime=time.time() # get a cooccurrence matrix corpus_cooc = Corpus.load(cooc_file) # get a model glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True) glove.add_dictionary(corpus_cooc.dictionary) glove.save(model_file) # count=0 # for word,wid in corpus_cooc.dictionary.items(): # count+=1 # if count>100: # break # print word,wid print('Dict size: %s' % len(corpus_cooc.dictionary))
from __future__ import print_function import argparse import pprint # import gensim from glove import Glove from glove import Corpus sentense = [['你','是','谁'],['我','是','中国人']] corpus_model = Corpus() corpus_model.fit(sentense, window=10) #corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print(corpus_model.dictionary) print('Collocations: %s' % corpus_model.matrix.nnz) print(corpus_model.matrix) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=1, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model') glove = Glove.load('glove.model') corpus_model.save('corpus.model') corpus_model = Corpus.load('corpus.model') print("most similar to 我") print(glove.most_similar('我', number=10)) # 全部词向量矩阵 print(glove.word_vectors) # 指定词条词向量 print("你") print(glove.word_vectors[glove.dictionary['你']])
wordnet_lemmatizer = WordNetLemmatizer() lines_with_lemmas = [] #stop words contain the set of stop words for line in lines: temp_line = [] for word in lines: temp_line.append(wordnet_lemmatizer.lemmatize(word)) string = ' ' lines_with_lemmas.append(string.join(temp_line)) lines = lines_with_lemmas # creating a corpus object corpus = Corpus() # training the corpus to generate the co occurence matrix which is used in GloVe corpus.fit(lines, window=10) # creating a Glove object which will use the matrix created in the above lines to create embeddings # We can set the learning rate as it uses Gradient Descent and number of components glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model')
def main(): corpus_model = Corpus() corpus_model.fit(itertexts(), window=10, max_map_size=1000000) corpus_model.save('bioc-corpus-AZ2.model')
def test_supplied_dict_checks(): dictionary = {'a': 4, 'naïve': 1, 'fox': 0} with pytest.raises(Exception): Corpus(dictionary=dictionary)
data_all = pd.concat([train, test]) len_train = train.shape[0] qs = [] ts = [] ds = [] sentences = [] for q, t in zip(data_all['question1'].values.tolist(), data_all['question2'].values.tolist()): sentences.append(q.split(' ')) sentences.append(t.split(' ')) qs.append(q.split(' ')) ts.append(t.split(' ')) corpus_model = Corpus() corpus_model.fit(sentences, window=10) corpus_model.save(path + 'corpus.mdl') corpus_model = Corpus.load(path + 'corpus.mdl') glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=7, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(path + 'glove.glv') glove = Glove.load(path + 'glove.glv') print glove qt_sims_dists = [] qt_diff = []
mlp10 = mlp_model(10) mlp10_accuracy = train_test(mlp10, x, y, folds) mlp100 = mlp_model(100) mlp100_accuracy = train_test(mlp100, x, y, folds) mlp1000 = mlp_model(1000) mlp1000_accuracy = train_test(mlp1000, x, y, folds) print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy)) #3CNN #Glove Vectors from reviews c = [review.split() for review in data.data] corpus = Corpus() corpus.fit(c, window=10) glv = Glove(no_components=100, learning_rate=0.05) glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glv.add_dictionary(corpus.dictionary) embeddings_index = glv.dictionary BASE_DIR = '' GLOVE_DIR = BASE_DIR + '/glove.6B/' TEXT_DATA_DIR = 'txt_sentoken/' MAX_SEQUENCE_LENGTH = 1000 MAX_NB_WORDS = 20000 EMBEDDING_DIM = 100
for article in tom_data: article_body = article['article_body'] word_tokens = article_body.lower().translate(delchars).split(' ') yield [w for w in word_tokens if ( (not w in stop_words) and (not len(w) <= 4) )] if __name__ == '__main__': want_TSNE = False want_GRAPH = True # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') filepattern = 'tom_articles_1*.json' corpus_model = Corpus() corpus_model.fit(read_corpus(filepattern) , window=5) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=50, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=4, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model')
def main(model_select): data = pd.read_excel("./data/doc_set_final4.xlsx") data.token = data.token.apply(lambda x: literal_eval(x)) data = data.sample(frac=1, random_state=1234) token_list = data.token.tolist() target = data[['new_class', 'new_small_class']] train_x_data, test_x_data, train_y, test_y = train_test_split( token_list, target, test_size=0.3, stratify=target, shuffle=True, random_state=1234) if model_select == 'w2v': w2v_name = 'base_token' print("모델 학습") word2vec_kargs = { 'num_features': 300, 'num_workers': 4, 'window': 8, 'seed': 1234, 'min_word_count': 5, 'min_alpha': 0.025, 'iter': 30 } model = word2vec_model(train_x_data, **word2vec_kargs) print("모델 저장") model_name = './model/word_embedding/Word2vec1({}).model'.format( w2v_name) model.save(model_name) elif model_select == 'd2v': TaggedDocument = namedtuple('TaggedDocument', 'words tags') tagged_train_docs = [ TaggedDocument(d, [c[1]['new_class'], c[1]['new_small_class']]) for d, c in zip(train_x_data, train_y.iterrows()) ] print("모델 학습") doc2vec_kargs = { 'size': 300, 'window': 8, 'min_count': 5, 'alpha': 0.025, 'min_alpha': 0.025, 'workers': 4, 'seed': 1234, 'iter': 50 } model = doc2vec_model(tagged_train_docs, **doc2vec_kargs) print("모델 저장") model.save('./model/word_embedding/Doc2vec_new_small2_4.model') elif model_select == 'fasttext': print("모델 학습") ft_kargs = { 'size': 300, 'window': 5, 'min_count': 3, 'workers': 4, 'seed': 1234 } model = fasttext_model(train_x_data, **ft_kargs) print("모델 저장") model.save('./model/word_embedding/FastText.model') elif model_select == 'glove': glove_kargs = { 'size': 300, 'lr': 0.005, 'random_state': 1234, 'no_threads': 4, 'epoch': 30 } corpus = Corpus() corpus.fit(train_x_data, window=8) glove = Glove(no_components=glove_kargs['size'], learning_rate=glove_kargs['lr']) glove.fit(corpus.matrix, epochs=glove_kargs['epoch'], no_threads=glove_kargs['no_threads'], verbose=True) glove.add_dictionary(corpus.dictionary) print("모델 저장") glove.save('./model/word_embedding/glove.model') else: print("3가지 방식 중에 고르시오")
return noPunctutations for l in jsonLines: temp = json.loads(l) #tokenize, lemmatize, stopword,punctutions removal contentList = preprocess(temp["content"]) fulltext.append(contentList) #fulltext+=temp["content"] #print(fulltext) print(len(fulltext)) print(len(fulltext[0])) print(len(fulltext[1])) #mainlist=[] #mainlist.append(fulltext) corpus = Corpus() corpus.fit(fulltext, window=10) glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') print(glove.word_vectors[glove.dictionary['summary']]) print(glove.word_vectors[glove.dictionary['mapped']]) print(glove.word_vectors[glove.dictionary['low']]) print(glove.most_similar('low'))
def seriestest(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] with tf.Session() as sess: # build the model model = StarGAN('G_test', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) #sess.run(tf.variables_initializer(tf.global_variables())) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) model.saver.restore(sess, ckpt.model_checkpoint_path) print('read in model from {}'.format(ckpt.model_checkpoint_path)) vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) sys.stdout.write('> ') sys.stdout.flush() sentence = sys.stdin.readline() while sentence: if sentence.strip() == 'exit()': break # step number = 0 feature = [] for f in range(FLAGS.feature_size): feature.append( [[3 if x == f else 0 for x in range(FLAGS.feature_size)]]) token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab, normalize_digits=False) print(token_ids) token_ids.append(data_utils.EOS_ID) encoder_pad = [data_utils.PAD_ID ] * (_buckets[-1][0] - len(token_ids)) encoder_lens = [len(token_ids)] # feature in my implementation token_ids = list(token_ids) + encoder_pad encoder_inputs = [] for idx in token_ids: encoder_inputs.append([idx]) print(encoder_inputs) decoder_inputs = [[data_utils.GO_ID]] for x in range(FLAGS.feature_size): A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \ decoder_inputs, feature[x], gloveA_emb, gloveB_emb) #print(A) #outputs = [int(np.argmax(logit, axis=1)) for logit in outputs] outputs = [output_ids[0] for output_ids in outputs] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print( feature[x], ':', " ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ])) print(log_prob) sys.stdout.write('> ') sys.stdout.flush() sentence = sys.stdin.readline()
def train_glove(inst, meta_data={}): start_total = datetime.now() meta_data["glove_params"] = settings.GLOVE_PARAMS glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS) for params in glove_paramgrid: start = datetime.now() # MAKE CORPUS # set corpus filepath corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format( settings.DATASET, params["window"])) # load if corpus exists if os.path.isfile(corpus_fp): logging.info("Loading existing corpus {}.".format(corpus_fp)) corpus_model = Corpus.load(corpus_fp) logging.info("Successfully loaded existing corpus {}.".format(corpus_fp)) # make a new coocurrence corpus if it does not exist else: logging.info("Creating new corpus at {}.".format(corpus_fp)) corpus_model = Corpus() corpus_model.fit(inst, window=params["window"]) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) corpus_model.save(corpus_fp) logging.info("Dict size: {}.".format(len(corpus_model.dictionary))) logging.info("Collocations: {}.".format(corpus_model.matrix.nnz)) # GLOVE VECTOR TRAINING glove = Glove(no_components=params["dims"], learning_rate=params["lr"]) logging.info("Start fitting GloVe with parameters: {}.".format(params)) glove.fit(corpus_model.matrix, epochs=params["epochs"], no_threads=params["njobs"], verbose=False) glove.add_dictionary(corpus_model.dictionary) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET, params["window"], params["lr"], params["epochs"], params["dims"]) glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name)) duration = (datetime.now() - start).total_seconds() meta_data["models"][model_name] = params meta_data["models"][model_name]["duration_training"] = duration logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format( model_name, duration, params)) # SIMILARITY TEST for test_word in settings.TESTSIM_WORDS: if test_word not in meta_data["most_similar"]: meta_data["most_similar"][test_word] = {} logging.info("Querying model {} for {} most similar to \'{}\':".format( model_name, settings.N_TESTSIM, test_word)) sim = glove.most_similar(test_word, number=settings.N_TESTSIM) meta_data["most_similar"][test_word][model_name] = sim logging.info(pprint.pformat(sim)) total_duration = (datetime.now() - start_total).total_seconds() meta_data["glove_duration_training"] = total_duration return meta_data
for line in datafile: #print(line.lower().split(' ')) yield line.lower().split(' ') # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') print( 'You can use saved Corpus Model. In order to do this, Enter nothing for the file name' ) print('To train GloVe based on new dataset, Enter dataset\'s name') file_name = input("Enter file name: ") if file_name: get_data = read_corpus(file_name) corpus_model = Corpus() corpus_model.fit(get_data, window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if not file_name: corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(10), no_threads=4, verbose=True)
# reads .txt files def read_corpus(filename): delchars = [chr(c) for c in range(256)] delchars = [x for x in delchars if not x.isalnum()] delchars.remove(' ') delchars = ''.join(delchars) table = str.maketrans(dict.fromkeys(delchars)) with open(filename, 'r') as datafile: for line in datafile: yield line.lower().translate(table).split(' ') get_data = read_corpus('data/articles.txt') corpus_model = Corpus() corpus_model.fit(get_data, window=10) epochs = 1000 no_threads = 8 glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=epochs, no_threads=no_threads, verbose=True) glove.add_dictionary(corpus_model.dictionary) print("Most similar to Male ==>" + str(glove.most_similar('male'))) print( "---------------------------------------------------------------------------" ) print("Most similar to Population ==>" + str(glove.most_similar('population')))
previous_message[index] = -1 except IndexError: previous_message[index] = -1 texts = [] classes = [] for row in csvsequence: texts.append(clean(row[3]).split()) classes.append(row[0]) # Calculate distribution, to account for 95th percentile of messages. max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts]))) print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length)) corpus = Corpus() try: print("Loading pretrained corpus...") corpus = Corpus.load("cache/corpus.p") except: print("Training corpus...") corpus.fit(texts, window=max_sentence_length) corpus.save("cache/corpus.p") glove = Glove(no_components=number_components, learning_rate=0.05) try: print("Loading pretrained GloVe vectors...") glove = Glove.load("cache/glove.p") except: print("Training GloVe vectors...") # More epochs seems to make it worse
""" Created on Fri Sep 14 12:45:30 2018 @author: charlie """ import itertools from gensim.models.word2vec import Text8Corpus from glove import Corpus, Glove import os cur_dir = os.getcwd() glove_fname = '/glove.model' corpus_fname = "/corpus.model" if os.path.exists(cur_dir + glove_fname): glove = Glove.load(cur_dir+glove_fname) # corpus = Corpus.load(cur_dir+corpus_fname) else: sentences = list(itertools.islice(Text8Corpus('text/text8'), None)) corpus = Corpus() corpus.fit(sentences, window = 10) glove = Glove(no_components=100, learning_rate = 0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save(cur_dir + glove_fname) corpus.save(cur_dir+corpus_fname) glove.most_similar('men') # Parameters are hashable string not list glove.word_vectors[glove.dictionary['perfect']]
from glove import Glove, Corpus from gensim import utils, corpora, matutils, models import os corpus_file_name = '' wiki = models.word2vec.LineSentence(corpus_file_name) id2word = corpora.Dictionary(wiki) id2word.filter_extremes(keep_n=30000) word2id = dict((word, id) for id, word in id2word.iteritems()) # Filter all wiki documents to contain only those 30k words. filter_text = lambda text: [word for word in text if word in word2id] filtered_wiki = lambda: (filter_text(text) for text in wiki) # generator corpus = Corpus() corpus.fit(filtered_wiki(), window=10) HERE = os.path.dirname(os.path.dirname(__file__)) PS_FILE = os.path.join(HERE, "glove_default_30k.model") glove = Glove.load(PS_FILE) glove.add_dictionary(corpus.dictionary) glove.save('glove_default_30k_with_dict.model')
def __iter__(self): file_count = 0 for file_path in file_path_list: file_count += 1 print(f"Now file name:{file_path}, now file count:{file_count}") with open(file_path, "r", encoding="utf-8") as f: for line in f: if args.use_segment == 0: yield list(line.strip()) else: yield list(jieba.cut(line.strip())) sentences = Text(file_path_list) corpus_model = Corpus() corpus_model.fit(sentences, window=args.window_size) model = Glove(no_components=args.embedding_size) model.fit(corpus_model.matrix, epochs=args.iters, no_threads=args.cpu_count) model.add_dictionary(corpus_model.dictionary) end_time = time.time() use_time = round(end_time - start_time, 2) model_name = args.output_path if not os.path.isdir(model_name): model.save(model_name) else: model_name = os.path.join( model_name, f"glove_{args.window_size}_{args.embedding_size}.model")
def matrix_glove_embedding(click_all, flag, mode, threshold=0, dim=100, epochs=30, learning_rate=0.5): """ glove 原理 + 矩阵分解: 窗口内 加权统计 共线性词频 四种向量化方式: flag='item' mode='all': sku1 sku2 sku3 sku4 sku5 user flag='user' mode='all': user1 user2 user3 user4 user5 sku flag='item',mode='only': item1 item2 item3 item4 item5 flag='user' mode='only' user1 user2 user3 user4 user5 """ import psutil from glove import Glove from glove import Corpus if flag == 'user': group_by_col, agg_col = 'item_id', 'user_id' if flag == 'item': group_by_col, agg_col = 'user_id', 'item_id' data_ = click_all.groupby([ group_by_col ])[agg_col].agg(lambda x: ','.join(list(x))).reset_index() if mode == 'only': list_data = list(data_[agg_col].map(lambda x: x.split(','))) if mode == 'all': data_['concat'] = data_[agg_col] + ',' + data_[group_by_col].map( lambda x: 'all_' + x) list_data = data_['concat'].map(lambda x: x.split(',')) corpus_model = Corpus() corpus_model.fit(list_data, window=999999) glove = Glove(no_components=dim, learning_rate=learning_rate) glove.fit(corpus_model.matrix, epochs=epochs, no_threads=psutil.cpu_count(), verbose=True) glove.add_dictionary(corpus_model.dictionary) keys = glove.dictionary.keys() if mode == 'only': glove_embedding = {flag: {}} if mode == 'all': glove_embedding = {'user': {}, 'item': {}} for k in keys: if 'all' not in k: glove_embedding[flag][k] = glove.word_vectors[glove.dictionary[k]] if 'all' in k: flag_ = group_by_col.split('_')[0] k_ = k.split('_')[1] glove_embedding[flag_][k_] = glove.word_vectors[ glove.dictionary[k]] return glove_embedding
import csv year1 = 2007 year2 = 2008 filename = 'C:/Users/Marija/PyCharmProjects/scraping/arxivData ' data = [] type = 'title' while year2 < 2018: with open(filename + str(year1) + '-' + str(year2) + '.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in reader: # Titles only sentence = row[0][2:len(row[0]) - 1] sentence = sentence.replace('\\n', ' ').replace('.', '').replace( ',', '').replace(':', '').replace(')', '').replace('(', '').lower().split(' ') data.append(sentence) year1 += 1 year2 += 1 corpus = Corpus() corpus.fit(data, window=10) glove = Glove(no_components=100, learning_rate=0.025) glove.fit(corpus.matrix, epochs=5, no_threads=4) glove.add_dictionary(corpus.dictionary) glove.save(type + 'Glove.txt')
def topk_recall_glove_embedding(click_all, dict_label, k=100, dim=88, epochs=30, learning_rate=0.5): import psutil from glove import Glove from glove import Corpus data_ = click_all.groupby( ['pred', 'user_id'])['item_id'].agg(lambda x: ','.join(list(x))).reset_index() list_data = list(data_['item_id'].map(lambda x: x.split(','))) corpus_model = Corpus() corpus_model.fit(list_data, window=999999) glove = Glove(no_components=dim, learning_rate=learning_rate) glove.fit(corpus_model.matrix, epochs=epochs, no_threads=psutil.cpu_count(), verbose=True) glove.add_dictionary(corpus_model.dictionary) list_user_id = [] list_item_similar = [] list_score_similar = [] print('------- glove 召回 ---------') for i, row in tqdm(data_.iterrows()): list_item_id = row['item_id'].split(',') dict_item_id_score = {} for i, item in enumerate(list_item_id[::-1]): most_topk = glove.most_similar(item, number=k) for item_similar, score_similar in most_topk: if item_similar not in list_item_id: if item_similar not in dict_item_id_score: dict_item_id_score[item_similar] = 0 sigma = 0.8 dict_item_id_score[item_similar] += 1.0 / ( 1 + sigma * i) * score_similar dict_item_id_score_topk = sorted(dict_item_id_score.items(), key=lambda kv: kv[1], reverse=True)[:k] assert len(dict_item_id_score_topk) == k dict_item_id_set = set([ item_similar for item_similar, score_similar in dict_item_id_score_topk ]) assert len(dict_item_id_set) == k for item_similar, score_similar in dict_item_id_score_topk: list_item_similar.append(item_similar) list_score_similar.append(score_similar) list_user_id.append(row['user_id']) topk_recall = pd.DataFrame({ 'user_id': list_user_id, 'item_similar': list_item_similar, 'score_similar': list_score_similar }) topk_recall['next_item_id'] = topk_recall['user_id'].map(dict_label) topk_recall['pred'] = topk_recall['user_id'].map( lambda x: 'train' if x in dict_label else 'test') return topk_recall
## FastText Embedding ## ============================================================================= # #modelft = FastText(masterList1, size=150, window=3, min_count=5, workers=10) # #modelft['forecast'] #modelft.most_similar('forecast')[:5] # #modelft.save("fasttext.model") # ## ============================================================================= ## GloVe Embedding ## ============================================================================= # # creating a corpus object corpus = Corpus() #training the corpus to generate the co occurence matrix which is used in GloVe corpus.fit(masterList1, window=5) #creating a Glove object which will use the matrix created in the above lines to create embeddings #We can set the learning rate as it uses Gradient Descent and number of components glove = Glove(no_components=150, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) glove.word_vectors[glove.dictionary['forecast']] t1 = glove.most_similar('forecast')[:5] glove.save('glove.model')
from __future__ import print_function from glove import Glove from glove import Corpus '''数据集导入''' # 将文本行存入列表 i = 1 lines = [] for line in open("processed.txt", encoding='utf-8'): lines.append(line.split(' ')) print("appending line " + str(i)) i += 1 # 准备数据集 corpus_model = Corpus() corpus_model.fit(lines, window=10) #corpus_model.save('corpus.model') print('Dictionary size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) '''训练模型''' gl = Glove(no_components=200, learning_rate=0.05) gl.fit(corpus_model.matrix, epochs=5, no_threads=1, verbose=True) gl.add_dictionary(corpus_model.dictionary) '''模型保存''' gl.save('glove.model')
from glove import Corpus, Glove corpus = Corpus() sentences = [['나는', '정말', '화난다'], ['너도', '정말', '화나지']] corpus.fit(sentences, window=5) # 훈련 데이터로부터 GloVe에서 사용할 동시 등장 행렬 생성 glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) # 학습에 이용할 쓰레드의 개수는 4로 설정, 에포크는 20. model_result1 = glove.most_similar("나는") print(model_result1)
cats = [] with open('yahoo_train.txt', 'r') as file: for line in file: d = json.loads(line) uris.append(d[0]) questions.append(d[1]) answers.append(d[2]) cats.append(d[3]) def get_lines(): for a in answers: yield a.split() # Build the corpus dictionary and cooccurence matrix corpus_model = Corpus() corpus_model.fit(get_lines(), window=8) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) # Train GloVe model #glove = Glove(no_components = no_comp, learning_rate=0.05) glove = Glove.load_stanford('vectors.6B.100d.txt') glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) glove.add_dictionary(corpus_model.dictionary) # Save with open('model.glove', 'w+') as file: file.write('%i %i \n' % (len(glove.dictionary), no_comp)) for (word, idx) in glove.dictionary.iteritems():
def train_StarGAN(): os.environ['CUDA_VISIBLE_DEVICES'] = '0' from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 set_session(tf.Session(config=config)) if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) if not os.path.exists(FLAGS.pretrain_dir): os.makedirs(FLAGS.pretrain_dir) if not os.path.exists(FLAGS.stargan_dir): os.makedirs(FLAGS.stargan_dir) def build_summaries(): train_loss = tf.Variable(0.) tf.summary.scalar("train_loss", train_loss) summary_vars = [train_loss] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars feature, data, train, data_voc, train_voc = \ data_utils.prepare_data(FLAGS.feature_path, FLAGS.feature_size, FLAGS.data_dir, \ FLAGS.data_path, FLAGS.train_path, FLAGS.vocab_size) glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) data_utils.combine_corpus(data_voc, train_voc, vocab_path, glove_corpus_path, 28) sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None)) corpus = Corpus() corpus.fit(sentences, window=30) modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4) if not os.path.exists(modelA): gloveA = Glove(no_components=FLAGS.size, learning_rate=0.05) gloveA.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True) gloveA.add_dictionary(corpus.dictionary) gloveA.save(modelA) # 512 if not os.path.exists(modelB): gloveB = Glove(no_components=int(FLAGS.size * 3 / 4), learning_rate=0.05) gloveB.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True) gloveB.add_dictionary(corpus.dictionary) gloveB.save(modelB) # 384 gloveA = Glove.load(modelA) gloveA.add_dictionary(corpus.dictionary) gloveB = Glove.load(modelB) gloveB.add_dictionary(corpus.dictionary) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) with tf.Session() as sess: # build the model model = StarGAN('StarGAN', FLAGS.size, FLAGS.num_layers, FLAGS.vocab_size, _buckets, FLAGS.feature_size, FLAGS.baseline, FLAGS.lr, FLAGS.lr_decay, FLAGS.grad_norm, critic=None, use_attn=FLAGS.use_attn, output_sample=True, input_embed=True, batch_size=FLAGS.batch_size, D_lambda=FLAGS.lambda_dis, G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two), dtype=tf.float32) # build summary and intialize summary_ops, summary_vars = build_summaries() sess.run(tf.variables_initializer(tf.global_variables())) log_dir = os.path.join(FLAGS.model_dir, 'log') writer = tf.summary.FileWriter(log_dir, sess.graph) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('read in model from {}'.format(ckpt.model_checkpoint_path)) model.saver.restore(sess, ckpt.model_checkpoint_path) # load in train and dev(valid) data with buckets train_set = read_data_with_buckets(train, FLAGS.max_train_data_size) data_set = read_data_with_buckets(data, FLAGS.max_train_data_size) train_buckets_sizes = [len(train_set[b]) for b in range(len(_buckets))] train_total_size = float(sum(train_buckets_sizes)) print('each buckets has: {d}'.format(d=train_buckets_sizes)) train_buckets_scale = [ sum(train_buckets_sizes[:i + 1]) / train_total_size for i in range(len(train_buckets_sizes)) ] # main process step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] # glove embeddings gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :] gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :] ### ------------------------------------------------------------ ### ### Pretrain ### ### ------------------------------------------------------------ ### while True: # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, weights_f, seq_lens_f, _, \ feature_inputs_b, encoder_inputs_b, decoder_inputs_b, weights_b, seq_lens_b, _, = \ get_batch_with_buckets(FLAGS.feature_size, data_set, FLAGS.batch_size, bucket_id) # pretrain start ! start_time = time.time() forloss, _ , _, _ = model.train_previous(sess, encoder_inputs_f, feature_inputs_f, \ decoder_inputs_f, weights_f, encoder_inputs_b, \ feature_inputs_b, decoder_inputs_b, weights_b, \ bucket_id, gloveA_emb, gloveB_emb, seq_lens_f, seq_lens_b) step_loss = forloss step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('pretrain : ',step_loss) ### ------------------------------------------------------------ ### ### Train StarGAN ### ### ------------------------------------------------------------ ### for _ in range(FLAGS.Dstep): # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \ feature_inputs_b, decoder_inputs_b, weights_b, \ real_inputs, real_feature , real_seq_lens= \ get_stargan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id) # D_step start ! start_time = time.time() _, D_loss = model.train_StarGAN(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \ decoder_inputs_b, weights_b, feature_inputs_b, \ real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \ disc = True,real_seq_len=real_seq_lens, forward_seq_len=seq_lens_f) step_loss = D_loss step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('D_step : ', step_loss) for _ in range(FLAGS.Gstep): # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \ feature_inputs_b, decoder_inputs_b, weights_b, \ real_inputs, real_feature, real_seq_lens = \ get_stargan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id) # G_step start ! start_time = time.time() _, for_reward = model.train_StarGAN(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \ decoder_inputs_b, weights_b, feature_inputs_b, \ real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \ forward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f) step_loss = for_reward step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('for_loss :', step_loss) # get batch from a random selected bucket random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # random pick bucket # get batch for the pretraining data feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \ feature_inputs_b, decoder_inputs_b, weights_b, \ real_inputs, real_feature, real_seq_lens = \ get_stargan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id) # G_step start ! start_time = time.time() _, back_reward = model.train_StarGAN(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \ decoder_inputs_b, weights_b, feature_inputs_b, \ real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \ backward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f) step_loss = back_reward step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / ( FLAGS.Gstep * 2 + FLAGS.Dstep + 1) #print('back_loss :', step_loss) current_step += 1 # log, save and eval if current_step % FLAGS.steps_per_checkpoint == 0: perplexity = math.exp( float(loss)) if loss < 300 else float('inf') print( "Generator step %d; learning rate %.4f; learning_rate_star %.6f; D_lr %6f; step-time %.2f; perplexity " "%.2f; loss %.2f" % (model.global_F_step.eval(), model.learning_rate.eval(), model.learning_rate_star.eval(), model.D_lr.eval(), step_time, perplexity, loss)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.op_lr_decay) sess.run(model.op_D_lr_decay) sess.run(model.learning_rate_star_decay) previous_losses.append(loss) # write summary feed_dict = {} feed_dict[summary_vars[0]] = loss summary_str = sess.run(summary_ops, feed_dict=feed_dict) writer.add_summary(summary_str, model.global_F_step.eval()) writer.flush() # Save checkpoint and zero timer and loss. ckpt_path = os.path.join(FLAGS.model_dir, "ckpt") model.saver.save(sess, ckpt_path, global_step=model.global_F_step) stargan_path = os.path.join(FLAGS.stargan_dir, "ckpt_prev") model.saver.save(sess, stargan_path, global_step=model.global_F_step) step_time, loss = 0.0, 0.0 sys.stdout.flush()
def fit_corpus(corpus): model = Corpus() model.fit(corpus) return corpus
def read_data(filenames): """ input - filenames output - a list of words in the question """ for f in filenames: for i, r in f.iterrows(): for q in question_cols: yield question_to_wordlist(r[q]) filenames = [train_df, test_df] #filenames = [train_df] print("Preprocessing corpus") get_data = read_data corpus_model = Corpus() corpus_model.fit(get_data(filenames), window=10) #corpus_model.save(COMPUTE_DATA_PATH + '/corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) #corpus_model = Corpus.load(COMPUTE_DATA_PATH + '/corpus.model') glove = Glove(no_components=300, learning_rate=0.05) print("Starting training") glove.fit(corpus_model.matrix, epochs=1000, no_threads=6, verbose=True) #glove = Glove.load(COMPUTE_DATA_PATH + '/glove.model') glove.add_dictionary(corpus_model.dictionary)
#salva o arquivo em formato pickled if not os.path.exists('pickledData/'): print('criando diretorio para salvar o arquivo pickled ') os.makedirs('pickledData/') with open('pickledData/' + DICTIONARY_FILE, 'wb') as f: pickle.dump(pickle_files, f) print("Fim arquivos e variaveis em %s segundos", (time.time() - start_time)) # =========================================== # =========================================== print("Inicio criacao modelo glove") start_time = time.time() corpus = Corpus() # treina o corpus para gerar a matrix de co ocorrencia utilizado no GloVe #com o tamanho da janela considerando quantas palavras no contexto corpus.fit(X_train, window=10) # cria o arquivo GloVe, contendo a dimensao (no_componentes) e o learning_rate. Constantes declaradas no inicio glove = Glove(no_components=EMBEDDING_DIM, learning_rate=GLOVE_LEARNING_RATE) glove.fit(corpus.matrix, epochs=GLOVE_NUM_EPOCHS_TRAINING, no_threads=4, verbose=True ) #glove.fit(corpus.matrix, epochs=10, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) if not os.path.exists('models/'): print('criando diretorio para salvar os modelos') os.makedirs('models/')
iter=args.epochs, size=args.size, sg=args.sg, window=args.window, min_count=args.min_count, workers=args.workers) model.save(args.save + 'fastText.model') model = FastText.load(args.save + 'fastText.model') print("완성된fastText 임베딩 크기 확인:", model.wv.vectors.shape) elif (args.model == 'word2vec'): model = Word2Vec(sentences=tokenized_data, size=args.size, window=args.window, min_count=args.min_count, workers=args.workers) model.save(args.save + 'Word2Vec.model') model = Word2Vec.load(args.save + 'Word2Vec.model') print("완성된word2vec 임베딩 크기 확인:", model.wv.vectors.shape) elif (args.model == 'glove'): corpus = Corpus() corpus.fit(tokenized_data, window=5) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) model.save(args.save + 'glove.model') model = FastText.load(args.save + 'glove.model') print("완성된glove 임베딩 크기 확인:", model.wv.vectors.shape) print(model.wv.most_similar("핸드폰")) print(model.wv.most_similar("도로"))
delchars.remove(' ') delchars = ''.join(delchars) with open(filename, 'r') as datafile: for line in datafile: # list of tokenized words yield line.lower().translate(None, delchars).split(' ') if __name__ == '__main__': # initialize glove object glove = Glove(no_components=100, learning_rate=0.05) # read in the data to train on; this file is shakespeare text corpus_model = Corpus() corpus_model.fit(read_corpus("data/input.txt"), window=10) # fit the model using the given parameters glove.fit(corpus_model.matrix, epochs=10, no_threads=1, verbose=True) # add a dictionary just to make it easier for similarity queries glove.add_dictionary(corpus_model.dictionary) # save glove object to file glove.save_obj('glove.model.obj') # give me the 5 words most similar to each word in the words list in this # corpus and show me how similar the words are in this corpus to each word # in the words list in general words = ['sky', 'queen', 'car']
def main_procesing_corpus(korpus: str, size: int): [people_vect_dict, my_corpus] = read_corpus(in_path + korpus, size) corpus = Corpus() corpus.fit(my_corpus, window=10) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) # glove.save('glove.model') person_result_dict = {} f = open(r"C:\Users\tymon.czarnota\Desktop\PADT1\output_{}_{}.tsv".format( korpus, size), 'w', encoding='utf-8') ff = open( r"C:\Users\tymon.czarnota\Desktop\PADT1\output_{}_{}_META.tsv".format( korpus, size), 'w', encoding='utf-8') for key in people_vect_dict: ppl = str(key) for prof in people_dict: for mm in people_dict[prof]: if str(mm) == str(key): ppl = ppl + "<--->" + prof + "\n" for l in people_vect_dict[key]: ff.write(ppl) a = [glove.word_vectors[glove.dictionary[w]] for w in l] a_mean = np.mean(a, axis=0, dtype=np.float64) if key not in person_result_dict: person_result_dict[key] = [] person_result_dict[key].append(a_mean) text = "" for val in person_result_dict[key]: for single in val: text = text + str(single) + "\t" text = text + "\n" f.write(text) f = open(out_path + r"output_{}_{}_WHOLE.tsv".format(korpus, size), 'w', encoding='utf-8') ff = open(out_path + r"output_{}_{}_WHOLE_META.tsv".format(korpus, size), 'w', encoding='utf-8') for key in person_result_dict: a = np.mean(person_result_dict[key], axis=0, dtype=np.float64) str_a = "" for el in a: str_a = str_a + str(el) + "\t" str_a = str_a + "\n" f.write(str_a) str_key = "" for prof in people_dict: for mm in people_dict[prof]: if str(mm) == str(key): str_key = str(key) + "<--->" + prof + "\n" ff.write(str_key)
checkX.append((a1, a2)) checkY.append(flag[check[i]["gold_label"]]) for i in range(len(test)): test[i] = json.loads(test[i]) if flag.get(test[i]["gold_label"]) is None: continue a1 = test[i]["sentence1"].split() a2 = test[i]["sentence2"].split() sentense.append(a1) sentense.append(a2) maxlen1 = max(maxlen1, len(a1)) maxlen2 = max(maxlen2, len(a2)) testX.append((a1, a2)) testY.append(flag[test[i]["gold_label"]]) corpus_model = Corpus() corpus_model.fit(sentense, window=10) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) embedding_dim=300 batch_size = 32 type_size=3 dict_size=len(corpus_model.dictionary) maxlen = maxlen1 + maxlen2 glove = Glove(no_components=embedding_dim, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=1,no_threads=8, verbose=True) glove.add_dictionary(corpus_model.dictionary) #glove.load('glove.model') def get(X,Y,maxlen1,maxlen2): tmpX = np.zeros((len(X),maxlen1+maxlen2+1), dtype=int)
vocab = [] vocab.extend(fulltext) vocab.extend(description) domain_vocab.append(vocab) #fulltext+=temp["content"] print("Total number of documents (description and content) ", len(vocab)) #1446 #print(domain_vocab) maxlen_content = max([len(item) for item in fulltext]) maxlen_description = max([len(item) for item in description]) print("total no of words in content ", maxlen_content) #59080 print("total no of words in description", maxlen_description) #1480 #GloVe Implementation corpus = Corpus() corpus.fit(vocab, window=10) print(corpus) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) #print(corpus.dictionary) word2idx = corpus.dictionary # unique content word as key and index as value content_vector = glove.word_vectors # content vector with word embeddings print("corpus.dictionary(word2idx)_lenth", len(word2idx)) print("EMBEDDING VECTOR LENGTH", len(content_vector)) """ with open("/Users/prathibha/Documents/Project/Try1_embed.tsv","w+") as my_csv:
from glove import Glove from glove import Corpus vocab_count = 50000 # write vocab to file if not os.path.exists('Embedding/main_cat/glove'): os.makedirs('Embedding/main_cat/glove') # In[ ]: if not os.path.exists("Embedding/main_cat/glove/glove.model"): corpus_model = Corpus() corpus_model.fit(sentences, window=10) #corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('Embedding/main_cat/glove/glove.model') # 存模型 corpus_model.save('Embedding/main_cat/glove/corpus.model') # 存字典 glove = Glove.load('Embedding/main_cat/glove/glove.model')
import itertools from gensim.models.word2vec import Text8Corpus from glove import Corpus, Glove # for installing text8 corpus you should follow this commands # wget http://mattmahoney.net/dc/text8.zip -P /tmp # unzip text8.zip sentences = list(itertools.islice(Text8Corpus('/tmp/text8'), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) print glove.most_similar('frog', number=10) print glove.most_similar('girl', number=10) print glove.most_similar('car', number=10)