def get_pretrained_embedding(vocabulary_processor): print("Load word2vec file {}\n".format(embedding_file)) initW = np.random.uniform(-0.25, 0.25, (n_words, EMBEDDING_SIZE)) word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True) for word in word_vectors.vocab: idx = vocabulary_processor.vocabulary_.get(word) if idx != 0: initW[idx] = word_vectors[word] return initW
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) files = kwargs.get('files', {}) self.dataset_name = kwargs.get('dataset_name') # summary length restriction self.max_length = 100 # Set this to 0 is every sentence length is allowed self.min_sentence_length = 0 # initiate stopword lists & w2v models self.stopwords = {} self.w2v_models = {} model_path = 'models/word2vec/blendle/word2vec_blendle' self.w2v_models['blendle'] = W2VModel( 'blendle', Word2Vec.load(files.get(model_path, model_path)).wv) model_path = 'models/word2vec/google/GoogleNews-vectors-negative300.bin' self.w2v_models['google'] = W2VModel( 'google', KeyedVectors.load_word2vec_format(files.get( model_path, model_path), binary=True)) stopword_path = 'models/stopwords.json' with open(files.get(stopword_path, stopword_path)) as fh: self.stopwords = json.load(fh) # Load model state for inference # Only the normally trained models (no pre- or co-training) are included. self.USE_CUDA = False google_rnn_parameters = torch.load( 'rnn/saved_models/google/rnn_normal', map_location=(lambda storage, loc: storage)) blendle_rnn_parameters = torch.load( 'rnn/saved_models/blendle/rnn_normal', map_location=(lambda storage, loc: storage)) # Initiate model in eval mode with trained parameters self.rnn_model = {} self.rnn_model['blendle_normal'] = RNNModel(self.USE_CUDA, blendle_rnn_parameters) self.rnn_model['google_normal'] = RNNModel(self.USE_CUDA, google_rnn_parameters) # load other models self.svd_path = 'bin/summarization/bigram_svd_{}.model'.format( self.dataset_name) self.svd = sk_joblib.load(files.get(self.svd_path, self.svd_path)) self.bigram_path = 'bin/summarization/bigrams_{}.pkl'.format( self.dataset_name) self.final_bigrams = pickle.load(open(self.bigram_path, 'rb'))
def embed(): model = KeyedVectors.load_word2vec_format(model_path, binary=True) if os.path.exists(vocab_path): vocab = pickle.load(open(vocab_path, "rb")) else: iw, vocab, _ = build_index() size = len(list(vocab.keys())) emb = np.zeros(shape=[size, emb_dim]) for word, index in vocab.items(): if index in [0, 1, 2]: continue emb[index] = model[word] np.save(open(emb_path, "wb"), emb) return vocab, emb
def get_embedding(): emb_path="datasets/temp/embedding.np" if os.path.exists(emb_path): return np.load(open(emb_path,'rb')) else: model=KeyedVectors.load_word2vec_format(model_path,binary=True) iw,vocab,_=get_vocab() size=len(list(vocab.keys())) emb=np.zeros(shape=[size,emb_dim]) for word,index in vocab.items(): if index in [0,1] or word not in model.vocab: continue emb[index]=model[word] np.save(open(emb_path,"wb"),emb) return emb
def embed(): model = KeyedVectors.load_word2vec_format(model_path, binary=True) vocab = model.vocab size = len(vocab) + 2 embedding = np.zeros(shape=[size, emb_dim]) w2idx = {'UNK': 0, "<END>": 1} index = 2 for word in vocab.keys(): if word in vocab: embedding[index] = model[word] w2idx[word] = index index += 1 np.save(open(emb_path, "wb"), embedding) pickle.dump(w2idx, open(vocab_path, 'wb')) return w2idx, embedding
def __init__(self,\ clambda=200,\ n_topics=10,\ batchsize=4096,\ power=0.75,\ words_pretrained=True,\ temperature=1,\ max_length=1000,\ min_count=0,\ word2vec_path=None): # 'Strength' of the dircihlet prior; 200.0 seems to work well self.clambda = clambda # Number of topics to fit self.n_topics = n_topics #int(os.getenv('n_topics', 10)) self.batchsize = batchsize # Power for neg sampling self.power = power #float(os.getenv('power', 0.75)) # Intialize with pretrained word vectors self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True))) self.temp = temperature self.max_length = max_length self.min_count = min_count self.word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
def train_step(): config = Config() with tf.Graph().as_default(), tf.Session() as session: # initializer = tf.random_normal_initializer(0.0, 0.02, dtype=tf.float64) with tf.variable_scope("mymodel", reuse=None, dtype=tf.float64): model = LSTMRNN(config=config, is_training=True) with tf.variable_scope("mymodel", reuse=True, dtype=tf.float64): valid_model = LSTMRNN(config=config, is_training=False) test_model = LSTMRNN(config=config, is_training=False) # add checkpoint pre_checkpoint_dir = os.path.abspath( os.path.join(config.out_dir, "checkpoints-pre")) pre_checkpoint_prefix = os.path.join(pre_checkpoint_dir, "model-pre") if not os.path.exists(pre_checkpoint_dir): os.makedirs(pre_checkpoint_dir) checkpoint_dir = os.path.abspath( os.path.join(config.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) saver.restore(session, FLAGS.checkpoint2train) # tf.global_variables_initializer().run() global_steps = 1984156 index = np.random.randint(1, 4000) begin_time = int(time.time()) print("loading the dataset...") pretrained_word_model = KeyedVectors.load_word2vec_format( './data/GoogleNews-vectors-negative300.bin.gz', binary=True) ''' pre_train_data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/stsallrmf.mask2.p', embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight) print "index: ", index print "begin pre-training" for i in range(FLAGS.pre_num_epoch): print "the %d epoch pre-training..." % (i + 1) lr = model.assign_new_lr(session, config.lr) print "current learning rate is %f" % lr # 11000+数据 train_data, valid_data = cut_data(pre_train_data, 0.005) global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data) if i % FLAGS.pre_check_point_every == 0: path = saver.save(session, pre_checkpoint_prefix, global_steps) pos = 'index: %d, position: %s' % (i, path) with open(FLAGS.pre_logs_dir, 'a') as f: f.write(pos) f.write('\n') print pos print "pre-train finish." ''' print "begin training" #data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtrain.mask2.p', # embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight) test_data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtest.mask2.p', embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight) # print "length of train set:", len(data[2]) # print "example of train set: ", data[3][index] print "length of test set:", len(test_data[2]) print "example of test set:", test_data[3][index] x1, x2, y, m1, m2 = test_data fetches = [test_model.sent1, test_model.sent2] feed_dict = {} feed_dict[test_model.input_data_s1] = x1 feed_dict[test_model.input_data_s2] = x2 feed_dict[test_model.target] = y feed_dict[test_model.mask_s1] = m1 feed_dict[test_model.mask_s2] = m2 test_model.assign_new_batch_size(session, len(x1)) sentence1, sentence2 = session.run(fetches, feed_dict) print 'type of sent1: ', type(sentence1) print 'len of sent1: ', len(sentence1) print 'type of sent2: ', type(sentence2) print 'len of sent2: ', len(sentence2) ''' for i in range(config.num_epoch): print("the %d epoch training..." % (i + 1)) lr = model.assign_new_lr(session, config.lr) print "current learning rate is %f" % lr train_data, valid_data = cut_data(data, 0.01) global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data) if i % config.checkpoint_every == 0 and i != 0: path = saver.save(session, checkpoint_prefix, global_steps) test_cost, test_pearson_r, test_spearman_r = evaluate(test_model, session, test_data, istest=True) res = 'index: %d, cost: %f, pearson_r: %f, spearman_r: %f' % ( i, test_cost, test_pearson_r, test_spearman_r) print res with open(FLAGS.logs_dir, 'a') as f: f.write(res) f.write('\n') f.write('model position: {}\n'.format(path)) print("Saved results chechpoint to{}\n".format(path)) print("the train is finished") end_time = int(time.time()) print("training takes %d seconds already\n" % (end_time - begin_time)) ''' print("program end!")
def load_model(path, save=None): wv_from_text = KeyedVectors.load_word2vec_format(path, binary=False) if save: wv_from_text.save(save) return wv_from_text
def raiseError(error): return error if __name__ == '__main__': global model #----------- Parsing Arguments --------------- p = argparse.ArgumentParser() p.add_argument("--model", help="Path to the trained model") p.add_argument("--binary", help="Specifies the loaded model is binary") p.add_argument("--host", help="Host name (default: localhost)") p.add_argument("--port", help="Port (default: 5000)") p.add_argument("--path", help="Path (default: /word2vec)") args = p.parse_args() model_path = args.model if args.model else "./model.bin.gz" binary = True if args.binary else False host = args.host if args.host else "localhost" path = args.path if args.path else "/word2vec" port = int(args.port) if args.port else 5000 if not args.model: print "Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]" model = kv.load_word2vec_format(model_path, binary=binary) api.add_resource(N_Similarity, path + '/n_similarity') api.add_resource(Similarity, path + '/similarity') api.add_resource(MostSimilar, path + '/most_similar') api.add_resource(Model, path + '/model') api.add_resource(ModelWordSet, '/word2vec/model_word_set') app.run(host=host, port=port)
def loadW2V(fileName): """ Loads the word 2 vec model """ return KeyedVectors.load_word2vec_format(fileName, binary=True)
import os from nlpia.data.loaders import BIGDATA_PATH from gensim.models.word2vec import KeyedVectors path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz') wv = KeyedVectors.load_word2vec_format(path, binary=True) len(wv.vocab) # 3000000 index = annoy.AnnoyIndex(f=len(wv[wv.index2word[0]])) for i, word in enumerate(wv.index2word): if not i % 100000: print('{}: {}'.format(i, word)) index.add_item(i, wv[word]) # 0: </s> # 100000: distinctiveness # ... # 2600000: cedar_juniper # 2700000: Wendy_Liberatore # 2800000: Management_GDCM # 2900000: BOARDED_UP num_vectors = len(wv.vocab) num_trees = int(np.log(num_vectors).round(0)) >>> index.build(num_trees) # <1> >>> index.save('Word2vec_index.ann') # <2>
def train_step(): config = Config() eval_config = Config() eval_config.keep_prob = 1.0 # gpu_config=tf.ConfigProto() # gpu_config.gpu_options.allow_growth=True with tf.Graph().as_default(), tf.Session() as session: # 这个初始化不好,效果极差 initializer = tf.random_normal_initializer(0.0, 0.2, dtype=tf.float32) with tf.variable_scope("model", reuse=None, initializer=initializer): model = LSTMRNN(config=config, sess=session, is_training=True) with tf.variable_scope("model", reuse=True, initializer=initializer): valid_model = LSTMRNN(config=eval_config, sess=session, is_training=False) test_model = LSTMRNN(config=eval_config, sess=session, is_training=False) # add summary train_summary_dir = os.path.join(config.out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph) dev_summary_dir = os.path.join(eval_config.out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, session.graph) # add checkpoint checkpoint_dir = os.path.abspath( os.path.join(config.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) tf.global_variables_initializer().run() global_steps = 1 begin_time = int(time.time()) print("loading the dataset...") pretrained_word_model = KeyedVectors.load_word2vec_format( './GoogleNews-vectors-negative300.bin.gz', binary=True) pre_train_data = data_helper.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/stsallrmf.p', embed_dim=FLAGS.emdedding_dim) data = data_helper.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtrain.p', embed_dim=FLAGS.emdedding_dim) test_data = data_helper.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtest.p', embed_dim=FLAGS.emdedding_dim) print("length of pre-train set:", len(pre_train_data[0])) print("length of train set:", len(data[0])) print("length of test set:", len(test_data[0])) print("begin pre-training") for i in range(70): print("the %d epoch pre-training..." % (i + 1)) lr = model.assign_new_lr(session, config.lr) print("current learning rate is %f" % lr) # 11000+ data train_data, valid_data = cut_data(pre_train_data, 0.05) global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data, train_summary_writer, dev_summary_writer) path = saver.save(session, checkpoint_prefix, global_steps) print("pre-train finish.") print("Saved pre-train model chechpoint to{}\n".format(path)) print("begin training") for i in range(config.num_epoch): print("the %d epoch training..." % (i + 1)) # lr_decay = config.lr_decay ** max(i - config.max_decay_epoch, 0.0) lr = model.assign_new_lr(session, config.lr) print('current learning rate is %f' % lr) train_data, valid_data = cut_data(data, 0.1) global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data, train_summary_writer, dev_summary_writer) if i % config.checkpoint_every == 0: path = saver.save(session, checkpoint_prefix, global_steps) print("Saved model chechpoint to{}\n".format(path)) print("the train is finished") end_time = int(time.time()) print("training takes %d seconds already\n" % (end_time - begin_time)) test_cost, test_pearson_r = evaluate(test_model, session, test_data) print("the test data cost is %f" % test_cost) print("the test data pearson_r is %f" % test_pearson_r) print("program end!")
#### keras tensorflow # ref to StyleTransfer.ipynb #### Embedding # ref to: MNIST/Embeddings.ipynb 只要存在Embeddings,Keras会自动记录。TB里会直接看到。我只需要手动生成meta-data文件,不管是程序中绑定,还是在TB中Load都可以。至于meta-data文件,提前生成也OK,程序生成也OK。 #### NPL, NLTK, ngrams # ref to: Quora/Quora-neural-network.ipynb # ref to: Quora/Quora-Feature-Enginnering.ipynb #### word2vec from gensim.models.word2vec import Word2Vec, KeyedVectors word2vec = KeyedVectors.load_word2vec_format('/input/Kaggle/Word2Vec/GoogleNews-vectors-negative300.bin.gz', binary=True) 'apple' in word2vec.vocab # True #### Tokenizer-Scikit-Learn from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from nltk.stem.snowball import EnglishStemmer # 自定义analyzer,加入stem count_analyzer = CountVectorizer().build_analyzer() stemmer = EnglishStemmer() def stem_count_analyzer(doc): return (stemmer.stem(w) for w in count_analyzer(doc)) cv = CountVectorizer(analyzer=stem_count_analyzer, preprocessor=None, stop_words='english', max_features=128) cv.fit(unique_questions)
XX = X - X.dot(pc.transpose()).dot(pc) return XX if isTest: words_dict = {'I': np.array([1., 1., 1., 1.], dtype=float), 'am': np.array([2., 2., 2., 2.], dtype=float), 'You': np.array([0.9, 0.9, 0.9, 0.9], dtype=float), 'Today': np.array([10.0, 9.0, 8.0, 7.0], dtype=float), 'boy': np.array([4.5, 3., 5., 6.], dtype=float), 'girl': np.array([4.4, 3.1, 5.2, 6.1], dtype=float) } else: print '读取词向量...' # word2vec词向量字典 words_dict = KeyedVectors.load_word2vec_format('./others/GoogleNews-vectors-negative300.bin.gz', binary=True) print '词向量读取完毕!' if isTest: embed_domins = 4 p1 = ['I am a boy . ', 'You are a girl . ', 'I like playing basketball . '] p2 = ['Today is a nice day .', 'Something will happen today . ', 'Do you love me ? '] scores = [0.5, 0.4, 0.3] weights_dict = {'am': 0.5} else: p1 = [] p2 = [] scores = [] with open(single_data_path, 'r') as f: for line in f:
def train_step(): config = Config() with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_normal_initializer(0.0, 0.02, dtype=tf.float64) with tf.variable_scope("mymodel", reuse=None, initializer=initializer, dtype=tf.float64): model = LSTMRNN(config=config, is_training=True) with tf.variable_scope("mymodel", reuse=True, initializer=initializer, dtype=tf.float64): valid_model = LSTMRNN(config=config, is_training=False) test_model = LSTMRNN(config=config, is_training=False) # add checkpoint pre_checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints-pre")) pre_checkpoint_prefix = os.path.join(pre_checkpoint_dir, "model-pre") if not os.path.exists(pre_checkpoint_dir): os.makedirs(pre_checkpoint_dir) checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables()) tf.global_variables_initializer().run() global_steps = 1 begin_time = int(time.time()) print("loading the dataset...") pretrained_word_model = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin.gz', binary=True) pre_train_data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/stsallrmf.mask2.p', embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight) print "length of pre-train set:", len(pre_train_data[2]) print "begin pre-training..." for i in range(FLAGS.pre_num_epoch): print "the %d epoch pre-training..." % (i + 1) lr = model.assign_new_lr(session, config.lr) print "current learning rate is %f" % lr # 11000+数据 train_data, valid_data = cut_data(pre_train_data, 0.005) global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data) if i % config.pre_checkpoint_every == 0 and i != 0: path = saver.save(session, pre_checkpoint_prefix, global_steps) pos = 'index: %d, position: %s' % (i, path) with open(FLAGS.pre_logs_dir, 'a') as f: f.write(pos) f.write('\n') print pos print "pre-train finish." print "begin training" data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtrain.mask2.p', embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight) test_data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtest.mask2.p', embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight) print "length of train set:", len(data[2]) print "length of test set:", len(test_data[2]) for i in range(config.num_epoch): print("the %d epoch training..." % (i + 1)) lr = model.assign_new_lr(session, config.lr) print "current learning rate is %f" % lr train_data, valid_data = cut_data(data, 0.01) global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data) if i % config.checkpoint_every == 0: path = saver.save(session, checkpoint_prefix, global_steps) test_cost, test_pearson_r, test_spearman_r = evaluate(test_model, session, test_data, istest=True) res = 'index: %d, cost: %f, pearson_r: %f, spearman_r: %f' % ( i, test_cost, test_pearson_r, test_spearman_r) print res with open(FLAGS.logs_dir, 'a') as f: f.write(res) f.write('\n') f.write('model position: {}\n'.format(path)) print("Saved results chechpoint to{}\n".format(path)) print("the train is finished") end_time = int(time.time()) print("training takes %d seconds already\n" % (end_time - begin_time)) print("program end!")
args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) meta = Meta() if args.edev: edev = read(args.edev, lang='en') if args.hdev: hdev = read(args.hdev, lang='hi') if args.cdev: cdev = read(args.cdev, lang='dev') if not args.load_model: train_e = read(args.etrain, 'en') train_h = read(args.htrain, 'hi') ewvm = KeyedVectors.load_word2vec_format(args.eembd, binary=args.bvec, limit=args.elimit) hwvm = KeyedVectors.load_word2vec_format(args.hembd, binary=args.bvec, limit=args.hlimit) meta.w_dim_eng = ewvm.syn0.shape[1] meta.n_words_eng = ewvm.syn0.shape[0] + meta.add_words meta.w_dim_hin = hwvm.syn0.shape[1] meta.n_words_hin = hwvm.syn0.shape[0] + meta.add_words get_char_map(train_e + train_h) meta.ew2i = {} for w in ewvm.vocab: meta.ew2i[w] = ewvm.vocab[w].index + meta.add_words meta.hw2i = {} for w in hwvm.vocab:
def load_model(): model = KeyedVectors.load_word2vec_format(model_path, binary=True) return model
def train_word2vec(): '''训练词项向量 ''' if not os.path.exists(text_path): build_text() model=Word2Vec(sentences=LineSentence(text_path),size=emb_dim,window=5,min_count=5,iter=10) model.wv.save_word2vec_format(model_path,binary=True) return model def get_embedding(): emb_path="datasets/temp/embedding.np" if os.path.exists(emb_path): return np.load(open(emb_path,'rb')) else: model=KeyedVectors.load_word2vec_format(model_path,binary=True) iw,vocab,_=get_vocab() size=len(list(vocab.keys())) emb=np.zeros(shape=[size,emb_dim]) for word,index in vocab.items(): if index in [0,1] or word not in model.vocab: continue emb[index]=model[word] np.save(open(emb_path,"wb"),emb) return emb if __name__=="__main__": train_word2vec() model=KeyedVectors.load_word2vec_format(model_path,binary=True) emb=get_embedding()
saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary # vocabulary_processor.save(os.path.join(FLAGS.checkpointDir, "vocab")) initW = None if FLAGS.embedding_type in \ ['static', 'none_static', 'multiple_channels']: # initial matrix with random uniform initW = np.random.uniform( -0.25, 0.25, (len(vocabulary_processor.vocabulary_), FLAGS.embedding_dim)) # load any vectors from the word2vec print("Load word2vec file {}\n".format(FLAGS.embedding_file)) word_vectors = KeyedVectors.load_word2vec_format( FLAGS.embedding_file, binary=True) for word in word_vectors.vocab: idx = vocabulary_processor.vocabulary_.get(word) if idx != 0: initW[idx] = word_vectors[word] sess.run(cnn.W.assign(initW)) if FLAGS.embedding_type == 'multiple_channels': sess.run(cnn.W_static.assign(initW)) # Initialize all variables sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) def train_step(x_batch_train, y_batch_train): """ A single training step