def build_graph(self): """Build the graph for the full model.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, self._epoch, self._words, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._examples = examples self._labels = labels self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i true_logits, sampled_logits = self.forward(examples, labels) loss = self.nce_loss(true_logits, sampled_logits) tf.scalar_summary("NCE loss", loss) self._loss = loss self.optimize(loss) # Properly initialize all variables. tf.initialize_all_variables().run() self.saver = tf.train.Saver()
def build_graph(self): """Build the graph for the full model.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, self._epoch, self._words, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run( [words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._examples = examples self._labels = labels self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i true_logits, sampled_logits = self.forward(examples, labels) loss = self.nce_loss(true_logits, sampled_logits) tf.scalar_summary("NCE loss", loss) self._loss = loss self.optimize(loss) # Properly initialize all variables. tf.initialize_all_variables().run() self.saver = tf.train.Saver()
def build_graph(self): #get the training data (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=word_config.train_data_path, batch_size=word_config.batch_size, window_size=word_config.window_size, min_count=word_config.min_count, subsample=word_config.subsample) # vocab_words, vocab_counts, words_per_epoch = self._sess.run([words, counts, words_per_epoch]) if tf.gfile.Exists(os.path.join(word_config.output_dir, 'vocab.txt')): vocab_words, vocab_counts = self.load_vocab() else: vocab_words, vocab_counts = self._sess.run([words, counts]) vocab_size = len(vocab_words) print("Data file: ", word_config.train_data_path) print("Vocab size: ", vocab_size - 1, " + UNK") # print("Words per epoch: ", words_per_epoch) self._id2word = vocab_words for id, word in enumerate(self._id2word): self._word2id[word] = id w_embed_in = tf.Variable(tf.random_uniform([vocab_size, word_config.embed_size], -0.5 / word_config.embed_size, 0.5 / word_config.embed_size), name="w_embed_in") w_embed_out = tf.Variable(tf.zeros([vocab_size, word_config.embed_size]), name="w_embed_out") self.param_summary(w_embed_in) self.param_summary(w_embed_out) # learning_rate = tf.Variable(word_config.learning_rate, trainable=False, name="learning_rate") global_step = tf.Variable(0, trainable=False, name="global_step") total_words = words_per_epoch * word_config.max_steps learning_rate = word_config.learning_rate * tf.maximum(0.0001, tf.cast(1 - total_words_processed / total_words, tf.float32)) inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_embed_in, w_embed_out, examples, labels, learning_rate, vocab_counts.tolist(), word_config.nr_neg_samples) self._vocab_words = vocab_words self._vocab_counts = vocab_counts self._vocab_size = vocab_size self._w_embed_in = w_embed_in self._w_embed_out = w_embed_out self._train = train self._examples = examples self._labels = labels self._global_step = global_step self._current_epoch = current_epoch self._total_words_processed = total_words_processed self._learning_rate = learning_rate print("end of build graph")
def build_graph(self): """Build the model graph.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable( tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum(0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts.tolist(), num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.step = global_step self._epoch = current_epoch self._words = total_words_processed
def build_graph(self): """Build the graph for the full model.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, self._epoch, self._words, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run( [words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._examples = examples self._labels = labels self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i loss = self.calculate_loss(examples, labels) self._loss = loss if opts.normclip: self._clip_ops = self.clip_ops_graph(self._examples, self._labels, self._neg_idxs) if opts.adagrad: print("Using Adagrad as an optimizer!") self.optimize_adagrad(loss) else: # Using Standard SGD self.optimize(loss) tf.scalar_summary('learning rate', self._lr) # Properly initialize all variables. self.check_op = tf.add_check_numerics_ops() tf.initialize_all_variables().run() try: print('Try using saver version v2') self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=opts.max_to_keep) except: print('Default to saver version v1') self.saver = tf.train.Saver(max_to_keep=opts.max_to_keep)
def build_graph(self): opts = self._options (words, counts, words_per_epoch, self._epoch, self._words, examples, labels) = word2vec.skipgram(filename="text8", batch_size=opt.batch_size, window_size=opt.window_size, min_count=opt.min_count, subsample=0) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._examples = examples self._labels = labels self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i true_logits, sampled_logits = self.forward(examples, labels) loss = self.nce_loss(true_logits, sampled_logits) tf.scalar_summary("NCE loss", loss) self._loss = loss self.optimize(loss)
def build_graph(self): """Build the model graph.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable( tf.random_uniform( [opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts.tolist(), num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.step = global_step self._epoch = current_epoch self._words = total_words_processed
def build_graph(self): """Build the model graph.""" opts = self._options # The training data for text skipgram. A text file. (words, w_counts, words_per_epoch, w_current_epoch, total_words_processed, w_examples, w_labels) = word2vec.skipgram(filename=opts.text_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) # the training data for entity skipgram (entities, e_counts, entities_per_epoch, e_current_epoch, total_entities_processed, e_examples, e_labels) = kg_skipgram(filename=opts.kg_data, batch_size=opts.batch_size, min_count=opts.min_count) (opts.vocab_words, vocab_word_counts, opts.words_per_epoch, opts.vocab_entities, vocab_entity_counts, opts.entities_per_epoch) = self._session.run([words, w_counts, words_per_epoch, entities, e_counts, entities_per_epoch]) # the training data for align anchor skipgram (anchors_per_epoch, a_current_epoch, total_anchors_processed, a_examples, a_labels) = align_model(filename=opts.anchor_data, batch_size=opts.batch_size, window_size=opts.window_size, subsample=opts.subsample,vocab_word=opts.vocab_words.tolist(),vocab_word_freq=vocab_word_counts.tolist(),vocab_entity=opts.vocab_entities.tolist()) opts.vocab_word_size = len(opts.vocab_words) opts.vocab_entity_size = len(opts.vocab_entities) opts.vocab_size = opts.vocab_word_size+opts.vocab_entity_size opts.anchors_per_epoch = self._session.run(anchors_per_epoch) # for neg sample, [vocab_word_counts, 0...0] and [0...0, vocab_entity_counts] opts.vocab_word_counts = tf.concat(0,[vocab_word_counts,tf.zeros([opts.vocab_entity_size])]).eval() opts.vocab_entity_counts = tf.concat(0,[tf.zeros([opts.vocab_word_size]), vocab_entity_counts]).eval() print("Text data file: ", opts.text_data) print("Word vocab size: ", opts.vocab_word_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) print("Entity data file: ", opts.kg_data) print("Entity vocab size: ", opts.vocab_entity_size) print("Entities per epoch: ", opts.entities_per_epoch) print("Anchor data file: ", opts.anchor_data) print("Anchors per epoch: ", opts.anchors_per_epoch) # for i< opts.vocab_word_size is a word, others is entity self._id2item = np.concatenate((opts.vocab_words, opts.vocab_entities), 0) for i, w in enumerate(self._id2item): self._item2id[w] = i # Declare all variables we need. # Input embedding including both words and entities: [vocab_size, emb_dim] # shard variable if larger than 2g, because saver's limit self._is_shard = False vocab_size_limit = 511000000/opts.emb_dim # 2048/4-1 mb if opts.vocab_size > vocab_size_limit: self._is_shard = True remain_size = opts.vocab_size for i in xrange(opts.vocab_size/vocab_size_limit): self._v_in.append(tf.Variable(tf.random_uniform([vocab_size_limit,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+i)) self._v_out.append(tf.Variable(tf.zeros([vocab_size_limit,opts.emb_dim]),name="v_out"+i)) remain_size -= vocab_size_limit if remain_size!=0: tmp_cap = len(self._v_in) self._v_in.append(tf.Variable(tf.random_uniform([remain_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+tmp_cap)) self._v_out.append(tf.Variable(tf.zeros([remain_size,opts.emb_dim]),name="v_out"+tmp_cap)) else: self._v_in.append(tf.Variable(tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in")) self._v_out.append(tf.Variable(tf.zeros([opts.vocab_size,opts.emb_dim]),name="v_out")) # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): w_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], w_examples, w_labels, lr, vocab_count=opts.vocab_word_counts.tolist(), num_negative_samples=opts.num_samples) e_train = neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], e_examples+opts.vocab_word_size, e_labels+opts.vocab_word_size, lr, vocab_count=opts.vocab_entity_counts.tolist(), num_negative_samples=opts.num_samples) a_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], a_examples+opts.vocab_word_size, a_labels, lr, vocab_count=opts.vocab_word_counts.tolist(), num_negative_samples=opts.num_samples) self._lr = lr self._train_text = w_train self._train_kg = e_train self._train_align = a_train self.step = global_step self._epoch_text = w_current_epoch self._epoch_kg = e_current_epoch self._epoch_align = a_current_epoch self._words = total_words_processed self._entities = total_entities_processed self._anchors = total_anchors_processed
def build_graph(self): """Build the model graph.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run( [words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i #let me interrupt and get pos/neg words in vocab SOCIAL = False SOL = True num_words = 100 LOVEHATE = False FIN = False BULLBEAR = False if (SOCIAL): #use one of the social polar lexicons fileNames = [ "./data/train_lexicons/10_social_employment_opportunities.txt", "./data/train_lexicons/10_social_freedom_from_discrimination.txt", "./data/train_lexicons/10_social_good_education.txt", "./data/train_lexicons/10_social_honest_and_responsive_government.txt", "./data/train_lexicons/10_social_political_freedom.txt" ] social_issue_to_use = 0 polarity_dict = pd.read_csv(fileNames[social_issue_to_use], header=None, names=["pos", "neg"]) neg_terms = polarity_dict["neg"] pos_terms = polarity_dict["pos"] elif (SOL): #use the stock opinion lexicon neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv", header=0) pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv", header=0) ordered = True if (ordered): neg_terms = neg_terms.sort_values( ["v1"], axis=0, ascending=True)["w1"].iloc[:num_words] pos_terms = pos_terms.sort_values( ["v1"], axis=0, ascending=False)["w1"].iloc[:num_words] else: neg_terms = neg_terms.sample(n=num_words)["w1"] pos_terms = pos_terms.sample(n=num_words)["w1"] elif (LOVEHATE): #use the love-hate lexicon justlovehate = True neg_terms = pd.read_csv("./data/train_lexicons/hate.txt", names=["neg"]) pos_terms = pd.read_csv("./data/train_lexicons/love.txt", names=["pos"]) if (justlovehate): neg_terms = neg_terms["neg"].iloc[-1] pos_terms = pos_terms["pos"].iloc[-1] else: neg_terms = neg_terms["neg"].iloc[15:] pos_terms = pos_terms["pos"].iloc[15:] elif (FIN): #use FIN lexicon neg_terms = pd.read_csv("./data/train_lexicons/fin_negatives.csv", header=0, index_col=0)["negs"] pos_terms = pd.read_csv("./data/train_lexicons/fin_positives.csv", header=0, index_col=0)["poss"] elif (BULLBEAR): neg_terms = ["bearish"] pos_terms = ["bullish"] self.neg_terms_in_vocab = [] self.neg_ids = [] self.pos_terms_in_vocab = [] self.pos_ids = [] opts = self._options for neg_term in neg_terms: neg_term = neg_term.encode() if neg_term in opts.vocab_words: self.neg_terms_in_vocab.append(neg_term) self.neg_ids.append(self._word2id.get(neg_term, 0)) self.neg_ids = tf.constant(self.neg_ids) for pos_term in pos_terms: pos_term = pos_term.encode() if pos_term in opts.vocab_words: self.pos_terms_in_vocab.append(pos_term) self.pos_ids.append(self._word2id.get(pos_term, 0)) self.pos_ids = tf.constant(self.pos_ids) if (LOVEHATE): #evaluation only works for the love-hate lexicon ... self.eval_neg_id = [self._word2id.get("hate", 0)] self.eval_neg_id = tf.constant(self.eval_neg_id) self.eval_pos_id = [self._word2id.get("love", 0)] self.eval_pos_id = tf.constant(self.eval_pos_id) #eval neg_ids and pos_ids (all train words) neg_terms = pd.read_csv("./data/train_lexicons/hate.txt", names=["neg"]) pos_terms = pd.read_csv("./data/train_lexicons/love.txt", names=["pos"]) neg_terms = neg_terms["neg"].iloc[15:] pos_terms = pos_terms["pos"].iloc[15:] self.eval_neg_ids = [] self.eval_pos_ids = [] opts = self._options for neg_term in neg_terms: neg_term = neg_term.encode() if neg_term in opts.vocab_words: self.eval_neg_ids.append(self._word2id.get(neg_term, 0)) self.eval_neg_ids = tf.constant(self.eval_neg_ids) for pos_term in pos_terms: pos_term = pos_term.encode() if pos_term in opts.vocab_words: self.eval_pos_ids.append(self._word2id.get(pos_term, 0)) self.eval_pos_ids = tf.constant(self.eval_pos_ids) else: neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv", names=["neg"]) pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv", names=["pos"]) neg_terms = neg_terms["neg"].iloc[num_words:] pos_terms = pos_terms["pos"].iloc[num_words:] self.eval_neg_ids = [] self.eval_pos_ids = [] opts = self._options for neg_term in neg_terms: neg_term = neg_term.encode() if neg_term in opts.vocab_words: self.eval_neg_ids.append(self._word2id.get(neg_term, 0)) self.eval_neg_ids = tf.constant(self.eval_neg_ids) for pos_term in pos_terms: pos_term = pos_term.encode() if pos_term in opts.vocab_words: self.eval_pos_ids.append(self._word2id.get(pos_term, 0)) self.eval_pos_ids = tf.constant(self.eval_pos_ids) #continue where it left off # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts.tolist(), num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.step = global_step self._epoch = current_epoch self._words = total_words_processed #Train nodes with antonyms loss2 = self.antonym_loss_and_optimize() self._loss2 = loss2
def build_graph(self): """Build the graph for the full model.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, self._epoch, self._words, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) ###NEW: read sampling corpus (=all files in same dir as train_data except for training data) full_path = os.path.realpath(opts.train_data) path, filename = os.path.split(full_path) sampling_files = [] for file in os.listdir(path): if file.endswith(".txt") or file.endswith(".tok") and file != filename: sampling_files.append(path+"/"+file) print("Files for sampling: ", ", ".join(sampling_files)) #write new file as concat of all sampling files sample_data = opts.train_data+".sample" sample_train_data = sample_data+".train" o = codecs.open(sample_data, "w", "utf8") oo = codecs.open(sample_train_data, "w", "utf8") for sampling_file in sampling_files: f = open(sampling_file,"r") t = f.read() o.write(t.decode("utf8")+" ") #concat all files oo.write(t.decode("utf8")+" ") f.close() o.close() t = codecs.open(opts.train_data, "r", "utf8") oo.write(t.read().decode("utf8")) t.close() oo.close() # The sampling data. A text file. (words_samples, counts_samples, words_per_epoch_samples, b_epoch_samples, b_words_samples, examples_samples, labels_samples) = word2vec.skipgram(filename=sample_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) #Sampling plus training data for getting full vocabulary for embeddings (words_samples_train, counts_samples_train, words_per_epoch_samples_train, b_epoch_samples_train, b_words_samples_train, examples_samples_train, labels_samples_train) = word2vec.skipgram(filename=sample_train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.all_words, opts.all_counts, all_words_per_epoch) = self._session.run([words_samples_train, counts_samples_train, words_per_epoch]) (opts.sample_words, opts.sample_counts, sample_words_per_epoch) = self._session.run([words_samples, counts_samples, words_per_epoch]) #first add sample words for s in opts.sample_words: last_index = len(self._word2id) self._word2id.setdefault(s,last_index) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) #then add training words for v in opts.vocab_words: last_index = len(self._word2id) self._word2id.setdefault(v,last_index) print("Word2id: ", self._word2id) opts.vocab_size = len(self._word2id) #NOTE: wc20(train)+wc(sample) != wc20(train+sample) -> therefore use word2id (proper union) print("Sample file: ", sample_data) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._examples = examples_samples self._labels = labels_samples #self._id2word = opts.all_words #for i, w in enumerate(self._id2word): for (w,i) in self._word2id.iteritems(): self._id2word[i] = w print("id2word: ", self._id2word) true_logits, sampled_logits = self.forward(examples_samples, labels_samples) loss = self.nce_loss(true_logits, sampled_logits) tf.scalar_summary("NCE loss", loss) self._loss = loss self.optimize(loss) # Properly initialize all variables. tf.initialize_all_variables().run() self.saver = tf.train.Saver()