def build_graph(self): #get the training data (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=word_config.train_data_path, batch_size=word_config.batch_size, window_size=word_config.window_size, min_count=word_config.min_count, subsample=word_config.subsample) # vocab_words, vocab_counts, words_per_epoch = self._sess.run([words, counts, words_per_epoch]) if tf.gfile.Exists(os.path.join(word_config.output_dir, 'vocab.txt')): vocab_words, vocab_counts = self.load_vocab() else: vocab_words, vocab_counts = self._sess.run([words, counts]) vocab_size = len(vocab_words) print("Data file: ", word_config.train_data_path) print("Vocab size: ", vocab_size - 1, " + UNK") # print("Words per epoch: ", words_per_epoch) self._id2word = vocab_words for id, word in enumerate(self._id2word): self._word2id[word] = id w_embed_in = tf.Variable(tf.random_uniform([vocab_size, word_config.embed_size], -0.5 / word_config.embed_size, 0.5 / word_config.embed_size), name="w_embed_in") w_embed_out = tf.Variable(tf.zeros([vocab_size, word_config.embed_size]), name="w_embed_out") self.param_summary(w_embed_in) self.param_summary(w_embed_out) # learning_rate = tf.Variable(word_config.learning_rate, trainable=False, name="learning_rate") global_step = tf.Variable(0, trainable=False, name="global_step") total_words = words_per_epoch * word_config.max_steps learning_rate = word_config.learning_rate * tf.maximum(0.0001, tf.cast(1 - total_words_processed / total_words, tf.float32)) inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_embed_in, w_embed_out, examples, labels, learning_rate, vocab_counts.tolist(), word_config.nr_neg_samples) self._vocab_words = vocab_words self._vocab_counts = vocab_counts self._vocab_size = vocab_size self._w_embed_in = w_embed_in self._w_embed_out = w_embed_out self._train = train self._examples = examples self._labels = labels self._global_step = global_step self._current_epoch = current_epoch self._total_words_processed = total_words_processed self._learning_rate = learning_rate print("end of build graph")
def build_graph(self): """Build the model graph.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable( tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum(0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts.tolist(), num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.step = global_step self._epoch = current_epoch self._words = total_words_processed
def build_graph(self): """Build the model graph.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable( tf.random_uniform( [opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts.tolist(), num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.step = global_step self._epoch = current_epoch self._words = total_words_processed
def build_graph(self): """Build the model graph.""" opts = self._options # The training data for text skipgram. A text file. (words, w_counts, words_per_epoch, w_current_epoch, total_words_processed, w_examples, w_labels) = word2vec.skipgram(filename=opts.text_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) # the training data for entity skipgram (entities, e_counts, entities_per_epoch, e_current_epoch, total_entities_processed, e_examples, e_labels) = kg_skipgram(filename=opts.kg_data, batch_size=opts.batch_size, min_count=opts.min_count) (opts.vocab_words, vocab_word_counts, opts.words_per_epoch, opts.vocab_entities, vocab_entity_counts, opts.entities_per_epoch) = self._session.run([words, w_counts, words_per_epoch, entities, e_counts, entities_per_epoch]) # the training data for align anchor skipgram (anchors_per_epoch, a_current_epoch, total_anchors_processed, a_examples, a_labels) = align_model(filename=opts.anchor_data, batch_size=opts.batch_size, window_size=opts.window_size, subsample=opts.subsample,vocab_word=opts.vocab_words.tolist(),vocab_word_freq=vocab_word_counts.tolist(),vocab_entity=opts.vocab_entities.tolist()) opts.vocab_word_size = len(opts.vocab_words) opts.vocab_entity_size = len(opts.vocab_entities) opts.vocab_size = opts.vocab_word_size+opts.vocab_entity_size opts.anchors_per_epoch = self._session.run(anchors_per_epoch) # for neg sample, [vocab_word_counts, 0...0] and [0...0, vocab_entity_counts] opts.vocab_word_counts = tf.concat(0,[vocab_word_counts,tf.zeros([opts.vocab_entity_size])]).eval() opts.vocab_entity_counts = tf.concat(0,[tf.zeros([opts.vocab_word_size]), vocab_entity_counts]).eval() print("Text data file: ", opts.text_data) print("Word vocab size: ", opts.vocab_word_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) print("Entity data file: ", opts.kg_data) print("Entity vocab size: ", opts.vocab_entity_size) print("Entities per epoch: ", opts.entities_per_epoch) print("Anchor data file: ", opts.anchor_data) print("Anchors per epoch: ", opts.anchors_per_epoch) # for i< opts.vocab_word_size is a word, others is entity self._id2item = np.concatenate((opts.vocab_words, opts.vocab_entities), 0) for i, w in enumerate(self._id2item): self._item2id[w] = i # Declare all variables we need. # Input embedding including both words and entities: [vocab_size, emb_dim] # shard variable if larger than 2g, because saver's limit self._is_shard = False vocab_size_limit = 511000000/opts.emb_dim # 2048/4-1 mb if opts.vocab_size > vocab_size_limit: self._is_shard = True remain_size = opts.vocab_size for i in xrange(opts.vocab_size/vocab_size_limit): self._v_in.append(tf.Variable(tf.random_uniform([vocab_size_limit,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+i)) self._v_out.append(tf.Variable(tf.zeros([vocab_size_limit,opts.emb_dim]),name="v_out"+i)) remain_size -= vocab_size_limit if remain_size!=0: tmp_cap = len(self._v_in) self._v_in.append(tf.Variable(tf.random_uniform([remain_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+tmp_cap)) self._v_out.append(tf.Variable(tf.zeros([remain_size,opts.emb_dim]),name="v_out"+tmp_cap)) else: self._v_in.append(tf.Variable(tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in")) self._v_out.append(tf.Variable(tf.zeros([opts.vocab_size,opts.emb_dim]),name="v_out")) # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): w_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], w_examples, w_labels, lr, vocab_count=opts.vocab_word_counts.tolist(), num_negative_samples=opts.num_samples) e_train = neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], e_examples+opts.vocab_word_size, e_labels+opts.vocab_word_size, lr, vocab_count=opts.vocab_entity_counts.tolist(), num_negative_samples=opts.num_samples) a_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], a_examples+opts.vocab_word_size, a_labels, lr, vocab_count=opts.vocab_word_counts.tolist(), num_negative_samples=opts.num_samples) self._lr = lr self._train_text = w_train self._train_kg = e_train self._train_align = a_train self.step = global_step self._epoch_text = w_current_epoch self._epoch_kg = e_current_epoch self._epoch_align = a_current_epoch self._words = total_words_processed self._entities = total_entities_processed self._anchors = total_anchors_processed
def build_graph(self): """Build the model graph.""" opts = self._options # The training data. A text file. (words, counts, words_per_epoch, current_epoch, total_words_processed, examples, labels) = word2vec.skipgram(filename=opts.train_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample) (opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run( [words, counts, words_per_epoch]) opts.vocab_size = len(opts.vocab_words) print("Data file: ", opts.train_data) print("Vocab size: ", opts.vocab_size - 1, " + UNK") print("Words per epoch: ", opts.words_per_epoch) self._id2word = opts.vocab_words for i, w in enumerate(self._id2word): self._word2id[w] = i #let me interrupt and get pos/neg words in vocab SOCIAL = False SOL = True num_words = 100 LOVEHATE = False FIN = False BULLBEAR = False if (SOCIAL): #use one of the social polar lexicons fileNames = [ "./data/train_lexicons/10_social_employment_opportunities.txt", "./data/train_lexicons/10_social_freedom_from_discrimination.txt", "./data/train_lexicons/10_social_good_education.txt", "./data/train_lexicons/10_social_honest_and_responsive_government.txt", "./data/train_lexicons/10_social_political_freedom.txt" ] social_issue_to_use = 0 polarity_dict = pd.read_csv(fileNames[social_issue_to_use], header=None, names=["pos", "neg"]) neg_terms = polarity_dict["neg"] pos_terms = polarity_dict["pos"] elif (SOL): #use the stock opinion lexicon neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv", header=0) pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv", header=0) ordered = True if (ordered): neg_terms = neg_terms.sort_values( ["v1"], axis=0, ascending=True)["w1"].iloc[:num_words] pos_terms = pos_terms.sort_values( ["v1"], axis=0, ascending=False)["w1"].iloc[:num_words] else: neg_terms = neg_terms.sample(n=num_words)["w1"] pos_terms = pos_terms.sample(n=num_words)["w1"] elif (LOVEHATE): #use the love-hate lexicon justlovehate = True neg_terms = pd.read_csv("./data/train_lexicons/hate.txt", names=["neg"]) pos_terms = pd.read_csv("./data/train_lexicons/love.txt", names=["pos"]) if (justlovehate): neg_terms = neg_terms["neg"].iloc[-1] pos_terms = pos_terms["pos"].iloc[-1] else: neg_terms = neg_terms["neg"].iloc[15:] pos_terms = pos_terms["pos"].iloc[15:] elif (FIN): #use FIN lexicon neg_terms = pd.read_csv("./data/train_lexicons/fin_negatives.csv", header=0, index_col=0)["negs"] pos_terms = pd.read_csv("./data/train_lexicons/fin_positives.csv", header=0, index_col=0)["poss"] elif (BULLBEAR): neg_terms = ["bearish"] pos_terms = ["bullish"] self.neg_terms_in_vocab = [] self.neg_ids = [] self.pos_terms_in_vocab = [] self.pos_ids = [] opts = self._options for neg_term in neg_terms: neg_term = neg_term.encode() if neg_term in opts.vocab_words: self.neg_terms_in_vocab.append(neg_term) self.neg_ids.append(self._word2id.get(neg_term, 0)) self.neg_ids = tf.constant(self.neg_ids) for pos_term in pos_terms: pos_term = pos_term.encode() if pos_term in opts.vocab_words: self.pos_terms_in_vocab.append(pos_term) self.pos_ids.append(self._word2id.get(pos_term, 0)) self.pos_ids = tf.constant(self.pos_ids) if (LOVEHATE): #evaluation only works for the love-hate lexicon ... self.eval_neg_id = [self._word2id.get("hate", 0)] self.eval_neg_id = tf.constant(self.eval_neg_id) self.eval_pos_id = [self._word2id.get("love", 0)] self.eval_pos_id = tf.constant(self.eval_pos_id) #eval neg_ids and pos_ids (all train words) neg_terms = pd.read_csv("./data/train_lexicons/hate.txt", names=["neg"]) pos_terms = pd.read_csv("./data/train_lexicons/love.txt", names=["pos"]) neg_terms = neg_terms["neg"].iloc[15:] pos_terms = pos_terms["pos"].iloc[15:] self.eval_neg_ids = [] self.eval_pos_ids = [] opts = self._options for neg_term in neg_terms: neg_term = neg_term.encode() if neg_term in opts.vocab_words: self.eval_neg_ids.append(self._word2id.get(neg_term, 0)) self.eval_neg_ids = tf.constant(self.eval_neg_ids) for pos_term in pos_terms: pos_term = pos_term.encode() if pos_term in opts.vocab_words: self.eval_pos_ids.append(self._word2id.get(pos_term, 0)) self.eval_pos_ids = tf.constant(self.eval_pos_ids) else: neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv", names=["neg"]) pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv", names=["pos"]) neg_terms = neg_terms["neg"].iloc[num_words:] pos_terms = pos_terms["pos"].iloc[num_words:] self.eval_neg_ids = [] self.eval_pos_ids = [] opts = self._options for neg_term in neg_terms: neg_term = neg_term.encode() if neg_term in opts.vocab_words: self.eval_neg_ids.append(self._word2id.get(neg_term, 0)) self.eval_neg_ids = tf.constant(self.eval_neg_ids) for pos_term in pos_terms: pos_term = pos_term.encode() if pos_term in opts.vocab_words: self.eval_pos_ids.append(self._word2id.get(pos_term, 0)) self.eval_pos_ids = tf.constant(self.eval_pos_ids) #continue where it left off # Declare all variables we need. # Input words embedding: [vocab_size, emb_dim] w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in") # Global step: scalar, i.e., shape []. w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out") # Global step: [] global_step = tf.Variable(0, name="global_step") # Linear learning rate decay. words_to_train = float(opts.words_per_epoch * opts.epochs_to_train) lr = opts.learning_rate * tf.maximum( 0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train) # Training nodes. inc = global_step.assign_add(1) with tf.control_dependencies([inc]): train = word2vec.neg_train(w_in, w_out, examples, labels, lr, vocab_count=opts.vocab_counts.tolist(), num_negative_samples=opts.num_samples) self._w_in = w_in self._examples = examples self._labels = labels self._lr = lr self._train = train self.step = global_step self._epoch = current_epoch self._words = total_words_processed #Train nodes with antonyms loss2 = self.antonym_loss_and_optimize() self._loss2 = loss2
#global_step = tf.Variable(0, trainable=False) learning_rate_word2vec = tf.Variable(float(lr_rate), trainable=False) selected_sense_output_indices = tf.placeholder(tf.int32, None, name='selected_sense_output_indices') selected_sense_input_indices = tf.placeholder(tf.int32, None, name='selected_sense_input_indices') # [batch_size, sense_dim, sense_embedding_dim] embedded_sense_input = tf.nn.embedding_lookup(s_in, selected_sense_input_indices) embedded_sense_output = tf.nn.embedding_lookup(s_out, selected_sense_output_indices) reward_sense_prob = tf.sigmoid(tf.reduce_sum(tf.mul(embedded_sense_input, embedded_sense_output), 1)) print 'embedded_sense_input:shape=',embedded_sense_input inc = total_words_processed.assign_add(batch_size) with tf.control_dependencies([inc]): train = word2vec.neg_train(s_in,s_out,selected_sense_input_indices,selected_sense_output_indices,\ learning_rate_word2vec,vocab_count=sense_counts,num_negative_samples=samp_size) init_word2vec = tf.initialize_all_variables() with tf.variable_scope("RLWE"): w_out = tf.Variable(tf.zeros([sense_size, embedding_dim]),\ trainable=True, name="word_outputs") w_in = tf.Variable(tf.random_uniform([vocab_size, embedding_dim],-(3./embedding_dim)**0.5,(3./embedding_dim)**0.5),\ trainable=True, name="word_embeddings") global_step = tf.Variable(0, trainable=False) learning_rate = tf.Variable(float(lr_rate), trainable=False) context_indices = tf.placeholder(tf.int32, [context_window*2+batch_size, max_context_length]) sense_indices = tf.placeholder(tf.int32, [(context_window*2+batch_size) * sense_dim])