Beispiel #1
0
  def build_graph(self):
    """Build the graph for the full model."""
    opts = self._options
    # The training data. A text file.
    (words, counts, words_per_epoch, self._epoch, self._words, examples,
     labels) = word2vec.skipgram(filename=opts.train_data,
                                 batch_size=opts.batch_size,
                                 window_size=opts.window_size,
                                 min_count=opts.min_count,
                                 subsample=opts.subsample)
    (opts.vocab_words, opts.vocab_counts,
     opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
    opts.vocab_size = len(opts.vocab_words)
    print("Data file: ", opts.train_data)
    print("Vocab size: ", opts.vocab_size - 1, " + UNK")
    print("Words per epoch: ", opts.words_per_epoch)
    self._examples = examples
    self._labels = labels
    self._id2word = opts.vocab_words
    for i, w in enumerate(self._id2word):
      self._word2id[w] = i
    true_logits, sampled_logits = self.forward(examples, labels)
    loss = self.nce_loss(true_logits, sampled_logits)
    tf.scalar_summary("NCE loss", loss)
    self._loss = loss
    self.optimize(loss)

    # Properly initialize all variables.
    tf.initialize_all_variables().run()

    self.saver = tf.train.Saver()
Beispiel #2
0
    def build_graph(self):
        """Build the graph for the full model."""
        opts = self._options
        # The training data. A text file.
        (words, counts, words_per_epoch, self._epoch, self._words, examples,
         labels) = word2vec.skipgram(filename=opts.train_data,
                                     batch_size=opts.batch_size,
                                     window_size=opts.window_size,
                                     min_count=opts.min_count,
                                     subsample=opts.subsample)
        (opts.vocab_words, opts.vocab_counts,
         opts.words_per_epoch) = self._session.run(
             [words, counts, words_per_epoch])
        opts.vocab_size = len(opts.vocab_words)
        print("Data file: ", opts.train_data)
        print("Vocab size: ", opts.vocab_size - 1, " + UNK")
        print("Words per epoch: ", opts.words_per_epoch)
        self._examples = examples
        self._labels = labels
        self._id2word = opts.vocab_words
        for i, w in enumerate(self._id2word):
            self._word2id[w] = i
        true_logits, sampled_logits = self.forward(examples, labels)
        loss = self.nce_loss(true_logits, sampled_logits)
        tf.scalar_summary("NCE loss", loss)
        self._loss = loss
        self.optimize(loss)

        # Properly initialize all variables.
        tf.initialize_all_variables().run()

        self.saver = tf.train.Saver()
Beispiel #3
0
  def build_graph(self):
    #get the training data
    (words, counts, words_per_epoch, current_epoch, total_words_processed,
     examples, labels) = word2vec.skipgram(filename=word_config.train_data_path,
                                           batch_size=word_config.batch_size,
                                           window_size=word_config.window_size,
                                           min_count=word_config.min_count,
                                           subsample=word_config.subsample)
    # vocab_words, vocab_counts, words_per_epoch = self._sess.run([words, counts, words_per_epoch])
    if tf.gfile.Exists(os.path.join(word_config.output_dir, 'vocab.txt')):
      vocab_words, vocab_counts = self.load_vocab()
    else:
      vocab_words, vocab_counts = self._sess.run([words, counts])

    vocab_size = len(vocab_words)
    print("Data file: ", word_config.train_data_path)
    print("Vocab size: ", vocab_size - 1, " + UNK")
    # print("Words per epoch: ", words_per_epoch)

    self._id2word = vocab_words
    for id, word in enumerate(self._id2word):
      self._word2id[word] = id

    w_embed_in = tf.Variable(tf.random_uniform([vocab_size, word_config.embed_size],
                                                -0.5 / word_config.embed_size, 0.5 / word_config.embed_size),
                             name="w_embed_in")
    w_embed_out = tf.Variable(tf.zeros([vocab_size, word_config.embed_size]), name="w_embed_out")

    self.param_summary(w_embed_in)
    self.param_summary(w_embed_out)

    # learning_rate = tf.Variable(word_config.learning_rate, trainable=False, name="learning_rate")

    global_step = tf.Variable(0, trainable=False, name="global_step")

    total_words = words_per_epoch * word_config.max_steps

    learning_rate = word_config.learning_rate * tf.maximum(0.0001, tf.cast(1 - total_words_processed / total_words, tf.float32))

    inc = global_step.assign_add(1)
    with tf.control_dependencies([inc]):
      train = word2vec.neg_train(w_embed_in, w_embed_out, examples, labels, learning_rate, vocab_counts.tolist(),
                                 word_config.nr_neg_samples)

    self._vocab_words = vocab_words
    self._vocab_counts = vocab_counts
    self._vocab_size = vocab_size
    self._w_embed_in = w_embed_in
    self._w_embed_out = w_embed_out
    self._train = train
    self._examples = examples
    self._labels = labels
    self._global_step = global_step
    self._current_epoch = current_epoch
    self._total_words_processed = total_words_processed
    self._learning_rate = learning_rate
    print("end of build graph")
  def build_graph(self):
	"""Build the model graph."""
	opts = self._options

	# The training data. A text file.
	(words, counts, words_per_epoch, current_epoch, total_words_processed,
	 examples, labels) = word2vec.skipgram(filename=opts.train_data,
										   batch_size=opts.batch_size,
										   window_size=opts.window_size,
										   min_count=opts.min_count,
										   subsample=opts.subsample)
			
	(opts.vocab_words, opts.vocab_counts, opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
	opts.vocab_size = len(opts.vocab_words)
	print("Data file: ", opts.train_data)
	print("Vocab size: ", opts.vocab_size - 1, " + UNK")
	print("Words per epoch: ", opts.words_per_epoch)

	self._id2word = opts.vocab_words
	for i, w in enumerate(self._id2word):
	  self._word2id[w] = i

	# Declare all variables we need.
	# Input words embedding: [vocab_size, emb_dim]
	w_in = tf.Variable(
		tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim), name="w_in")

	# Global step: scalar, i.e., shape [].
	w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out")

	# Global step: []
	global_step = tf.Variable(0, name="global_step")

	# Linear learning rate decay.
	words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
	lr = opts.learning_rate * tf.maximum(0.0001, 1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

	# Training nodes.
	inc = global_step.assign_add(1)
	with tf.control_dependencies([inc]):
	  train = word2vec.neg_train(w_in,
								 w_out,
								 examples,
								 labels,
								 lr,
								 vocab_count=opts.vocab_counts.tolist(),
								 num_negative_samples=opts.num_samples)

	self._w_in = w_in
	self._examples = examples
	self._labels = labels
	self._lr = lr
	self._train = train
	self.step = global_step
	self._epoch = current_epoch
	self._words = total_words_processed
Beispiel #5
0
    def build_graph(self):
        """Build the graph for the full model."""
        opts = self._options
        # The training data. A text file.
        (words, counts, words_per_epoch, self._epoch, self._words, examples,
         labels) = word2vec.skipgram(filename=opts.train_data,
                                     batch_size=opts.batch_size,
                                     window_size=opts.window_size,
                                     min_count=opts.min_count,
                                     subsample=opts.subsample)
        (opts.vocab_words, opts.vocab_counts,
         opts.words_per_epoch) = self._session.run(
             [words, counts, words_per_epoch])
        opts.vocab_size = len(opts.vocab_words)
        print("Data file: ", opts.train_data)
        print("Vocab size: ", opts.vocab_size - 1, " + UNK")
        print("Words per epoch: ", opts.words_per_epoch)
        self._examples = examples
        self._labels = labels
        self._id2word = opts.vocab_words
        for i, w in enumerate(self._id2word):
            self._word2id[w] = i
        loss = self.calculate_loss(examples, labels)
        self._loss = loss

        if opts.normclip:
            self._clip_ops = self.clip_ops_graph(self._examples, self._labels,
                                                 self._neg_idxs)

        if opts.adagrad:
            print("Using Adagrad as an optimizer!")
            self.optimize_adagrad(loss)
        else:
            # Using Standard SGD
            self.optimize(loss)

        tf.scalar_summary('learning rate', self._lr)

        # Properly initialize all variables.
        self.check_op = tf.add_check_numerics_ops()

        tf.initialize_all_variables().run()

        try:
            print('Try using saver version v2')
            self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                        max_to_keep=opts.max_to_keep)
        except:
            print('Default to saver version v1')
            self.saver = tf.train.Saver(max_to_keep=opts.max_to_keep)
Beispiel #6
0
 def build_graph(self):
     opts = self._options
     (words, counts, words_per_epoch, self._epoch, self._words, examples,
     labels) = word2vec.skipgram(filename="text8",
                       batch_size=opt.batch_size,
                       window_size=opt.window_size,
                       min_count=opt.min_count,
                       subsample=0)
     (opts.vocab_words, opts.vocab_counts,
      opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
     opts.vocab_size = len(opts.vocab_words)
     print("Data file: ", opts.train_data)
     print("Vocab size: ", opts.vocab_size - 1, " + UNK")
     print("Words per epoch: ", opts.words_per_epoch)
     self._examples = examples
     self._labels = labels
     self._id2word = opts.vocab_words
     for i, w in enumerate(self._id2word):
         self._word2id[w] = i
     true_logits, sampled_logits = self.forward(examples, labels)
     loss = self.nce_loss(true_logits, sampled_logits)
     tf.scalar_summary("NCE loss", loss)
     self._loss = loss
     self.optimize(loss)
  def build_graph(self):
    """Build the model graph."""
    opts = self._options

    # The training data. A text file.
    (words, counts, words_per_epoch, current_epoch, total_words_processed,
     examples, labels) = word2vec.skipgram(filename=opts.train_data,
                                           batch_size=opts.batch_size,
                                           window_size=opts.window_size,
                                           min_count=opts.min_count,
                                           subsample=opts.subsample)
    (opts.vocab_words, opts.vocab_counts,
     opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])
    opts.vocab_size = len(opts.vocab_words)
    print("Data file: ", opts.train_data)
    print("Vocab size: ", opts.vocab_size - 1, " + UNK")
    print("Words per epoch: ", opts.words_per_epoch)

    self._id2word = opts.vocab_words
    for i, w in enumerate(self._id2word):
      self._word2id[w] = i

    # Declare all variables we need.
    # Input words embedding: [vocab_size, emb_dim]
    w_in = tf.Variable(
        tf.random_uniform(
            [opts.vocab_size,
             opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),
        name="w_in")

    # Global step: scalar, i.e., shape [].
    w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]), name="w_out")

    # Global step: []
    global_step = tf.Variable(0, name="global_step")

    # Linear learning rate decay.
    words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
    lr = opts.learning_rate * tf.maximum(
        0.0001,
        1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

    # Training nodes.
    inc = global_step.assign_add(1)
    with tf.control_dependencies([inc]):
      train = word2vec.neg_train(w_in,
                                 w_out,
                                 examples,
                                 labels,
                                 lr,
                                 vocab_count=opts.vocab_counts.tolist(),
                                 num_negative_samples=opts.num_samples)

    self._w_in = w_in
    self._examples = examples
    self._labels = labels
    self._lr = lr
    self._train = train
    self.step = global_step
    self._epoch = current_epoch
    self._words = total_words_processed
  def build_graph(self):
    """Build the model graph."""
    opts = self._options

    # The training data for text skipgram. A text file.
    (words, w_counts, words_per_epoch, w_current_epoch, total_words_processed, w_examples, w_labels) = word2vec.skipgram(filename=opts.text_data, batch_size=opts.batch_size, window_size=opts.window_size, min_count=opts.min_count, subsample=opts.subsample)
    # the training data for entity skipgram
    (entities, e_counts, entities_per_epoch, e_current_epoch, total_entities_processed,
     e_examples, e_labels) = kg_skipgram(filename=opts.kg_data,
                                           batch_size=opts.batch_size,
                                           min_count=opts.min_count)

    (opts.vocab_words, vocab_word_counts,
     opts.words_per_epoch, opts.vocab_entities, vocab_entity_counts, opts.entities_per_epoch) = self._session.run([words, w_counts, words_per_epoch, entities, e_counts, entities_per_epoch])

    # the training data for align anchor skipgram
    (anchors_per_epoch, a_current_epoch, total_anchors_processed,
     a_examples, a_labels) = align_model(filename=opts.anchor_data,
                                           batch_size=opts.batch_size,
                                           window_size=opts.window_size,
                                           subsample=opts.subsample,vocab_word=opts.vocab_words.tolist(),vocab_word_freq=vocab_word_counts.tolist(),vocab_entity=opts.vocab_entities.tolist())


    opts.vocab_word_size = len(opts.vocab_words)
    opts.vocab_entity_size = len(opts.vocab_entities)
    opts.vocab_size = opts.vocab_word_size+opts.vocab_entity_size
    opts.anchors_per_epoch = self._session.run(anchors_per_epoch)

    # for neg sample, [vocab_word_counts, 0...0] and [0...0, vocab_entity_counts]
    opts.vocab_word_counts = tf.concat(0,[vocab_word_counts,tf.zeros([opts.vocab_entity_size])]).eval()
    opts.vocab_entity_counts = tf.concat(0,[tf.zeros([opts.vocab_word_size]), vocab_entity_counts]).eval()

    print("Text data file: ", opts.text_data)
    print("Word vocab size: ", opts.vocab_word_size - 1, " + UNK")
    print("Words per epoch: ", opts.words_per_epoch)

    print("Entity data file: ", opts.kg_data)
    print("Entity vocab size: ", opts.vocab_entity_size)
    print("Entities per epoch: ", opts.entities_per_epoch)
    print("Anchor data file: ", opts.anchor_data)
    print("Anchors per epoch: ", opts.anchors_per_epoch)

    # for i< opts.vocab_word_size is a word, others is entity
    self._id2item = np.concatenate((opts.vocab_words, opts.vocab_entities), 0)
    for i, w in enumerate(self._id2item):
      self._item2id[w] = i

    # Declare all variables we need.
    # Input embedding including both words and entities: [vocab_size, emb_dim]
    # shard variable if larger than 2g, because saver's limit

    self._is_shard = False
    vocab_size_limit = 511000000/opts.emb_dim
    # 2048/4-1 mb
    if opts.vocab_size > vocab_size_limit:
      self._is_shard = True
      remain_size = opts.vocab_size
      for i in xrange(opts.vocab_size/vocab_size_limit):
        self._v_in.append(tf.Variable(tf.random_uniform([vocab_size_limit,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+i))
        self._v_out.append(tf.Variable(tf.zeros([vocab_size_limit,opts.emb_dim]),name="v_out"+i))
        remain_size -= vocab_size_limit
      if remain_size!=0:
        tmp_cap = len(self._v_in)
        self._v_in.append(tf.Variable(tf.random_uniform([remain_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"+tmp_cap))
        self._v_out.append(tf.Variable(tf.zeros([remain_size,opts.emb_dim]),name="v_out"+tmp_cap))
    else:
      self._v_in.append(tf.Variable(tf.random_uniform([opts.vocab_size,opts.emb_dim], -0.5 / opts.emb_dim, 0.5 / opts.emb_dim),name="v_in"))
      self._v_out.append(tf.Variable(tf.zeros([opts.vocab_size,opts.emb_dim]),name="v_out"))

    # Global step: []
    global_step = tf.Variable(0, name="global_step")

    # Linear learning rate decay.
    words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
    lr = opts.learning_rate * tf.maximum(
        0.0001,
        1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

    # Training nodes.
    inc = global_step.assign_add(1)
    with tf.control_dependencies([inc]):
      w_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0],
                                 tf.concat(0,self._v_out) if self._is_shard else self._v_out[0],
                                 w_examples,
                                 w_labels,
                                 lr,
                                 vocab_count=opts.vocab_word_counts.tolist(),
                                 num_negative_samples=opts.num_samples)
      e_train = neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0],
                                 tf.concat(0,self._v_out) if self._is_shard else self._v_out[0],
                                 e_examples+opts.vocab_word_size,
                                 e_labels+opts.vocab_word_size,
                                 lr,
                                 vocab_count=opts.vocab_entity_counts.tolist(),
                                 num_negative_samples=opts.num_samples)
      a_train = word2vec.neg_train(tf.concat(0,self._v_in) if self._is_shard else self._v_in[0], tf.concat(0,self._v_out) if self._is_shard else self._v_out[0], a_examples+opts.vocab_word_size, a_labels, lr, vocab_count=opts.vocab_word_counts.tolist(), num_negative_samples=opts.num_samples)


    self._lr = lr
    self._train_text = w_train
    self._train_kg = e_train
    self._train_align = a_train
    self.step = global_step
    self._epoch_text = w_current_epoch
    self._epoch_kg = e_current_epoch
    self._epoch_align = a_current_epoch
    self._words = total_words_processed
    self._entities = total_entities_processed
    self._anchors = total_anchors_processed
    def build_graph(self):
        """Build the model graph."""
        opts = self._options

        # The training data. A text file.
        (words, counts, words_per_epoch, current_epoch, total_words_processed,
         examples, labels) = word2vec.skipgram(filename=opts.train_data,
                                               batch_size=opts.batch_size,
                                               window_size=opts.window_size,
                                               min_count=opts.min_count,
                                               subsample=opts.subsample)
        (opts.vocab_words, opts.vocab_counts,
         opts.words_per_epoch) = self._session.run(
             [words, counts, words_per_epoch])
        opts.vocab_size = len(opts.vocab_words)
        print("Data file: ", opts.train_data)
        print("Vocab size: ", opts.vocab_size - 1, " + UNK")
        print("Words per epoch: ", opts.words_per_epoch)

        self._id2word = opts.vocab_words
        for i, w in enumerate(self._id2word):
            self._word2id[w] = i

        #let me interrupt and get pos/neg words in vocab

        SOCIAL = False
        SOL = True
        num_words = 100
        LOVEHATE = False
        FIN = False
        BULLBEAR = False

        if (SOCIAL):  #use one of the social polar lexicons
            fileNames = [
                "./data/train_lexicons/10_social_employment_opportunities.txt",
                "./data/train_lexicons/10_social_freedom_from_discrimination.txt",
                "./data/train_lexicons/10_social_good_education.txt",
                "./data/train_lexicons/10_social_honest_and_responsive_government.txt",
                "./data/train_lexicons/10_social_political_freedom.txt"
            ]
            social_issue_to_use = 0
            polarity_dict = pd.read_csv(fileNames[social_issue_to_use],
                                        header=None,
                                        names=["pos", "neg"])
            neg_terms = polarity_dict["neg"]
            pos_terms = polarity_dict["pos"]

        elif (SOL):  #use the stock opinion lexicon
            neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv",
                                    header=0)
            pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv",
                                    header=0)
            ordered = True
            if (ordered):
                neg_terms = neg_terms.sort_values(
                    ["v1"], axis=0, ascending=True)["w1"].iloc[:num_words]
                pos_terms = pos_terms.sort_values(
                    ["v1"], axis=0, ascending=False)["w1"].iloc[:num_words]
            else:
                neg_terms = neg_terms.sample(n=num_words)["w1"]
                pos_terms = pos_terms.sample(n=num_words)["w1"]

        elif (LOVEHATE):  #use the love-hate lexicon
            justlovehate = True
            neg_terms = pd.read_csv("./data/train_lexicons/hate.txt",
                                    names=["neg"])
            pos_terms = pd.read_csv("./data/train_lexicons/love.txt",
                                    names=["pos"])
            if (justlovehate):
                neg_terms = neg_terms["neg"].iloc[-1]
                pos_terms = pos_terms["pos"].iloc[-1]
            else:
                neg_terms = neg_terms["neg"].iloc[15:]
                pos_terms = pos_terms["pos"].iloc[15:]

        elif (FIN):  #use FIN lexicon
            neg_terms = pd.read_csv("./data/train_lexicons/fin_negatives.csv",
                                    header=0,
                                    index_col=0)["negs"]
            pos_terms = pd.read_csv("./data/train_lexicons/fin_positives.csv",
                                    header=0,
                                    index_col=0)["poss"]

        elif (BULLBEAR):
            neg_terms = ["bearish"]
            pos_terms = ["bullish"]

        self.neg_terms_in_vocab = []
        self.neg_ids = []
        self.pos_terms_in_vocab = []
        self.pos_ids = []
        opts = self._options
        for neg_term in neg_terms:
            neg_term = neg_term.encode()
            if neg_term in opts.vocab_words:
                self.neg_terms_in_vocab.append(neg_term)
                self.neg_ids.append(self._word2id.get(neg_term, 0))
        self.neg_ids = tf.constant(self.neg_ids)
        for pos_term in pos_terms:
            pos_term = pos_term.encode()
            if pos_term in opts.vocab_words:
                self.pos_terms_in_vocab.append(pos_term)
                self.pos_ids.append(self._word2id.get(pos_term, 0))
        self.pos_ids = tf.constant(self.pos_ids)

        if (LOVEHATE):
            #evaluation only works for the love-hate lexicon ...
            self.eval_neg_id = [self._word2id.get("hate", 0)]
            self.eval_neg_id = tf.constant(self.eval_neg_id)
            self.eval_pos_id = [self._word2id.get("love", 0)]
            self.eval_pos_id = tf.constant(self.eval_pos_id)

            #eval neg_ids and pos_ids (all train words)
            neg_terms = pd.read_csv("./data/train_lexicons/hate.txt",
                                    names=["neg"])
            pos_terms = pd.read_csv("./data/train_lexicons/love.txt",
                                    names=["pos"])
            neg_terms = neg_terms["neg"].iloc[15:]
            pos_terms = pos_terms["pos"].iloc[15:]
            self.eval_neg_ids = []
            self.eval_pos_ids = []
            opts = self._options
            for neg_term in neg_terms:
                neg_term = neg_term.encode()
                if neg_term in opts.vocab_words:
                    self.eval_neg_ids.append(self._word2id.get(neg_term, 0))
            self.eval_neg_ids = tf.constant(self.eval_neg_ids)
            for pos_term in pos_terms:
                pos_term = pos_term.encode()
                if pos_term in opts.vocab_words:
                    self.eval_pos_ids.append(self._word2id.get(pos_term, 0))
            self.eval_pos_ids = tf.constant(self.eval_pos_ids)
        else:
            neg_terms = pd.read_csv("./data/train_lexicons/sol_train_neg.csv",
                                    names=["neg"])
            pos_terms = pd.read_csv("./data/train_lexicons/sol_train_pos.csv",
                                    names=["pos"])
            neg_terms = neg_terms["neg"].iloc[num_words:]
            pos_terms = pos_terms["pos"].iloc[num_words:]
            self.eval_neg_ids = []
            self.eval_pos_ids = []
            opts = self._options
            for neg_term in neg_terms:
                neg_term = neg_term.encode()
                if neg_term in opts.vocab_words:
                    self.eval_neg_ids.append(self._word2id.get(neg_term, 0))
            self.eval_neg_ids = tf.constant(self.eval_neg_ids)
            for pos_term in pos_terms:
                pos_term = pos_term.encode()
                if pos_term in opts.vocab_words:
                    self.eval_pos_ids.append(self._word2id.get(pos_term, 0))
            self.eval_pos_ids = tf.constant(self.eval_pos_ids)

        #continue where it left off
        # Declare all variables we need.
        # Input words embedding: [vocab_size, emb_dim]
        w_in = tf.Variable(tf.random_uniform([opts.vocab_size, opts.emb_dim],
                                             -0.5 / opts.emb_dim,
                                             0.5 / opts.emb_dim),
                           name="w_in")

        # Global step: scalar, i.e., shape [].
        w_out = tf.Variable(tf.zeros([opts.vocab_size, opts.emb_dim]),
                            name="w_out")

        # Global step: []
        global_step = tf.Variable(0, name="global_step")

        # Linear learning rate decay.
        words_to_train = float(opts.words_per_epoch * opts.epochs_to_train)
        lr = opts.learning_rate * tf.maximum(
            0.0001,
            1.0 - tf.cast(total_words_processed, tf.float32) / words_to_train)

        # Training nodes.
        inc = global_step.assign_add(1)
        with tf.control_dependencies([inc]):
            train = word2vec.neg_train(w_in,
                                       w_out,
                                       examples,
                                       labels,
                                       lr,
                                       vocab_count=opts.vocab_counts.tolist(),
                                       num_negative_samples=opts.num_samples)

        self._w_in = w_in
        self._examples = examples
        self._labels = labels
        self._lr = lr
        self._train = train
        self.step = global_step
        self._epoch = current_epoch
        self._words = total_words_processed

        #Train nodes with antonyms
        loss2 = self.antonym_loss_and_optimize()
        self._loss2 = loss2
Beispiel #10
0
  def build_graph(self):
    """Build the graph for the full model."""
    opts = self._options
    # The training data. A text file.
    (words, counts, words_per_epoch, self._epoch, self._words, examples,
     labels) = word2vec.skipgram(filename=opts.train_data,
                                 batch_size=opts.batch_size,
                                 window_size=opts.window_size,
                                 min_count=opts.min_count,
                                 subsample=opts.subsample)

    ###NEW: read sampling corpus (=all files in same dir as train_data except for training data)
    full_path = os.path.realpath(opts.train_data)
    path, filename = os.path.split(full_path)
    sampling_files = []
    for file in os.listdir(path):
        if file.endswith(".txt") or file.endswith(".tok") and file != filename:
            sampling_files.append(path+"/"+file)
    print("Files for sampling: ", ", ".join(sampling_files))

    #write new file as concat of all sampling files
    sample_data = opts.train_data+".sample"
    sample_train_data = sample_data+".train"
    o = codecs.open(sample_data, "w", "utf8")
    oo = codecs.open(sample_train_data, "w", "utf8")
    for sampling_file in sampling_files:
        f = open(sampling_file,"r")
        t = f.read()
        o.write(t.decode("utf8")+" ") #concat all files
        oo.write(t.decode("utf8")+" ")
        f.close()
    o.close()
    t = codecs.open(opts.train_data, "r", "utf8")
    oo.write(t.read().decode("utf8"))
    t.close()
    oo.close()

    # The sampling data. A text file.
    (words_samples, counts_samples, words_per_epoch_samples, b_epoch_samples, b_words_samples, examples_samples,
     labels_samples) = word2vec.skipgram(filename=sample_data,
                                 batch_size=opts.batch_size,
                                 window_size=opts.window_size,
                                 min_count=opts.min_count,
                                 subsample=opts.subsample)

    #Sampling plus training data for getting full vocabulary for embeddings
    (words_samples_train, counts_samples_train, words_per_epoch_samples_train, b_epoch_samples_train, b_words_samples_train, examples_samples_train,
     labels_samples_train) = word2vec.skipgram(filename=sample_train_data,
                                 batch_size=opts.batch_size,
                                 window_size=opts.window_size,
                                 min_count=opts.min_count,
                                 subsample=opts.subsample)

    (opts.all_words, opts.all_counts,
     all_words_per_epoch) = self._session.run([words_samples_train, counts_samples_train, words_per_epoch])

    (opts.sample_words, opts.sample_counts,
     sample_words_per_epoch) = self._session.run([words_samples, counts_samples, words_per_epoch])

    #first add sample words
    for s in opts.sample_words:
        last_index = len(self._word2id)
        self._word2id.setdefault(s,last_index)

    (opts.vocab_words, opts.vocab_counts,
     opts.words_per_epoch) = self._session.run([words, counts, words_per_epoch])

    #then add training words
    for v in opts.vocab_words:
        last_index = len(self._word2id)
        self._word2id.setdefault(v,last_index)

    print("Word2id: ", self._word2id)

    opts.vocab_size = len(self._word2id) #NOTE: wc20(train)+wc(sample) != wc20(train+sample) -> therefore use word2id (proper union)
    print("Sample file: ", sample_data)
    print("Data file: ", opts.train_data)


    print("Vocab size: ", opts.vocab_size - 1, " + UNK")
    print("Words per epoch: ", opts.words_per_epoch)
    self._examples = examples_samples
    self._labels = labels_samples
    #self._id2word = opts.all_words
    #for i, w in enumerate(self._id2word):
    for (w,i) in self._word2id.iteritems():
      self._id2word[i] = w

    print("id2word: ", self._id2word)

    true_logits, sampled_logits = self.forward(examples_samples, labels_samples)
    loss = self.nce_loss(true_logits, sampled_logits)
    tf.scalar_summary("NCE loss", loss)
    self._loss = loss
    self.optimize(loss)

    # Properly initialize all variables.
    tf.initialize_all_variables().run()

    self.saver = tf.train.Saver()