Example #1
0
    def __init__(self,
                 sentences=None,
                 oov_word=False,
                 size=100,
                 alpha=0.035,
                 window=5,
                 min_count=5,
                 seed=1,
                 workers=1,
                 min_alpha=0.0001,
                 sg=1,
                 training_function=train_sentence_sg_original,
                 decay=True,
                 vocab_report_frequency=10000):
        """
		Construct a word2vec language model from a corpus `sentences`.
		Optionally specify whether an out of vocabulary word should be
		trained.

		Inputs (optional keyword arguments)
		-----------------------------------

		oov_word              bool : train unknown word vector?
		size                   int : embedding dimensions.
		alpha                float : learning rate
		decay                 bool : anneal learning rate over time?
		min_alpha            float : if learning rate is annealed, minimum rate to use.
		sg                     int : use skip gram method or average context?
		training_function function : training function that takes as parameters:
			Word2VecExtended, list<int>, float, np.array (stores gradient)
		window                 int : size of the context used to train embeddings.
		min_count              int : minimum word occurence to include in model vocabulary.
		seed                   int : random seed to set up weights
		sentences           object : what corpus to train on (See: `LineCorpus`,
			`BrownCorpus`, `BrownCorpusSimple`)

		"""

        self.vocab = {}  # mapping from a word (string) to a Vocab object
        self.index2word = [
        ]  # map from a word's matrix index (int) to word (string)
        self.sg = int(sg)
        self.layer1_size = int(size)
        self.logistic_regression_size = self.layer1_size
        if size % 4 != 0:
            logger.warning(
                "consider setting layer size to a multiple of 4 for greater performance"
            )
        self.alpha = float(alpha)
        self.window = int(window)
        self.weight_decay = decay
        self.seed = seed
        self.hs = True
        self.negative = False
        self.training_function = training_function
        self.min_count = min_count
        self.workers = workers
        self.min_alpha = min_alpha

        if sentences is not None:
            self.build_vocab(sentences,
                             oov_word=oov_word,
                             report_frequency=vocab_report_frequency)
            self.train(sentences)  # maybe ?
from six.moves import xrange
from timeit import default_timer
from random import shuffle
try:
  from queue import Queue, Empty
except ImportError:
  from Queue import Queue, Empty

try:
  from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow
  from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow
  from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH
  logger.debug("Fast version of {0} is being used".format(__name__))
except ImportError:
  # failed... fall back to plain numpy (20-80x slower training than the above)
  logger.warning("Slow version of {0} is being used".format(__name__))
  FAST_VERSION = -1
  MAX_WORDS_IN_BATCH = 10000


def train_batch_sg_constraints(model, constraints, alpha, work=None):
  """This function adds an additional constraint to the representation."""
  result = 0
  for constraint in constraints:
    word = model.vocab[constraint[0]]
    word2 = model.vocab[constraint[1]]

    # the representation of word2.index is used to predict model.index2word[word.index]
    train_sg_pair(model, model.index2word[word.index], word2.index, alpha)
    result += 1
  return result
    def job_producer():
      """Fill jobs queue using the input `sentences` iterator."""
      job_batch, batch_size = [], 0
      pushed_words, pushed_examples = 0, 0
      next_alpha = self.alpha
      if next_alpha > self.min_alpha_yet_reached:
        logger.warn("Effective 'alpha' higher than previous training cycles")
      self.min_alpha_yet_reached = next_alpha
      job_no = 0

      for sent_idx, sentence in enumerate(sentences):
        sentence_length = self._raw_word_count([sentence])

        # can we fit this sentence into the existing job batch?
        if batch_size + sentence_length <= self.batch_words:
          # yes => add it to the current job
          job_batch.append(sentence)
          batch_size += sentence_length
        else:
          # no => submit the existing job
          pair_idx = list(
              numpy.random.choice(
                  range(len(self.pairwise_constraints)), int(batch_size * 0.2)))
          pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx]
          logger.debug(
              "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
              job_no, batch_size, len(job_batch), len(pairwise_samples),
              next_alpha)
          job_no += 1
          job_queue.put((job_batch, pairwise_samples, next_alpha))

          # update the learning rate for the next job
          if self.min_alpha < next_alpha:
            if total_examples:
              # examples-based decay
              pushed_examples += len(job_batch)
              progress = 1.0 * pushed_examples / total_examples
            else:
              # words-based decay
              pushed_words += self._raw_word_count(job_batch)
              progress = 1.0 * pushed_words / total_words
            next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
            next_alpha = max(self.min_alpha, next_alpha)

          # add the sentence that didn't fit as the first item of a new job
          job_batch, batch_size = [sentence], sentence_length

      # add the last job too (may be significantly smaller than batch_words)
      if job_batch:
        logger.debug(
            "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
            job_no, batch_size, len(job_batch), len(self.pairwise_constraints),
            next_alpha)
        job_no += 1
        job_queue.put((job_batch, self.pairwise_constraints, next_alpha))

      if job_no == 0 and self.train_count == 0:
        logger.warning(
            "train() called with an empty iterator (if not intended, "
            "be sure to provide a corpus that offers restartable "
            "iteration = an iterable).")

      # give the workers heads up that they can finish -- no more work!
      for _ in xrange(self.workers):
        job_queue.put(None)
      logger.debug("job loop exiting, total %i jobs", job_no)
Example #4
0
	def __init__(self, vocabulary = None, random_window = False, scale_updates = False, self_predict = 0, batchsize = 100, symmetric_window = True, oov_word = True, min_count = 5, paragraph_size = 400, concatenate = True, sentences=None, size=400, alpha=0.035, window=5, seed=1, workers=1, min_alpha=0.0001, decay = True, vocab_report_frequency = 10000):
		"""

		PVDM model for training and learning context paragraphs for sentiment and topic
		analysis, or information retrieval.

		This method uses hierarchical softmax, word2vec, and word windows to obtain an
		unsupervised model for these paragraphs and their context [1]

		[1] Quoc Le and Tomas Mikolov, "Distributed Representations of Sentences and Documents," ICML 2014.

		TODO:

		- add synparagraph for updating paragraph positions
		- store paragraph size
		- build record of paragraph index (without code)
		- update training function accordingly.

		"""

		if batchsize > MAX_BATCHSIZE:
			raise AssertionError("Maximum batch size is %d." % (MAX_BATCHSIZE))

		self.batchsize = int(batchsize) if batchsize > 0 else 1
		self.symmetric_window = symmetric_window
		self.scale_updates = scale_updates

		self.vocab = {}  # mapping from a word (string) to a Vocab object
		self.paragraph_vocab = {}
		self.index2word = []  # map from a word's matrix index (int) to word (string)
		self.index2paragraph = [] # map from a paragraph's matrix index (int) to paragraph (string)

		self.layer1_size = int(size)
		self.paragraph_size = int(paragraph_size)

		self.concatenate = concatenate
		self.random_window = random_window

		if size % 4 != 0:
			logger.warning("consider setting layer size to a multiple of 4 for greater performance")

		self.alpha = float(alpha)
		self.window = int(window)
		self.weight_decay = decay
		self.seed = seed
		self.hs = True
		self.negative = False

		self.self_predict = self_predict

		self.min_count = min_count
		self.workers   = workers
		self.min_alpha = min_alpha

		if self.concatenate:
			# the logistic regression layer for hierarchical softmax deals
			# first with the paragraph dimensions, then with window * 2
			# words:
			if self.symmetric_window:
				self.logistic_regression_size = self.paragraph_size + self.window * 2 * self.layer1_size
			else:
				self.logistic_regression_size = self.paragraph_size + self.window * 1 * self.layer1_size
		else:
			# the logistic regression layer for hierarchical softmax deals first
			# with the paragraph dimensions, then with the average of the
			# 2 * window words:
			self.logistic_regression_size = self.layer1_size + self.paragraph_size

		if self_predict > 0:
			self.training_function = train_sentence_batch_pvdm_self_predict if self_predict == 1 else train_sentence_batch_pvdm_skipgram
			self.logistic_regression_size = self.layer1_size
			self.true_paragraph_size = self.paragraph_size
			self.paragraph_size = 0
		else:
			self.training_function = train_sentence_batch_pvdm

		if sentences is not None:
			self.build_vocab(sentences, oov_word = oov_word, report_frequency = vocab_report_frequency)
			self.train(sentences) # maybe ?