Example #1
0
		def worker_train():
			"""Train the model, lifting lists of sentences from the jobs queue."""
			paragraph_work = zeros(self.paragraph_size, dtype=REAL)  # each thread must have its own work memory
			error = zeros(1, dtype = REAL)
			if self.concatenate:
				# word work here is for each individual word, so it has length logistic regression - para size
				word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL)
			else:
				# here word work is aggregated:
				word_work = zeros(self.layer1_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

			zeros(self.logistic_regression_size, dtype = REAL)
			while True:
				job = jobs.get()
				if job is None:  # data finished, exit
					break
				# update the learning rate before every job
				alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha
				# how many words did we train on? out-of-vocabulary (unknown) words do not count
				job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job))

				with lock:
					# here we can store the scores for later plotting and viewing...
					word_count[0] += job_words

					elapsed = time.time() - start
					total_error[0] += error[0]
					if elapsed >= next_report[0]:
						logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," %
							(100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
						next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
Example #2
0
        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(
                self.layer1_size,
                dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(
                    self.min_alpha,
                    self.alpha * (1 - 1.0 * word_count[0] / total_words)
                ) if self.weight_decay else self.alpha
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = 0

                for sentence in job:
                    job_words += self.training_function(
                        self, sentence, alpha, work)

                with lock:
                    # here we can store the scores for later plotting and viewing...
                    word_count[0] += job_words

                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.debug(
                            "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s"
                            % (100.0 * word_count[0] / total_words, alpha,
                               word_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports
 def worker_loop():
   """Train the model, lifting lists of sentences from the job_queue."""
   work = matutils.zeros_aligned(
       self.layer1_size, dtype=REAL)  # per-thread private work memory
   neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
   jobs_processed = 0
   while True:
     job = job_queue.get()
     if job is None:
       progress_queue.put(None)
       break  # no more jobs => quit this worker
     sentences, pairwise, alpha = job
     tally, raw_tally = self._do_train_job(sentences, pairwise, alpha,
                                           (work, neu1))
     progress_queue.put(
         (len(sentences), tally, raw_tally))  # report back progress
     jobs_processed += 1
   logger.debug("worker exiting, processed %i jobs", jobs_processed)
Example #4
0
    def accuracy(self, questions, restrict_vocab=30000):
        """
		Compute accuracy of the model (with **capitalizations**). `questions` is a filename where lines are
		4-tuples of words, split into sections by ": SECTION NAME" lines.
		See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

		The accuracy is reported (=printed to log and returned as a list) for each
		section separately, plus there's one aggregate summary at the end.

		Use `restrict_vocab` to ignore all questions containing a word whose frequency
		is not in the top-N most frequent words (default top 30,000).

		This method corresponds to the `compute-accuracy` script of the original C word2vec.

		"""
        ok_vocab = dict(
            sorted(self.vocab.items(),
                   key=lambda item: -item[1].count)[:restrict_vocab])
        ok_index = set(v.index for v in ok_vocab.values())

        def log_accuracy(section):
            correct, incorrect = section['correct'], section['incorrect']
            if correct + incorrect > 0:
                logger.info(
                    "%s: %.1f%% (%i/%i)" %
                    (section['section'], 100.0 * correct /
                     (correct + incorrect), correct, correct + incorrect))

        sections, section = [], None
        for line_no, line in enumerate(open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    log_accuracy(section)
                section = {
                    'section': line.lstrip(': ').strip(),
                    'correct': 0,
                    'incorrect': 0
                }
            else:
                if not section:
                    raise ValueError(
                        "missing section header before line #%i in %s" %
                        (line_no, questions))
                try:
                    a, b, c, expected = line.split(
                    )  # TODO assumes vocabulary preprocessing uses lowercase, too...
                except:
                    logger.info("skipping invalid line #%i in %s" %
                                (line_no, questions))
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" %
                                 (line_no, line))
                    continue

                ignore = set(self.vocab[v].index
                             for v in [a, b, c])  # indexes of words to ignore
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                for index in argsort(
                        self.most_similar(positive=[b, c],
                                          negative=[a],
                                          topn=False))[::-1]:
                    if index in ok_index and index not in ignore:
                        predicted = self.index2word[index]
                        if predicted != expected and predicted != expected.lower(
                        ):
                            logger.debug("%s: expected %s, predicted %s" %
                                         (line.strip(), expected, predicted))
                        break
                section['correct' if predicted ==
                        expected else 'incorrect'] += 1
        if section:
            # store the last section, too
            sections.append(section)
            log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum(s['correct'] for s in sections),
            'incorrect': sum(s['incorrect'] for s in sections)
        }
        log_accuracy(total)
        sections.append(total)
        return sections
    def job_producer():
      """Fill jobs queue using the input `sentences` iterator."""
      job_batch, batch_size = [], 0
      pushed_words, pushed_examples = 0, 0
      next_alpha = self.alpha
      if next_alpha > self.min_alpha_yet_reached:
        logger.warn("Effective 'alpha' higher than previous training cycles")
      self.min_alpha_yet_reached = next_alpha
      job_no = 0

      for sent_idx, sentence in enumerate(sentences):
        sentence_length = self._raw_word_count([sentence])

        # can we fit this sentence into the existing job batch?
        if batch_size + sentence_length <= self.batch_words:
          # yes => add it to the current job
          job_batch.append(sentence)
          batch_size += sentence_length
        else:
          # no => submit the existing job
          pair_idx = list(
              numpy.random.choice(
                  range(len(self.pairwise_constraints)), int(batch_size * 0.2)))
          pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx]
          logger.debug(
              "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
              job_no, batch_size, len(job_batch), len(pairwise_samples),
              next_alpha)
          job_no += 1
          job_queue.put((job_batch, pairwise_samples, next_alpha))

          # update the learning rate for the next job
          if self.min_alpha < next_alpha:
            if total_examples:
              # examples-based decay
              pushed_examples += len(job_batch)
              progress = 1.0 * pushed_examples / total_examples
            else:
              # words-based decay
              pushed_words += self._raw_word_count(job_batch)
              progress = 1.0 * pushed_words / total_words
            next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
            next_alpha = max(self.min_alpha, next_alpha)

          # add the sentence that didn't fit as the first item of a new job
          job_batch, batch_size = [sentence], sentence_length

      # add the last job too (may be significantly smaller than batch_words)
      if job_batch:
        logger.debug(
            "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
            job_no, batch_size, len(job_batch), len(self.pairwise_constraints),
            next_alpha)
        job_no += 1
        job_queue.put((job_batch, self.pairwise_constraints, next_alpha))

      if job_no == 0 and self.train_count == 0:
        logger.warning(
            "train() called with an empty iterator (if not intended, "
            "be sure to provide a corpus that offers restartable "
            "iteration = an iterable).")

      # give the workers heads up that they can finish -- no more work!
      for _ in xrange(self.workers):
        job_queue.put(None)
      logger.debug("job loop exiting, total %i jobs", job_no)
import numpy
from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis, ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack
from six import iteritems, itervalues, string_types
from six.moves import xrange
from timeit import default_timer
from random import shuffle
try:
  from queue import Queue, Empty
except ImportError:
  from Queue import Queue, Empty

try:
  from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow
  from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow
  from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH
  logger.debug("Fast version of {0} is being used".format(__name__))
except ImportError:
  # failed... fall back to plain numpy (20-80x slower training than the above)
  logger.warning("Slow version of {0} is being used".format(__name__))
  FAST_VERSION = -1
  MAX_WORDS_IN_BATCH = 10000


def train_batch_sg_constraints(model, constraints, alpha, work=None):
  """This function adds an additional constraint to the representation."""
  result = 0
  for constraint in constraints:
    word = model.vocab[constraint[0]]
    word2 = model.vocab[constraint[1]]

    # the representation of word2.index is used to predict model.index2word[word.index]
Example #7
0
	def train(self, sentences, total_words=None, word_count=0, paragraphs_only = False, vocab = None, paragraphs = None):
		"""
		Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
		if paragraphs is None:
			paragraphs = self.synparagraph
		if vocab is None:
			vocab = self.paragraph_vocab

		if not self.vocab:
			raise RuntimeError("you must first build vocabulary before training the model")

		start, next_report = time.time(), [1.0]
		word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab))
		jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
		lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)
		total_error = [0.0]

		def worker_train():
			"""Train the model, lifting lists of sentences from the jobs queue."""
			paragraph_work = zeros(self.paragraph_size, dtype=REAL)  # each thread must have its own work memory
			error = zeros(1, dtype = REAL)
			if self.concatenate:
				# word work here is for each individual word, so it has length logistic regression - para size
				word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL)
			else:
				# here word work is aggregated:
				word_work = zeros(self.layer1_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

			zeros(self.logistic_regression_size, dtype = REAL)
			while True:
				job = jobs.get()
				if job is None:  # data finished, exit
					break
				# update the learning rate before every job
				alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha
				# how many words did we train on? out-of-vocabulary (unknown) words do not count
				job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job))

				with lock:
					# here we can store the scores for later plotting and viewing...
					word_count[0] += job_words

					elapsed = time.time() - start
					total_error[0] += error[0]
					if elapsed >= next_report[0]:
						logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," %
							(100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
						next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

		workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
		for thread in workers:
			thread.daemon = True  # make interrupting the process with ctrl+c easier
			thread.start()

		# convert input strings to Vocab objects, and paragraph to paragraph (Vocab) object:
		no_oov = (self.create_job(sentence,vocab) for sentence in sentences)
		for job_no, job in enumerate(utils.grouper(no_oov, self.batchsize)):
			logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
			jobs.put(job)
		logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
		for _ in xrange(self.workers):
			jobs.put(None)  # give the workers heads up that they can finish -- no more work!

		for thread in workers:
			thread.join()

		elapsed = time.time() - start
		logger.info("training on %i sentences took %.1fs, %.0f sentences/s, %.6f" %
			(word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0, total_error[0]))

		return (word_count[0], total_error[0])