Example #1
0
 def log_accuracy(section):
     correct, incorrect = section['correct'], section['incorrect']
     if correct + incorrect > 0:
         logger.info(
             "%s: %.1f%% (%i/%i)" %
             (section['section'], 100.0 * correct /
              (correct + incorrect), correct, correct + incorrect))
Example #2
0
 def reset_weights(self):
     """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary."""
     logger.info("resetting layer weights")
     random.seed(self.seed)
     self.syn0 = empty((len(self.vocab), self.layer1_size), dtype=REAL)
     # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once
     for i in range(len(self.vocab)):
         self.syn0[i] = (random.rand(self.layer1_size) -
                         0.5) / self.layer1_size
     self.syn1 = zeros((len(self.vocab), self.logistic_regression_size),
                       dtype=REAL)
     self.syn0norm = None
Example #3
0
    def extend_vocab(self, sentences, oov_word=False, report_frequency=10000):
        """
		Extend vocabulary from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
        logger.info("collecting all words and their counts")

        prev_sentence_no = -1
        sentence_no, vocab = -1, {}
        total_words = 0
        assign_to_vocab = vocab.__setitem__  # slight performance gain
        # https://wiki.python.org/moin/PythonSpeed/PerformanceTips
        get_from_vocab = vocab.__getitem__
        for sentence_no, sentence in enumerate(sentences):
            if prev_sentence_no == sentence_no:
                break
            if sentence_no % report_frequency == 0:
                logger.info(
                    "PROGRESS: at sentence #%i, processed %i words and %i word types"
                    % (sentence_no, total_words, len(vocab)))
            for word in sentence:
                if word in vocab:
                    get_from_vocab(word).count += 1
                else:
                    assign_to_vocab(word, Vocab(count=1))
            total_words += len(sentence)
            prev_sentence_no = sentence_no
        logger.info(
            "collected %i word types from a corpus of %i words and %i sentences"
            % (len(vocab), total_words, sentence_no + 1))

        # assign a unique index to each word
        append = self.index2word.append
        assign_to_vocab = self.vocab.__setitem__
        for word, v in vocab.items():
            if word not in self.vocab:
                if v.count >= self.min_count:
                    v.index = len(self.vocab)
                    append(word)
                    assign_to_vocab(word, v)
            else:
                self.vocab[word].count += v.count

        # add the special out of vocabulary word **UNKNOWN**:
        if oov_word:
            self.add_oov_word(count=len(vocab) - len(self.vocab))

        logger.info("total %i word types after removing those with count<%s" %
                    (len(self.vocab), self.min_count))

        # add info about each word's Huffman encoding
        self.create_binary_tree()
        self.extend_weights()
Example #4
0
def word2vector(X_train):
    """训练词向量"""
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    wv_model = Word2Vec(X_train,
                        size=wv_size,
                        window=6,
                        sg=1,
                        min_count=5,
                        workers=multiprocessing.cpu_count(),
                        iter=10)

    gensim_dict = Dictionary()  # 创建词语词典
    gensim_dict.doc2bow(wv_model.wv.vocab.keys(), allow_update=True)

    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
    w2vec = {word: wv_model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec
Example #5
0
	def build_paragraph_vocab(self, sentences):
		paragraph_vocab = {}
		for sentence_no, sentence in enumerate(sentences):
			sentence = " ".join(sentence)
			if sentence not in paragraph_vocab:
				paragraph_vocab[sentence] = Paragraph()

		logger.info("collected %i sentence types from a corpus of %i sentences." %
				(len(paragraph_vocab), sentence_no+1))

		# assign a unique index to each sentence
		self.paragraph_vocab, self.index2paragraph = {}, []
		append = self.index2paragraph.append
		assign_to_vocab = self.paragraph_vocab.__setitem__

		for sentence, v in iteritems(paragraph_vocab):
			v.index = len(self.paragraph_vocab)
			assign_to_vocab(sentence, v)
			append(sentence)
Example #6
0
	def init_sims(self, replace=False):
		"""
		Precompute L2-normalized vectors.

		If `replace` is set, forget the original vectors and only keep the normalized
		ones = saves lots of memory!

		Note that you **cannot continue training** after doing a replace. The model becomes
		effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.

		"""
		super().init_sims(replace = replace)

		if getattr(self, 'synparagraphnorm', None) is None or replace:
			logger.info("precomputing L2-norms of word weight vectors")
			if replace:
				for i in range(self.synparagraph.shape[0]):
					self.synparagraph[i, :] /= sqrt((self.synparagraph[i, :] ** 2).sum(-1))
				self.synparagraphnorm = self.synparagraph
			else:
				self.synparagraphnorm = (self.synparagraph / sqrt((self.synparagraph ** 2).sum(-1))[..., newaxis]).astype(REAL)
Example #7
0
    def train(self, sentences, total_words=None, word_count=0, chunksize=100):
        """
		Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
        # oov_replacement = self.vocab.get(UnknownWord,None)

        if not self.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        word_count, total_words = [
            word_count
        ], total_words or sum(v.count for v in self.vocab.values())
        jobs = Queue(
            maxsize=2 * self.workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock(
        )  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(
                self.layer1_size,
                dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(
                    self.min_alpha,
                    self.alpha * (1 - 1.0 * word_count[0] / total_words)
                ) if self.weight_decay else self.alpha
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = 0

                for sentence in job:
                    job_words += self.training_function(
                        self, sentence, alpha, work)

                with lock:
                    # here we can store the scores for later plotting and viewing...
                    word_count[0] += job_words

                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.debug(
                            "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s"
                            % (100.0 * word_count[0] / total_words, alpha,
                               word_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [
            threading.Thread(target=worker_train) for _ in range(self.workers)
        ]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue
        no_oov = ([self.get_underlying_word_object(word) for word in sentence]
                  for sentence in sentences)
        for job_no, job in enumerate(utils.grouper(no_oov, chunksize)):
            # logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info(
            "reached the end of input; waiting to finish %i outstanding jobs" %
            jobs.qsize())
        for _ in range(self.workers):
            jobs.put(
                None
            )  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
                    (word_count[0], elapsed,
                     word_count[0] / elapsed if elapsed else 0.0))

        return word_count[0]
Example #8
0
    def build_vocab(self, sentences, oov_word=False, report_frequency=10000):
        """
		Build vocabulary from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
        print("build vocab")
        path = (re.sub("/", "_", sentences.fname) + ("(mc=%d)" %
                                                     (self.min_count)) +
                ".vocab") if hasattr(sentences, "fname") else None
        if path != None and file_exists(path):
            logger.info("loading from saved vocab list at \"%s\"" % (path))
            file = gzip.open(path, 'r')
            saved_vocab = pickle.load(file)
            file.close()
            self.index2word = saved_vocab["index2word"]
            self.vocab = saved_vocab["vocab"]

            if oov_word:
                self.add_oov_word(count=10000)

            self.create_binary_tree()
            self.reset_weights()

        else:
            logger.info("collecting all words and their counts")

            prev_sentence_no = -1
            sentence_no, vocab = -1, {}
            total_words = 0
            assign_to_vocab = vocab.__setitem__  # slight performance gain
            # https://wiki.python.org/moin/PythonSpeed/PerformanceTips
            get_from_vocab = vocab.__getitem__
            for sentence_no, sentence in enumerate(sentences):
                if prev_sentence_no == sentence_no:
                    break
                if sentence_no % report_frequency == 0:
                    logger.info(
                        "PROGRESS: at sentence #%i, processed %i words and %i word types"
                        % (sentence_no, total_words, len(vocab)))
                for word in sentence:
                    if word in vocab:
                        get_from_vocab(word).count += 1
                    else:
                        assign_to_vocab(word, Vocab(count=1))
                total_words += len(sentence)
                prev_sentence_no = sentence_no
            logger.info(
                "collected %i word types from a corpus of %i words and %i sentences"
                % (len(vocab), total_words, sentence_no + 1))

            # assign a unique index to each word
            self.vocab, self.index2word = {}, []
            append = self.index2word.append
            assign_to_vocab = self.vocab.__setitem__
            for word, v in vocab.items():
                if v.count >= self.min_count:
                    v.index = len(self.vocab)
                    append(word)
                    assign_to_vocab(word, v)

            # add the special out of vocabulary word **UNKNOWN**:
            if oov_word:
                self.add_oov_word(count=len(vocab) - len(self.vocab))
            len(vocab) - len(self.vocab)

            logger.info(
                "total %i word types after removing those with count<%s" %
                (len(self.vocab), self.min_count))

            # add info about each word's Huffman encoding
            self.create_binary_tree()
            self.reset_weights()
            if path != None:
                logger.info("saving vocab list in \"%s\"" % (path))
                with gzip.open(path, 'wb') as file:
                    pickle.dump(
                        {
                            "vocab": self.vocab,
                            "index2word": self.index2word
                        }, file, 1)
Example #9
0
    def accuracy(self, questions, restrict_vocab=30000):
        """
		Compute accuracy of the model (with **capitalizations**). `questions` is a filename where lines are
		4-tuples of words, split into sections by ": SECTION NAME" lines.
		See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

		The accuracy is reported (=printed to log and returned as a list) for each
		section separately, plus there's one aggregate summary at the end.

		Use `restrict_vocab` to ignore all questions containing a word whose frequency
		is not in the top-N most frequent words (default top 30,000).

		This method corresponds to the `compute-accuracy` script of the original C word2vec.

		"""
        ok_vocab = dict(
            sorted(self.vocab.items(),
                   key=lambda item: -item[1].count)[:restrict_vocab])
        ok_index = set(v.index for v in ok_vocab.values())

        def log_accuracy(section):
            correct, incorrect = section['correct'], section['incorrect']
            if correct + incorrect > 0:
                logger.info(
                    "%s: %.1f%% (%i/%i)" %
                    (section['section'], 100.0 * correct /
                     (correct + incorrect), correct, correct + incorrect))

        sections, section = [], None
        for line_no, line in enumerate(open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    log_accuracy(section)
                section = {
                    'section': line.lstrip(': ').strip(),
                    'correct': 0,
                    'incorrect': 0
                }
            else:
                if not section:
                    raise ValueError(
                        "missing section header before line #%i in %s" %
                        (line_no, questions))
                try:
                    a, b, c, expected = line.split(
                    )  # TODO assumes vocabulary preprocessing uses lowercase, too...
                except:
                    logger.info("skipping invalid line #%i in %s" %
                                (line_no, questions))
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" %
                                 (line_no, line))
                    continue

                ignore = set(self.vocab[v].index
                             for v in [a, b, c])  # indexes of words to ignore
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                for index in argsort(
                        self.most_similar(positive=[b, c],
                                          negative=[a],
                                          topn=False))[::-1]:
                    if index in ok_index and index not in ignore:
                        predicted = self.index2word[index]
                        if predicted != expected and predicted != expected.lower(
                        ):
                            logger.debug("%s: expected %s, predicted %s" %
                                         (line.strip(), expected, predicted))
                        break
                section['correct' if predicted ==
                        expected else 'incorrect'] += 1
        if section:
            # store the last section, too
            sections.append(section)
            log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum(s['correct'] for s in sections),
            'incorrect': sum(s['incorrect'] for s in sections)
        }
        log_accuracy(total)
        sections.append(total)
        return sections
  def train(self,
            sentences,
            total_words=None,
            word_count=0,
            total_examples=None,
            queue_factor=2,
            report_delay=1.0):
    """ Update the model's neural weights from a sequence of sentences (can be a

        once-only generator stream).
        For Word2Vec, each sentence must be a list of unicode strings.
        (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha,
        either total_examples
        (count of sentences) or total_words (count of raw words in sentences)
        should be provided, unless the
        sentences are the same as those that were used to initially build the
        vocabulary.
    """
    logger.info("Starting training.")

    self.neg_labels = []
    if self.negative > 0:
      # precompute negative labels optimization for pure-python training
      self.neg_labels = zeros(self.negative + 1)
      self.neg_labels[0] = 1.

    if FAST_VERSION < 0:
      import warnings
      warnings.warn(
          "C extension not loaded for Word2Vec, training will be slow. "
          "Install a C compiler and reinstall gensim for fast training.")
      self.neg_labels = []
      if self.negative > 0:
        # precompute negative labels optimization for pure-python training
        self.neg_labels = zeros(self.negative + 1)
        self.neg_labels[0] = 1.

    logger.info(
        "training model with %i workers on %i vocabulary and %i features, "
        "using sg=%s hs=%s sample=%s negative=%s window=%s", self.workers,
        len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample,
        self.negative, self.window)

    if not self.vocab:
      raise RuntimeError(
          "you must first build vocabulary before training the model")
    if not hasattr(self, "syn0"):
      raise RuntimeError(
          "you must first finalize vocabulary before training the model")

    if total_words is None and total_examples is None:
      if self.corpus_count:
        total_examples = self.corpus_count
        logger.info(
            "expecting %i sentences, matching count from corpus used for vocabulary survey",
            total_examples)
      else:
        raise ValueError(
            "you must provide either total_words or total_examples, to enable alpha and progress calculations"
        )

    job_tally = 0

    if self.iter > 1:
      sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
      total_words = total_words and total_words * self.iter
      total_examples = total_examples and total_examples * self.iter

    def worker_loop():
      """Train the model, lifting lists of sentences from the job_queue."""
      work = matutils.zeros_aligned(
          self.layer1_size, dtype=REAL)  # per-thread private work memory
      neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
      jobs_processed = 0
      while True:
        job = job_queue.get()
        if job is None:
          progress_queue.put(None)
          break  # no more jobs => quit this worker
        sentences, pairwise, alpha = job
        tally, raw_tally = self._do_train_job(sentences, pairwise, alpha,
                                              (work, neu1))
        progress_queue.put(
            (len(sentences), tally, raw_tally))  # report back progress
        jobs_processed += 1
      logger.debug("worker exiting, processed %i jobs", jobs_processed)

    def job_producer():
      """Fill jobs queue using the input `sentences` iterator."""
      job_batch, batch_size = [], 0
      pushed_words, pushed_examples = 0, 0
      next_alpha = self.alpha
      if next_alpha > self.min_alpha_yet_reached:
        logger.warn("Effective 'alpha' higher than previous training cycles")
      self.min_alpha_yet_reached = next_alpha
      job_no = 0

      for sent_idx, sentence in enumerate(sentences):
        sentence_length = self._raw_word_count([sentence])

        # can we fit this sentence into the existing job batch?
        if batch_size + sentence_length <= self.batch_words:
          # yes => add it to the current job
          job_batch.append(sentence)
          batch_size += sentence_length
        else:
          # no => submit the existing job
          pair_idx = list(
              numpy.random.choice(
                  range(len(self.pairwise_constraints)), int(batch_size * 0.2)))
          pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx]
          logger.debug(
              "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
              job_no, batch_size, len(job_batch), len(pairwise_samples),
              next_alpha)
          job_no += 1
          job_queue.put((job_batch, pairwise_samples, next_alpha))

          # update the learning rate for the next job
          if self.min_alpha < next_alpha:
            if total_examples:
              # examples-based decay
              pushed_examples += len(job_batch)
              progress = 1.0 * pushed_examples / total_examples
            else:
              # words-based decay
              pushed_words += self._raw_word_count(job_batch)
              progress = 1.0 * pushed_words / total_words
            next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
            next_alpha = max(self.min_alpha, next_alpha)

          # add the sentence that didn't fit as the first item of a new job
          job_batch, batch_size = [sentence], sentence_length

      # add the last job too (may be significantly smaller than batch_words)
      if job_batch:
        logger.debug(
            "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
            job_no, batch_size, len(job_batch), len(self.pairwise_constraints),
            next_alpha)
        job_no += 1
        job_queue.put((job_batch, self.pairwise_constraints, next_alpha))

      if job_no == 0 and self.train_count == 0:
        logger.warning(
            "train() called with an empty iterator (if not intended, "
            "be sure to provide a corpus that offers restartable "
            "iteration = an iterable).")

      # give the workers heads up that they can finish -- no more work!
      for _ in xrange(self.workers):
        job_queue.put(None)
      logger.debug("job loop exiting, total %i jobs", job_no)

    # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
    job_queue = Queue(maxsize=queue_factor * self.workers)
    progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

    workers = [
        threading.Thread(target=worker_loop) for _ in xrange(self.workers)
    ]
    unfinished_worker_count = len(workers)
    workers.append(threading.Thread(target=job_producer))

    for thread in workers:
      thread.daemon = True  # make interrupting the process with ctrl+c easier
      thread.start()

    example_count, trained_word_count, raw_word_count = 0, 0, word_count
    start, next_report = default_timer() - 0.00001, 1.0

    while unfinished_worker_count > 0:
      report = progress_queue.get()  # blocks if workers too slow
      if report is None:  # a thread reporting that it finished
        unfinished_worker_count -= 1
        logger.info(
            "worker thread finished; awaiting finish of %i more threads",
            unfinished_worker_count)
        continue
      examples, trained_words, raw_words = report
      job_tally += 1

      # update progress stats
      example_count += examples
      trained_word_count += trained_words  # only words in vocab & sampled
      raw_word_count += raw_words

      # log progress once every report_delay seconds
      elapsed = default_timer() - start
      if elapsed >= next_report:
        if total_examples:
          # examples-based progress %
          logger.info(
              "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
              100.0 * example_count / total_examples,
              trained_word_count / elapsed, utils.qsize(job_queue),
              utils.qsize(progress_queue))
        else:
          # words-based progress %
          logger.info(
              "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
              100.0 * raw_word_count / total_words,
              trained_word_count / elapsed, utils.qsize(job_queue),
              utils.qsize(progress_queue))
        next_report = elapsed + report_delay

    # all done; report the final stats
    elapsed = default_timer() - start
    logger.info(
        "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
        raw_word_count, trained_word_count, elapsed,
        trained_word_count / elapsed)
    if job_tally < 10 * self.workers:
      logger.warn(
          "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
      )

    # check that the input corpus hasn't changed during iteration
    if total_examples and total_examples != example_count:
      logger.warn(
          "supplied example count (%i) did not equal expected count (%i)",
          example_count, total_examples)
    if total_words and total_words != raw_word_count:
      logger.warn(
          "supplied raw word count (%i) did not equal expected count (%i)",
          raw_word_count, total_words)

    self.train_count += 1  # number of times train() has been called
    self.total_train_time += elapsed
    self.clear_sims()
    return trained_word_count
Example #11
0
	def build_vocab(self, sentences, oov_word = False, report_frequency = 10000):
		"""
		Build vocabulary from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
		path = (re.sub("/","_",sentences.fname)+".vocab") if hasattr(sentences, "fname") else None
		if path != None and file_exists(path):
			logger.info("loading from saved vocab list at \"%s\"" % (path))
			file = gzip.open(path, 'r')
			saved_vocab = pickle.load(file)
			file.close()
			self.index2word = saved_vocab["index2word"]
			self.vocab      = saved_vocab["vocab"]

			if oov_word:
				self.add_oov_word(count = 100000)
			
			if PaddingWord not in self.vocab:
				v = self.add_word_to_vocab(PaddingWord, count = 1000000)
				self.padding_word = v
			else:
				self.padding_word = self.vocab[PaddingWord]

			# add special padding word here.
			self.create_binary_tree()
			self.build_paragraph_vocab(sentences)
			self.reset_weights()

		else:
			logger.info("collecting all words and their counts")

			prev_sentence_no = -1
			sentence_no, vocab = -1, {}
			total_words = 0
			assign_to_vocab = vocab.__setitem__ # slight performance gain
			# https://wiki.python.org/moin/PythonSpeed/PerformanceTips
			get_from_vocab = vocab.__getitem__
			for sentence_no, sentence in enumerate(sentences):
				if prev_sentence_no == sentence_no:
					break
				if sentence_no % report_frequency == 0:
					logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
						(sentence_no, total_words, len(vocab)))
				for word in sentence:
					if word in vocab:
						get_from_vocab(word).count += 1
					else:
						assign_to_vocab(word, Vocab(count=1))
				total_words += len(sentence)
				prev_sentence_no = sentence_no
			logger.info("collected %i word types from a corpus of %i words and %i sentences" %
				(len(vocab), total_words, sentence_no + 1))

			# assign a unique index to each word
			self.vocab, self.index2word = {}, []
			append = self.index2word.append
			assign_to_vocab = self.vocab.__setitem__
			for word, v in iteritems(vocab):
				if v.count >= self.min_count:
					v.index = len(self.vocab)
					append(word)
					assign_to_vocab(word, v)

			# add the special out of vocabulary word **UNKNOWN**:
			if oov_word: self.add_oov_word(count = len(vocab) - len(self.vocab))

			if PaddingWord not in self.vocab:
				v = self.add_word_to_vocab(PaddingWord, count = 1000000)
				self.padding_word = v
			else:
				self.padding_word = self.vocab[PaddingWord]

			logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

			# add info about each word's Huffman encoding
			self.create_binary_tree()
			self.build_paragraph_vocab(sentences)
			self.reset_weights()
			if path != None:
				logger.info("saving vocab list in \"%s\"" % (path))
				with gzip.open(path, 'wb') as file:
					pickle.dump({"vocab": self.vocab, "index2word": self.index2word}, file, 1)
Example #12
0
	def train(self, sentences, total_words=None, word_count=0, paragraphs_only = False, vocab = None, paragraphs = None):
		"""
		Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
		if paragraphs is None:
			paragraphs = self.synparagraph
		if vocab is None:
			vocab = self.paragraph_vocab

		if not self.vocab:
			raise RuntimeError("you must first build vocabulary before training the model")

		start, next_report = time.time(), [1.0]
		word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab))
		jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
		lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)
		total_error = [0.0]

		def worker_train():
			"""Train the model, lifting lists of sentences from the jobs queue."""
			paragraph_work = zeros(self.paragraph_size, dtype=REAL)  # each thread must have its own work memory
			error = zeros(1, dtype = REAL)
			if self.concatenate:
				# word work here is for each individual word, so it has length logistic regression - para size
				word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL)
			else:
				# here word work is aggregated:
				word_work = zeros(self.layer1_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

			zeros(self.logistic_regression_size, dtype = REAL)
			while True:
				job = jobs.get()
				if job is None:  # data finished, exit
					break
				# update the learning rate before every job
				alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha
				# how many words did we train on? out-of-vocabulary (unknown) words do not count
				job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job))

				with lock:
					# here we can store the scores for later plotting and viewing...
					word_count[0] += job_words

					elapsed = time.time() - start
					total_error[0] += error[0]
					if elapsed >= next_report[0]:
						logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," %
							(100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
						next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

		workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
		for thread in workers:
			thread.daemon = True  # make interrupting the process with ctrl+c easier
			thread.start()

		# convert input strings to Vocab objects, and paragraph to paragraph (Vocab) object:
		no_oov = (self.create_job(sentence,vocab) for sentence in sentences)
		for job_no, job in enumerate(utils.grouper(no_oov, self.batchsize)):
			logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
			jobs.put(job)
		logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
		for _ in xrange(self.workers):
			jobs.put(None)  # give the workers heads up that they can finish -- no more work!

		for thread in workers:
			thread.join()

		elapsed = time.time() - start
		logger.info("training on %i sentences took %.1fs, %.0f sentences/s, %.6f" %
			(word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0, total_error[0]))

		return (word_count[0], total_error[0])