Ejemplo n.º 1
0
 def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
                   raw_word_count, total_words, trained_word_count, elapsed):
     if total_examples:
         # examples-based progress %
         logger.info(
             "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
             cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
             utils.qsize(job_queue), utils.qsize(progress_queue)
         )
     else:
         # words-based progress %
         logger.info(
             "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
             cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
             utils.qsize(job_queue), utils.qsize(progress_queue)
         )
Ejemplo n.º 2
0
  def train(self, train_pairs, total_ptrees=None, ptree_count=0,
            total_examples=None, queue_factor=2, report_delay=1.0):
    """
    An example is a pair of a complete AST tree and an utterance.
    A ptree is a partial tree.
    """
    if FAST_VERSION < 0:
      import warnings
      warnings.warn("C extension not loaded; training will be slow.")
      # precompute negative labels optimization for pure-python training
      self.neg_labels = zeros(self.negative + 1 + NEG_SAMPLING_VOCAB_SIZE_THRESHOLD)
      self.neg_labels[0] = 1.

    logger.info(
        "training model with %i workers",
        self.workers)

    if not self.vocab_i or not self.vocab_k or not self.vocab_l or not self.vocab_r:
      raise RuntimeError("you must first build vocabulary before training the model")
    if not hasattr(self, 'syn0i') or not hasattr(self, 'syn0k') or not hasattr(self, 'syn0l'):
      raise RuntimeError("you must first finalize vocabulary before training the model")

    if total_ptrees is None and total_examples is None:
      if self.corpus_count:
        total_examples = self.corpus_count
        logger.info("expecting %i train pairs, matching count from corpus used for vocabulary survey", total_examples)
      else:
        raise ValueError("you must provide either total_ptrees or total_examples, to enable alpha and progress calculations")

    job_tally = 0

    if self.iter > 1:
      train_pairs = gsutils.RepeatCorpusNTimes(train_pairs, self.iter)
      total_ptrees = total_ptrees and total_ptrees * self.iter
      total_examples = total_examples and total_examples * self.iter

    def worker_loop():
      """Train the model, lifting lists of train_pairs from the job_queue."""

      # per-thread private work memory - useless in numpy implementation
      work = matutils.zeros_aligned(self.vector_size, dtype=REAL)
      neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL)
      jobs_processed = 0
      while True:
        job = job_queue.get()
        if job is None:
          progress_queue.put(None)
          break  # no more jobs => quit this worker
        train_pairs, alpha = job
        tally, raw_tally = self._do_train_job(train_pairs, alpha, (work, neu1))
        progress_queue.put((len(train_pairs), tally, raw_tally))  # report back progress
        jobs_processed += 1
      logger.debug("worker exiting, processed %i jobs", jobs_processed)

    def job_producer():
      """Fill jobs queue using the input `train_pairs` iterator."""
      job_batch, batch_size = [], 0
      pushed_ptrees, pushed_examples = 0, 0
      next_alpha = self.alpha
      job_no = 0

      for train_pair in train_pairs:
        train_pair_length = self._raw_ptree_count([train_pair])

        # can we fit this train_pair into the existing job batch?
        if batch_size + train_pair_length <= self.batch_ptrees:
          # yes => add it to the current job
          job_batch.append(train_pair)
          batch_size += train_pair_length
        else:
          # no => submit the existing job
          logger.debug(
            "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
            job_no, batch_size, len(job_batch), next_alpha)
          job_no += 1
          job_queue.put((job_batch, next_alpha))

          # update the learning rate for the next job
          if self.min_alpha < next_alpha:
            if total_examples:
              # examples-based decay
              pushed_examples += len(job_batch)
              progress = 1.0 * pushed_examples / total_examples
            else:
              # ptrees-based decay
              pushed_ptrees += self._raw_ptree_count(job_batch)
              progress = 1.0 * pushed_ptrees / total_ptrees
            next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
            next_alpha = max(self.min_alpha, next_alpha)

          # add the train_pair that didn't fit as the first item of a new job
          job_batch, batch_size = [train_pair], train_pair_length

      # add the last job too (may be significantly smaller than batch_ptrees)
      if job_batch:
        logger.debug(
          "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
          job_no, batch_size, len(job_batch), next_alpha)
        job_no += 1
        job_queue.put((job_batch, next_alpha))

      if job_no == 0 and self.train_count == 0:
        logger.warning(
          "train() called with an empty iterator (if not intended, "
          "be sure to provide a corpus that offers restartable "
          "iteration = an iterable)."
        )

      # give the workers heads up that they can finish -- no more work!
      for _ in xrange(self.workers):
        job_queue.put(None)
      logger.debug("job loop exiting, total %i jobs", job_no)

    # buffer ahead only a limited number of jobs.. this is the reason we can't
    # simply use ThreadPool :(
    job_queue = Queue(maxsize=queue_factor * self.workers)
    progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

    workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)]
    unfinished_worker_count = len(workers)
    workers.append(threading.Thread(target=job_producer))

    for thread in workers:
      thread.daemon = True  # make interrupting the process with ctrl+c easier
      thread.start()

    example_count, trained_ptree_count, raw_ptree_count = 0, 0, ptree_count
    start, next_report = default_timer() - 0.00001, 1.0

    while unfinished_worker_count > 0:
      report = progress_queue.get()  # blocks if workers too slow
      if report is None:  # a thread reporting that it finished
        unfinished_worker_count -= 1
        logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
        continue
      examples, trained_ptrees, raw_ptrees = report
      job_tally += 1

      # update progress stats
      example_count += examples
      trained_ptree_count += trained_ptrees  # only ptrees in vocab & sampled
      raw_ptree_count += raw_ptrees

      # log progress once every report_delay seconds
      elapsed = default_timer() - start
      if elapsed >= next_report:
        if total_examples:
          # examples-based progress %
          logger.info(
            "PROGRESS: at %.2f%% examples, %.0f ptrees/s, in_qsize %i, out_qsize %i",
            100.0 * example_count / total_examples, trained_ptree_count / elapsed,
            gsutils.qsize(job_queue), gsutils.qsize(progress_queue))
        else:
          # ptrees-based progress %
          logger.info(
            "PROGRESS: at %.2f%% ptrees, %.0f ptrees/s, in_qsize %i, out_qsize %i",
            100.0 * raw_ptree_count / total_ptrees, trained_ptree_count / elapsed,
            gsutils.qsize(job_queue), gsutils.qsize(progress_queue))
        next_report = elapsed + report_delay

    # all done; report the final stats
    elapsed = default_timer() - start
    logger.info(
      "training on %i raw ptrees (%i effective ptrees) took %.1fs, %.0f effective ptrees/s",
      raw_ptree_count, trained_ptree_count, elapsed, trained_ptree_count / elapsed)
    if job_tally < 10 * self.workers:
      logger.warn("under 10 jobs per worker: consider setting a smaller `batch_ptrees' for smoother alpha decay")

    # check that the input corpus hasn't changed during iteration
    if total_examples and total_examples != example_count:
      logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
    if total_ptrees and total_ptrees != raw_ptree_count:
      logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_ptree_count, total_ptrees)

    self.train_count += 1  # number of times train() has been called
    self.total_train_time += elapsed
    return trained_ptree_count
Ejemplo n.º 3
0
  def train(self,
            sentences,
            total_words=None,
            word_count=0,
            total_examples=None,
            queue_factor=2,
            report_delay=1.0):
    """ Update the model's neural weights from a sequence of sentences (can be a

        once-only generator stream).
        For Word2Vec, each sentence must be a list of unicode strings.
        (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha,
        either total_examples
        (count of sentences) or total_words (count of raw words in sentences)
        should be provided, unless the
        sentences are the same as those that were used to initially build the
        vocabulary.
    """
    logger.info("Starting training.")

    self.neg_labels = []
    if self.negative > 0:
      # precompute negative labels optimization for pure-python training
      self.neg_labels = zeros(self.negative + 1)
      self.neg_labels[0] = 1.

    if FAST_VERSION < 0:
      import warnings
      warnings.warn(
          "C extension not loaded for Word2Vec, training will be slow. "
          "Install a C compiler and reinstall gensim for fast training.")
      self.neg_labels = []
      if self.negative > 0:
        # precompute negative labels optimization for pure-python training
        self.neg_labels = zeros(self.negative + 1)
        self.neg_labels[0] = 1.

    logger.info(
        "training model with %i workers on %i vocabulary and %i features, "
        "using sg=%s hs=%s sample=%s negative=%s window=%s", self.workers,
        len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample,
        self.negative, self.window)

    if not self.vocab:
      raise RuntimeError(
          "you must first build vocabulary before training the model")
    if not hasattr(self, "syn0"):
      raise RuntimeError(
          "you must first finalize vocabulary before training the model")

    if total_words is None and total_examples is None:
      if self.corpus_count:
        total_examples = self.corpus_count
        logger.info(
            "expecting %i sentences, matching count from corpus used for vocabulary survey",
            total_examples)
      else:
        raise ValueError(
            "you must provide either total_words or total_examples, to enable alpha and progress calculations"
        )

    job_tally = 0

    if self.iter > 1:
      sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
      total_words = total_words and total_words * self.iter
      total_examples = total_examples and total_examples * self.iter

    def worker_loop():
      """Train the model, lifting lists of sentences from the job_queue."""
      work = matutils.zeros_aligned(
          self.layer1_size, dtype=REAL)  # per-thread private work memory
      neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
      jobs_processed = 0
      while True:
        job = job_queue.get()
        if job is None:
          progress_queue.put(None)
          break  # no more jobs => quit this worker
        sentences, pairwise, alpha = job
        tally, raw_tally = self._do_train_job(sentences, pairwise, alpha,
                                              (work, neu1))
        progress_queue.put(
            (len(sentences), tally, raw_tally))  # report back progress
        jobs_processed += 1
      logger.debug("worker exiting, processed %i jobs", jobs_processed)

    def job_producer():
      """Fill jobs queue using the input `sentences` iterator."""
      job_batch, batch_size = [], 0
      pushed_words, pushed_examples = 0, 0
      next_alpha = self.alpha
      if next_alpha > self.min_alpha_yet_reached:
        logger.warn("Effective 'alpha' higher than previous training cycles")
      self.min_alpha_yet_reached = next_alpha
      job_no = 0

      for sent_idx, sentence in enumerate(sentences):
        sentence_length = self._raw_word_count([sentence])

        # can we fit this sentence into the existing job batch?
        if batch_size + sentence_length <= self.batch_words:
          # yes => add it to the current job
          job_batch.append(sentence)
          batch_size += sentence_length
        else:
          # no => submit the existing job
          pair_idx = list(
              numpy.random.choice(
                  range(len(self.pairwise_constraints)), int(batch_size * 0.2)))
          pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx]
          logger.debug(
              "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
              job_no, batch_size, len(job_batch), len(pairwise_samples),
              next_alpha)
          job_no += 1
          job_queue.put((job_batch, pairwise_samples, next_alpha))

          # update the learning rate for the next job
          if self.min_alpha < next_alpha:
            if total_examples:
              # examples-based decay
              pushed_examples += len(job_batch)
              progress = 1.0 * pushed_examples / total_examples
            else:
              # words-based decay
              pushed_words += self._raw_word_count(job_batch)
              progress = 1.0 * pushed_words / total_words
            next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
            next_alpha = max(self.min_alpha, next_alpha)

          # add the sentence that didn't fit as the first item of a new job
          job_batch, batch_size = [sentence], sentence_length

      # add the last job too (may be significantly smaller than batch_words)
      if job_batch:
        logger.debug(
            "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
            job_no, batch_size, len(job_batch), len(self.pairwise_constraints),
            next_alpha)
        job_no += 1
        job_queue.put((job_batch, self.pairwise_constraints, next_alpha))

      if job_no == 0 and self.train_count == 0:
        logger.warning(
            "train() called with an empty iterator (if not intended, "
            "be sure to provide a corpus that offers restartable "
            "iteration = an iterable).")

      # give the workers heads up that they can finish -- no more work!
      for _ in xrange(self.workers):
        job_queue.put(None)
      logger.debug("job loop exiting, total %i jobs", job_no)

    # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
    job_queue = Queue(maxsize=queue_factor * self.workers)
    progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

    workers = [
        threading.Thread(target=worker_loop) for _ in xrange(self.workers)
    ]
    unfinished_worker_count = len(workers)
    workers.append(threading.Thread(target=job_producer))

    for thread in workers:
      thread.daemon = True  # make interrupting the process with ctrl+c easier
      thread.start()

    example_count, trained_word_count, raw_word_count = 0, 0, word_count
    start, next_report = default_timer() - 0.00001, 1.0

    while unfinished_worker_count > 0:
      report = progress_queue.get()  # blocks if workers too slow
      if report is None:  # a thread reporting that it finished
        unfinished_worker_count -= 1
        logger.info(
            "worker thread finished; awaiting finish of %i more threads",
            unfinished_worker_count)
        continue
      examples, trained_words, raw_words = report
      job_tally += 1

      # update progress stats
      example_count += examples
      trained_word_count += trained_words  # only words in vocab & sampled
      raw_word_count += raw_words

      # log progress once every report_delay seconds
      elapsed = default_timer() - start
      if elapsed >= next_report:
        if total_examples:
          # examples-based progress %
          logger.info(
              "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
              100.0 * example_count / total_examples,
              trained_word_count / elapsed, utils.qsize(job_queue),
              utils.qsize(progress_queue))
        else:
          # words-based progress %
          logger.info(
              "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
              100.0 * raw_word_count / total_words,
              trained_word_count / elapsed, utils.qsize(job_queue),
              utils.qsize(progress_queue))
        next_report = elapsed + report_delay

    # all done; report the final stats
    elapsed = default_timer() - start
    logger.info(
        "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
        raw_word_count, trained_word_count, elapsed,
        trained_word_count / elapsed)
    if job_tally < 10 * self.workers:
      logger.warn(
          "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
      )

    # check that the input corpus hasn't changed during iteration
    if total_examples and total_examples != example_count:
      logger.warn(
          "supplied example count (%i) did not equal expected count (%i)",
          example_count, total_examples)
    if total_words and total_words != raw_word_count:
      logger.warn(
          "supplied raw word count (%i) did not equal expected count (%i)",
          raw_word_count, total_words)

    self.train_count += 1  # number of times train() has been called
    self.total_train_time += elapsed
    self.clear_sims()
    return trained_word_count
Ejemplo n.º 4
0
    def train(self,
              train_pairs,
              total_ptrees=None,
              ptree_count=0,
              total_examples=None,
              queue_factor=2,
              report_delay=1.0):
        """
    An example is a pair of a complete AST tree and an utterance.
    A ptree is a partial tree.
    """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension not loaded; training will be slow.")
            # precompute negative labels optimization for pure-python training
            self.neg_labels = zeros(self.negative + 1 +
                                    NEG_SAMPLING_VOCAB_SIZE_THRESHOLD)
            self.neg_labels[0] = 1.

        logger.info("training model with %i workers", self.workers)

        if not self.vocab_i or not self.vocab_k or not self.vocab_l or not self.vocab_r:
            raise RuntimeError(
                "you must first build vocabulary before training the model")
        if not hasattr(self, 'syn0i') or not hasattr(
                self, 'syn0k') or not hasattr(self, 'syn0l'):
            raise RuntimeError(
                "you must first finalize vocabulary before training the model")

        if total_ptrees is None and total_examples is None:
            if self.corpus_count:
                total_examples = self.corpus_count
                logger.info(
                    "expecting %i train pairs, matching count from corpus used for vocabulary survey",
                    total_examples)
            else:
                raise ValueError(
                    "you must provide either total_ptrees or total_examples, to enable alpha and progress calculations"
                )

        job_tally = 0

        if self.iter > 1:
            train_pairs = gsutils.RepeatCorpusNTimes(train_pairs, self.iter)
            total_ptrees = total_ptrees and total_ptrees * self.iter
            total_examples = total_examples and total_examples * self.iter

        def worker_loop():
            """Train the model, lifting lists of train_pairs from the job_queue."""

            # per-thread private work memory - useless in numpy implementation
            work = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            jobs_processed = 0
            while True:
                job = job_queue.get()
                if job is None:
                    progress_queue.put(None)
                    break  # no more jobs => quit this worker
                train_pairs, alpha = job
                tally, raw_tally = self._do_train_job(train_pairs, alpha,
                                                      (work, neu1))
                progress_queue.put((len(train_pairs), tally,
                                    raw_tally))  # report back progress
                jobs_processed += 1
            logger.debug("worker exiting, processed %i jobs", jobs_processed)

        def job_producer():
            """Fill jobs queue using the input `train_pairs` iterator."""
            job_batch, batch_size = [], 0
            pushed_ptrees, pushed_examples = 0, 0
            next_alpha = self.alpha
            job_no = 0

            for train_pair in train_pairs:
                train_pair_length = self._raw_ptree_count([train_pair])

                # can we fit this train_pair into the existing job batch?
                if batch_size + train_pair_length <= self.batch_ptrees:
                    # yes => add it to the current job
                    job_batch.append(train_pair)
                    batch_size += train_pair_length
                else:
                    # no => submit the existing job
                    logger.debug(
                        "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
                        job_no, batch_size, len(job_batch), next_alpha)
                    job_no += 1
                    job_queue.put((job_batch, next_alpha))

                    # update the learning rate for the next job
                    if self.min_alpha < next_alpha:
                        if total_examples:
                            # examples-based decay
                            pushed_examples += len(job_batch)
                            progress = 1.0 * pushed_examples / total_examples
                        else:
                            # ptrees-based decay
                            pushed_ptrees += self._raw_ptree_count(job_batch)
                            progress = 1.0 * pushed_ptrees / total_ptrees
                        next_alpha = self.alpha - (self.alpha -
                                                   self.min_alpha) * progress
                        next_alpha = max(self.min_alpha, next_alpha)

                    # add the train_pair that didn't fit as the first item of a new job
                    job_batch, batch_size = [train_pair], train_pair_length

            # add the last job too (may be significantly smaller than batch_ptrees)
            if job_batch:
                logger.debug(
                    "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
                    job_no, batch_size, len(job_batch), next_alpha)
                job_no += 1
                job_queue.put((job_batch, next_alpha))

            if job_no == 0 and self.train_count == 0:
                logger.warning(
                    "train() called with an empty iterator (if not intended, "
                    "be sure to provide a corpus that offers restartable "
                    "iteration = an iterable).")

            # give the workers heads up that they can finish -- no more work!
            for _ in xrange(self.workers):
                job_queue.put(None)
            logger.debug("job loop exiting, total %i jobs", job_no)

        # buffer ahead only a limited number of jobs.. this is the reason we can't
        # simply use ThreadPool :(
        job_queue = Queue(maxsize=queue_factor * self.workers)
        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

        workers = [
            threading.Thread(target=worker_loop) for _ in xrange(self.workers)
        ]
        unfinished_worker_count = len(workers)
        workers.append(threading.Thread(target=job_producer))

        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        example_count, trained_ptree_count, raw_ptree_count = 0, 0, ptree_count
        start, next_report = default_timer() - 0.00001, 1.0

        while unfinished_worker_count > 0:
            report = progress_queue.get()  # blocks if workers too slow
            if report is None:  # a thread reporting that it finished
                unfinished_worker_count -= 1
                logger.info(
                    "worker thread finished; awaiting finish of %i more threads",
                    unfinished_worker_count)
                continue
            examples, trained_ptrees, raw_ptrees = report
            job_tally += 1

            # update progress stats
            example_count += examples
            trained_ptree_count += trained_ptrees  # only ptrees in vocab & sampled
            raw_ptree_count += raw_ptrees

            # log progress once every report_delay seconds
            elapsed = default_timer() - start
            if elapsed >= next_report:
                if total_examples:
                    # examples-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% examples, %.0f ptrees/s, in_qsize %i, out_qsize %i",
                        100.0 * example_count / total_examples,
                        trained_ptree_count / elapsed,
                        gsutils.qsize(job_queue),
                        gsutils.qsize(progress_queue))
                else:
                    # ptrees-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% ptrees, %.0f ptrees/s, in_qsize %i, out_qsize %i",
                        100.0 * raw_ptree_count / total_ptrees,
                        trained_ptree_count / elapsed,
                        gsutils.qsize(job_queue),
                        gsutils.qsize(progress_queue))
                next_report = elapsed + report_delay

        # all done; report the final stats
        elapsed = default_timer() - start
        logger.info(
            "training on %i raw ptrees (%i effective ptrees) took %.1fs, %.0f effective ptrees/s",
            raw_ptree_count, trained_ptree_count, elapsed,
            trained_ptree_count / elapsed)
        if job_tally < 10 * self.workers:
            logger.warn(
                "under 10 jobs per worker: consider setting a smaller `batch_ptrees' for smoother alpha decay"
            )

        # check that the input corpus hasn't changed during iteration
        if total_examples and total_examples != example_count:
            logger.warn(
                "supplied example count (%i) did not equal expected count (%i)",
                example_count, total_examples)
        if total_ptrees and total_ptrees != raw_ptree_count:
            logger.warn(
                "supplied raw word count (%i) did not equal expected count (%i)",
                raw_ptree_count, total_ptrees)

        self.train_count += 1  # number of times train() has been called
        self.total_train_time += elapsed
        return trained_ptree_count