Python qsize Examples

Programming Language: Python

Namespace/Package Name: gensim.utils

Method/Function: qsize

Examples at hotexamples.com: 4

Python qsize - 4 examples found. These are the top rated real world Python examples of gensim.utils.qsize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
                   raw_word_count, total_words, trained_word_count, elapsed):
     if total_examples:
         # examples-based progress %
         logger.info(
             "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
             cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
             utils.qsize(job_queue), utils.qsize(progress_queue)
         )
     else:
         # words-based progress %
         logger.info(
             "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
             cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
             utils.qsize(job_queue), utils.qsize(progress_queue)
         )

Example #2

Show file

File: bimodal.py Project: ronxin/codemend

  def train(self, train_pairs, total_ptrees=None, ptree_count=0,
            total_examples=None, queue_factor=2, report_delay=1.0):
    """
    An example is a pair of a complete AST tree and an utterance.
    A ptree is a partial tree.
    """
    if FAST_VERSION < 0:
      import warnings
      warnings.warn("C extension not loaded; training will be slow.")
      # precompute negative labels optimization for pure-python training
      self.neg_labels = zeros(self.negative + 1 + NEG_SAMPLING_VOCAB_SIZE_THRESHOLD)
      self.neg_labels[0] = 1.

    logger.info(
        "training model with %i workers",
        self.workers)

    if not self.vocab_i or not self.vocab_k or not self.vocab_l or not self.vocab_r:
      raise RuntimeError("you must first build vocabulary before training the model")
    if not hasattr(self, 'syn0i') or not hasattr(self, 'syn0k') or not hasattr(self, 'syn0l'):
      raise RuntimeError("you must first finalize vocabulary before training the model")

    if total_ptrees is None and total_examples is None:
      if self.corpus_count:
        total_examples = self.corpus_count
        logger.info("expecting %i train pairs, matching count from corpus used for vocabulary survey", total_examples)
      else:
        raise ValueError("you must provide either total_ptrees or total_examples, to enable alpha and progress calculations")

    job_tally = 0

    if self.iter > 1:
      train_pairs = gsutils.RepeatCorpusNTimes(train_pairs, self.iter)
      total_ptrees = total_ptrees and total_ptrees * self.iter
      total_examples = total_examples and total_examples * self.iter

    def worker_loop():
      """Train the model, lifting lists of train_pairs from the job_queue."""

      # per-thread private work memory - useless in numpy implementation
      work = matutils.zeros_aligned(self.vector_size, dtype=REAL)
      neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL)
      jobs_processed = 0
      while True:
        job = job_queue.get()
        if job is None:
          progress_queue.put(None)
          break  # no more jobs => quit this worker
        train_pairs, alpha = job
        tally, raw_tally = self._do_train_job(train_pairs, alpha, (work, neu1))
        progress_queue.put((len(train_pairs), tally, raw_tally))  # report back progress
        jobs_processed += 1
      logger.debug("worker exiting, processed %i jobs", jobs_processed)

    def job_producer():
      """Fill jobs queue using the input `train_pairs` iterator."""
      job_batch, batch_size = [], 0
      pushed_ptrees, pushed_examples = 0, 0
      next_alpha = self.alpha
      job_no = 0

      for train_pair in train_pairs:
        train_pair_length = self._raw_ptree_count([train_pair])

        # can we fit this train_pair into the existing job batch?
        if batch_size + train_pair_length <= self.batch_ptrees:
          # yes => add it to the current job
          job_batch.append(train_pair)
          batch_size += train_pair_length
        else:
          # no => submit the existing job
          logger.debug(
            "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
            job_no, batch_size, len(job_batch), next_alpha)
          job_no += 1
          job_queue.put((job_batch, next_alpha))

          # update the learning rate for the next job
          if self.min_alpha < next_alpha:
            if total_examples:
              # examples-based decay
              pushed_examples += len(job_batch)
              progress = 1.0 * pushed_examples / total_examples
            else:
              # ptrees-based decay
              pushed_ptrees += self._raw_ptree_count(job_batch)
              progress = 1.0 * pushed_ptrees / total_ptrees
            next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
            next_alpha = max(self.min_alpha, next_alpha)

          # add the train_pair that didn't fit as the first item of a new job
          job_batch, batch_size = [train_pair], train_pair_length

      # add the last job too (may be significantly smaller than batch_ptrees)
      if job_batch:
        logger.debug(
          "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
          job_no, batch_size, len(job_batch), next_alpha)
        job_no += 1
        job_queue.put((job_batch, next_alpha))

      if job_no == 0 and self.train_count == 0:
        logger.warning(
          "train() called with an empty iterator (if not intended, "
          "be sure to provide a corpus that offers restartable "
          "iteration = an iterable)."
        )

      # give the workers heads up that they can finish -- no more work!
      for _ in xrange(self.workers):
        job_queue.put(None)
      logger.debug("job loop exiting, total %i jobs", job_no)

    # buffer ahead only a limited number of jobs.. this is the reason we can't
    # simply use ThreadPool :(
    job_queue = Queue(maxsize=queue_factor * self.workers)
    progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

    workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)]
    unfinished_worker_count = len(workers)
    workers.append(threading.Thread(target=job_producer))

    for thread in workers:
      thread.daemon = True  # make interrupting the process with ctrl+c easier
      thread.start()

    example_count, trained_ptree_count, raw_ptree_count = 0, 0, ptree_count
    start, next_report = default_timer() - 0.00001, 1.0

    while unfinished_worker_count > 0:
      report = progress_queue.get()  # blocks if workers too slow
      if report is None:  # a thread reporting that it finished
        unfinished_worker_count -= 1
        logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
        continue
      examples, trained_ptrees, raw_ptrees = report
      job_tally += 1

      # update progress stats
      example_count += examples
      trained_ptree_count += trained_ptrees  # only ptrees in vocab & sampled
      raw_ptree_count += raw_ptrees

      # log progress once every report_delay seconds
      elapsed = default_timer() - start
      if elapsed >= next_report:
        if total_examples:
          # examples-based progress %
          logger.info(
            "PROGRESS: at %.2f%% examples, %.0f ptrees/s, in_qsize %i, out_qsize %i",
            100.0 * example_count / total_examples, trained_ptree_count / elapsed,
            gsutils.qsize(job_queue), gsutils.qsize(progress_queue))
        else:
          # ptrees-based progress %
          logger.info(
            "PROGRESS: at %.2f%% ptrees, %.0f ptrees/s, in_qsize %i, out_qsize %i",
            100.0 * raw_ptree_count / total_ptrees, trained_ptree_count / elapsed,
            gsutils.qsize(job_queue), gsutils.qsize(progress_queue))
        next_report = elapsed + report_delay

    # all done; report the final stats
    elapsed = default_timer() - start
    logger.info(
      "training on %i raw ptrees (%i effective ptrees) took %.1fs, %.0f effective ptrees/s",
      raw_ptree_count, trained_ptree_count, elapsed, trained_ptree_count / elapsed)
    if job_tally < 10 * self.workers:
      logger.warn("under 10 jobs per worker: consider setting a smaller `batch_ptrees' for smoother alpha decay")

    # check that the input corpus hasn't changed during iteration
    if total_examples and total_examples != example_count:
      logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
    if total_ptrees and total_ptrees != raw_ptree_count:
      logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_ptree_count, total_ptrees)

    self.train_count += 1  # number of times train() has been called
    self.total_train_time += elapsed
    return trained_ptree_count

Example #3

Show file

File: persona2vec.py Project: MitchellTesla/google-research

  def train(self,
            sentences,
            total_words=None,
            word_count=0,
            total_examples=None,
            queue_factor=2,
            report_delay=1.0):
    """ Update the model's neural weights from a sequence of sentences (can be a

        once-only generator stream).
        For Word2Vec, each sentence must be a list of unicode strings.
        (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha,
        either total_examples
        (count of sentences) or total_words (count of raw words in sentences)
        should be provided, unless the
        sentences are the same as those that were used to initially build the
        vocabulary.
    """
    logger.info("Starting training.")

    self.neg_labels = []
    if self.negative > 0:
      # precompute negative labels optimization for pure-python training
      self.neg_labels = zeros(self.negative + 1)
      self.neg_labels[0] = 1.

    if FAST_VERSION < 0:
      import warnings
      warnings.warn(
          "C extension not loaded for Word2Vec, training will be slow. "
          "Install a C compiler and reinstall gensim for fast training.")
      self.neg_labels = []
      if self.negative > 0:
        # precompute negative labels optimization for pure-python training
        self.neg_labels = zeros(self.negative + 1)
        self.neg_labels[0] = 1.

    logger.info(
        "training model with %i workers on %i vocabulary and %i features, "
        "using sg=%s hs=%s sample=%s negative=%s window=%s", self.workers,
        len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample,
        self.negative, self.window)

    if not self.vocab:
      raise RuntimeError(
          "you must first build vocabulary before training the model")
    if not hasattr(self, "syn0"):
      raise RuntimeError(
          "you must first finalize vocabulary before training the model")

    if total_words is None and total_examples is None:
      if self.corpus_count:
        total_examples = self.corpus_count
        logger.info(
            "expecting %i sentences, matching count from corpus used for vocabulary survey",
            total_examples)
      else:
        raise ValueError(
            "you must provide either total_words or total_examples, to enable alpha and progress calculations"
        )

    job_tally = 0

    if self.iter > 1:
      sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
      total_words = total_words and total_words * self.iter
      total_examples = total_examples and total_examples * self.iter

    def worker_loop():
      """Train the model, lifting lists of sentences from the job_queue."""
      work = matutils.zeros_aligned(
          self.layer1_size, dtype=REAL)  # per-thread private work memory
      neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
      jobs_processed = 0
      while True:
        job = job_queue.get()
        if job is None:
          progress_queue.put(None)
          break  # no more jobs => quit this worker
        sentences, pairwise, alpha = job
        tally, raw_tally = self._do_train_job(sentences, pairwise, alpha,
                                              (work, neu1))
        progress_queue.put(
            (len(sentences), tally, raw_tally))  # report back progress
        jobs_processed += 1
      logger.debug("worker exiting, processed %i jobs", jobs_processed)

    def job_producer():
      """Fill jobs queue using the input `sentences` iterator."""
      job_batch, batch_size = [], 0
      pushed_words, pushed_examples = 0, 0
      next_alpha = self.alpha
      if next_alpha > self.min_alpha_yet_reached:
        logger.warn("Effective 'alpha' higher than previous training cycles")
      self.min_alpha_yet_reached = next_alpha
      job_no = 0

      for sent_idx, sentence in enumerate(sentences):
        sentence_length = self._raw_word_count([sentence])

        # can we fit this sentence into the existing job batch?
        if batch_size + sentence_length <= self.batch_words:
          # yes => add it to the current job
          job_batch.append(sentence)
          batch_size += sentence_length
        else:
          # no => submit the existing job
          pair_idx = list(
              numpy.random.choice(
                  range(len(self.pairwise_constraints)), int(batch_size * 0.2)))
          pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx]
          logger.debug(
              "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
              job_no, batch_size, len(job_batch), len(pairwise_samples),
              next_alpha)
          job_no += 1
          job_queue.put((job_batch, pairwise_samples, next_alpha))

          # update the learning rate for the next job
          if self.min_alpha < next_alpha:
            if total_examples:
              # examples-based decay
              pushed_examples += len(job_batch)
              progress = 1.0 * pushed_examples / total_examples
            else:
              # words-based decay
              pushed_words += self._raw_word_count(job_batch)
              progress = 1.0 * pushed_words / total_words
            next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
            next_alpha = max(self.min_alpha, next_alpha)

          # add the sentence that didn't fit as the first item of a new job
          job_batch, batch_size = [sentence], sentence_length

      # add the last job too (may be significantly smaller than batch_words)
      if job_batch:
        logger.debug(
            "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f",
            job_no, batch_size, len(job_batch), len(self.pairwise_constraints),
            next_alpha)
        job_no += 1
        job_queue.put((job_batch, self.pairwise_constraints, next_alpha))

      if job_no == 0 and self.train_count == 0:
        logger.warning(
            "train() called with an empty iterator (if not intended, "
            "be sure to provide a corpus that offers restartable "
            "iteration = an iterable).")

      # give the workers heads up that they can finish -- no more work!
      for _ in xrange(self.workers):
        job_queue.put(None)
      logger.debug("job loop exiting, total %i jobs", job_no)

    # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
    job_queue = Queue(maxsize=queue_factor * self.workers)
    progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

    workers = [
        threading.Thread(target=worker_loop) for _ in xrange(self.workers)
    ]
    unfinished_worker_count = len(workers)
    workers.append(threading.Thread(target=job_producer))

    for thread in workers:
      thread.daemon = True  # make interrupting the process with ctrl+c easier
      thread.start()

    example_count, trained_word_count, raw_word_count = 0, 0, word_count
    start, next_report = default_timer() - 0.00001, 1.0

    while unfinished_worker_count > 0:
      report = progress_queue.get()  # blocks if workers too slow
      if report is None:  # a thread reporting that it finished
        unfinished_worker_count -= 1
        logger.info(
            "worker thread finished; awaiting finish of %i more threads",
            unfinished_worker_count)
        continue
      examples, trained_words, raw_words = report
      job_tally += 1

      # update progress stats
      example_count += examples
      trained_word_count += trained_words  # only words in vocab & sampled
      raw_word_count += raw_words

      # log progress once every report_delay seconds
      elapsed = default_timer() - start
      if elapsed >= next_report:
        if total_examples:
          # examples-based progress %
          logger.info(
              "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
              100.0 * example_count / total_examples,
              trained_word_count / elapsed, utils.qsize(job_queue),
              utils.qsize(progress_queue))
        else:
          # words-based progress %
          logger.info(
              "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
              100.0 * raw_word_count / total_words,
              trained_word_count / elapsed, utils.qsize(job_queue),
              utils.qsize(progress_queue))
        next_report = elapsed + report_delay

    # all done; report the final stats
    elapsed = default_timer() - start
    logger.info(
        "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
        raw_word_count, trained_word_count, elapsed,
        trained_word_count / elapsed)
    if job_tally < 10 * self.workers:
      logger.warn(
          "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
      )

    # check that the input corpus hasn't changed during iteration
    if total_examples and total_examples != example_count:
      logger.warn(
          "supplied example count (%i) did not equal expected count (%i)",
          example_count, total_examples)
    if total_words and total_words != raw_word_count:
      logger.warn(
          "supplied raw word count (%i) did not equal expected count (%i)",
          raw_word_count, total_words)

    self.train_count += 1  # number of times train() has been called
    self.total_train_time += elapsed
    self.clear_sims()
    return trained_word_count

Example #4

Show file

    def train(self,
              train_pairs,
              total_ptrees=None,
              ptree_count=0,
              total_examples=None,
              queue_factor=2,
              report_delay=1.0):
        """
    An example is a pair of a complete AST tree and an utterance.
    A ptree is a partial tree.
    """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension not loaded; training will be slow.")
            # precompute negative labels optimization for pure-python training
            self.neg_labels = zeros(self.negative + 1 +
                                    NEG_SAMPLING_VOCAB_SIZE_THRESHOLD)
            self.neg_labels[0] = 1.

        logger.info("training model with %i workers", self.workers)

        if not self.vocab_i or not self.vocab_k or not self.vocab_l or not self.vocab_r:
            raise RuntimeError(
                "you must first build vocabulary before training the model")
        if not hasattr(self, 'syn0i') or not hasattr(
                self, 'syn0k') or not hasattr(self, 'syn0l'):
            raise RuntimeError(
                "you must first finalize vocabulary before training the model")

        if total_ptrees is None and total_examples is None:
            if self.corpus_count:
                total_examples = self.corpus_count
                logger.info(
                    "expecting %i train pairs, matching count from corpus used for vocabulary survey",
                    total_examples)
            else:
                raise ValueError(
                    "you must provide either total_ptrees or total_examples, to enable alpha and progress calculations"
                )

        job_tally = 0

        if self.iter > 1:
            train_pairs = gsutils.RepeatCorpusNTimes(train_pairs, self.iter)
            total_ptrees = total_ptrees and total_ptrees * self.iter
            total_examples = total_examples and total_examples * self.iter

        def worker_loop():
            """Train the model, lifting lists of train_pairs from the job_queue."""

            # per-thread private work memory - useless in numpy implementation
            work = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            jobs_processed = 0
            while True:
                job = job_queue.get()
                if job is None:
                    progress_queue.put(None)
                    break  # no more jobs => quit this worker
                train_pairs, alpha = job
                tally, raw_tally = self._do_train_job(train_pairs, alpha,
                                                      (work, neu1))
                progress_queue.put((len(train_pairs), tally,
                                    raw_tally))  # report back progress
                jobs_processed += 1
            logger.debug("worker exiting, processed %i jobs", jobs_processed)

        def job_producer():
            """Fill jobs queue using the input `train_pairs` iterator."""
            job_batch, batch_size = [], 0
            pushed_ptrees, pushed_examples = 0, 0
            next_alpha = self.alpha
            job_no = 0

            for train_pair in train_pairs:
                train_pair_length = self._raw_ptree_count([train_pair])

                # can we fit this train_pair into the existing job batch?
                if batch_size + train_pair_length <= self.batch_ptrees:
                    # yes => add it to the current job
                    job_batch.append(train_pair)
                    batch_size += train_pair_length
                else:
                    # no => submit the existing job
                    logger.debug(
                        "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
                        job_no, batch_size, len(job_batch), next_alpha)
                    job_no += 1
                    job_queue.put((job_batch, next_alpha))

                    # update the learning rate for the next job
                    if self.min_alpha < next_alpha:
                        if total_examples:
                            # examples-based decay
                            pushed_examples += len(job_batch)
                            progress = 1.0 * pushed_examples / total_examples
                        else:
                            # ptrees-based decay
                            pushed_ptrees += self._raw_ptree_count(job_batch)
                            progress = 1.0 * pushed_ptrees / total_ptrees
                        next_alpha = self.alpha - (self.alpha -
                                                   self.min_alpha) * progress
                        next_alpha = max(self.min_alpha, next_alpha)

                    # add the train_pair that didn't fit as the first item of a new job
                    job_batch, batch_size = [train_pair], train_pair_length

            # add the last job too (may be significantly smaller than batch_ptrees)
            if job_batch:
                logger.debug(
                    "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
                    job_no, batch_size, len(job_batch), next_alpha)
                job_no += 1
                job_queue.put((job_batch, next_alpha))

            if job_no == 0 and self.train_count == 0:
                logger.warning(
                    "train() called with an empty iterator (if not intended, "
                    "be sure to provide a corpus that offers restartable "
                    "iteration = an iterable).")

            # give the workers heads up that they can finish -- no more work!
            for _ in xrange(self.workers):
                job_queue.put(None)
            logger.debug("job loop exiting, total %i jobs", job_no)

        # buffer ahead only a limited number of jobs.. this is the reason we can't
        # simply use ThreadPool :(
        job_queue = Queue(maxsize=queue_factor * self.workers)
        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

        workers = [
            threading.Thread(target=worker_loop) for _ in xrange(self.workers)
        ]
        unfinished_worker_count = len(workers)
        workers.append(threading.Thread(target=job_producer))

        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        example_count, trained_ptree_count, raw_ptree_count = 0, 0, ptree_count
        start, next_report = default_timer() - 0.00001, 1.0

        while unfinished_worker_count > 0:
            report = progress_queue.get()  # blocks if workers too slow
            if report is None:  # a thread reporting that it finished
                unfinished_worker_count -= 1
                logger.info(
                    "worker thread finished; awaiting finish of %i more threads",
                    unfinished_worker_count)
                continue
            examples, trained_ptrees, raw_ptrees = report
            job_tally += 1

            # update progress stats
            example_count += examples
            trained_ptree_count += trained_ptrees  # only ptrees in vocab & sampled
            raw_ptree_count += raw_ptrees

            # log progress once every report_delay seconds
            elapsed = default_timer() - start
            if elapsed >= next_report:
                if total_examples:
                    # examples-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% examples, %.0f ptrees/s, in_qsize %i, out_qsize %i",
                        100.0 * example_count / total_examples,
                        trained_ptree_count / elapsed,
                        gsutils.qsize(job_queue),
                        gsutils.qsize(progress_queue))
                else:
                    # ptrees-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% ptrees, %.0f ptrees/s, in_qsize %i, out_qsize %i",
                        100.0 * raw_ptree_count / total_ptrees,
                        trained_ptree_count / elapsed,
                        gsutils.qsize(job_queue),
                        gsutils.qsize(progress_queue))
                next_report = elapsed + report_delay

        # all done; report the final stats
        elapsed = default_timer() - start
        logger.info(
            "training on %i raw ptrees (%i effective ptrees) took %.1fs, %.0f effective ptrees/s",
            raw_ptree_count, trained_ptree_count, elapsed,
            trained_ptree_count / elapsed)
        if job_tally < 10 * self.workers:
            logger.warn(
                "under 10 jobs per worker: consider setting a smaller `batch_ptrees' for smoother alpha decay"
            )

        # check that the input corpus hasn't changed during iteration
        if total_examples and total_examples != example_count:
            logger.warn(
                "supplied example count (%i) did not equal expected count (%i)",
                example_count, total_examples)
        if total_ptrees and total_ptrees != raw_ptree_count:
            logger.warn(
                "supplied raw word count (%i) did not equal expected count (%i)",
                raw_ptree_count, total_ptrees)

        self.train_count += 1  # number of times train() has been called
        self.total_train_time += elapsed
        return trained_ptree_count