def test_batch_generator(self):
        job_postings = JobPostingCollectionSample()
        batch_iter = batches_generator(job_postings, 10)
        # Each bacth produced by batched_generator() is still an iterator
        assert isinstance(batch_iter, collections.abc.Iterator)
        batch_iter = list(batch_iter)
        assert len(batch_iter) == 10

        job_postings = JobPostingCollectionSample()
        batch_tuple = tuple(BatchGenerator(job_postings, 10))
        # Each batch produced by BatchGenerator() is a tuple
        assert isinstance(batch_tuple, tuple)
        assert len(list(batch_tuple)) == 10
Exemple #2
0
    def train(self, lookup=False, *args, **kwargs):
        """Train an embedding model, build a lookup table and model metadata. After training, they will be saved to S3.

        Args:
            kwargs: all arguments that gensim.models.doc2vec.Docvec will take.
        """
        if self.model_type == 'word2vec':
            if self._model.wv.vocab:
                logging.info("Model has been trained")
                self.update = True

            batch_iter = 1
            batch_gen = batches_generator(self.corpus_generator,
                                          self.batch_size)
            for batch in batch_gen:
                batch = Reiterable(batch)
                logging.info("Training batch #{} ".format(batch_iter))
                if not self.update:
                    self._model.build_vocab(batch, update=False)
                    self.update = True
                else:
                    self._model.build_vocab(batch, update=True)

                self._model.train(batch,
                                  total_examples=self._model.corpus_count,
                                  epochs=self._model.iter,
                                  *args,
                                  **kwargs)
                self.vocab_size_cumu.append(len(self._model.wv.vocab))
                batch_iter += 1
                logging.info('\n')

        elif self.model_type == 'doc2vec':
            corpus_gen = self.corpus_generator
            reiter_corpus_gen = Reiterable(corpus_gen)
            self._model.build_vocab(reiter_corpus_gen)
            self._model.train(reiter_corpus_gen,
                              total_examples=self._model.corpus_count,
                              epochs=self._model.iter,
                              *args,
                              **kwargs)
            if lookup:
                self.lookup_dict = corpus_gen.lookup
                self._model.lookup_dict = self.lookup_dict

        self._model.metadata = self.metadata
        self._model.model_name = self.model_name
    def train(self, size=500, min_count=3, iter=4, window=6, workers=3, **kwargs):
        """Train an embedding model, build a lookup table and model metadata. After training, they will be saved to S3.

        Args:
            kwargs: all arguments that gensim.models.doc2vec.Docvec will take.
        """
        job_postings_generator = job_postings_chain(self.s3_conn, self.quarters, self.jp_s3_path, source=self.source)

        if self.model_type == 'word2vec':
            if not self._model:
                model = Word2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs)
            else:
                logging.info("Model existed")
                model = self._model
                self.update = True

            batch_iter = 1
            batch_gen = batches_generator(Word2VecGensimCorpusCreator(job_postings_generator), self.batch_size)
            for batch in batch_gen:
                batch = Reiterable(batch)
                logging.info("Training batch #{} ".format(batch_iter))
                if not self.update:
                    model.build_vocab(batch, update=False)
                    self.update = True
                else:
                    model.build_vocab(batch, update=True)

                model.train(batch, total_examples=model.corpus_count, epochs=model.iter)
                self.vocab_size_cumu.append(len(model.wv.vocab))
                batch_iter += 1
                logging.info('\n')

        elif self.model_type == 'doc2vec':
            model = Doc2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs)
            corpus_gen = Doc2VecGensimCorpusCreator(job_postings_generator)
            reiter_corpus_gen = Reiterable(corpus_gen)
            model.build_vocab(reiter_corpus_gen)
            model.train(reiter_corpus_gen, total_examples=model.corpus_count, epochs=model.iter)
            self._lookup = corpus_gen.lookup

        self._model = model
        self._upload()