def test_batch_generator(self): job_postings = JobPostingCollectionSample() batch_iter = batches_generator(job_postings, 10) # Each bacth produced by batched_generator() is still an iterator assert isinstance(batch_iter, collections.abc.Iterator) batch_iter = list(batch_iter) assert len(batch_iter) == 10 job_postings = JobPostingCollectionSample() batch_tuple = tuple(BatchGenerator(job_postings, 10)) # Each batch produced by BatchGenerator() is a tuple assert isinstance(batch_tuple, tuple) assert len(list(batch_tuple)) == 10
def train(self, lookup=False, *args, **kwargs): """Train an embedding model, build a lookup table and model metadata. After training, they will be saved to S3. Args: kwargs: all arguments that gensim.models.doc2vec.Docvec will take. """ if self.model_type == 'word2vec': if self._model.wv.vocab: logging.info("Model has been trained") self.update = True batch_iter = 1 batch_gen = batches_generator(self.corpus_generator, self.batch_size) for batch in batch_gen: batch = Reiterable(batch) logging.info("Training batch #{} ".format(batch_iter)) if not self.update: self._model.build_vocab(batch, update=False) self.update = True else: self._model.build_vocab(batch, update=True) self._model.train(batch, total_examples=self._model.corpus_count, epochs=self._model.iter, *args, **kwargs) self.vocab_size_cumu.append(len(self._model.wv.vocab)) batch_iter += 1 logging.info('\n') elif self.model_type == 'doc2vec': corpus_gen = self.corpus_generator reiter_corpus_gen = Reiterable(corpus_gen) self._model.build_vocab(reiter_corpus_gen) self._model.train(reiter_corpus_gen, total_examples=self._model.corpus_count, epochs=self._model.iter, *args, **kwargs) if lookup: self.lookup_dict = corpus_gen.lookup self._model.lookup_dict = self.lookup_dict self._model.metadata = self.metadata self._model.model_name = self.model_name
def train(self, size=500, min_count=3, iter=4, window=6, workers=3, **kwargs): """Train an embedding model, build a lookup table and model metadata. After training, they will be saved to S3. Args: kwargs: all arguments that gensim.models.doc2vec.Docvec will take. """ job_postings_generator = job_postings_chain(self.s3_conn, self.quarters, self.jp_s3_path, source=self.source) if self.model_type == 'word2vec': if not self._model: model = Word2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs) else: logging.info("Model existed") model = self._model self.update = True batch_iter = 1 batch_gen = batches_generator(Word2VecGensimCorpusCreator(job_postings_generator), self.batch_size) for batch in batch_gen: batch = Reiterable(batch) logging.info("Training batch #{} ".format(batch_iter)) if not self.update: model.build_vocab(batch, update=False) self.update = True else: model.build_vocab(batch, update=True) model.train(batch, total_examples=model.corpus_count, epochs=model.iter) self.vocab_size_cumu.append(len(model.wv.vocab)) batch_iter += 1 logging.info('\n') elif self.model_type == 'doc2vec': model = Doc2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs) corpus_gen = Doc2VecGensimCorpusCreator(job_postings_generator) reiter_corpus_gen = Reiterable(corpus_gen) model.build_vocab(reiter_corpus_gen) model.train(reiter_corpus_gen, total_examples=model.corpus_count, epochs=model.iter) self._lookup = corpus_gen.lookup self._model = model self._upload()