def _train(self, corpus, params): if corpus.is_empty(): raise NotSupportedException( 'training backend {} with no documents'.format( self.backend_id)) self.info("creating PAV models") sources = annif.util.parse_sources(self.params['sources']) min_docs = int(params['min-docs']) for source_project_id, _ in sources: self._create_pav_model(source_project_id, min_docs, corpus)
def _fit_model(self, corpus, epochs, lmdb_map_size): env = self._open_lmdb(corpus == 'cached', lmdb_map_size) if corpus != 'cached': if corpus.is_empty(): raise NotSupportedException( 'Cannot train nn_ensemble project with no documents') with env.begin(write=True, buffers=True) as txn: seq = LMDBSequence(txn, batch_size=32) self._corpus_to_vectors(corpus, seq) else: self.info("Reusing cached training data from previous run.") # fit the model using a read-only view of the LMDB with env.begin(buffers=True) as txn: seq = LMDBSequence(txn, batch_size=32) self._model.fit(seq, verbose=True, epochs=epochs) annif.util.atomic_save( self._model, self.datadir, self.MODEL_FILE)