Exemple #1
0
 def _train(self, corpus, params):
     if corpus.is_empty():
         raise NotSupportedException(
             'training backend {} with no documents'.format(
                 self.backend_id))
     self.info("creating PAV models")
     sources = annif.util.parse_sources(self.params['sources'])
     min_docs = int(params['min-docs'])
     for source_project_id, _ in sources:
         self._create_pav_model(source_project_id, min_docs, corpus)
Exemple #2
0
    def _fit_model(self, corpus, epochs, lmdb_map_size):
        env = self._open_lmdb(corpus == 'cached', lmdb_map_size)
        if corpus != 'cached':
            if corpus.is_empty():
                raise NotSupportedException(
                    'Cannot train nn_ensemble project with no documents')
            with env.begin(write=True, buffers=True) as txn:
                seq = LMDBSequence(txn, batch_size=32)
                self._corpus_to_vectors(corpus, seq)
        else:
            self.info("Reusing cached training data from previous run.")
        # fit the model using a read-only view of the LMDB
        with env.begin(buffers=True) as txn:
            seq = LMDBSequence(txn, batch_size=32)
            self._model.fit(seq, verbose=True, epochs=epochs)

        annif.util.atomic_save(
            self._model,
            self.datadir,
            self.MODEL_FILE)