def _train(self, corpus, params): if corpus == 'cached': raise NotSupportedException( 'Training tfidf project from cached data not supported.') if corpus.is_empty(): raise NotSupportedException( 'Cannot train tfidf project with no documents') self.info('transforming subject corpus') subjects = self._generate_subjects_from_documents(corpus) veccorpus = self.create_vectorizer(subjects) self._create_index(veccorpus)
def _train(self, corpus, params): if corpus == 'cached': raise NotSupportedException( 'Training pav project from cached data not supported.') if corpus.is_empty(): raise NotSupportedException('training backend {} with no documents' .format(self.backend_id)) self.info("creating PAV models") sources = annif.util.parse_sources(self.params['sources']) min_docs = int(params['min-docs']) for source_project_id, _ in sources: self._create_pav_model(source_project_id, min_docs, corpus)
def _train(self, corpus, params): if corpus == 'cached': raise NotSupportedException( 'Training maui project from cached data not supported.') if corpus.is_empty(): raise NotSupportedException('training backend {} with no documents' .format(self.backend_id)) self._initialize_tagger(params) self._upload_vocabulary(params) self._create_train_file(corpus) self._upload_train_file(params) self._wait_for_train(params)
def _load_data(self, corpus): if corpus == 'cached': raise NotSupportedException( 'Training stwfsa project from cached data not supported.') if corpus.is_empty(): raise NotSupportedException( 'Cannot train stwfsa project with no documents.') self.debug("Transforming training data.") X = [] y = [] for doc in corpus.documents: X.append(doc.text) y.append(doc.uris) return X, y
def validate_backend_params(backend, beparam, project): if 'algorithm' in beparam: raise NotSupportedException('Algorithm overriding not supported.') if backend != project.config['backend']: raise ConfigurationException( 'The backend {} in CLI option "-b {}" not matching the project' ' backend {}.'.format(backend, beparam, project.config['backend']))
def _train(self, corpus, params, jobs=0): self.info('starting train') if corpus != 'cached': if corpus.is_empty(): raise NotSupportedException( 'training backend {} with no documents'.format( self.backend_id)) self.info("preparing training data") self._model = MLLMModel() train_data = self._model.prepare_train(corpus, self.project.vocab, self.project.analyzer, params, jobs) annif.util.atomic_save(train_data, self.datadir, self.TRAIN_FILE, method=joblib.dump) else: self.info("reusing cached training data from previous run") self._model = self._load_model() train_data = self._load_train_data() self.info("training model") self._model.train(train_data[0], train_data[1], params) self.info('saving model') annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE)
def _train(self, corpus, params): if corpus.is_empty(): raise NotSupportedException( 'training backend {} with no documents'.format( self.backend_id)) self._create_train_file(corpus) self._create_model(params)
def _train(self, corpus, params): if corpus == 'cached': raise NotSupportedException( 'Training nn_ensemble project from cached data not supported.') sources = annif.util.parse_sources(self.params['sources']) self._create_model(sources) self._fit_model(corpus, epochs=int(params['epochs']))
def train(self, corpus): if corpus.is_empty(): raise NotSupportedException( 'Cannot train tfidf project with no documents') self.info('transforming subject corpus') subjects = self._generate_subjects_from_documents(corpus) veccorpus = self.create_vectorizer(subjects) self._create_index(veccorpus)
def train(self, corpus): if corpus.is_empty(): raise NotSupportedException('training backend {} with no documents' .format(self.backend_id)) self._initialize_tagger() self._upload_vocabulary() self._create_train_file(corpus) self._upload_train_file() self._wait_for_train()
def _train(self, corpus, params): if corpus != 'cached': if corpus.is_empty(): raise NotSupportedException( 'training backend {} with no documents'.format( self.backend_id)) self._create_train_file(corpus) else: self.info("Reusing cached training data from previous run.") self._create_model(params)
def learn(self, corpus): """further train the project using documents from a metadata source""" corpus.set_subject_index(self.subjects) if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend): self.backend.learn(corpus, project=self) else: raise NotSupportedException("Learning not supported by backend", project_id=self.project_id)
def train(self, corpus): if corpus.is_empty(): raise NotSupportedException( 'Cannot train omikuji project with no documents') input = (doc.text for doc in corpus.documents) params = { 'min_df': int(self.params['min_df']), 'tokenizer': self.project.analyzer.tokenize_words } veccorpus = self.create_vectorizer(input, params) self._create_train_file(veccorpus, corpus) self._create_model()
def hyperopt(self, corpus, trials, jobs, metric, results_file): """optimize the hyperparameters of the project using a validation corpus against a given metric""" if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend): optimizer = self.backend.get_hp_optimizer(corpus, metric) return optimizer.optimize(trials, jobs, results_file) raise NotSupportedException( "Hyperparameter optimization not supported " "by backend", project_id=self.project_id)
def learn(self, corpus, backend_params=None): """further train the project using documents from a metadata source""" corpus.set_subject_index(self.subjects) if backend_params is None: backend_params = {} beparams = backend_params.get(self.backend.backend_id, {}) if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend): self.backend.learn(corpus, beparams) else: raise NotSupportedException("Learning not supported by backend", project_id=self.project_id)
def _train(self, corpus, params): if corpus != 'cached': if corpus.is_empty(): raise NotSupportedException( 'Cannot train omikuji project with no documents') input = (doc.text for doc in corpus.documents) vecparams = { 'min_df': int(params['min_df']), 'tokenizer': self.project.analyzer.tokenize_words } veccorpus = self.create_vectorizer(input, vecparams) self._create_train_file(veccorpus, corpus) else: self.info("Reusing cached training data from previous run.") self._create_model(params)
def run_eval( project_id, paths, limit, threshold, results_file, jobs, backend_param): """ Analyze documents and evaluate the result. Compare the results of automated indexing against a gold standard. The path may be either a TSV file with short documents or a directory with documents in separate files. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param, project) eval_batch = annif.eval.EvaluationBatch(project.subjects) if results_file: try: print('', end='', file=results_file) click.echo('Writing per subject evaluation results to {!s}'.format( results_file.name)) except Exception as e: raise NotSupportedException( "cannot open results-file for writing: " + str(e)) docs = open_documents(paths) jobs, pool_class = annif.parallel.get_pool(jobs) project.initialize() psmap = annif.parallel.ProjectSuggestMap( project.registry, [project_id], backend_params, limit, threshold) with pool_class(jobs) as pool: for hits, uris, labels in pool.imap_unordered( psmap.suggest, docs.documents): eval_batch.evaluate(hits[project_id], annif.corpus.SubjectSet((uris, labels))) template = "{0:<30}\t{1}" for metric, score in eval_batch.results(results_file=results_file).items(): click.echo(template.format(metric + ":", score))
def results(self, metrics='all'): """evaluate a set of selected subjects against a gold standard using different metrics. The set of metrics can be either 'all' or 'simple'.""" if not self._samples: raise NotSupportedException("cannot evaluate empty corpus") y_true = np.array([gold_subjects.as_vector(self._subject_index) for hits, gold_subjects in self._samples]) y_pred = np.array([hits.vector for hits, gold_subjects in self._samples], dtype=np.float32) results = self._evaluate_samples( y_true, y_pred, metrics) results['Documents evaluated'] = y_true.shape[0] return results
def _fit_model(self, corpus, epochs, lmdb_map_size): env = self._open_lmdb(corpus == 'cached', lmdb_map_size) if corpus != 'cached': if corpus.is_empty(): raise NotSupportedException( 'Cannot train nn_ensemble project with no documents') with env.begin(write=True, buffers=True) as txn: seq = LMDBSequence(txn, batch_size=32) self._corpus_to_vectors(corpus, seq) else: self.info("Reusing cached training data from previous run.") # fit the model using a read-only view of the LMDB with env.begin(buffers=True) as txn: seq = LMDBSequence(txn, batch_size=32) self._model.fit(seq, verbose=True, epochs=epochs) annif.util.atomic_save( self._model, self.datadir, self.MODEL_FILE)
def results(self, metrics='all', results_file=None, warnings=False): """evaluate a set of selected subjects against a gold standard using different metrics. The set of metrics can be either 'all' or 'simple'. If results_file (file object) given, write results per subject to it""" if not self._samples: raise NotSupportedException("cannot evaluate empty corpus") shape = (len(self._samples), len(self._subject_index)) y_true = np.zeros(shape, dtype=bool) y_pred = np.zeros(shape, dtype=np.float32) for idx, (hits, gold_subjects) in enumerate(self._samples): gold_subjects.as_vector(self._subject_index, destination=y_true[idx], warnings=warnings) hits.as_vector(self._subject_index, destination=y_pred[idx]) results = self._evaluate_samples(y_true, y_pred, metrics) results['Documents evaluated'] = y_true.shape[0] if results_file: self.output_result_per_subject(y_true, y_pred, results_file) return results
def _train(self, corpus, params): raise NotSupportedException( 'Training ensemble backend is not possible.')
def _train(self, corpus, params, jobs=0): raise NotSupportedException('Training yake backend is not possible.')