Ejemplo n.º 1
0
 def _train(self, corpus, params):
     if corpus == 'cached':
         raise NotSupportedException(
             'Training tfidf project from cached data not supported.')
     if corpus.is_empty():
         raise NotSupportedException(
             'Cannot train tfidf project with no documents')
     self.info('transforming subject corpus')
     subjects = self._generate_subjects_from_documents(corpus)
     veccorpus = self.create_vectorizer(subjects)
     self._create_index(veccorpus)
Ejemplo n.º 2
0
 def _train(self, corpus, params):
     if corpus == 'cached':
         raise NotSupportedException(
             'Training pav project from cached data not supported.')
     if corpus.is_empty():
         raise NotSupportedException('training backend {} with no documents'
                                     .format(self.backend_id))
     self.info("creating PAV models")
     sources = annif.util.parse_sources(self.params['sources'])
     min_docs = int(params['min-docs'])
     for source_project_id, _ in sources:
         self._create_pav_model(source_project_id, min_docs, corpus)
Ejemplo n.º 3
0
 def _train(self, corpus, params):
     if corpus == 'cached':
         raise NotSupportedException(
             'Training maui project from cached data not supported.')
     if corpus.is_empty():
         raise NotSupportedException('training backend {} with no documents'
                                     .format(self.backend_id))
     self._initialize_tagger(params)
     self._upload_vocabulary(params)
     self._create_train_file(corpus)
     self._upload_train_file(params)
     self._wait_for_train(params)
Ejemplo n.º 4
0
 def _load_data(self, corpus):
     if corpus == 'cached':
         raise NotSupportedException(
             'Training stwfsa project from cached data not supported.')
     if corpus.is_empty():
         raise NotSupportedException(
             'Cannot train stwfsa project with no documents.')
     self.debug("Transforming training data.")
     X = []
     y = []
     for doc in corpus.documents:
         X.append(doc.text)
         y.append(doc.uris)
     return X, y
Ejemplo n.º 5
0
Archivo: cli.py Proyecto: mo-fu/Annif
def validate_backend_params(backend, beparam, project):
    if 'algorithm' in beparam:
        raise NotSupportedException('Algorithm overriding not supported.')
    if backend != project.config['backend']:
        raise ConfigurationException(
            'The backend {} in CLI option "-b {}" not matching the project'
            ' backend {}.'.format(backend, beparam, project.config['backend']))
Ejemplo n.º 6
0
    def _train(self, corpus, params, jobs=0):
        self.info('starting train')
        if corpus != 'cached':
            if corpus.is_empty():
                raise NotSupportedException(
                    'training backend {} with no documents'.format(
                        self.backend_id))
            self.info("preparing training data")
            self._model = MLLMModel()
            train_data = self._model.prepare_train(corpus, self.project.vocab,
                                                   self.project.analyzer,
                                                   params, jobs)
            annif.util.atomic_save(train_data,
                                   self.datadir,
                                   self.TRAIN_FILE,
                                   method=joblib.dump)
        else:
            self.info("reusing cached training data from previous run")
            self._model = self._load_model()
            train_data = self._load_train_data()

        self.info("training model")
        self._model.train(train_data[0], train_data[1], params)

        self.info('saving model')
        annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE)
Ejemplo n.º 7
0
 def _train(self, corpus, params):
     if corpus.is_empty():
         raise NotSupportedException(
             'training backend {} with no documents'.format(
                 self.backend_id))
     self._create_train_file(corpus)
     self._create_model(params)
Ejemplo n.º 8
0
 def _train(self, corpus, params):
     if corpus == 'cached':
         raise NotSupportedException(
             'Training nn_ensemble project from cached data not supported.')
     sources = annif.util.parse_sources(self.params['sources'])
     self._create_model(sources)
     self._fit_model(corpus, epochs=int(params['epochs']))
Ejemplo n.º 9
0
 def train(self, corpus):
     if corpus.is_empty():
         raise NotSupportedException(
             'Cannot train tfidf project with no documents')
     self.info('transforming subject corpus')
     subjects = self._generate_subjects_from_documents(corpus)
     veccorpus = self.create_vectorizer(subjects)
     self._create_index(veccorpus)
Ejemplo n.º 10
0
 def train(self, corpus):
     if corpus.is_empty():
         raise NotSupportedException('training backend {} with no documents'
                                     .format(self.backend_id))
     self._initialize_tagger()
     self._upload_vocabulary()
     self._create_train_file(corpus)
     self._upload_train_file()
     self._wait_for_train()
Ejemplo n.º 11
0
 def _train(self, corpus, params):
     if corpus != 'cached':
         if corpus.is_empty():
             raise NotSupportedException(
                 'training backend {} with no documents'.format(
                     self.backend_id))
         self._create_train_file(corpus)
     else:
         self.info("Reusing cached training data from previous run.")
     self._create_model(params)
Ejemplo n.º 12
0
    def learn(self, corpus):
        """further train the project using documents from a metadata source"""

        corpus.set_subject_index(self.subjects)
        if isinstance(self.backend,
                      annif.backend.backend.AnnifLearningBackend):
            self.backend.learn(corpus, project=self)
        else:
            raise NotSupportedException("Learning not supported by backend",
                                        project_id=self.project_id)
Ejemplo n.º 13
0
 def train(self, corpus):
     if corpus.is_empty():
         raise NotSupportedException(
             'Cannot train omikuji project with no documents')
     input = (doc.text for doc in corpus.documents)
     params = {
         'min_df': int(self.params['min_df']),
         'tokenizer': self.project.analyzer.tokenize_words
     }
     veccorpus = self.create_vectorizer(input, params)
     self._create_train_file(veccorpus, corpus)
     self._create_model()
Ejemplo n.º 14
0
    def hyperopt(self, corpus, trials, jobs, metric, results_file):
        """optimize the hyperparameters of the project using a validation
        corpus against a given metric"""
        if isinstance(self.backend,
                      annif.backend.hyperopt.AnnifHyperoptBackend):
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
            return optimizer.optimize(trials, jobs, results_file)

        raise NotSupportedException(
            "Hyperparameter optimization not supported "
            "by backend",
            project_id=self.project_id)
Ejemplo n.º 15
0
 def learn(self, corpus, backend_params=None):
     """further train the project using documents from a metadata source"""
     corpus.set_subject_index(self.subjects)
     if backend_params is None:
         backend_params = {}
     beparams = backend_params.get(self.backend.backend_id, {})
     if isinstance(self.backend,
                   annif.backend.backend.AnnifLearningBackend):
         self.backend.learn(corpus, beparams)
     else:
         raise NotSupportedException("Learning not supported by backend",
                                     project_id=self.project_id)
Ejemplo n.º 16
0
 def _train(self, corpus, params):
     if corpus != 'cached':
         if corpus.is_empty():
             raise NotSupportedException(
                 'Cannot train omikuji project with no documents')
         input = (doc.text for doc in corpus.documents)
         vecparams = {
             'min_df': int(params['min_df']),
             'tokenizer': self.project.analyzer.tokenize_words
         }
         veccorpus = self.create_vectorizer(input, vecparams)
         self._create_train_file(veccorpus, corpus)
     else:
         self.info("Reusing cached training data from previous run.")
     self._create_model(params)
Ejemplo n.º 17
0
def run_eval(
        project_id,
        paths,
        limit,
        threshold,
        results_file,
        jobs,
        backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """

    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param, project)

    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    if results_file:
        try:
            print('', end='', file=results_file)
            click.echo('Writing per subject evaluation results to {!s}'.format(
                results_file.name))
        except Exception as e:
            raise NotSupportedException(
                "cannot open results-file for writing: " + str(e))
    docs = open_documents(paths)

    jobs, pool_class = annif.parallel.get_pool(jobs)

    project.initialize()
    psmap = annif.parallel.ProjectSuggestMap(
        project.registry, [project_id], backend_params, limit, threshold)

    with pool_class(jobs) as pool:
        for hits, uris, labels in pool.imap_unordered(
                psmap.suggest, docs.documents):
            eval_batch.evaluate(hits[project_id],
                                annif.corpus.SubjectSet((uris, labels)))

    template = "{0:<30}\t{1}"
    for metric, score in eval_batch.results(results_file=results_file).items():
        click.echo(template.format(metric + ":", score))
Ejemplo n.º 18
0
    def results(self, metrics='all'):
        """evaluate a set of selected subjects against a gold standard using
        different metrics. The set of metrics can be either 'all' or
        'simple'."""

        if not self._samples:
            raise NotSupportedException("cannot evaluate empty corpus")

        y_true = np.array([gold_subjects.as_vector(self._subject_index)
                           for hits, gold_subjects in self._samples])
        y_pred = np.array([hits.vector
                           for hits, gold_subjects in self._samples],
                          dtype=np.float32)

        results = self._evaluate_samples(
            y_true, y_pred, metrics)
        results['Documents evaluated'] = y_true.shape[0]
        return results
Ejemplo n.º 19
0
    def _fit_model(self, corpus, epochs, lmdb_map_size):
        env = self._open_lmdb(corpus == 'cached', lmdb_map_size)
        if corpus != 'cached':
            if corpus.is_empty():
                raise NotSupportedException(
                    'Cannot train nn_ensemble project with no documents')
            with env.begin(write=True, buffers=True) as txn:
                seq = LMDBSequence(txn, batch_size=32)
                self._corpus_to_vectors(corpus, seq)
        else:
            self.info("Reusing cached training data from previous run.")
        # fit the model using a read-only view of the LMDB
        with env.begin(buffers=True) as txn:
            seq = LMDBSequence(txn, batch_size=32)
            self._model.fit(seq, verbose=True, epochs=epochs)

        annif.util.atomic_save(
            self._model,
            self.datadir,
            self.MODEL_FILE)
Ejemplo n.º 20
0
Archivo: eval.py Proyecto: mo-fu/Annif
    def results(self, metrics='all', results_file=None, warnings=False):
        """evaluate a set of selected subjects against a gold standard using
        different metrics. The set of metrics can be either 'all' or 'simple'.
        If results_file (file object) given, write results per subject to it"""

        if not self._samples:
            raise NotSupportedException("cannot evaluate empty corpus")

        shape = (len(self._samples), len(self._subject_index))
        y_true = np.zeros(shape, dtype=bool)
        y_pred = np.zeros(shape, dtype=np.float32)

        for idx, (hits, gold_subjects) in enumerate(self._samples):
            gold_subjects.as_vector(self._subject_index,
                                    destination=y_true[idx],
                                    warnings=warnings)
            hits.as_vector(self._subject_index, destination=y_pred[idx])

        results = self._evaluate_samples(y_true, y_pred, metrics)
        results['Documents evaluated'] = y_true.shape[0]

        if results_file:
            self.output_result_per_subject(y_true, y_pred, results_file)
        return results
Ejemplo n.º 21
0
 def _train(self, corpus, params):
     raise NotSupportedException(
         'Training ensemble backend is not possible.')
Ejemplo n.º 22
0
 def _train(self, corpus, params, jobs=0):
     raise NotSupportedException('Training yake backend is not possible.')