Exemple #1
0
    def skos(self):
        """return the subject vocabulary from SKOS file"""
        if self._skos_vocab is not None:
            return self._skos_vocab

        # attempt to load graph from dump file
        dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
        if os.path.exists(dumppath):
            logger.debug(f'loading graph dump from {dumppath}')
            try:
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath,
                                                                self.language)
            except ModuleNotFoundError:
                # Probably dump has been saved using a different rdflib version
                logger.debug('could not load graph dump, using turtle file')
            else:
                return self._skos_vocab

        # graph dump file not found - parse ttl file instead
        path = os.path.join(self.datadir, 'subjects.ttl')
        if os.path.exists(path):
            logger.debug(f'loading graph from {path}')
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path,
                                                            self.language)
            # store the dump file so we can use it next time
            self._skos_vocab.save_skos(path, self.language)
            return self._skos_vocab

        raise NotInitializedException(f'graph file {path} not found')
Exemple #2
0
 def _load_train_data(self):
     path = os.path.join(self.datadir, self.TRAIN_FILE)
     if os.path.exists(path):
         return joblib.load(path)
     else:
         raise NotInitializedException(
             'train data file {} not found'.format(path),
             backend_id=self.backend_id)
Exemple #3
0
 def _load_model(self):
     path = os.path.join(self.datadir, self.MODEL_FILE)
     self.debug('loading model from {}'.format(path))
     if os.path.exists(path):
         return MLLMModel.load(path)
     else:
         raise NotInitializedException('model {} not found'.format(path),
                                       backend_id=self.backend_id)
Exemple #4
0
 def initialize(self):
     if self._model is None:
         path = os.path.join(self.datadir, self.MODEL_FILE)
         self.debug(f'Loading STWFSA model from {path}.')
         if os.path.exists(path):
             self._model = StwfsapyPredictor.load(path)
             self.debug('Loaded model.')
         else:
             raise NotInitializedException(f'Model not found at {path}',
                                           backend_id=self.backend_id)
Exemple #5
0
 def _initialize_model(self):
     if self._model is None:
         path = os.path.join(self.datadir, self.MODEL_FILE)
         self.debug('loading model from {}'.format(path))
         if os.path.exists(path):
             self._model = omikuji.Model.load(path)
         else:
             raise NotInitializedException(
                 'model {} not found'.format(path),
                 backend_id=self.backend_id)
Exemple #6
0
 def subjects(self):
     if self._subjects is None:
         path = os.path.join(self.datadir, 'subjects')
         if os.path.exists(path):
             logger.debug('loading subjects from %s', path)
             self._subjects = annif.corpus.SubjectIndex.load(path)
         else:
             raise NotInitializedException(
                 "subject file {} not found".format(path))
     return self._subjects
Exemple #7
0
 def initialize(self):
     if self._model is not None:
         return  # already initialized
     model_filename = os.path.join(self.datadir, self.MODEL_FILE)
     if not os.path.exists(model_filename):
         raise NotInitializedException(
             'model file {} not found'.format(model_filename),
             backend_id=self.backend_id)
     self.debug('loading Keras model from {}'.format(model_filename))
     self._model = load_model(model_filename)
Exemple #8
0
 def initialize_vectorizer(self):
     if self.vectorizer is None:
         path = os.path.join(self.datadir, self.VECTORIZER_FILE)
         if os.path.exists(path):
             self.debug('loading vectorizer from {}'.format(path))
             self.vectorizer = joblib.load(path)
         else:
             raise NotInitializedException(
                 "vectorizer file '{}' not found".format(path),
                 backend_id=self.backend_id)
Exemple #9
0
 def vectorizer(self):
     if self._vectorizer is None:
         path = os.path.join(self.datadir, 'vectorizer')
         if os.path.exists(path):
             logger.debug('loading vectorizer from %s', path)
             self._vectorizer = joblib.load(path)
         else:
             raise NotInitializedException(
                 "vectorizer file '{}' not found".format(path),
                 project_id=self.project_id)
     return self._vectorizer
Exemple #10
0
 def _initialize_index(self):
     if self._index is None:
         path = os.path.join(self.datadir, self.INDEX_FILE)
         self.debug('loading similarity index from {}'.format(path))
         if os.path.exists(path):
             self._index = gensim.similarities.SparseMatrixSimilarity.load(
                 path)
         else:
             raise NotInitializedException(
                 'similarity index {} not found'.format(path),
                 backend_id=self.backend_id)
Exemple #11
0
 def skos(self):
     """return the subject vocabulary from SKOS file"""
     if self._skos_vocab is None:
         path = os.path.join(self.datadir, 'subjects.ttl')
         if os.path.exists(path):
             logger.debug(f'loading graph from {path}')
             self._skos_vocab = annif.corpus.SubjectFileSKOS(
                 path, self.language)
         else:
             raise NotInitializedException(f'graph file {path} not found')
     return self._skos_vocab
Exemple #12
0
 def initialize(self):
     if self._model is None:
         path = os.path.join(self.datadir, self.MODEL_FILE)
         self.debug('loading fastText model from {}'.format(path))
         if os.path.exists(path):
             self._model = self._load_model(path)
             self.debug('loaded model {}'.format(str(self._model)))
             self.debug('dim: {}'.format(self._model.get_dimension()))
         else:
             raise NotInitializedException(
                 'model {} not found'.format(path),
                 backend_id=self.backend_id)
Exemple #13
0
 def suggest(self, text, backend_params=None):
     """Suggest subjects the given text by passing it to the backend. Returns a
     list of SubjectSuggestion objects ordered by decreasing score."""
     if not self.is_trained:
         if self.is_trained is None:
             logger.warning('Could not get train state information.')
         else:
             raise NotInitializedException('Project is not trained.')
     logger.debug('Suggesting subjects for text "%s..." (len=%d)',
                  text[:20], len(text))
     hits = self._suggest_with_backend(text, backend_params)
     logger.debug('%d hits from backend', len(hits))
     return hits
Exemple #14
0
 def _initialize_model(self):
     if self._model is None:
         path = os.path.join(self.datadir, self.MODEL_FILE)
         self.debug('loading model from {}'.format(path))
         if os.path.exists(path):
             try:
                 self._model = omikuji.Model.load(path)
             except RuntimeError:
                 raise OperationFailedException(
                     "Omikuji models trained on Annif versions older than "
                     "0.56 cannot be loaded. Please retrain your project.")
         else:
             raise NotInitializedException(
                 'model {} not found'.format(path),
                 backend_id=self.backend_id)
Exemple #15
0
 def initialize(self):
     if self._model is None:
         path = os.path.join(self.datadir, self.MODEL_FILE)
         if not os.path.exists(path):
             raise NotInitializedException(
                 'model {} not found'.format(path),
                 backend_id=self.backend_id)
         self.debug('loading VW model from {}'.format(path))
         params = self._create_params({'i': path, 'quiet': True})
         if 'passes' in params:
             # don't confuse the model with passes
             del params['passes']
         self.debug("model parameters: {}".format(params))
         self._model = pyvw.vw(**params)
         self.debug('loaded model {}'.format(str(self._model)))
Exemple #16
0
 def initialize(self):
     if self._models is not None:
         return  # already initialized
     self._models = {}
     sources = annif.util.parse_sources(self.params['sources'])
     for source_project_id, _ in sources:
         model_filename = self.MODEL_FILE_PREFIX + source_project_id
         path = os.path.join(self.datadir, model_filename)
         if os.path.exists(path):
             self.debug('loading PAV model from {}'.format(path))
             self._models[source_project_id] = joblib.load(path)
         else:
             raise NotInitializedException(
                 "PAV model file '{}' not found".format(path),
                 backend_id=self.backend_id)
Exemple #17
0
 def initialize(self, parallel=False):
     super().initialize(parallel)
     if self._model is not None:
         return  # already initialized
     if parallel:
         # Don't load TF model just before parallel execution,
         # since it won't work after forking worker processes
         return
     model_filename = os.path.join(self.datadir, self.MODEL_FILE)
     if not os.path.exists(model_filename):
         raise NotInitializedException(
             'model file {} not found'.format(model_filename),
             backend_id=self.backend_id)
     self.debug('loading Keras model from {}'.format(model_filename))
     self._model = load_model(model_filename,
                              custom_objects={'MeanLayer': MeanLayer})
Exemple #18
0
 def _load_subject_freq(self):
     path = os.path.join(self.datadir, self.FREQ_FILE)
     if not os.path.exists(path):
         raise NotInitializedException(
             'frequency file {} not found'.format(path),
             backend_id=self.backend_id)
     self.debug('loading concept frequencies from {}'.format(path))
     with open(path) as freqf:
         # The Counter was serialized like a dictionary, need to
         # convert it back. Keys that became strings need to be turned
         # back into integers.
         self._subject_freq = collections.Counter()
         for cid, freq in json.load(freqf).items():
             self._subject_freq[int(cid)] = freq
     self.debug('loaded frequencies for {} concepts'.format(
         len(self._subject_freq)))