def skos(self): """return the subject vocabulary from SKOS file""" if self._skos_vocab is not None: return self._skos_vocab # attempt to load graph from dump file dumppath = os.path.join(self.datadir, 'subjects.dump.gz') if os.path.exists(dumppath): logger.debug(f'loading graph dump from {dumppath}') try: self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath, self.language) except ModuleNotFoundError: # Probably dump has been saved using a different rdflib version logger.debug('could not load graph dump, using turtle file') else: return self._skos_vocab # graph dump file not found - parse ttl file instead path = os.path.join(self.datadir, 'subjects.ttl') if os.path.exists(path): logger.debug(f'loading graph from {path}') self._skos_vocab = annif.corpus.SubjectFileSKOS(path, self.language) # store the dump file so we can use it next time self._skos_vocab.save_skos(path, self.language) return self._skos_vocab raise NotInitializedException(f'graph file {path} not found')
def _load_train_data(self): path = os.path.join(self.datadir, self.TRAIN_FILE) if os.path.exists(path): return joblib.load(path) else: raise NotInitializedException( 'train data file {} not found'.format(path), backend_id=self.backend_id)
def _load_model(self): path = os.path.join(self.datadir, self.MODEL_FILE) self.debug('loading model from {}'.format(path)) if os.path.exists(path): return MLLMModel.load(path) else: raise NotInitializedException('model {} not found'.format(path), backend_id=self.backend_id)
def initialize(self): if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug(f'Loading STWFSA model from {path}.') if os.path.exists(path): self._model = StwfsapyPredictor.load(path) self.debug('Loaded model.') else: raise NotInitializedException(f'Model not found at {path}', backend_id=self.backend_id)
def _initialize_model(self): if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug('loading model from {}'.format(path)) if os.path.exists(path): self._model = omikuji.Model.load(path) else: raise NotInitializedException( 'model {} not found'.format(path), backend_id=self.backend_id)
def subjects(self): if self._subjects is None: path = os.path.join(self.datadir, 'subjects') if os.path.exists(path): logger.debug('loading subjects from %s', path) self._subjects = annif.corpus.SubjectIndex.load(path) else: raise NotInitializedException( "subject file {} not found".format(path)) return self._subjects
def initialize(self): if self._model is not None: return # already initialized model_filename = os.path.join(self.datadir, self.MODEL_FILE) if not os.path.exists(model_filename): raise NotInitializedException( 'model file {} not found'.format(model_filename), backend_id=self.backend_id) self.debug('loading Keras model from {}'.format(model_filename)) self._model = load_model(model_filename)
def initialize_vectorizer(self): if self.vectorizer is None: path = os.path.join(self.datadir, self.VECTORIZER_FILE) if os.path.exists(path): self.debug('loading vectorizer from {}'.format(path)) self.vectorizer = joblib.load(path) else: raise NotInitializedException( "vectorizer file '{}' not found".format(path), backend_id=self.backend_id)
def vectorizer(self): if self._vectorizer is None: path = os.path.join(self.datadir, 'vectorizer') if os.path.exists(path): logger.debug('loading vectorizer from %s', path) self._vectorizer = joblib.load(path) else: raise NotInitializedException( "vectorizer file '{}' not found".format(path), project_id=self.project_id) return self._vectorizer
def _initialize_index(self): if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) self.debug('loading similarity index from {}'.format(path)) if os.path.exists(path): self._index = gensim.similarities.SparseMatrixSimilarity.load( path) else: raise NotInitializedException( 'similarity index {} not found'.format(path), backend_id=self.backend_id)
def skos(self): """return the subject vocabulary from SKOS file""" if self._skos_vocab is None: path = os.path.join(self.datadir, 'subjects.ttl') if os.path.exists(path): logger.debug(f'loading graph from {path}') self._skos_vocab = annif.corpus.SubjectFileSKOS( path, self.language) else: raise NotInitializedException(f'graph file {path} not found') return self._skos_vocab
def initialize(self): if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug('loading fastText model from {}'.format(path)) if os.path.exists(path): self._model = self._load_model(path) self.debug('loaded model {}'.format(str(self._model))) self.debug('dim: {}'.format(self._model.get_dimension())) else: raise NotInitializedException( 'model {} not found'.format(path), backend_id=self.backend_id)
def suggest(self, text, backend_params=None): """Suggest subjects the given text by passing it to the backend. Returns a list of SubjectSuggestion objects ordered by decreasing score.""" if not self.is_trained: if self.is_trained is None: logger.warning('Could not get train state information.') else: raise NotInitializedException('Project is not trained.') logger.debug('Suggesting subjects for text "%s..." (len=%d)', text[:20], len(text)) hits = self._suggest_with_backend(text, backend_params) logger.debug('%d hits from backend', len(hits)) return hits
def _initialize_model(self): if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) self.debug('loading model from {}'.format(path)) if os.path.exists(path): try: self._model = omikuji.Model.load(path) except RuntimeError: raise OperationFailedException( "Omikuji models trained on Annif versions older than " "0.56 cannot be loaded. Please retrain your project.") else: raise NotInitializedException( 'model {} not found'.format(path), backend_id=self.backend_id)
def initialize(self): if self._model is None: path = os.path.join(self.datadir, self.MODEL_FILE) if not os.path.exists(path): raise NotInitializedException( 'model {} not found'.format(path), backend_id=self.backend_id) self.debug('loading VW model from {}'.format(path)) params = self._create_params({'i': path, 'quiet': True}) if 'passes' in params: # don't confuse the model with passes del params['passes'] self.debug("model parameters: {}".format(params)) self._model = pyvw.vw(**params) self.debug('loaded model {}'.format(str(self._model)))
def initialize(self): if self._models is not None: return # already initialized self._models = {} sources = annif.util.parse_sources(self.params['sources']) for source_project_id, _ in sources: model_filename = self.MODEL_FILE_PREFIX + source_project_id path = os.path.join(self.datadir, model_filename) if os.path.exists(path): self.debug('loading PAV model from {}'.format(path)) self._models[source_project_id] = joblib.load(path) else: raise NotInitializedException( "PAV model file '{}' not found".format(path), backend_id=self.backend_id)
def initialize(self, parallel=False): super().initialize(parallel) if self._model is not None: return # already initialized if parallel: # Don't load TF model just before parallel execution, # since it won't work after forking worker processes return model_filename = os.path.join(self.datadir, self.MODEL_FILE) if not os.path.exists(model_filename): raise NotInitializedException( 'model file {} not found'.format(model_filename), backend_id=self.backend_id) self.debug('loading Keras model from {}'.format(model_filename)) self._model = load_model(model_filename, custom_objects={'MeanLayer': MeanLayer})
def _load_subject_freq(self): path = os.path.join(self.datadir, self.FREQ_FILE) if not os.path.exists(path): raise NotInitializedException( 'frequency file {} not found'.format(path), backend_id=self.backend_id) self.debug('loading concept frequencies from {}'.format(path)) with open(path) as freqf: # The Counter was serialized like a dictionary, need to # convert it back. Keys that became strings need to be turned # back into integers. self._subject_freq = collections.Counter() for cid, freq in json.load(freqf).items(): self._subject_freq[int(cid)] = freq self.debug('loaded frequencies for {} concepts'.format( len(self._subject_freq)))