def _suggest(self, text, project, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) vectors = project.vectorizer.transform([text]) docsim = self._index[vectors[0]] fullresult = VectorSuggestionResult(docsim, project.subjects) return fullresult.filter(limit=int(self.params['limit']))
def _suggest(self, text, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) tokens = self.project.analyzer.tokenize_words(text) vectors = self.vectorizer.transform([" ".join(tokens)]) docsim = self._index[vectors[0]] fullresult = VectorSuggestionResult(docsim, self.project.subjects) return fullresult.filter(limit=int(params['limit']))
def test_vector_suggestion_result_as_vector_destination(subject_index): orig_vector = np.ones(len(subject_index), dtype=np.float32) suggestions = VectorSuggestionResult(orig_vector) destination = np.zeros(len(subject_index), dtype=np.float32) assert not (destination == orig_vector).all() # destination is all zeros vector = suggestions.as_vector(subject_index, destination=destination) assert vector is destination assert (destination == orig_vector).all() # destination now all ones
def _merge_hits_from_sources(self, hits_from_sources, params): score_vector = np.array( [hits.vector * weight for hits, weight in hits_from_sources], dtype=np.float32) results = self._model.predict( np.expand_dims(score_vector.transpose(), 0)) return VectorSuggestionResult(results[0], self.project.subjects)
def test_hitfilter_vector_suggestion_results_with_deprecated_subjects( subject_index): subject_index.append('http://example.org/deprecated', None, None) vector = np.ones(len(subject_index)) suggestions = VectorSuggestionResult(vector) filtered_suggestions = SuggestionFilter(subject_index)(suggestions) assert len(suggestions) == len(filtered_suggestions) \ + len(subject_index.deprecated_ids()) deprecated = SubjectSuggestion(uri='http://example.org/deprecated', label=None, notation=None, score=1.0) assert deprecated in suggestions.as_list(subject_index) assert deprecated not in filtered_suggestions.as_list(subject_index)
def merge_hits(weighted_hits, subject_index): """Merge hits from multiple sources. Input is a sequence of WeightedSuggestion objects. A SubjectIndex is needed to convert between subject IDs and URIs. Returns an SuggestionResult object.""" weights = [whit.weight for whit in weighted_hits] scores = [whit.hits.as_vector(subject_index) for whit in weighted_hits] result = np.average(scores, axis=0, weights=weights) return VectorSuggestionResult(result)
def _merge_hits_from_sources(self, hits_from_sources, params): score_vector = np.array([np.sqrt(hits.as_vector(subjects)) * weight * len(hits_from_sources) for hits, weight, subjects in hits_from_sources], dtype=np.float32) results = self._model.predict( np.expand_dims(score_vector.transpose(), 0)) return VectorSuggestionResult(results[0])
def _suggest_chunks(self, chunktexts, project): results = [] for chunktext in chunktexts: exampletext = self._inputs_to_exampletext(project, chunktext) if not exampletext: continue example = ' {}'.format(exampletext) result = self._model.predict(example) results.append(self._convert_result(result, project)) if not results: # empty result return ListSuggestionResult(hits=[], subject_index=project.subjects) return VectorSuggestionResult( np.array(results).mean(axis=0), project.subjects)
def _suggest_chunks(self, chunktexts, params): results = [] for chunktext in chunktexts: exampletext = self._inputs_to_exampletext(chunktext) if not exampletext: continue example = ' {}'.format(exampletext) result = self._model.predict(example) results.append(self._convert_result(result)) if not results: # empty result return ListSuggestionResult([]) return VectorSuggestionResult( np.array(results, dtype=np.float32).mean(axis=0))
def _merge_hits_from_sources(self, hits_from_sources, project, params): score_vector = np.array([hits.vector for hits, _ in hits_from_sources]) discount_rate = float( self.params.get('discount_rate', self.DEFAULT_DISCOUNT_RATE)) result = np.zeros(score_vector.shape[1]) for subj_id in range(score_vector.shape[1]): subj_score_vector = score_vector[:, subj_id] if subj_score_vector.sum() > 0.0: raw_score, pred_score = self._calculate_scores( subj_id, subj_score_vector) raw_weight = 1.0 / \ ((discount_rate * self._subject_freq[subj_id]) + 1) result[subj_id] = (raw_weight * raw_score) + \ (1.0 - raw_weight) * pred_score return VectorSuggestionResult(result, project.subjects)
def test_vector_suggestions_enforce_score_range(subject_index): orig_vector = np.array([-0.1, 0.0, 0.5, 1.0, 1.5], dtype=np.float32) suggestions = VectorSuggestionResult(orig_vector) vector = suggestions.as_vector(subject_index) expected = np.array([0.0, 0.0, 0.5, 1.0, 1.0], dtype=np.float32) assert (vector == expected).all()
def test_vector_suggestion_result_as_vector(subject_index): orig_vector = np.ones(len(subject_index), dtype=np.float32) suggestions = VectorSuggestionResult(orig_vector) vector = suggestions.as_vector(subject_index) assert (vector == orig_vector).all()
def _prediction_to_result(self, prediction, params): vector = np.zeros(len(self.project.subjects), dtype=np.float32) for score, subject_id in prediction: vector[subject_id] = score result = VectorSuggestionResult(vector) return result.filter(self.project.subjects, limit=int(params['limit']))