def test_list_suggestions_vector_enforce_score_range(subject_index): suggestions = ListSuggestionResult([ SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p7141', label='sinetit', notation=None, score=1.5), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', notation=None, score=1.0), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p14173', label='kaivaukset', notation=None, score=0.5), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p14588', label='riimukivet', notation=None, score=0.0), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p12738', label='viikinkiaika', notation=None, score=-0.5) ]) vector = suggestions.as_vector(subject_index) assert vector.sum() == 2.5 for subject_id, score in enumerate(vector): if subject_index[subject_id][1] == 'sinetit': assert score == 1.0 elif subject_index[subject_id][1] == 'viikinkiaika': assert score == 0.0 else: assert score in (1.0, 0.5, 0.0)
def test_list_suggestion_result_vector_destination(subject_index): suggestions = ListSuggestionResult([ SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p7141', label='sinetit', notation=None, score=1.0), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', notation=None, score=0.5) ]) destination = np.zeros(len(subject_index), dtype=np.float32) vector = suggestions.as_vector(subject_index, destination=destination) assert vector is destination
def _suggest(self, text, params): score = float(params.get('score', 1.0)) notation = params.get('notation', None) return ListSuggestionResult([ SubjectSuggestion( uri=self.uri, label=self.label, notation=notation, score=score) ], self.project.subjects)
def test_hitfilter_zero_score(subject_index): origsuggestions = ListSuggestionResult( [SubjectSuggestion(uri='uri', label='label', score=0.0)], subject_index) suggestions = SuggestionFilter()(origsuggestions) assert isinstance(suggestions, SuggestionResult) assert len(suggestions) == 0
def _suggest(self, text, project, params): data = {'text': text} if 'project' in params: data['project'] = params['project'] try: req = requests.post(params['endpoint'], data=data) req.raise_for_status() except requests.exceptions.RequestException as err: self.warning("HTTP request failed: {}".format(err)) return ListSuggestionResult([], project.subjects) try: response = req.json() except ValueError as err: self.warning("JSON decode failed: {}".format(err)) return ListSuggestionResult([], project.subjects) if 'results' in response: results = response['results'] else: results = response try: return ListSuggestionResult([SubjectSuggestion(uri=h['uri'], label=h['label'], score=h['score']) for h in results if h['score'] > 0.0], project.subjects) except (TypeError, ValueError) as err: self.warning("Problem interpreting JSON data: {}".format(err)) return ListSuggestionResult([], project.subjects)
def generate_suggestions(n, subject_index): suggestions = [] for i in range(n): uri = 'http://example.org/{}'.format(i) suggestions.append(SubjectSuggestion(uri=uri, label='hit {}'.format(i), score=1.0 / (i + 1))) return ListSuggestionResult(suggestions, subject_index)
def test_list_suggestion_result_vector_notfound(subject_index): suggestions = ListSuggestionResult([ SubjectSuggestion(uri='http://example.com/notfound', label='not found', notation=None, score=1.0) ]) assert suggestions.as_vector(subject_index).sum() == 0
def test_list_suggestions_vector_notfound(document_corpus, subject_index): suggestions = ListSuggestionResult([ SubjectSuggestion(uri='http://example.com/notfound', label='not found', notation=None, score=1.0) ], subject_index) assert suggestions.vector.sum() == 0
def test_list_suggestions_vector(document_corpus, subject_index): suggestions = ListSuggestionResult([ SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p7141', label='sinetit', score=1.0), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', score=0.5) ], subject_index) assert isinstance(suggestions.vector, np.ndarray) assert len(suggestions.vector) == len(subject_index) assert suggestions.vector.sum() == 1.5 for subject_id, score in enumerate(suggestions.vector): if subject_index[subject_id][1] == 'sinetit': assert score == 1.0 elif subject_index[subject_id][1] == 'viikingit': assert score == 0.5 else: assert score == 0.0
def _response_to_result(self, response): try: return ListSuggestionResult([ SubjectSuggestion( uri=h['id'], label=h['label'], score=h['probability']) for h in response['topics'] if h['probability'] > 0.0 ], self.project.subjects) except (TypeError, ValueError) as err: self.warning("Problem interpreting JSON data: {}".format(err)) return ListSuggestionResult([], self.project.subjects)
def test_hitfilter_list_suggestion_results_with_deprecated_subjects( subject_index): subject_index.append('http://example.org/deprecated', None, None) suggestions = ListSuggestionResult([ SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p7141', label='sinetit', notation=None, score=1.0), SubjectSuggestion(uri='http://www.yso.fi/onto/yso/p6479', label='viikingit', notation=None, score=0.5), SubjectSuggestion(uri='http://example.org/deprecated', label=None, notation=None, score=0.5) ], subject_index) filtered_suggestions = SuggestionFilter()(suggestions) assert isinstance(filtered_suggestions, SuggestionResult) assert len(filtered_suggestions) == 2 assert filtered_suggestions[0] == suggestions[0] assert filtered_suggestions[1] == suggestions[1]
def _suggest(self, text, params): self.debug( f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') result = self._model.suggest_proba([text])[0] suggestions = [] for uri, score in result: subject_id = self.project.subjects.by_uri(uri) if subject_id: label = self.project.subjects[subject_id][1] else: label = None suggestion = SubjectSuggestion(uri, label, None, score) suggestions.append(suggestion) return ListSuggestionResult(suggestions)
def _response_to_result(self, response): try: subject_suggestions = [SubjectSuggestion( uri=hit['id'], label=None, notation=None, score=hit['probability']) for hit in response['topics'] if hit['probability'] > 0.0] except (TypeError, ValueError) as err: self.warning("Problem interpreting JSON data: {}".format(err)) return ListSuggestionResult([], self.project.subjects) return ListSuggestionResult.create_from_index(subject_suggestions, self.project.subjects)
def _suggest(self, text, params): self.debug('Suggesting subjects for text "{}..." (len={})'.format( text[:20], len(text))) vector = self.vectorizer.transform([text]) feature_values = [(col, vector[row, col]) for row, col in zip(*vector.nonzero())] results = [] limit = int(self.params['limit']) for subj_id, score in self._model.predict(feature_values, top_k=limit): subject = self.project.subjects[subj_id] results.append( SubjectSuggestion(uri=subject[0], label=subject[1], score=score)) return ListSuggestionResult(results, self.project.subjects)
def test_hitfilter_vector_suggestion_results_with_deprecated_subjects( subject_index): subject_index.append('http://example.org/deprecated', None, None) vector = np.ones(len(subject_index)) suggestions = VectorSuggestionResult(vector) filtered_suggestions = SuggestionFilter(subject_index)(suggestions) assert len(suggestions) == len(filtered_suggestions) \ + len(subject_index.deprecated_ids()) deprecated = SubjectSuggestion(uri='http://example.org/deprecated', label=None, notation=None, score=1.0) assert deprecated in suggestions.as_list(subject_index) assert deprecated not in filtered_suggestions.as_list(subject_index)
def _suggest_chunks(self, chunktexts, params): limit = int(params['limit']) chunklabels, chunkscores = self._predict_chunks(chunktexts, limit) label_scores = collections.defaultdict(float) for labels, scores in zip(chunklabels, chunkscores): for label, score in zip(labels, scores): label_scores[label] += score best_labels = sorted([(score, label) for label, score in label_scores.items()], reverse=True) results = [] for score, label in best_labels[:limit]: subject = self._label_to_subject(label) results.append( SubjectSuggestion(uri=subject[0], label=subject[1], score=score / len(chunktexts))) return ListSuggestionResult(results, self.project.subjects)
def _suggest(self, text, params): self.debug( f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params['limit']) self._kw_extractor = yake.KeywordExtractor( lan=params['language'], n=int(params['max_ngram_size']), dedupLim=float(params['deduplication_threshold']), dedupFunc=params['deduplication_algo'], windowsSize=int(params['window_size']), top=int(params['num_keywords']), features=self.params['features']) keyphrases = self._kw_extractor.extract_keywords(text) suggestions = self._keyphrases2suggestions(keyphrases) subject_suggestions = [ SubjectSuggestion(uri=uri, label=None, notation=None, score=score) for uri, score in suggestions[:limit] if score > 0.0 ] return ListSuggestionResult.create_from_index(subject_suggestions, self.project.subjects)
def _suggest(self, text, project, params): score = float(params.get('score', 1.0)) return ListSuggestionResult( [SubjectSuggestion(uri=self.uri, label=self.label, score=score)], project.subjects)