def get_matches(evaluation_references, sample_N): # Take a random sample of the evaluation references to find matches for match_data = evaluation_references.sample(n=sample_N * 2, random_state=0).reset_index() match_data['Title'] = match_data['title'] match_data_positive = match_data.iloc[0:sample_N] match_data_negative = match_data.iloc[sample_N:] evaluation_references_without_negative = evaluation_references.loc[ ~evaluation_references['uber_id'].isin(match_data_negative['uber_id'])] fuzzy_matcher = FuzzyMatcher(evaluation_references_without_negative, -1) match_data_pos_neg = pd.concat([match_data_positive, match_data_negative]) eval_references = [] for i, ref in match_data_pos_neg.iterrows(): match_ref = fuzzy_matcher.match(ref) eval_references.append(match_ref) eval_references = pd.DataFrame(eval_references) eval_references["Title Length"] = [ len(title) for title in eval_references["Extracted title"] ] eval_references["Match Type"] = ["Positive"] * sample_N + ["Negative" ] * sample_N return eval_references
def test_close_match(self): real_publications = pd.DataFrame({ 'title': ['Malaria is caused by mosquitoes'], 'uber_id': [1] }) threshold = settings.FUZZYMATCH_SIMILARITY_THRESHOLD fuzzy_matcher = FuzzyMatcher(real_publications, threshold) reference = { 'Document id': 10, 'Reference id': 11, 'Title': 'Malaria' } matched_publication = fuzzy_matcher.match(reference) self.assertEqual(matched_publication, None)
def test_no_string_titles(self): real_publications = pd.DataFrame({ 'title': [1,2] }) threshold = 75 with self.assertRaises(AttributeError): FuzzyMatcher(real_publications, threshold)
def test_empty_title(self): real_publications = pd.DataFrame({ 'title': [] }) threshold = 75 with self.assertRaises(ValueError): FuzzyMatcher(real_publications, threshold)
def evaluate_match_references(evaluation_references, match_threshold, length_threshold, sample_N): # Take a random sample of the evaluation references to find matches for match_data = evaluation_references.sample(n=sample_N * 2, random_state=0).reset_index() match_data['Title'] = match_data['title'] match_data_positive = match_data.iloc[0:sample_N] match_data_negative = match_data.iloc[sample_N:] evaluation_references_without_negative = evaluation_references.loc[ ~evaluation_references['uber_id'].isin(match_data_negative['uber_id'])] fuzzy_matcher = FuzzyMatcher(evaluation_references_without_negative, match_threshold, length_threshold) predictions = predict_match_data( match_data=match_data_positive.to_dict('records') + match_data_negative.to_dict('records'), matcher=fuzzy_matcher) actual = match_data_positive['Reference id'].to_list() + [None] * sample_N metrics = evaluate_metric(actual, predictions) return metrics
def init_fuzzy_matcher(self): real_publications = pd.DataFrame({ 'title': ['Malaria', 'Zika'], 'uber_id': [1, 2] }) threshold = settings.FUZZYMATCH_SIMILARITY_THRESHOLD fuzzy_matcher = FuzzyMatcher(real_publications, threshold) return fuzzy_matcher
def test_init_variables(self): real_publications = pd.DataFrame({ 'title': ['Malaria', 'Zika'] }) threshold = 0 fuzzy_matcher = FuzzyMatcher(real_publications, threshold) assert_frame_equal( fuzzy_matcher.publications, real_publications ) self.assertEqual( fuzzy_matcher.similarity_threshold, threshold ) self.assertTrue( fuzzy_matcher.tfidf_matrix.size != 0 )
def test_no_title(self): real_publications = pd.DataFrame({}) threshold = 75 with self.assertRaises(KeyError): FuzzyMatcher(real_publications, threshold)