def select_best_fitting_record(main_zbl_record, candidates, meaning_fields = ['ti', 'au', 'jt', 'py']): """From list of candidate records select the most similar (the one with the smallest edit distance on meaning_fields) to main_zbl_record. >>> select_best_fitting_record({'ti': 'kot', 'jt': 'Koty', 'py': '1991'}, [{'ti': 'koty', 'jt': 'Koty'}, {'ti': 'kota', 'jt': 'Kozy', 'py': '1991'}]) == {'ti': 'koty', 'jt': 'Koty'} True """ if len(candidates) == 1: return candidates[0] #generate subset of fields present in all records: fields = set(main_zbl_record.keys()) for candidate in candidates: fields = fields.intersection(candidate.keys()) #keep relevant fields: fields = fields.intersection(set(meaning_fields)) #estimate edit distances for all candidates: edit_distances = [] for candidate in candidates: total_edit_distance = 0 for field in fields: edit_distance = text_analysis.lev(candidate[field].lower(), main_zbl_record[field].lower()) total_edit_distance = total_edit_distance + edit_distance edit_distances.append(total_edit_distance) #select most similar: most_similar_ix = edit_distances.index(min(edit_distances)) return candidates[most_similar_ix]
def are_elements_almost_equal(a1, a2, max_hist_dist = 4, max_edit_dist = 2): """ Returns true iff a1, a2 fulfill following conditions: - distance between histograms <= max_hist_dist - edit distance <= max_edit_dist Sample use: >>> are_elements_almost_equal('andrzej', 'endrzaj', 4, 2) True >>> are_elements_almost_equal('andrzej', 'endrzaje', 4, 2) False >>> are_elements_almost_equal('andrzej', 'endzraj', 4, 2) False """ hist_dist = text_analysis.word_hist_diff_total(a1, a2) if hist_dist > max_hist_dist: #print "a1=",a1,"a2=",a2,"hist_dist=",hist_dist return False edit_dist = text_analysis.lev(a1, a2) if edit_dist > max_edit_dist: #print "a1=",a1,"a2=",a2,"edit_dist=",edit_dist return False return True