class TestRankedSearchJaccard(TestCase): def setUp(self) -> None: db = DictDatabase(CharacterNgramFeatureExtractor(2)) db.add("foo") db.add("bar") db.add("fooo") db.add("food") db.add("fool") db.add("follow") self.searcher = Searcher(db, JaccardMeasure()) def test_ranked_search_example1(self): results = self.searcher.ranked_search("fo", 0.5) goal = OrderedDict({"foo": 0.75, "fooo": 0.6}) self.assertEqual(results, goal) def test_ranked_search_example2(self): results = self.searcher.ranked_search("fo", 0.3) goal = OrderedDict({ "foo": 0.75, "fooo": 0.6, "food": 0.3333333333333333, "fool": 0.3333333333333333, }) self.assertEqual(results, goal)
class TestRankedSearchCosine(TestCase): def setUp(self) -> None: db = DictDatabase(CharacterNgramFeatureExtractor(2)) db.add("foo") db.add("bar") db.add("fooo") db.add("food") db.add("fool") db.add("follow") self.searcher = Searcher(db, CosineMeasure()) def test_ranked_search_example1(self): results = self.searcher.ranked_search("fo", 0.5) goal = OrderedDict({ "foo": 0.8660254037844387, "fooo": 0.7745966692414834, "food": 0.5163977794943222, "fool": 0.5163977794943222, }) self.assertEqual(results, goal) def test_ranked_search_example2(self): results = self.searcher.ranked_search("fo", 0.6) goal = OrderedDict({ "foo": 0.8660254037844387, "fooo": 0.7745966692414834 }) self.assertEqual(results, goal)
class SimString_UMLS(object): def __init__(self, umls_db, db_path, cui_mapping_path, alpha=0.5): self.db = None self.umls_db = umls_db self.cui_mapping = None self.searcher = None self.alpha = alpha self.load(db_path, cui_mapping_path) def load(self, db_path, cui_mapping_path): logging.info('Loading DB ...') with open(db_path, 'rb') as db_f: self.db = pickle.load(db_f) logging.info('Loading Mapping ...') with open(cui_mapping_path, 'rb') as mapping_f: self.cui_mapping = pickle.load(mapping_f) logging.info('Creating Searcher ...') self.searcher = Searcher(self.db, CosineMeasure()) @lru_cache(262144) def match(self, text): results = self.searcher.ranked_search(text, alpha=self.alpha) results = [(a, sim) for sim, a in results] # to be consistent with other matchers return results def match_cuis(self, text): alias_results = self.match(text) cui_results = [] included_cuis = set() for alias, sim in alias_results: for cui in self.cui_mapping[alias]: if cui not in included_cuis: cui_results.append((cui, sim)) included_cuis.add(cui) return cui_results def match_sts(self, text): st_results = {} for cui, sim in self.match_cuis(text): for st in self.umls_db.get_sts(cui): if st not in st_results: st_results[st] = sim else: st_results[st] = max(sim, st_results[st]) st_results = list(st_results.items()) st_results = sorted(st_results, key=lambda x: (x[1], x[0]), reverse=True) return st_results
def output_similar_strings_of_each_line(path): db = DictDatabase(CharacterNgramFeatureExtractor(2)) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') result = [str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8)] print("\t".join([strings, ",".join(result)]))
class TestSearcher(TestCase): strings = ["a", "ab", "abc", "abcd", "abcde"] def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure()) def test_search1(self): self.assertEqual(self.searcher.search("a", 1.0), ["a"]) def test_search2(self): self.assertEqual(self.searcher.search("ab", 0.5), ["ab", "abc", "abcd"]) self.assertEqual(self.searcher.search("ab", 1.0), ["ab"]) self.assertEqual(self.searcher.search("ab", 0.9), ["ab"]) def test_search3(self): self.assertEqual(self.searcher.search("abc", 1.0), ["abc"]) self.assertEqual(self.searcher.search("abc", 0.9), ["abc"]) def test_search4(self): self.assertEqual(self.searcher.search("abcd", 1.0), ["abcd"]) self.assertEqual(self.searcher.search("abcd", 0.9), ["abcd"]) def test_ranked_search(self): self.assertEqual(self.searcher.ranked_search("abcd", 1.0), OrderedDict({"abcd": 1.0})) self.assertEqual( self.searcher.ranked_search("ab", 0.41), OrderedDict({ "ab": 1.0, "abc": 0.5773502691896258, "abcd": 0.5163977794943222, "abcde": 0.47140452079103173, }), )
def output_similar_strings_of_each_line(path): db = DictDatabase(CharacterNgramFeatureExtractor(2)) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') result = [ str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8) ] print("\t".join([strings, ",".join(result)]))
class GESSimpleMatcher: ''' Clase para hacer match simple de patologías GES. Solo considera similitud entre strings, nada muy sofisticado. Basado en código de Fabián Villena (https://fabianvillena.cl). Actualmente considera un extractor de features que combina caracteres y palabras y tiene ciertas cosas específicas de textos GES. TODO: - probar técnicas un poco más sofisticadas de matching - completar la documentación ''' def __init__( self, base_ges_data='ges_utils/data/ges-health-problems.json', no_ges_str='UNK', alpha=0.2, n_chars=4, n_words=[2], special_words=['vih'] ): self.alpha = alpha with open(base_ges_data,'r',encoding='utf-8') as f: self.__ges_dict = json.load(f) # feature extractor extractor = GESSyntacticFeatureExtractor( n_chars=n_chars, n_words=n_words, special_words=special_words ) self.__db = DictDatabase(extractor) # Caché self.__cache = {} self.__problems_from_disease = defaultdict(set) self.__ids_from_disease = defaultdict(set) self.__problems = {} self.__ids = {} self.__problems[-1] = no_ges_str self.__ids[no_ges_str] = -1 # Por ahora los ids son el orden de los problemas en el json # TODO: decidir si los ids deberían obtenerse de algún lugar estándar for i, problem in enumerate(self.__ges_dict): problem_id = i+1 self.__problems[problem_id] = problem self.__ids[problem] = problem_id # agrega un problema como si fuera disease también self.__problems_from_disease[problem].add(problem) self.__ids_from_disease[problem].add(problem_id) # agrega a las BD self.__db.add(problem) for disease in self.__ges_dict[problem]: self.__problems_from_disease[disease].add(problem) self.__ids_from_disease[disease].add(problem_id) # agrega a la BD self.__db.add(disease) # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas self.__searcher = Searcher(self.__db, CosineMeasure()) def get_ranking_ges_diseases(self, raw_string): ranking = self.__searcher.ranked_search(raw_string, alpha=self.alpha) return ranking def get_ges_problem(self, raw_string): problem_id = self.get_ges_id(raw_string) problem = self.__problems[problem_id] return problem def get_ges_id(self, raw_string): # si ya lo computamos entrega el valor if raw_string in self.__cache: return self.__cache[raw_string] # si no lo tenemos, lo computamos ranking = self.get_ranking_ges_diseases(raw_string) if ranking: # ipdb.set_trace() (v, disease) = ranking[0] problem_ids = self.__ids_from_disease[disease] problem_id = list(problem_ids)[0] self.__cache[raw_string] = problem_id return problem_id else: self.__cache[raw_string] = -1 return -1 def get_possible_ges_ids(self, raw_string): to_search = raw_string problem_ids = [] # busca las enfermedades candidatas candidate_diseases = self.__searcher.search(to_search, alpha=self.alpha) for disease in candidate_diseases: problem_ids.extend(self.__ids_from_disease[disease]) problem_ids_counter = Counter(problem_ids) ordered_ids = [i for i,_ in problem_ids_counter.most_common()] return ordered_ids def get_ges_id_prev(self, raw_string): # si ya lo computamos entrega el valor if hash(raw_string) in self.__cache: return self.__cache[hash(raw_string)] ids_list = self.get_possible_ges_ids(raw_string) if not ids_list: self.__cache[raw_string] = -1 return -1 else: self.__cache[raw_string] = ids_list[0] return ids_list[0] def problem_from_id(self, id_problem): return self.__problems[id_problem] def id_from_problem(self, problem): return self.__ids[problem] def clean_cache(self): self.__cache = {}