Esempio n. 1
0
class TestSearcher(TestCase):
    strings = ['a', 'ab', 'abc', 'abcd', 'abcde']

    def setUp(self):
        db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for string in self.strings:
            db.add(string)
        self.searcher = Searcher(db, CosineMeasure())

    def test_search(self):
        self.assertEqual(self.searcher.search('a', 1.0), ['a'])
        self.assertEqual(self.searcher.search('ab', 1.0), ['ab'])
        self.assertEqual(self.searcher.search('ab', 0.9), ['ab'])
        self.assertEqual(self.searcher.search('ab', 0.5), ['ab', 'abc', 'abcd'])
Esempio n. 2
0
class TestSearcher(TestCase):
    strings = ['a', 'ab', 'abc', 'abcd', 'abcde']

    def setUp(self):
        db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for string in self.strings:
            db.add(string)
        self.searcher = Searcher(db, CosineMeasure())

    def test_search(self):
        self.assertEqual(self.searcher.search('a', 1.0), ['a'])
        self.assertEqual(self.searcher.search('ab', 1.0), ['ab'])
        self.assertEqual(self.searcher.search('ab', 0.9), ['ab'])
        self.assertEqual(self.searcher.search('ab', 0.5),
                         ['ab', 'abc', 'abcd'])
Esempio n. 3
0
def ssdb_supstring_exists(s, dbname, threshold=DEFAULT_THRESHOLD):
    """Given a string s and a DB name, returns whether at least one string in
    the associated simstring DB likely contains s as an (approximate)
    substring."""

    if threshold == 1.0:
        # optimized (not hugely, though) for this common case
        __import_simstring()
        db = ssdb_open(dbname)

        if SIMSTRING_BINARY:
            __set_db_measure(db, 'overlap')
            db.threshold = threshold

            result = db.retrieve(s)
        else:
            searcher = Searcher(db, OverlapMeasure())
            result = searcher.search(s, threshold)

        db.close()

        for r in result:
            if s in r:
                return True
        return False
    else:
        # naive implementation for everything else
        return len(ssdb_supstring_lookup(s, dbname, threshold)) != 0
Esempio n. 4
0
def output_similar_strings_of_each_line(path, measure):
    strings = []
    with open(path, "r") as lines:
        for line in lines:
            strings.append(line.rstrip("\r\n"))

    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    for string in strings:
        db.add(string)

    db.save("companies.db")

    dbl = DictDatabase.load("companies.db")

    searcher = Searcher(dbl, measure)
    profiler.start()

    for string in strings:
        result = searcher.search(string, 0.8)
        # result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)]
        # print("\t".join([string, ",".join(result)]))

    profiler.stop()

    profiler.print()
    profiler.open_in_browser()
Esempio n. 5
0
 def _(bm):
     searcher = Searcher(db, LeftOverlapMeasure(db))
     with open(path, "r") as lines:
         for i, line in enumerate(lines):
             if i >= SEARCH_COUNT_LIMIT:
                 break
             strings = line.rstrip("\r\n")
             result = searcher.search(strings, 0.8)
Esempio n. 6
0
 def _(bm):
     searcher = Searcher(db, CosineMeasure())
     with open(path, 'r') as lines:
         for i, line in enumerate(lines):
             if i >= SEARCH_COUNT_LIMIT:
                 break
             strings = line.rstrip('\r\n')
             result = searcher.search(strings, 0.8)
Esempio n. 7
0
class TestSearcher(TestCase):
    strings = ["a", "ab", "abc", "abcd", "abcde"]

    def setUp(self):
        db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for string in self.strings:
            db.add(string)
        self.searcher = Searcher(db, CosineMeasure())

    def test_search1(self):
        self.assertEqual(self.searcher.search("a", 1.0), ["a"])

    def test_search2(self):
        self.assertEqual(self.searcher.search("ab", 0.5),
                         ["ab", "abc", "abcd"])
        self.assertEqual(self.searcher.search("ab", 1.0), ["ab"])
        self.assertEqual(self.searcher.search("ab", 0.9), ["ab"])

    def test_search3(self):
        self.assertEqual(self.searcher.search("abc", 1.0), ["abc"])
        self.assertEqual(self.searcher.search("abc", 0.9), ["abc"])

    def test_search4(self):
        self.assertEqual(self.searcher.search("abcd", 1.0), ["abcd"])
        self.assertEqual(self.searcher.search("abcd", 0.9), ["abcd"])

    def test_ranked_search(self):
        self.assertEqual(self.searcher.ranked_search("abcd", 1.0),
                         OrderedDict({"abcd": 1.0}))
        self.assertEqual(
            self.searcher.ranked_search("ab", 0.41),
            OrderedDict({
                "ab": 1.0,
                "abc": 0.5773502691896258,
                "abcd": 0.5163977794943222,
                "abcde": 0.47140452079103173,
            }),
        )
Esempio n. 8
0
    def similar_words_top_k(self, query, measure=CosineMeasure(), initial_threshold=0.99, dec_step=0.01, k=3):
        """search similar words by using edit distance"""
        searcher = Searcher(self.db, measure)
        t = initial_threshold
        similar_words = []
        while True:
            similar_words = searcher.search(query, t)

            if len(similar_words) >= k or t <= 0.1:
                break
            t -= dec_step

        if len(similar_words) > 3:
            np.random.choice(42)
            return np.random.choice(similar_words, k, replace=False).tolist()
        else:
            return similar_words
Esempio n. 9
0
def ssdb_supstring_lookup(s,
                          dbname,
                          threshold=DEFAULT_THRESHOLD,
                          with_score=False):
    """Given a string s and a DB name, returns the strings in the associated
    simstring DB that likely contain s as an (approximate) substring.

    If with_score is True, returns pairs of (str,score) where score is
    the fraction of n-grams in s that are also found in the matched
    string.
    """
    db = ssdb_open(dbname)
    if SIMSTRING_BINARY:
        __set_db_measure(db, 'overlap')
        db.threshold = threshold

        result = db.retrieve(s)
    else:
        searcher = Searcher(db, OverlapMeasure())
        result = searcher.search(s, threshold)
    db.close()

    # The simstring overlap measure is symmetric and thus does not
    # differentiate between substring and superstring matches.
    # Replicate a small bit of the simstring functionality (mostly the
    # ngrams() function) to filter to substrings only.
    s_ngrams = ngrams(s)
    filtered = []
    for r in result:
        if s in r:
            # avoid calculation: simple containment => score=1
            if with_score:
                filtered.append((r, 1.0))
            else:
                filtered.append(r)
        else:
            r_ngrams = ngrams(r)
            overlap = s_ngrams & r_ngrams
            if len(overlap) >= len(s_ngrams) * threshold:
                if with_score:
                    filtered.append((r, 1.0 * len(overlap) / len(s_ngrams)))
                else:
                    filtered.append(r)

    return filtered
Esempio n. 10
0
def ssdb_lookup(s,
                dbname,
                measure=DEFAULT_SIMILARITY_MEASURE,
                threshold=DEFAULT_THRESHOLD):
    """Given a string and a DB name, returns the strings matching in the
    associated simstring DB."""
    db = ssdb_open(dbname)

    if SIMSTRING_BINARY:
        __set_db_measure(db, measure)
        db.threshold = threshold

        result = db.retrieve(s)
    else:
        searcher = Searcher(db, __get_pure_measure(measure))
        result = searcher.search(s, threshold)

    db.close()
    return result
Esempio n. 11
0
class GESSimpleMatcher:
    '''
    Clase para hacer match simple de patologías GES. Solo considera similitud entre strings, 
    nada muy sofisticado. Basado en código de Fabián Villena (https://fabianvillena.cl).
    Actualmente considera un extractor de features que combina caracteres y palabras y tiene
    ciertas cosas específicas de textos GES.
    TODO: 
        - probar técnicas un poco más sofisticadas de matching
        - completar la documentación
    '''
    def __init__(
            self, 
            base_ges_data='ges_utils/data/ges-health-problems.json', 
            no_ges_str='UNK',
            alpha=0.2,
            n_chars=4, 
            n_words=[2], 
            special_words=['vih']
        ):

        self.alpha = alpha

        with open(base_ges_data,'r',encoding='utf-8') as f:
            self.__ges_dict = json.load(f)
        
        # feature extractor
        extractor = GESSyntacticFeatureExtractor(
                        n_chars=n_chars, 
                        n_words=n_words, 
                        special_words=special_words
                    )
        self.__db = DictDatabase(extractor)
        
        # Caché
        self.__cache = {}
        
        self.__problems_from_disease = defaultdict(set)
        self.__ids_from_disease = defaultdict(set)
        self.__problems = {}
        self.__ids = {}
        
        self.__problems[-1] = no_ges_str
        self.__ids[no_ges_str] = -1
        
        # Por ahora los ids son el orden de los problemas en el json
        # TODO: decidir si los ids deberían obtenerse de algún lugar estándar
        for i, problem in enumerate(self.__ges_dict):
            
            problem_id = i+1
            
            self.__problems[problem_id] = problem
            self.__ids[problem] = problem_id
            
            # agrega un problema como si fuera disease también
            self.__problems_from_disease[problem].add(problem)
            self.__ids_from_disease[problem].add(problem_id)
            
            # agrega a las BD 
            self.__db.add(problem)
            
            for disease in self.__ges_dict[problem]:
                
                self.__problems_from_disease[disease].add(problem)
                self.__ids_from_disease[disease].add(problem_id)
                
                # agrega a la BD
                self.__db.add(disease)
        
        # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas

        self.__searcher = Searcher(self.__db, CosineMeasure())

    def get_ranking_ges_diseases(self, raw_string):
        ranking = self.__searcher.ranked_search(raw_string, alpha=self.alpha)
        return ranking

    def get_ges_problem(self, raw_string):
        problem_id = self.get_ges_id(raw_string)
        problem = self.__problems[problem_id]
        return problem        

    def get_ges_id(self, raw_string):
        # si ya lo computamos entrega el valor 
        if raw_string in self.__cache:
            return self.__cache[raw_string]
        
        # si no lo tenemos, lo computamos
        ranking = self.get_ranking_ges_diseases(raw_string)

        if ranking:
            # ipdb.set_trace()
            (v, disease) = ranking[0]
            problem_ids = self.__ids_from_disease[disease]
            problem_id = list(problem_ids)[0]
            self.__cache[raw_string] = problem_id
            return problem_id

        else:
            self.__cache[raw_string] = -1
            return -1


    def get_possible_ges_ids(self, raw_string):
        
        to_search = raw_string
        
        problem_ids = []
        
        # busca las enfermedades candidatas
        candidate_diseases = self.__searcher.search(to_search, alpha=self.alpha) 
        
        for disease in candidate_diseases:
            problem_ids.extend(self.__ids_from_disease[disease])
          
        problem_ids_counter = Counter(problem_ids)
        ordered_ids = [i for i,_ in problem_ids_counter.most_common()]
        
        return ordered_ids

    def get_ges_id_prev(self, raw_string):
        
        # si ya lo computamos entrega el valor 
        if hash(raw_string) in self.__cache:
            return self.__cache[hash(raw_string)]
        
        ids_list = self.get_possible_ges_ids(raw_string)
        if not ids_list:
            self.__cache[raw_string] = -1
            return -1
        else:
            self.__cache[raw_string] = ids_list[0]
            return ids_list[0]

    def problem_from_id(self, id_problem):
        return self.__problems[id_problem]

    def id_from_problem(self, problem):
        return self.__ids[problem]

    def clean_cache(self):
        self.__cache = {}
Esempio n. 12
0
def make_change_image_dict(drink_names):
    import re
    import json
    import difflib
    from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
    from simstring.measure.cosine import CosineMeasure
    from simstring.database.dict import DictDatabase
    from simstring.searcher import Searcher

    ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig")
    json_data2 = json.load(ff)
    ff.close()

    # 互いに類似度を比較する文字列のリスト
    STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names]
    TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2}
    TCD_name_db = list(TCD_db.keys())
    count = 0
    length = len(STR_db)
    result_dict = {}
    change_image_dict = {}

    
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    for str1 in STR_db:
        db.add(str1)
    
    for str2 in TCD_name_db:
        result_dict[str2] = {}
        searcher = Searcher(db, CosineMeasure())
        i = 1.0
        # 類似度を計算、0.0~1.0 で結果が返る
        flag = False
        for str1 in STR_db:
            s = difflib.SequenceMatcher(None, str2, str1).ratio()
            if s > 0.75:
                flag = True
                if (str1 in result_dict[str2]):
                    
                    d =  result_dict[str2][str1]
                    #平均更新
                    d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1]
                    
                    result_dict[str2][str1] = d
                else:
                    
                    result_dict[str2].setdefault(str1, [s ,1])
                    
        
        temp = []
        while i >= 0.65:
            result = searcher.search(str2, i)
            if (len(result)):
                flag = True
                for str1 in result:
                    if (str1 in temp): continue
                    temp += [str1]
                    if (str1 in result_dict[str2]):
                        
                        d =  result_dict[str2][str1]
                        #平均更新
                        d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1]
                        
                        result_dict[str2][str1] = d
                    else:
                        result_dict[str2].setdefault(str1, [i ,1])
                        
                        
            i -= 0.001
        if (flag):
            
            count += 1
        
    with open("./search_log.txt", "w+", encoding="utf-8_sig") as f:
        real_count = 0
        for str2 in TCD_name_db:
            print("\n", file=f)
            print("\n")
            print(">> "+str2, file=f)
            print(">> "+str2)
            M = 0.0
            name = ""
            for key, value_list in result_dict[str2].items():
                if (M < value_list[0]):
                    name = key
                    M = value_list[0]
            print("  "+name+": "+str(M), file=f)
            if (M != 0):
                if (M >= 0.76):
                    print("  "+name+": "+str(M))
                    print("ok", file=f)
                    print("ok")
                    change_image_dict[name] = TCD_db[str2]
                    real_count += 1
                else:
                    print("  "+name+": "+str(M))
                    print("out", file=f)
                    print("out")
            

        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f)
        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length))

    exit()
    return change_image_dict
def similarity(word):
    searcher = Searcher(db, CosineMeasure())
    return np.array(searcher.search(normalize('NFKC', word), 0.65))
Esempio n. 14
0
 def search_term_sims(self, term: str) -> List[str]:
     searcher = Searcher(self.db, CosineMeasure())
     return searcher.search(term, 0.8)