Ejemplo n.º 1
0
def get_high_recall_matches_for_df(post_df):
    heb_db, eng_db, umls_data = get_umls_data()
    heb_searcher = Searcher(heb_db, CosineMeasure())
    eng_searcher = Searcher(eng_db, CosineMeasure())
    all_high_recall_matcher_found = []
    for row_idx, post_row in post_df.iterrows():
        high_recall_matcher_found = get_english_and_hebrew_matches(eng_searcher, heb_searcher, post_row, umls_data)
        all_high_recall_matcher_found.append(high_recall_matcher_found )
    return all_high_recall_matcher_found
Ejemplo n.º 2
0
def main():
    heb_db, eng_db, umls_data = get_umls_data()

    heb_searcher = Searcher(heb_db, CosineMeasure())
    eng_searcher = Searcher(eng_db, CosineMeasure())

    handle_community(SCLEROSIS, heb_searcher, eng_searcher, umls_data)
    handle_community(DIABETES, heb_searcher, eng_searcher, umls_data)
    handle_community(DEPRESSION, heb_searcher, eng_searcher, umls_data)

    print("Done")
Ejemplo n.º 3
0
class TestCosine(TestCase):
    measure = CosineMeasure()

    def test_min_feature_size(self):
        self.assertEqual(self.measure.min_feature_size(5, 1.0), 5)
        self.assertEqual(self.measure.min_feature_size(5, 0.5), 2)

    def test_max_feature_size(self):
        self.assertEqual(self.measure.max_feature_size(5, 1.0), 5)
        self.assertEqual(self.measure.max_feature_size(5, 0.5), 20)

    def test_minimum_common_feature_count(self):
        self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0),
                         5)
        self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0),
                         10)
        self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5),
                         3)

    def test_similarity(self):
        x = [1, 2, 3]
        y = [1, 2, 3, 4]
        self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0)
        self.assertEqual(round(self.measure.similarity(x, y), 2), 0.87)

        z = [1, 1, 2, 3]
        self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0)
Ejemplo n.º 4
0
 def _(bm):
     searcher = Searcher(db, CosineMeasure())
     with open(path, 'r') as lines:
         for i, line in enumerate(lines):
             if i >= SEARCH_COUNT_LIMIT:
                 break
             strings = line.rstrip('\r\n')
             result = searcher.search(strings, 0.8)
Ejemplo n.º 5
0
def use_demo_ontology():
    '''
    Specify the demo ontology to be used
    for the automated mapping suggestions
    '''
    global simstring_searcher, term_to_cui

    simstring_searcher = Searcher(demo_database, CosineMeasure())
    term_to_cui = demo_mappings
Ejemplo n.º 6
0
 def setUp(self) -> None:
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     db.add("foo")
     db.add("bar")
     db.add("fooo")
     db.add("food")
     db.add("fool")
     db.add("follow")
     self.searcher = Searcher(db, CosineMeasure())
Ejemplo n.º 7
0
def setup_custom_ontology(request):
    '''
    Setup custom ontology for
    automated mapping suggestions
    '''
    global simstring_searcher, term_to_cui

    ontology_data = request.POST['ontologyData'].split('\n')
    database, term_to_cui = construct_ontology(ontology_data)
    simstring_searcher = Searcher(database, CosineMeasure())

    return HttpResponse(None)
Ejemplo n.º 8
0
    def load(self, db_path, cui_mapping_path):

        logging.info('Loading DB ...')
        with open(db_path, 'rb') as db_f:
            self.db = pickle.load(db_f)

        logging.info('Loading Mapping ...')
        with open(cui_mapping_path, 'rb') as mapping_f:
            self.cui_mapping = pickle.load(mapping_f)

        logging.info('Creating Searcher ...')
        self.searcher = Searcher(self.db, CosineMeasure())
Ejemplo n.º 9
0
def setup_preloaded_ontology(selected_ontology):
    '''
    Setup user-specified, pre-loaded ontology
    for automated mapping suggestions
    '''
    global simstring_searcher, term_to_cui

    if selected_ontology == 'umls':
        simstring_searcher = Searcher(umls_database, CosineMeasure())
        term_to_cui = umls_mappings

    return HttpResponse(None)
Ejemplo n.º 10
0
def output_similar_strings_of_each_line(path):
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            db.add(strings)

    searcher = Searcher(db, CosineMeasure())
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            result = [
                str(round(x[0], 5)) + ' ' + x[1]
                for x in searcher.ranked_search(strings, 0.8)
            ]
            print("\t".join([strings, ",".join(result)]))
Ejemplo n.º 11
0
    def similar_words_top_k(self, query, measure=CosineMeasure(), initial_threshold=0.99, dec_step=0.01, k=3):
        """search similar words by using edit distance"""
        searcher = Searcher(self.db, measure)
        t = initial_threshold
        similar_words = []
        while True:
            similar_words = searcher.search(query, t)

            if len(similar_words) >= k or t <= 0.1:
                break
            t -= dec_step

        if len(similar_words) > 3:
            np.random.choice(42)
            return np.random.choice(similar_words, k, replace=False).tolist()
        else:
            return similar_words
Ejemplo n.º 12
0
class TestCosine(TestCase):
    measure = CosineMeasure()

    def test_min_feature_size(self):
        self.assertEqual(self.measure.min_feature_size(5, 1.0), 5)
        self.assertEqual(self.measure.min_feature_size(5, 0.5), 2)

    def test_max_feature_size(self):
        self.assertEqual(self.measure.max_feature_size(5, 1.0), 5)
        self.assertEqual(self.measure.max_feature_size(5, 0.5), 20)

    def test_minimum_common_feature_count(self):
        self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0),
                         5)
        self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0),
                         10)
        self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5),
                         3)

    def test_similarity(self):
        x = ["a", "ab", "bc", "c"]
        y = ["a", "ab", "bc", "cd", "e"]
        self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0)
        self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67)

        z = ["a", "ab", "ba", "ab", "a"]
        self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0)
        self.assertEqual(round(self.measure.similarity(x, z), 2), 0.58)
        self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67)

        # Test as per paper trigrams with quotes of methyl sulphone and methyl sulfone
        a = [
            ' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su',
            'sul', 'ulf', 'lfo', 'fon', 'one', 'ne"', 'e" '
        ]
        b = [
            ' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su',
            'sul', 'ulp', 'lph', 'pho', 'hon', 'one', 'ne"', 'e" '
        ]
        self.assertEqual(
            round(self.measure.similarity(a, b), 3),
            0.788)  #BUG? Disagrees with paper that claims should be 0.788
def similarity(word):
    searcher = Searcher(db, CosineMeasure())
    return np.array(searcher.search(normalize('NFKC', word), 0.65))
Ejemplo n.º 14
0
 def setUp(self):
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     for string in self.strings:
         db.add(string)
     self.searcher = Searcher(db, CosineMeasure())
Ejemplo n.º 15
0
    def __init__(
            self, 
            base_ges_data='ges_utils/data/ges-health-problems.json', 
            no_ges_str='UNK',
            alpha=0.2,
            n_chars=4, 
            n_words=[2], 
            special_words=['vih']
        ):

        self.alpha = alpha

        with open(base_ges_data,'r',encoding='utf-8') as f:
            self.__ges_dict = json.load(f)
        
        # feature extractor
        extractor = GESSyntacticFeatureExtractor(
                        n_chars=n_chars, 
                        n_words=n_words, 
                        special_words=special_words
                    )
        self.__db = DictDatabase(extractor)
        
        # Caché
        self.__cache = {}
        
        self.__problems_from_disease = defaultdict(set)
        self.__ids_from_disease = defaultdict(set)
        self.__problems = {}
        self.__ids = {}
        
        self.__problems[-1] = no_ges_str
        self.__ids[no_ges_str] = -1
        
        # Por ahora los ids son el orden de los problemas en el json
        # TODO: decidir si los ids deberían obtenerse de algún lugar estándar
        for i, problem in enumerate(self.__ges_dict):
            
            problem_id = i+1
            
            self.__problems[problem_id] = problem
            self.__ids[problem] = problem_id
            
            # agrega un problema como si fuera disease también
            self.__problems_from_disease[problem].add(problem)
            self.__ids_from_disease[problem].add(problem_id)
            
            # agrega a las BD 
            self.__db.add(problem)
            
            for disease in self.__ges_dict[problem]:
                
                self.__problems_from_disease[disease].add(problem)
                self.__ids_from_disease[disease].add(problem_id)
                
                # agrega a la BD
                self.__db.add(disease)
        
        # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas

        self.__searcher = Searcher(self.__db, CosineMeasure())
Ejemplo n.º 16
0
def make_change_image_dict(drink_names):
    import re
    import json
    import difflib
    from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
    from simstring.measure.cosine import CosineMeasure
    from simstring.database.dict import DictDatabase
    from simstring.searcher import Searcher

    ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig")
    json_data2 = json.load(ff)
    ff.close()

    # 互いに類似度を比較する文字列のリスト
    STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names]
    TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2}
    TCD_name_db = list(TCD_db.keys())
    count = 0
    length = len(STR_db)
    result_dict = {}
    change_image_dict = {}

    
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    for str1 in STR_db:
        db.add(str1)
    
    for str2 in TCD_name_db:
        result_dict[str2] = {}
        searcher = Searcher(db, CosineMeasure())
        i = 1.0
        # 類似度を計算、0.0~1.0 で結果が返る
        flag = False
        for str1 in STR_db:
            s = difflib.SequenceMatcher(None, str2, str1).ratio()
            if s > 0.75:
                flag = True
                if (str1 in result_dict[str2]):
                    
                    d =  result_dict[str2][str1]
                    #平均更新
                    d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1]
                    
                    result_dict[str2][str1] = d
                else:
                    
                    result_dict[str2].setdefault(str1, [s ,1])
                    
        
        temp = []
        while i >= 0.65:
            result = searcher.search(str2, i)
            if (len(result)):
                flag = True
                for str1 in result:
                    if (str1 in temp): continue
                    temp += [str1]
                    if (str1 in result_dict[str2]):
                        
                        d =  result_dict[str2][str1]
                        #平均更新
                        d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1]
                        
                        result_dict[str2][str1] = d
                    else:
                        result_dict[str2].setdefault(str1, [i ,1])
                        
                        
            i -= 0.001
        if (flag):
            
            count += 1
        
    with open("./search_log.txt", "w+", encoding="utf-8_sig") as f:
        real_count = 0
        for str2 in TCD_name_db:
            print("\n", file=f)
            print("\n")
            print(">> "+str2, file=f)
            print(">> "+str2)
            M = 0.0
            name = ""
            for key, value_list in result_dict[str2].items():
                if (M < value_list[0]):
                    name = key
                    M = value_list[0]
            print("  "+name+": "+str(M), file=f)
            if (M != 0):
                if (M >= 0.76):
                    print("  "+name+": "+str(M))
                    print("ok", file=f)
                    print("ok")
                    change_image_dict[name] = TCD_db[str2]
                    real_count += 1
                else:
                    print("  "+name+": "+str(M))
                    print("out", file=f)
                    print("out")
            

        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f)
        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length))

    exit()
    return change_image_dict
Ejemplo n.º 17
0
 def search_term_sims(self, term: str) -> List[str]:
     searcher = Searcher(self.db, CosineMeasure())
     return searcher.search(term, 0.8)
Ejemplo n.º 18
0
# Read in branded foods CSV and clean it
df = pd.read_csv('branded_food.csv')
all_ingredients_final = get_cleaned_ingredients_list(df)
# Get a count for all the ingredients to be used by Peter Norvig Implementation
ingredients_count = Counter(all_ingredients_final)

##############################################
# Peter Norvig SimString Implementation Code #
##############################################

# Populate database with all ingredients
db = DictDatabase(CharacterNgramFeatureExtractor(2))
for ingredient in all_ingredients_final:
    db.add(ingredient)
# Create searcher object to be used by candidates function
searcher = Searcher(db, CosineMeasure())

# Functions


def probability(word, N=sum(ingredients_count.values())):
    """ 
  Returns the probability of the word appearing in the text 
  Usually, correctly spelled words will have a higher count and therefore probability than their mispellings
  """
    return ingredients_count[word] / N


def candidates(word, searcher):
    """ 
  Obtain a list of candidates using our searcher for a given word
    results = medgate_trial_json(lower, upper, clean_terms, raw_terms)

    with open(output_file, 'w+') as f:
        json.dump(results, f)


try:
    nlp = spacy.load('en_core_web_md')
except:
    os.system('python -m spacy download en_core_web_md')
    nlp = spacy.load('en_core_web_md')

stopwords = set(
    open(os.path.join(sys.path[0], 'stopwords.txt')).read().split('\n'))
database = load_pickle(os.path.join(sys.path[0], 'db.pickle'), 'rb')
searcher = Searcher(database, CosineMeasure())

# Input directory of letters (finds all .txt files and ignores rest)
letter_dir = os.path.join(sys.path[0], 'letter_directory/')
letter_type = '.txt'

# Read in letters
letters = get_letters_incl_spacy(letter_dir, letter_type)

# Cosine thresholds
lower_threshold = 0.95
upper_threshold = 1.00

# Output file name
output_dir = os.path.join(sys.path[0],os.environ["output_dir"])
os.makedirs(output_dir, exist_ok=True)