def get_high_recall_matches_for_df(post_df): heb_db, eng_db, umls_data = get_umls_data() heb_searcher = Searcher(heb_db, CosineMeasure()) eng_searcher = Searcher(eng_db, CosineMeasure()) all_high_recall_matcher_found = [] for row_idx, post_row in post_df.iterrows(): high_recall_matcher_found = get_english_and_hebrew_matches(eng_searcher, heb_searcher, post_row, umls_data) all_high_recall_matcher_found.append(high_recall_matcher_found ) return all_high_recall_matcher_found
def main(): heb_db, eng_db, umls_data = get_umls_data() heb_searcher = Searcher(heb_db, CosineMeasure()) eng_searcher = Searcher(eng_db, CosineMeasure()) handle_community(SCLEROSIS, heb_searcher, eng_searcher, umls_data) handle_community(DIABETES, heb_searcher, eng_searcher, umls_data) handle_community(DEPRESSION, heb_searcher, eng_searcher, umls_data) print("Done")
class TestCosine(TestCase): measure = CosineMeasure() def test_min_feature_size(self): self.assertEqual(self.measure.min_feature_size(5, 1.0), 5) self.assertEqual(self.measure.min_feature_size(5, 0.5), 2) def test_max_feature_size(self): self.assertEqual(self.measure.max_feature_size(5, 1.0), 5) self.assertEqual(self.measure.max_feature_size(5, 0.5), 20) def test_minimum_common_feature_count(self): self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5) self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 10) self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3) def test_similarity(self): x = [1, 2, 3] y = [1, 2, 3, 4] self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0) self.assertEqual(round(self.measure.similarity(x, y), 2), 0.87) z = [1, 1, 2, 3] self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0)
def _(bm): searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for i, line in enumerate(lines): if i >= SEARCH_COUNT_LIMIT: break strings = line.rstrip('\r\n') result = searcher.search(strings, 0.8)
def use_demo_ontology(): ''' Specify the demo ontology to be used for the automated mapping suggestions ''' global simstring_searcher, term_to_cui simstring_searcher = Searcher(demo_database, CosineMeasure()) term_to_cui = demo_mappings
def setUp(self) -> None: db = DictDatabase(CharacterNgramFeatureExtractor(2)) db.add("foo") db.add("bar") db.add("fooo") db.add("food") db.add("fool") db.add("follow") self.searcher = Searcher(db, CosineMeasure())
def setup_custom_ontology(request): ''' Setup custom ontology for automated mapping suggestions ''' global simstring_searcher, term_to_cui ontology_data = request.POST['ontologyData'].split('\n') database, term_to_cui = construct_ontology(ontology_data) simstring_searcher = Searcher(database, CosineMeasure()) return HttpResponse(None)
def load(self, db_path, cui_mapping_path): logging.info('Loading DB ...') with open(db_path, 'rb') as db_f: self.db = pickle.load(db_f) logging.info('Loading Mapping ...') with open(cui_mapping_path, 'rb') as mapping_f: self.cui_mapping = pickle.load(mapping_f) logging.info('Creating Searcher ...') self.searcher = Searcher(self.db, CosineMeasure())
def setup_preloaded_ontology(selected_ontology): ''' Setup user-specified, pre-loaded ontology for automated mapping suggestions ''' global simstring_searcher, term_to_cui if selected_ontology == 'umls': simstring_searcher = Searcher(umls_database, CosineMeasure()) term_to_cui = umls_mappings return HttpResponse(None)
def output_similar_strings_of_each_line(path): db = DictDatabase(CharacterNgramFeatureExtractor(2)) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') result = [ str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8) ] print("\t".join([strings, ",".join(result)]))
def similar_words_top_k(self, query, measure=CosineMeasure(), initial_threshold=0.99, dec_step=0.01, k=3): """search similar words by using edit distance""" searcher = Searcher(self.db, measure) t = initial_threshold similar_words = [] while True: similar_words = searcher.search(query, t) if len(similar_words) >= k or t <= 0.1: break t -= dec_step if len(similar_words) > 3: np.random.choice(42) return np.random.choice(similar_words, k, replace=False).tolist() else: return similar_words
class TestCosine(TestCase): measure = CosineMeasure() def test_min_feature_size(self): self.assertEqual(self.measure.min_feature_size(5, 1.0), 5) self.assertEqual(self.measure.min_feature_size(5, 0.5), 2) def test_max_feature_size(self): self.assertEqual(self.measure.max_feature_size(5, 1.0), 5) self.assertEqual(self.measure.max_feature_size(5, 0.5), 20) def test_minimum_common_feature_count(self): self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 1.0), 5) self.assertEqual(self.measure.minimum_common_feature_count(5, 20, 1.0), 10) self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3) def test_similarity(self): x = ["a", "ab", "bc", "c"] y = ["a", "ab", "bc", "cd", "e"] self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0) self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67) z = ["a", "ab", "ba", "ab", "a"] self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0) self.assertEqual(round(self.measure.similarity(x, z), 2), 0.58) self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67) # Test as per paper trigrams with quotes of methyl sulphone and methyl sulfone a = [ ' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su', 'sul', 'ulf', 'lfo', 'fon', 'one', 'ne"', 'e" ' ] b = [ ' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su', 'sul', 'ulp', 'lph', 'pho', 'hon', 'one', 'ne"', 'e" ' ] self.assertEqual( round(self.measure.similarity(a, b), 3), 0.788) #BUG? Disagrees with paper that claims should be 0.788
def similarity(word): searcher = Searcher(db, CosineMeasure()) return np.array(searcher.search(normalize('NFKC', word), 0.65))
def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure())
def __init__( self, base_ges_data='ges_utils/data/ges-health-problems.json', no_ges_str='UNK', alpha=0.2, n_chars=4, n_words=[2], special_words=['vih'] ): self.alpha = alpha with open(base_ges_data,'r',encoding='utf-8') as f: self.__ges_dict = json.load(f) # feature extractor extractor = GESSyntacticFeatureExtractor( n_chars=n_chars, n_words=n_words, special_words=special_words ) self.__db = DictDatabase(extractor) # Caché self.__cache = {} self.__problems_from_disease = defaultdict(set) self.__ids_from_disease = defaultdict(set) self.__problems = {} self.__ids = {} self.__problems[-1] = no_ges_str self.__ids[no_ges_str] = -1 # Por ahora los ids son el orden de los problemas en el json # TODO: decidir si los ids deberían obtenerse de algún lugar estándar for i, problem in enumerate(self.__ges_dict): problem_id = i+1 self.__problems[problem_id] = problem self.__ids[problem] = problem_id # agrega un problema como si fuera disease también self.__problems_from_disease[problem].add(problem) self.__ids_from_disease[problem].add(problem_id) # agrega a las BD self.__db.add(problem) for disease in self.__ges_dict[problem]: self.__problems_from_disease[disease].add(problem) self.__ids_from_disease[disease].add(problem_id) # agrega a la BD self.__db.add(disease) # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas self.__searcher = Searcher(self.__db, CosineMeasure())
def make_change_image_dict(drink_names): import re import json import difflib from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor from simstring.measure.cosine import CosineMeasure from simstring.database.dict import DictDatabase from simstring.searcher import Searcher ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig") json_data2 = json.load(ff) ff.close() # 互いに類似度を比較する文字列のリスト STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names] TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2} TCD_name_db = list(TCD_db.keys()) count = 0 length = len(STR_db) result_dict = {} change_image_dict = {} db = DictDatabase(CharacterNgramFeatureExtractor(2)) for str1 in STR_db: db.add(str1) for str2 in TCD_name_db: result_dict[str2] = {} searcher = Searcher(db, CosineMeasure()) i = 1.0 # 類似度を計算、0.0~1.0 で結果が返る flag = False for str1 in STR_db: s = difflib.SequenceMatcher(None, str2, str1).ratio() if s > 0.75: flag = True if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [s ,1]) temp = [] while i >= 0.65: result = searcher.search(str2, i) if (len(result)): flag = True for str1 in result: if (str1 in temp): continue temp += [str1] if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [i ,1]) i -= 0.001 if (flag): count += 1 with open("./search_log.txt", "w+", encoding="utf-8_sig") as f: real_count = 0 for str2 in TCD_name_db: print("\n", file=f) print("\n") print(">> "+str2, file=f) print(">> "+str2) M = 0.0 name = "" for key, value_list in result_dict[str2].items(): if (M < value_list[0]): name = key M = value_list[0] print(" "+name+": "+str(M), file=f) if (M != 0): if (M >= 0.76): print(" "+name+": "+str(M)) print("ok", file=f) print("ok") change_image_dict[name] = TCD_db[str2] real_count += 1 else: print(" "+name+": "+str(M)) print("out", file=f) print("out") print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f) print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length)) exit() return change_image_dict
def search_term_sims(self, term: str) -> List[str]: searcher = Searcher(self.db, CosineMeasure()) return searcher.search(term, 0.8)
# Read in branded foods CSV and clean it df = pd.read_csv('branded_food.csv') all_ingredients_final = get_cleaned_ingredients_list(df) # Get a count for all the ingredients to be used by Peter Norvig Implementation ingredients_count = Counter(all_ingredients_final) ############################################## # Peter Norvig SimString Implementation Code # ############################################## # Populate database with all ingredients db = DictDatabase(CharacterNgramFeatureExtractor(2)) for ingredient in all_ingredients_final: db.add(ingredient) # Create searcher object to be used by candidates function searcher = Searcher(db, CosineMeasure()) # Functions def probability(word, N=sum(ingredients_count.values())): """ Returns the probability of the word appearing in the text Usually, correctly spelled words will have a higher count and therefore probability than their mispellings """ return ingredients_count[word] / N def candidates(word, searcher): """ Obtain a list of candidates using our searcher for a given word
results = medgate_trial_json(lower, upper, clean_terms, raw_terms) with open(output_file, 'w+') as f: json.dump(results, f) try: nlp = spacy.load('en_core_web_md') except: os.system('python -m spacy download en_core_web_md') nlp = spacy.load('en_core_web_md') stopwords = set( open(os.path.join(sys.path[0], 'stopwords.txt')).read().split('\n')) database = load_pickle(os.path.join(sys.path[0], 'db.pickle'), 'rb') searcher = Searcher(database, CosineMeasure()) # Input directory of letters (finds all .txt files and ignores rest) letter_dir = os.path.join(sys.path[0], 'letter_directory/') letter_type = '.txt' # Read in letters letters = get_letters_incl_spacy(letter_dir, letter_type) # Cosine thresholds lower_threshold = 0.95 upper_threshold = 1.00 # Output file name output_dir = os.path.join(sys.path[0],os.environ["output_dir"]) os.makedirs(output_dir, exist_ok=True)