def now(config): def _get_city_match(name): if name in replaces.keys(): return (1, replaces[name]) return (cities.get(name)[0][0], cities.get(name)[0][1]) # Get places ids df_places_id = get_places_id.now(config) df = get_googledrive_df(os.getenv("INLOCO_CITIES_ID")) time.sleep(2) # Get states closest matches states = fuzzyset.FuzzySet() for x in df_places_id["state_name"].unique(): states.add(x) df["state_name"] = df["state_name"].apply(lambda x: states.get(x)[0][1]) # Get cities closest matches by state+city name cities = fuzzyset.FuzzySet() df_places_id[ "state_city"] = df_places_id["state_name"] + df_places_id["city_name"] for x in df_places_id["state_city"].drop_duplicates(): cities.add(x) # Cities with changed names replaces = { v["state_name"] + name: v["state_name"] + v["correct_name"] for name, v in config["br"]["inloco"]["replace"].items() } df["state_city"] = df["state_name"] + df["city_name"] df["state_city_match"], df["state_city"] = zip( *df["state_city"].apply(lambda x: _get_city_match(x))) # Merge to get places ids del df["state_name"], df["city_name"] df = df.merge( df_places_id[[ "state_city", "state_num_id", "state_name", "health_region_name", "health_region_id", "city_name", "city_id", ]].drop_duplicates(), on=["state_city"], how="left", ) del df["state_city"] return df
def __init__(self): super(ResultsProvider, self).__init__() self.cumulative = {} self.track_percentiles = [0.0, 50.0, 90.0, 95.0, 99.0, 99.9, 100.0] self.listeners = [] self.buffer_len = 2 self.min_buffer_len = 2 self.max_buffer_len = float('inf') self.buffer_multiplier = 2 self.buffer_scale_idx = None self.histogram_max = 1.0 self.known_errors = fuzzyset.FuzzySet(use_levenshtein=True) self.max_error_count = 100 self.known_labels = fuzzyset.FuzzySet(use_levenshtein=True) self.generalize_labels = 100
def __init__(self, people, text_detection, ocr, pixel_threshold=0.5, link_threshold=0.5): self.prev_badge = '' self.prev_time = 0 self.pixel_threshold = pixel_threshold self.link_threshold = link_threshold self.text_detection = text_detection self.ocr = ocr charset, _ = read_charset() self.chrset_index = charset self.names_db = fuzzyset.FuzzySet() self.data_db = {} for p in people: tokens = p.split(' ') for t in tokens: if len(t) > 1: self.names_db.add(t) perm = permutations(tokens) for v in list(perm): v1 = ' '.join(v) v2 = ''.join(v) self.names_db.add(v1) self.names_db.add(v2) self.data_db[v1] = p self.data_db[v2] = p self.queue = queue.Queue(maxsize=1) self.worker = threading.Thread(target=self.run) self.worker.start()
def build_structures(): opts = ( ('a', 'FuzzySet', fuzzyset.FuzzySet()), ('b', 'FuzzySet (no leven)', fuzzyset.FuzzySet()), ('c', 'cFuzzySet', cfuzzyset.cFuzzySet()), ('d', 'cFuzzySet (no leven)', cfuzzyset.cFuzzySet()), ) ref = {} input_file = gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) for line in input_file: line = line.rstrip() for _, _, structure in opts: structure.add(line) ref[line] = line return opts + (('ref', 'reference (dict)', ref), )
def find_food(food): conn = sqlite3.connect('usda.sql3') search_clause = '%' + food + '%' c = conn.cursor() c.execute('SELECT id, long_desc FROM food WHERE long_desc LIKE ?', (search_clause, )) strmatch = fuzzyset.FuzzySet() strmatch.add(food) best_score = -1 best_food = '' best_id = -1 for row in c: food_id = row[0] food_name = row[1].lower().split(',')[0] if strmatch.get(food_name) is None: continue score = strmatch.get(food_name)[0][0] if score > best_score and food_name.startswith(food): best_score = score best_food = row[1] best_id = food_id # if food_name.startswith(food): # print(row[1]) print(str(best_id) + " " + best_food) return (best_id, best_food)
def sm_fuzzy_match(street, ed): if ed != str: ed = str(int(ed)) #Return null if street is blank if street == '': return ['', '', False] #Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null try: sm_ed_streets = sm_ed_st_dict[ed] sm_ed_streets_fuzzyset = fuzzyset.FuzzySet(sm_ed_streets) except: problem_EDs.append(ed) return ['', '', False] #Step 1: Find best match among streets associated with microdata ED try: best_match_ed = sm_ed_streets_fuzzyset[street][0] except: return ['', '', False] #Step 2: Find best match among all streets try: best_match_all = sm_all_streets_fuzzyset[street][0] except: return ['', '', False] #Step 3: If both best matches are the same, return as best match if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5): return [best_match_ed[1], best_match_ed[0], True] else: return ['', '', False]
def __init__(self, documents): self.documents = np.array(documents) if documents: self.tfidf = TfidfVectorizer(tokenizer=self.tokenize_document, stop_words=stop_word_set) self.tfs = self.tfidf.fit_transform(self.documents) self.fuzzyset = fuzzyset.FuzzySet(self.tfidf.get_feature_names(), use_levenshtein=False)
def compare(guess, answer): a = fuzzyset.FuzzySet() a.add(answer) metric = a.get(guess) tally = 0 if metric: for item in metric: tally += item[0] average = tally / len(metric) if average > 0.3: return True
def getid(msgs): with open('data.json') as f: data = json.load(f) a = fuzzyset.FuzzySet() a.add(str(msgs)) value = sys.maxint id = 0 name = '' for dt in data: val = a.get(str(dt['Name'])) if(val>value): value = val id = dt['ID'] return str(id)
def transformed_mean_score(transformed_set, target_set): score = 0 for index, row in transformed_set.iterrows(): fs = fuzzyset.FuzzySet() fs.add(row[0]) fuzzyval = fs.get(target_set.iloc[index, 0]) if fuzzyval is None: continue curr_score = fuzzyval[0][0] #curr_score = fs.get(target_set.iloc[index, 0])[0][0] # print("curr_score = ") # print(curr_score) score += curr_score del fs return score / transformed_set.shape[0]
def fuzzy_match_function(street, ed, ed_st_dict, all_streets_fuzzyset, check_too_similar=False): nomatch = ['', '', False] ed = str(ed) #Return null if street is blank if street == '': return nomatch #Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null try: ed_streets = ed_st_dict[ed] ed_streets_fuzzyset = fuzzyset.FuzzySet(ed_streets) except: # print("Problem ED:" + str(ed)) return nomatch #Step 1: Find best match among streets associated with microdata ED try: best_match_ed = ed_streets_fuzzyset[street][0] except: return nomatch #Step 2: Find best match among all streets try: best_match_all = all_streets_fuzzyset[street][0] except: return nomatch #Step 3: If both best matches are the same, return as best match if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5): #Check how many other streets in ED differ by one character if check_too_similar: too_similar = sum([diff_by_one_char(st, best_match_ed[1]) for st in sm_ed_streets]) if too_similar == 0: return [best_match_ed[1], best_match_ed[0], True] else: return nomatch else: return [best_match_ed[1], best_match_ed[0], True] #Step 4: If both are not the same, return one with the higher score (to help manual cleaning) else: if best_match_all[0] < best_match_ed[0]: return [best_match_ed[1], best_match_ed[0], False] else: return [best_match_all[1], best_match_all[0], False]
def getDayTimeAlt(AT, show): #Get broadcast time of show from MAL mal_watchlist = mal.User.getAnimeList(AT, "watching", ['broadcast', 'alternative_titles'])[0]["data"] fset = fuzzyset.FuzzySet() fset.add(show) max_prob, max_prob_idx = 0, 0 for idx, item in enumerate(mal_watchlist): result = fset.get(item['node']['title']) current_prob = result[0][0] if result != None else 0 if current_prob > max_prob: max_prob = current_prob max_prob_idx = idx if max_prob >= 0.6: day = dayMapping[mal_watchlist[max_prob_idx]['node']['broadcast']['day_of_the_week']] time = int(mal_watchlist[max_prob_idx]['node']['broadcast']['start_time'].replace(":", "")) #Converting day from JST to IST if(time <= 330): day = (day + 6) % 7 hr = int(time / 100) mi = time % 100 if mi < 30: hr = (hr + 20) % 24 else: hr = (hr + 21) % 24 mi = (mi + 30) % 60 time = hr * 100 + mi #Get list of alternative titles alt_names = mal_watchlist[max_prob_idx]['node']['alternative_titles']['synonyms'] alt_names.append(mal_watchlist[max_prob_idx]['node']['title']) return day, time, alt_names else: print("\033[91m[-] Anime not found in watchlist! Ignoring.\033[0m") return None, None, None
def predicttypesusingcolumnnames(): types = [row.split('.')[1] for row in data] list_to_match = [ "Person name", "Last name", "First name", "Middle name", "Full name", "Business name", "Phone Number", "Address", "Street name", "City", "Neighborhood", "Latitude Longitude", "Zip", "Borough", "School name", "Vehicle Color", "Vehicle Car make", "City agency", "Areas of study", "Subjects", "School Levels", "College/University names", "Websites", "Building Classification", "Vehicle Type", "Type of location", "dba" ] fz = fuzzyset.FuzzySet() for l in list_to_match: fz.add(l.lower()) count = 0 for row in types: actualdatasetname = data[count] lp = fz.get(process(row)) count = count + 1 predictions.append(list(lp[0])[1].lower())
def match_name_list(results_dedupe, df): """ Match name list per issue to LOC name list""" name_list_all = [] name_list_highmatch = [] fz = fuzzyset.FuzzySet() terms = df['name'].tolist() #Create a list of terms we would like to match against in a fuzzy way for l in terms: fz.add(l) #Now see if our sample term fuzzy matches any of those specified terms for name in results_dedupe: sample_term = name #matches is a list of tuples (prob, name) matches = fz.get(sample_term) if matches: max_match = max(matches, key=lambda x: x[0]) else: max_match = None if max_match: match_df = df[df['name'].str.match(max_match[1])] if len(match_df) >= 1: match_df = return_likely_year_match_df(match_df) #print(len(match_df)) name_list_all.append([ name, match_df.iloc[0]['name heading'], match_df.iloc[0]['URI'], max_match[0] ]) #select = [each for each in matches if each[0]>0.8] #print(select) if max_match[0] > 0.85: #if select : name_list_highmatch.append([ name, match_df.iloc[0]['name heading'], match_df.iloc[0]['URI'], max_match[0] ]) else: name_list_all.append([name, '', '', '']) else: name_list_all.append([name, '', '', '']) return name_list_all, name_list_highmatch
def setCorrectWatchlist(season): config = readConfig() watchlist = config['watchlist'] #Add shows from current season to fuzzyset list fset = fuzzyset.FuzzySet() for show in season: fset.add(show) #Generating correct watchlist fset_watchlist = {} for show in watchlist.keys(): if fset.get(show)[0][0] >= 0.48: fset_watchlist[fset.get(show)[0][1]] = watchlist[show] else: print( "\033[91m[-] {} does not seem to be airing this season! Ignoring..\033[0m" .format(show)) config['watchlist'] = fset_watchlist with open('data/config.json', 'w') as f: json.dump(config, f, indent=4)
def fuzzmatch(source_file, target_file): source_key = [column for column in source_file][0] target_key = [column for column in target_file][0] source_new_df = pd.DataFrame(columns=[column for column in source_file]) target_new_df = pd.DataFrame(columns=[column for column in target_file]) # mapping={} fuzzy_threshold = 0.125 for i, row_source in source_file.iterrows(): fuzzymatch = fuzzyset.FuzzySet() fuzzymatch.add(row_source[source_key]) for j, row_target in target_file.iterrows(): fuzzyval = fuzzymatch.get(row_target[target_key]) #print(row_source[col1]) row_target[col1] if fuzzyval is None: continue elif fuzzyval[0][0] > fuzzy_threshold: # print(fuzzyval) #add these rows to new dataframes source_new_df = source_new_df.append(row_source) target_new_df = target_new_df.append(row_target) del fuzzymatch return source_new_df, target_new_df
def get_term_vectors_for_articles_fuzzy(self, tokens, sim_threshold=0.8, gram_size=6, max_len_diff=5, use_levenshtein=True, composite_size=2): c = self.conn.cursor() if self.fuzzyset is None: fs_path = pl.Path(self.db_path + "_article_fs.pickle") if self.load_cached_fs and fs_path.exists(): print("Loading Fuzzy Set from disk") with fs_path.open("rb") as fs_file: f_s = pickle.load(fs_file) self.fuzzyset = f_s else: print("Creating Fuzzy Set") all_articles_query = """SELECT articles.article FROM articles """ c.execute(all_articles_query) f_s = fs.FuzzySet(gram_size_lower=gram_size, gram_size_upper=gram_size, use_levenshtein=False) i = 0 for article, in c: f_s.add(article) i += 1 # if i % 10000 == 00: # print("Articles processed: {}".format(i)) self.fuzzyset = f_s with fs_path.open("wb") as fs_file: pickle.dump(f_s, fs_file) print("Finished creating Fuzzy Set") self.fuzzyset.use_levenshtein = use_levenshtein vectors = defaultdict(dict) token_article_mapping = {} for i in range(composite_size, 0, -1): i_length_tokens = [ " ".join(tokens[i2:i2 + i]) for i2 in range(0, (len(tokens) + 1 - i)) ] matched_articles = [] i_m = 1 for i_t in i_length_tokens: i_m += 1 # print("Processed Token: {}".format(i_m)) match = self.fuzzyset.get(i_t) if match: sim, word = match[0] len_dif = abs(len(word) - len(i_t)) word_len = len(word) if len(word) <= 15 else 15 length_adjusted_threshold = sim_threshold + 0.15 * ( word_len / 15) condition = sim >= length_adjusted_threshold and len_dif <= max_len_diff if condition: token_article_mapping[i_t] = word matched_articles.append(word) for token_batch in batch(matched_articles, self.batchsize): param_placeholders = ", ".join( ["?" for _ in range(len(token_batch))]) statement = terms_for_articles_statement.format( param_placeholders) c.execute(statement, token_batch) for term, article, tf_idf in c: vectors[article][term] = tf_idf return vectors, token_article_mapping
'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", " ", '_' ] names_db = fuzzyset.FuzzySet() data_db = {} def read_charset(): charset = {} inv_charset = {} for i, v in enumerate(ENGLISH_CHAR_MAP): charset[i] = v inv_charset[v] = i return charset, inv_charset chrset_index = {}
def find_fuzzy_matches(df, city, street, all_streets, ed_st_dict, source): #Fuzzy matching algorithm def fuzzy_match_function(street, ed, ed_st_dict, all_streets_fuzzyset, check_too_similar=False): nomatch = ['', '', False] ed = str(ed) #Return null if street is blank if street == '': return nomatch #Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null try: ed_streets = ed_st_dict[ed] ed_streets_fuzzyset = fuzzyset.FuzzySet(ed_streets) except: # print("Problem ED:" + str(ed)) return nomatch #Step 1: Find best match among streets associated with microdata ED try: best_match_ed = ed_streets_fuzzyset[street][0] except: return nomatch #Step 2: Find best match among all streets try: best_match_all = all_streets_fuzzyset[street][0] except: return nomatch #Step 3: If both best matches are the same, return as best match if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5): #Check how many other streets in ED differ by one character if check_too_similar: too_similar = sum([diff_by_one_char(st, best_match_ed[1]) for st in sm_ed_streets]) if too_similar == 0: return [best_match_ed[1], best_match_ed[0], True] else: return nomatch else: return [best_match_ed[1], best_match_ed[0], True] #Step 4: If both are not the same, return one with the higher score (to help manual cleaning) else: if best_match_all[0] < best_match_ed[0]: return [best_match_ed[1], best_match_ed[0], False] else: return [best_match_all[1], best_match_all[0], False] #Helper function (necessary since dictionary built only for cases without validated exact matches) def get_fuzzy_match(exact_match, fuzzy_match_dict, street, ed): #Only look at cases without validated exact match if not (exact_match): #Need to make sure "Unnamed" street doesn't get fuzzy matched if 'Unnamed' in street: return ['', '', False] #Get fuzzy match else: return fuzzy_match_dict[street, ed] #Return null if exact validated match else: return ['', '', False] #Set var names fuzzy_match = 'fuzzy_match_'+source fuzzy_bool = 'fuzzy_match_bool_'+source fuzzy_score = 'fuzzy_match_score_'+source #Create all street fuzzyset only once all_streets_fuzzyset = fuzzyset.FuzzySet(all_streets) #Create dictionary based on Street-ED pairs for faster lookup using helper function df_no_exact_match = df[~df['current_match_bool']] df_grouped = df_no_exact_match.groupby([street, 'ed']) fuzzy_match_dict = {} for st_ed, _ in df_grouped: fuzzy_match_dict[st_ed] = fuzzy_match_function(st_ed[0], st_ed[1], ed_st_dict, all_streets_fuzzyset) #Compute current number of residuals num_records = len(df) num_current_residual_cases = num_records - len(df[df['current_match_bool']]) #Get fuzzy matches df[fuzzy_match], df[fuzzy_score], df[fuzzy_bool] = zip(*df.apply(lambda x: get_fuzzy_match(x['current_match_bool'], fuzzy_match_dict, x[street], x['ed']), axis=1)) #Update current match df['current_match'], df['current_match_bool'] = zip(*df.apply(lambda x: update_current_match(x['current_match'], x['current_match_bool'], x[fuzzy_match], x[fuzzy_bool], x[street]),axis=1)) #Generate dashboard information num_fuzzy_matches = np.sum(df[fuzzy_bool]) prop_fuzzy_matches = float(num_fuzzy_matches)/num_records fuzzy_info = [num_fuzzy_matches] print("Fuzzy matches (using "+source+"): "+str(num_fuzzy_matches)+" of "+str(num_current_residual_cases)+" unmatched cases ("+str(round(100*float(num_fuzzy_matches)/float(num_current_residual_cases), 1))+"%)") return df, fuzzy_info
def process(): names = fuzzyset.FuzzySet() names.add('stas khirman') names.add('khirman stas') names.add('stas') names.add('khirman') # drv = driver.load_driver('multimodel') serving = multimodel.MultiModelDriver(init_hook=face_badge.init_hook, process=face_badge.process_internal) kwargs = {'ml-serving-drivers': ['openvino', 'tensorflow', 'tensorflow']} serving.load_model(['./vidos/faces/face-detection.xml', './vidos/m1', './vidos/m2'], **kwargs) global to_process i_name = 1 while runned: lock.acquire(blocking=True) frame = to_process if frame is None: lock.release() continue print('start frame') to_process = None results = serving.predict_hooks({ 'pixel_threshold': 0.5, 'link_threshold': 0.5, 'image': frame }) frame = results['output'] table = results['table_output'] found_name = None candidates = [] for e in table: text = e['name'] if len(text) > 2: found = names.get(text) if (found is not None) and (len(found) > 0): if found[0][0] > 0.7: text = found[0][1] if ' ' in text: found_name = (found[0][0], text) candidates = [] break else: candidates.append(text) if (found_name is None) and len(candidates) > 0: found_name = choose_one(names, candidates) if found_name is not None: add_overlays(frame, found_name[0], found_name[1]) to_save = e['image'][:, :, ::-1] if output_dir != '': name = found_name[1].replace(" ", "_") to_dir = '{}/{}'.format(output_dir, name) if not os.path.exists(to_dir): os.mkdir(to_dir) fname = '{}/auto_{}_{}.jpg'.format(to_dir, int(time.time()), i_name) logging.info('Save new picture: {}'.format(fname)) cv2.imwrite(fname, to_save) global new_count new_count = 1 global result result = frame i_name += 1 global last_processed last_processed = frame lock.release() print('stop frame')
def test_type_directInstantiation(self): fs = fuzzyset.FuzzySet() self.assertTrue(isinstance(fs, fuzzyset.FuzzySet))
import fuzzyset from fuzzyset import * # a = fuzzyset.FuzzySet f = open( "/Users/dongxinyuan/Desktop/Projektpraktikum Information Service Engineering/data/rawdata/split/test.txt", "r") testlist = f.read().split("\n") a = fuzzyset.FuzzySet(testlist) for l in testlist: a.add(l) # a.add("asd") # a = f.read().split("\n") print(a.get("history"))
def compare_st_to_grid(info, year): stname, ed_list = info # convert ed_list to all strings, make lower to match letters in polygon files ed_list = [str(a).lower() for a in ed_list] # spatial join the grid with EDs the microdata street appears in if year == 1930: joined = gpd.sjoin( ed_poly_30.loc[ed_poly_30.ED_edit.isin(ed_list)], grid_geo) if year == 1940: joined = gpd.sjoin(ed_poly_40.loc[ed_poly_40.ED_num.isin(ed_list)], grid_geo) # create new df out of unique street names in grid df = pd.DataFrame({ 'grid': joined.st30.dropna().unique().tolist() + joined.st40.dropna().unique().tolist() }) df = df.loc[df.grid != ''] # drop direction from all grid names df['nodir'] = df.grid.apply(drop_dir).apply( lambda x: x.decode('utf-8', 'ignore')) # separate type from all grid names df['notype'] = df.nodir.apply(drop_type).apply( lambda x: x.decode('utf-8', 'ignore')) #df = df.loc[df.nodir != df.notype] df['type'] = df.nodir.apply(st_type) df = df.loc[df.type.apply(lambda x: x is not None)] df['type'] = df['type'].apply(lambda x: x.group().replace( ' ', '')).apply(lambda x: x.decode('utf-8', 'ignore')) # create a list of no-direction street names from the entire grid all_streets = grid_geo.st30.dropna().unique().tolist( ) + grid_geo.st40.dropna().unique().tolist() all_streets = map(drop_dir, all_streets) all_streets = list(set(all_streets)) # CHECK 1: if microdata street appears exactly IN ENTIRE GRID, drop from list to check later stname_nodir = drop_dir(stname).decode('utf-8', 'ignore') if stname_nodir in all_streets: return pd.DataFrame({ 'original': [stname], 'exact': [1], 'type_fix': [0], 'fuzzy': [0], 'check_grid': [0], 'correct_st': [''] }) # CHECK 2: if microdata name of street appears, and both types are one of St, Ave, Road, accept the grid version as correct stname_nodir_notype = drop_type(stname_nodir) try: stname_type = st_type(stname_nodir).group().replace(' ', '') # if code to get street type fails, it should be looked for in grid except: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [0], 'fuzzy': [0], 'check_grid': [1], 'correct_st': [''] }) # check condition if stname_nodir_notype in df['notype'].tolist(): check = df.loc[(df.notype == stname_nodir_notype) & ( df.type.isin(['St', 'Ave', 'Road']))].reset_index() # check that there is only one option to choose from if len(check) == 1: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [1], 'fuzzy': [0], 'check_grid': [0], 'correct_st': [check.iloc[0]['nodir']] }) else: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [0], 'fuzzy': [0], 'check_grid': [1], 'correct_st': [''] }) # CHECK 3: if microdata street has very close name fuzzy match and exact type match, accept the grid version as correct notype_fs = fuzzyset.FuzzySet(df.notype.unique()) match = notype_fs.get(stname_nodir_notype) # if match fails, case must be checked in grid if match is None: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [0], 'fuzzy': [0], 'check_grid': [1], 'correct_st': [''] }) # first, see if best match is high enough score if match[0][0] > 0.8: # next, see if corresponding grid name has exact same type check = df.loc[df.notype == match[0][1]].reset_index() # confirm only one matching street if len(check) == 1: # confirm same type if stname_type == check.iloc[0]['type']: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [0], 'fuzzy': [1], 'check_grid': [0], 'correct_st': [check.iloc[0]['nodir']] }) else: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [0], 'fuzzy': [0], 'check_grid': [1], 'correct_st': [''] }) else: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [0], 'fuzzy': [0], 'check_grid': [1], 'correct_st': [''] }) # if all checks fail, return a row marking that street to be checked, no correct street yet else: return pd.DataFrame({ 'original': [stname], 'exact': [0], 'type_fix': [0], 'fuzzy': [0], 'check_grid': [1], 'correct_st': [''] })
import pandas as pd import fuzzyset import sys import util.util as util import util.constants as c db = pd.read_pickle(f'resources/compare_{c.N_NGRAM}.db') match_set = fuzzyset.FuzzySet(db.ngram) def get_matches(seq): ngram = midi_to_ngram(seq) best_match = match_set.get(ngram)[0][1] matches = db[db.ngram==best_match] # Information comes from *next* ngram to_drop = [] for i, r in matches.iterrows(): if i+c.N_NGRAM >= len(db): to_drop.append(i) elif db.iloc[i].track != db.iloc[i+c.N_NGRAM].track: to_drop.append(i) elif db.iloc[i].track in c.reference_songs: to_drop.append(i) matches = matches.drop(to_drop) next_matches = db.iloc[matches.index+c.N_NGRAM] if matches.empty: # print('Error: End of song.') return None, None
def updateList(filename): toUpdate = input("[*] Update list on MAL? (y/n): ") if toUpdate != 'y': print("\n") return print("[*] Preparing to update list") # PARSER HERE - Get Anime name from filename # Using regex as temp. solution (works only for file names following HorribleSubs naming format) try: d = ap.Parse(filename) animename = d.getParsedValues()['anime'] # print(d.finalList) # animename = re.split("\]|\)|\[|\(", filename)[2].split('-')[0].strip() except: print("\033[91m[-] Unsupported filename format/Not an anime file! Skipping.\033[0m") return # Get loginData from login json file with open("data/loginData.json", "r") as f: loginData = json.load(f) # if loginData is empty or it's been "expires_in" seconds (expiration of the access_token), do a fresh login if (not loginData) or ((time.time() - float(loginData['access_token'][1])) > float(loginData['expires_in'])): print("[*] Doing a fresh login") # Get login credentials from config info = credentialCheck() # Get loginInfo by logging in and add current timestamp to file loginInfo = mal.User.login(info[0], info[1]) loginInfo['access_token'] = [loginInfo['access_token'], str(time.time())] with open("data/loginData.json", "w+") as f: json.dump(loginInfo, f, indent=4) AT = loginInfo['access_token'][0] # else, get the existing access_token from the json file else: credentialCheck() print("[*] Grabbing existing Access Token from file") AT = loginData['access_token'][0] # Get User's watchlist animeInfo = mal.User.getAnimeList(AT, "watching", ["alternative_titles", "num_episodes", "my_list_status"]) aniList = [] # Iterate through page-wise responses in animeInfo and form one single list with anime name and id for res in animeInfo: # print("Response: {}".format(res)) t_aniList = [] for i,item in enumerate(res['data']): t_aniList.append({'names': None, "id": "", "episodes": "", "status": ""}) originalTitle = item['node']['title'] engTitle = item['node']['alternative_titles']['en'] japTitle = item['node']['alternative_titles']['ja'] # Adding original, english, japanese and all other alternate_titles to names list t_aniList[i]['names'] = [originalTitle, engTitle, japTitle] + [name for name in item['node']['alternative_titles']['synonyms']] # Adding id of show t_aniList[i]['id'] = str(item['node']['id']) t_aniList[i]['episodes'] = str(item['node']['num_episodes']) t_aniList[i]['status'] = item['node']['my_list_status']['status'] aniList += t_aniList # probDict = {} probValues = [] fset = fuzzyset.FuzzySet() fset.add(animename) # print("{}: {}".format(aniList, len(aniList))) for show in aniList: probList = [] for name in show['names']: # print("Name: {}".format(name)) # Filter out blank entries from the list if name is not "": fuzzyInfo = fset.get(name) #Fuzzy returns None if 2 strings are completely different: filtering out those cases if fuzzyInfo is not None: # print("Fuzzy: {}".format(fuzzyInfo)) probList.append(fuzzyInfo[0][0]) # print("probList: {}".format(probList)) # Add 0 probability if probList is empty (fuzzyInfo returned None, so show was never added to probList), hence definitely not this show if not probList: probList.append(0) # Add "show: max probability" key-value pair in dictionary # Key is first name from list of names of the show (first name is always 'title', the name MAL uses on the website by default) # probDict[show['names'][0]] = str(max(probList)) probValues.append(max(probList)) # print(probValues) if max(probValues) >= 0.5: toUpdate_idx = probValues.index(max(probValues)) toUpdate_name = aniList[toUpdate_idx]['names'][0] toUpdate_ID = aniList[toUpdate_idx]['id'] toUpdate_Eps = int(aniList[toUpdate_idx]['episodes']) toUpdate_status = aniList[toUpdate_idx]['status'] # print("{} -> {}".format(toUpdate_name, toUpdate_ID)) #Get previously watched episodes from list for res in animeInfo: for show in res['data']: if show['node']['title'] == toUpdate_name: oldVal = int(show['node']['my_list_status']['num_episodes_watched']) #Update list with previously watched episodes + 1 mal.User.updateList(AT, toUpdate_ID, {"num_watched_episodes": oldVal + 1}) print("\033[92m[+]\033[0m \033[93m{}\033[0m \033[92mwas updated!\033[0m \033[96m{} --> {}\033[0m".format(toUpdate_name, oldVal, oldVal + 1)) if (oldVal + 1) == toUpdate_Eps: mal.User.updateList(AT, toUpdate_ID, {"status": "completed"}) print("\033[92m[+] Anime Completed!\033[0m Status updated: \033[96m{} --> completed\033[0m".format(toUpdate_status)) try: score = int(input("[*] Score? (1-10): ")) if score >= 1 and score <= 10: mal.User.updateList(AT, toUpdate_ID, {"score": score}) print("\033[92m[*] Score updated.\033[0m") print("\033[92m[*] Done.\033[0m") else: print("\033[91m[*] Skipped.\033[0m") except: print("\033[91m[-] Invalid input.\033[0m") else: print("\033[91m[-] This show does not seem to be in your watchlist! Skipping.\033[0m\n") return print("")
def find_fuzzy_matches(df, city, street, sm_all_streets, sm_ed_st_dict, check_too_similar=False): try: post = '_' + street.split('_')[2].split('HN')[0] except: post = '' num_records = len(df) cprint("Fuzzy matching algorithm for %s \n" % (street), attrs=['underline'], file=AnsiToWin32(sys.stdout)) start = time.time() # # Find the best matching Steve Morse street name # #Create a set of all streets for fuzzy matching (create once, call on) sm_all_streets_fuzzyset = fuzzyset.FuzzySet(sm_all_streets) #Keep track of problem EDs problem_EDs = [] #Function to check if street names differ by one character def diff_by_one_char(st1, st2): if len(st1) == len(st2): st1chars = list(st1) st2chars = list(st2) #Check how many characters differ, return True if only 1 character difference if sum([st1chars[i]!=st2chars[i] for i in range(len(st1chars))]) == 1: return True else: return False else: return False #Fuzzy matching algorithm def sm_fuzzy_match(street, ed): nomatch = ['', '', False] #Return null if street is blank if street == '': return nomatch #Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null try: sm_ed_streets = sm_ed_st_dict[ed] sm_ed_streets_fuzzyset = fuzzyset.FuzzySet(sm_ed_streets) except: problem_EDs.append(ed) return nomatch #Step 1: Find best match among streets associated with microdata ED try: best_match_ed = sm_ed_streets_fuzzyset[street][0] except: return nomatch #Step 2: Find best match among all streets try: best_match_all = sm_all_streets_fuzzyset[street][0] except: return nomatch #Step 3: If both best matches are the same, return as best match if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5): #Check how many other streets in ED differ by one character if check_too_similar: too_similar = sum([diff_by_one_char(st, best_match_ed[1]) for st in sm_ed_streets]) if too_similar == 0: return [best_match_ed[1], best_match_ed[0], True] else: return nomatch else: return [best_match_ed[1], best_match_ed[0], True] #Step 4: If both are not the same, return one with the higher score (to help manual cleaning) else: if best_match_all[0] < best_match_ed[0]: return [best_match_ed[1], best_match_ed[0], False] else: return [best_match_all[1], best_match_all[0], False] #Create dictionary based on Street-ED pairs for faster lookup using helper function df_no_exact_match = df[~(df['exact_match_bool'+post])] df_grouped = df_no_exact_match.groupby([street, 'ed']) sm_fuzzy_match_dict = {} for st_ed, _ in df_grouped: sm_fuzzy_match_dict[st_ed] = sm_fuzzy_match(st_ed[0], st_ed[1]) #Helper function (necessary since dictionary built only for cases without validated exact matches) def get_fuzzy_match(exact_match, street, ed): #Only look at cases without validated exact match if not (exact_match): #Need to make sure "Unnamed" street doesn't get fuzzy matched if 'Unnamed' in street: return ['', '', False] #Get fuzzy match else: return sm_fuzzy_match_dict[street, ed] #Return null if exact validated match else: return ['', '', False] #Get fuzzy matches df['fuzzy_match_sm'+post], df['fuzzy_match_sm_score'+post], df['fuzzy_match_sm_bool'+post] = zip(*df.apply(lambda x: get_fuzzy_match(x['exact_match_bool'+post], x[street], x['ed']), axis=1)) #Compute number of cases without exact match num_current_residual_cases = num_records - len(df[df['exact_match_bool'+post]]) #Generate dashboard information num_fuzzy_matches = np.sum(df['fuzzy_match_sm_bool'+post]) prop_sm_fuzzy_matches = float(num_fuzzy_matches)/num_records end = time.time() fuzzy_matching_time = round(float(end-start)/60, 1) fuzzy_info = [num_fuzzy_matches, fuzzy_matching_time, problem_EDs] cprint("Fuzzy matches (using microdata ED): "+str(num_fuzzy_matches)+" of "+str(num_current_residual_cases)+" unmatched cases ("+str(round(100*float(num_fuzzy_matches)/float(num_current_residual_cases), 1))+"%)\n", file=AnsiToWin32(sys.stdout)) cprint("Fuzzy matching for %s took %s\n" % (city, fuzzy_matching_time), 'cyan', attrs=['dark'], file=AnsiToWin32(sys.stdout)) return df, fuzzy_info
def find_fuzzy_matches(df, city, street, sm_all_streets, sm_ed_st_dict): num_records = df['st_edit'].notnull().sum() # # Identify exact matches to exclude from fuzzy match search # df['st_edit_exact_match'] = df[street].apply(lambda s: s in sm_all_streets) print("Exact matches: %s of %s" % (str(df['st_edit_exact_match'].sum()), str(num_records))) # # Find the best matching Steve Morse street name # #Create a set of all streets for fuzzy matching (create once, call on) sm_all_streets_fuzzyset = fuzzyset.FuzzySet(sm_all_streets) #Keep track of problem EDs problem_EDs = [] #Fuzzy matching algorithm def sm_fuzzy_match(street, ed): if ed != str: ed = str(int(ed)) #Return null if street is blank if street == '': return ['', '', False] #Microdata ED may not be in Steve Morse, if so then add it to problem ED list and return null try: sm_ed_streets = sm_ed_st_dict[ed] sm_ed_streets_fuzzyset = fuzzyset.FuzzySet(sm_ed_streets) except: problem_EDs.append(ed) return ['', '', False] #Step 1: Find best match among streets associated with microdata ED try: best_match_ed = sm_ed_streets_fuzzyset[street][0] except: return ['', '', False] #Step 2: Find best match among all streets try: best_match_all = sm_all_streets_fuzzyset[street][0] except: return ['', '', False] #Step 3: If both best matches are the same, return as best match if (best_match_ed[1] == best_match_all[1]) & (best_match_ed[0] >= 0.5): return [best_match_ed[1], best_match_ed[0], True] else: return ['', '', False] #Create dictionary based on Street-ED pairs for faster lookup using helper function df_st_edit = df[~df['st_edit_exact_match']] df_grouped = df_st_edit.groupby([street, 'ed']) sm_fuzzy_match_dict = {} for st_ed, _ in df_grouped: sm_fuzzy_match_dict[st_ed] = sm_fuzzy_match(st_ed[0], st_ed[1]) #Helper function (necessary since dictionary built only for cases without validated exact matches) def get_fuzzy_match(exact_match, street, ed): #Only look at cases without validated exact match if not (exact_match): #Need to make sure "Unnamed" street doesn't get fuzzy matched if 'Unnamed' in street: return ['', '', False] #Get fuzzy match else: return sm_fuzzy_match_dict[street, ed] #Return null if exact validated match else: return ['', '', False] #Get fuzzy matches df['st_edit_fuzzy_match'], df['st_edit_fuzzy_match_score'], df[ 'st_edit_fuzzy_match_bool'] = zip(*df.apply(lambda x: get_fuzzy_match( x['st_edit_exact_match'], x[street], x['ed']), axis=1)) print("Fuzzy matches: %s of %s" % (str(df['st_edit_fuzzy_match_bool'].sum()), str(len(df)))) print("Unmatched cases: %s of %s" % (str( len(df) - df['st_edit_fuzzy_match_bool'].sum() - df['st_edit_exact_match'].sum()), str(len(df)))) df['st_edit_matched'] = df['st_edit'] df.loc[~df['st_edit_exact_match'] & df['st_edit_fuzzy_match_bool'], 'st_edit_matched'] = df['st_edit_fuzzy_match'] return df
import fuzzyset fz = fuzzyset.FuzzySet() #Create a list of terms we would like to match against in a fuzzy way for l in ["Diane Abbott", "Boris Johnson"]: fz.add(l) #Now see if our sample term fuzzy matches any of those specified terms sample_term='Boris Johnstone' fz.get(sample_term) # , fz.get('Diana Abbot'), fz.get('Joanna Lumley')
from django.http import HttpResponse, JsonResponse import os, mimetypes import fuzzyset THRESHOLD = 0.3 DIFFERENCE_THRESHOLD = 0 # Index the lines in all our subtitles once upon deploy directory = os.path.dirname(os.path.realpath(__file__)) + "/../data" index = 0 sets = {} for filename in os.listdir(directory): if filename[0] == ".": continue subfile = open(directory + "/" + filename) fset = fuzzyset.FuzzySet() count = -1 for line in subfile: count += 1 line = line.strip() if count <= 1: continue elif line == "": count = -1 continue fset.add(line) index += 1 sets[filename] = fset def subs(request):
def process(): size = 1024 charset, _ = read_charset() global chrset_index chrset_index = charset names = fuzzyset.FuzzySet() names.add('stas khirman') names.add('khirman stas') names.add('stas') names.add('khirman') drv1 = driver.load_driver('tensorflow') serving1 = drv1() serving1.load_model('./m1') drv2 = driver.load_driver('tensorflow') serving2 = drv2() serving2.load_model('./m2') global to_process i_name = 1 while runned: lock.acquire(blocking=True) frame = to_process if frame is None: lock.release() continue print('start frame') to_process = None w = frame.shape[1] h = frame.shape[0] if w > h: if w > size: ratio = size / float(w) h = int(float(h) * ratio) w = size else: if h > size: ratio = size / float(h) w = int(float(w) * ratio) h = size w = fix_length(w,32) h = fix_length(h,32) original = frame[:, :, ::-1].copy() image = cv2.resize(original, (w, h)) image = image.astype(np.float32) / 255.0 image = np.expand_dims(image, 0) outputs = serving1.predict({'image': image}) cls = outputs['pixel_pos_scores'][0] links = outputs['link_pos_scores'][0] mask = decodeImageByJoin(cls, links, 0.5, 0.1) bboxes = maskToBoxes(mask, (original.shape[1], original.shape[0])) found_name = None candidates = [] for i in range(len(bboxes)): box = np.int0(cv2.boxPoints(bboxes[i])) maxp = np.max(box, axis=0) + 2 minp = np.min(box, axis=0) - 2 y1 = max(0, minp[1]) y2 = min(original.shape[0], maxp[1]) x1 = max(0, minp[0]) x2 = min(original.shape[1], maxp[0]) text_img = original[y1:y2, x1:x2, :] if text_img.shape[0] < 4 or text_img.shape[1] < 4: continue #if bboxes[i][1][0]>bboxes[i][1][1]: # angle = -1*bboxes[i][2] #else: # angle = -1*(90+bboxes[i][2]) #if angle!=0: # text_img = rotate_bound(text_img,angle) text_img = norm_image_for_text_prediction(text_img, 32, 320) text_img = np.expand_dims(text_img, 0) text = serving2.predict({'images':text_img}) text = text['output'][0] text = get_text(text) if len(text)>2: print('text: {}'.format(text)) found = names.get(text) if (found is not None) and (len(found)>0): print(found[0]) if found[0][0]>0.7: text = found[0][1] if ' ' in text: found_name = (found[0][0],text) candidates = [] break else: candidates.append(text) if (found_name is None) and len(candidates)>0: found_name = choose_one(names,candidates) for i in bboxes: box = cv2.boxPoints(i) box = np.int0(box) original = cv2.drawContours(original, [box], 0, (255, 0, 0), 2) frame = np.ascontiguousarray(original[:, :, ::-1],np.uint8) if found_name is not None: add_overlays(frame,found_name[0],found_name[1]) cv2.imwrite('results/result_{}.jpg'.format(i_name),frame) global result result = frame i_name+=1 global last_processed last_processed = frame lock.release() print('stop frame')