def resolve_name_from_imdb(self, name): '''resolves name from list of names using fuzzy string matching''' this_letter = name[0] a = bisect_left(self.imdb_movies,this_letter) next_letter = chr(ord(this_letter)+1) z = bisect_left(self.imdb_movies,next_letter) if a != 0 and z != 0 and z > a: imdb_name, score = process.extractOne(name,self.imdb_movies[a:z]) # figure out the year too idx = self.imdb_movies.index(imdb_name) year = self.imdb_movies_year[idx] if score == 100: return (imdb_name, score, year) print("Could not get an exact match, will perform a full search...") imdb_name, score = process.extractOne(name,self.imdb_movies,scorer=fuzz.token_sort_ratio) # figure out the year too idx = self.imdb_movies.index(imdb_name) year = self.imdb_movies_year[idx] return (imdb_name, score, year)
def testWithScorer(self): choices = [ "new york mets vs chicago cubs", "chicago cubs at new york mets", "atlanta braves vs pittsbugh pirates", "new york yankees vs boston red sox", ] choices_dict = { 1: "new york mets vs chicago cubs", 2: "chicago cubs vs chicago white sox", 3: "philladelphia phillies vs atlanta braves", 4: "braves vs mets", } # in this hypothetical example we care about ordering, so we use quick ratio query = "new york mets at chicago cubs" scorer = fuzz.QRatio # first, as an example, the normal way would select the "more # 'complete' match of choices[1]" best = process.extractOne(query, choices) self.assertEqual(best[0], choices[1]) # now, use the custom scorer best = process.extractOne(query, choices, scorer=scorer) self.assertEqual(best[0], choices[0]) best = process.extractOne(query, choices_dict) self.assertEqual(best[0], choices_dict[1])
def map_all(self): ''' Map_all has a confusing array of variables. - "mapkeys" are the columns of a mapping file. these are boilerplate -- 'field name', 'source code', 'source value', etc. - "mapdict_of_element" is the "mapdict" attribute of each Element object. so for ihr.race.mapdict, ihr is the registry, race is the element, and mapdict is the dictionary of valueset value to target mapping.' - "mapdict_of_element_keys" is a conveninence list that contains the KEYS of mapdict_of_element SANS all nan values. nan values trip up the fuzzy matching algorithm (extractOne), and it is definitely more valuable ot have that algorithm." ''' for x in self.mapmaster: if x[closest_match('field_name', self.mapkeys)] in self.regobject.elements: mapdict_of_element= getattr(getattr(self.regobject, x['field name']), 'mapdict') mapdict_of_element_keys = [x for x in mapdict_of_element.keys() if str(x) != 'nan'] print(mapdict_of_element) self.mapmaster[0][closest_match('yes', self.mapkeys)] code = x[closest_match('source_code', self.mapkeys)] value = x[closest_match('source_value', self.mapkeys)] try: if process.extractOne(str(code), mapdict_of_element_keys)[1] > 50: try: mapdict_of_element[code] = x[closest_match('omop_concept_id', self.mapkeys)] except: handle_it() else: if process.extractOne(str(value), mapdict_of_element_keys)[1] > 50: try: mapdict_of_element[value] = x[closest_match('omop_concept_id', self.mapkeys)] except: handle_it() print(str(x['field name']) + ", " + str(code) + " cannot be mapped") except: handle_it()
def play_item(item, item_type=None): print('play_item') conf = settings.Config.get_config() if item_type == 'muzyka': albums, names = m3uparser.parseFolderForPlaylists( conf['media_dir'] + '/Music') load_best_playlist(albums, names, item) elif item_type == 'audiobook': albums, names = m3uparser.parseFolderForPlaylists( conf['media_dir'] + '/Audiobooks') load_best_playlist(albums, names, item) elif item_type == 'podcast': albums, names = m3uparser.parseFolderForPlaylists( conf['media_dir'] + '/Podcasts') load_best_playlist(albums, names, item) elif item_type == 'radio': tracks, titles = m3uparser.parseFolderForTracks( conf['media_dir'] + '/Radio') load_best_track(tracks, titles, item) else: # try to play without a type print(conf['media_dir']) tracks, titles = m3uparser.parseFolderForTracks( conf['media_dir']) albums, names = m3uparser.parseFolderForPlaylists( conf['media_dir']) title = process.extractOne(item, titles) print(str(title)) name = process.extractOne(item, names) print(str(name)) if title[1] > name[1]: load_best_track(tracks, titles, item) else: load_best_playlist(albums, names, item)
def decodeName(self, card): if card['set'] == 'Full Sets': return None name = card['desc'] if card['set'] == 'Promos': if name.find('Baltimare') != -1 or name.find('SDCC') != -1: return None if name.find('Pre-Release') != -1: return None if name.startswith('Lady Justice Volunteer Promo'): return {'name': 'Lady Justice, Judge & Jury', 'id': 'pf16PR'} set = card['set'] # Try to find the card ID. Should always be the last number. id = None try: id = re.findall('[FPfp]?[^ \t\n#0-9]?[0-9]{1,3}', name)[-1] name = name.replace(id, '') name = name.strip() if name[-1] == '-': name = name[:-1] name = name.strip() except: pass score = -1 # Find the closest matching name. Things may be misspelled ;/ fullname = None if set not in self.namesBySet: extract = process.extractOne(name, self.allNames) score = extract[1] else: extract = process.extractOne(name, self.namesBySet[set]) score = extract[1] fullname = extract[0] return {'name': fullname, 'id': id}
def preprocess_nonurban_text(text, intersections, threshold=80): text_new = text if 'צומת' in text: text_new = text.split('צומת')[1].strip() suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio, score_cutoff=threshold) if suspected_intersection is None: text_new = text elif 'מחלף' in text: text_new = text.split('מחלף')[1].strip() suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio, score_cutoff=threshold) if suspected_intersection is None: text_new = text elif 'כניסה ל' in text: text_new = text.split('כניסה ל')[1].strip() suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio, score_cutoff=threshold) if suspected_intersection is None: text_new = text elif 'כביש' in text: text_new = text.split('כביש')[1].strip() suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio, score_cutoff=threshold) if suspected_intersection is None: text_new = text return text_new
def mapTitle(title, pub_year): mtitles = [] with open('/Users/petertamisin/demo/marvel_series_list.csv', 'rb') as csvfile: reader = csv.DictReader( csvfile ) for line in reader: if (int(pub_year) >= int(line[ 'startyear' ]) and int(pub_year) <= int(line[ 'endyear' ])): debug(4,line) # Remove Series Dates from title if (line['title'].endswith(')') and line['title'].rfind('(')>0) : idx = line['title'].rfind('(') line['newtitle'] = line['title'][0:idx] line['newtitle'] = line['newtitle'].rstrip() debug(4, line['newtitle']) mtitles.append(line) match = (process.extractOne(title, sorted(set(d['newtitle'] for d in mtitles), reverse=True))) score = match[1] if score >= 90: for mline in mtitles: if(mline["newtitle"] == match[0]): debug(4,mline) return mline else: #try match again with translated title match = (process.extractOne(wordSwap(title), sorted(set(d['newtitle'] for d in mtitles), reverse=True))) score = match[1] if score >= 90: for mline in mtitles: if(mline["newtitle"] == match[0]): debug(4,mline) return mline debug(2, 'MISMATCH:' + ''.join(str(e) for e in match) + '~' + wordSwap(title)) return
def predict_income(city, job, age, education, gender): city_names = city_name_map.keys() job_names = job_title_map.keys() standardized_city_name = process.extractOne(city, city_names)[0] standardized_job_name = process.extractOne(job, job_names)[0] given = [city_name_map[standardized_city_name], job_title_map[standardized_job_name], education] inputs = [] for field in fields: if field in given: print field inputs.append(1) else: inputs.append(0) age = float(age) age_std = (age - 16) / (85 - 16) inputs[fields.index('age')] = age_std inputs[fields.index('female')] = gender_map[gender] return np.dot(inputs, coefficients) * 1.5
def normalize_country(country): if process.extractBests(country, choices=['N/A', 'n/a'], score_cutoff=HIGH_CUTOFF): return None if country == 'null': return None if country in ['USA', 'UK']: return country if process.extractOne(country, choices=['United Kingdom', 'UK'], score_cutoff=HIGH_CUTOFF): return 'UK' if process.extractOne(country, choices=['England', 'Wales', 'Scotland'], score_cutoff=HIGH_CUTOFF): return 'UK' if process.extractOne(country, choices=['Russia', 'Russian Federation'], score_cutoff=LOW_CUTOFF): return 'Russia' try: country = pycountry.historic_countries.get(name=country.capitalize()) return country.name except KeyError: pass try: country = pycountry.historic_countries.get(alpha2=country.upper()) return country.name except KeyError: pass countries = [country.name for country in pycountry.countries] best_match = process.extractOne(country, choices=countries, score_cutoff=90) return best_match if best_match else country
def guess_column_names(columnname): ''' An attempt to standardize and rename column headers for manipulation later. Input: column name (String) Output: Corrected name (String) ''' #list of possible "correct" column headers correct_headers = ['First Name', 'Last Name','Fullname','Student Name','Job Title', 'Title','ID', 'Institution','School','Company','Company Name1','Company Name2','Organization Name','Department','Division', 'Email Address','Street Address','Street 1','Dorm Address 1','Dorm Address 2','Dorm Address 3', 'Dorm Address 4','Address 1','Street 2','Address 2','Address','Street 3','Address 3','Street 4','Address 4', 'Work Street 1','Work Street 2','Work Street 2','Work Street 3','Work Street 4', 'Zipcode','Home Zipcode','Work_City','Dorm Postalplus4','HOME_FOREIGN_CITYZIP','WORK_FOREIGN_CITYZIP','Work_State','Work_Country', 'Postal','City','County','State','Country'] # if column is exact match return name if columnname in correct_headers: # might want to make this a dict for O(1) lookups return columnname#, 100 # if column name is longer than 20 characters, return best quess based on last 15 characters if len(columnname) > 20: new_name, score = process.extractOne(columnname[-15:], correct_headers) return new_name#, score # for all others, else: new_name, score = process.extractOne(columnname, correct_headers) #if score > 80, return new_name if score < 80: # returns orginal name if match is bad return columnname#, score else: return new_name#, score
def fuzzThis(countries, choices): from fuzzywuzzy import fuzz from fuzzywuzzy import process for country in countries: print country print process.extractOne(country, choices)
def find_string(string): with open('ratio.txt') as f: lines = f.read().splitlines() ratio = process.extractOne(string, lines)[0] with open('companies.txt') as f: lines = f.read().splitlines() company = process.extractOne(string, lines)[0] return ratio + " " + company
def match_move(self, char, move, vt, data): ''' Main helper function that handles matching the move. Uses the reverse mapping of the common name, input command and short form converter to increase the chances of a better match. ''' # First find the char they want. char_match, char_ratio = process.extractOne(char, data.keys()) if char_ratio < self.char_ratio_thresh: return False # They might have supplied the move name in shortened format # so convert it to how the frame data dump expects. result = re.search(self.short_regex, move) if result: matched = result.group(0) # Slice to the second last char because the matched move might # be 'cr. 'or 'cr ' but the mapping only contains cr. move = re.sub( self.short_regex, self.short_mapping[matched[:-1]], move ) # Use the reverse mapping to determine which move they # were looking for. moves = data[char_match]['reverse_mapping'] move_match, move_ratio = process.extractOne(move, moves.keys()) if move_ratio < self.move_ratio_thresh: return False move = data[char_match]['reverse_mapping'][move_match] # Check if the matched name was a char stat or a move. if 'char_stat' in move: return char_match, move_match, move else: # Find the move they want. if vt: # The move might not have any difference in vtrigger # so just return the normal version. try: move_data = data[char_match]['moves'][self.vt_mappings[vt]][move] except KeyError: move_data = data[char_match]['moves']['normal'][move] else: try: move_data = data[char_match]['moves']['normal'][move] # Might be a vtrigger only move. except KeyError: try: move_data = data[char_match]['moves']['vtOne'][move] except KeyError: move_data = data[char_match]['moves']['vtTwo'][move] return char_match, move, move_data
def test_simplematch(self): basic_string = 'a, b' match_strings = ['a, b'] result = process.extractOne(basic_string, match_strings, scorer=fuzz.ratio) part_result = process.extractOne(basic_string, match_strings, scorer=fuzz.partial_ratio) self.assertEqual(result, ('a, b', 100)) self.assertEqual(part_result, ('a, b', 100))
def fuzzy_match(inputlist, choices): if (isinstance(inputlist, str)): fz = process.extractOne(inputlist, choices) return(fz[0]) else: outlist = [] for s in inputlist: fz = process.extractOne(s, choices) outlist.append(fz[0]) return(outlist)
def mapProjectsToLDAP(project_list, project_type, tenant_list=False): """Create a payload for ldap_updater module calls. Generate a list of dictionaries mapping Insightly properties to LDAP attributes. Args: project_list (List): A list of projects as JSON from Insightly to be converted into LDAP-like dictionaries. project_type (List): A description of the type of project, one of 'SDA', 'FPA' or 'FPA (CRA)'. tenant_list (List, optional): A list of tenants as JSON from Insightly, i.e. projects on the 'OpenStack Tenant' category. Returns: List: The project list converted into dictionaries with the relevant LDAP attributes, including nested tenants. """ return map(lambda p: {'o': str(p['PROJECT_ID']), 'description': project_type, 'cn': sanitize(p['PROJECT_NAME']), 'owner': mapContactsToLDAP(filter(lambda owner: owner['CONTACT_ID'] in map(lambda c: c['CONTACT_ID'], filter(lambda o: o['CONTACT_ID'] is not None and extractOne(str(o['ROLE']), TECH_ROLE, score_cutoff=80), p['LINKS'])), USERS) )[:1], 'seeAlso': mapContactsToLDAP(filter(lambda admin: admin['CONTACT_ID'] in map(lambda c: c['CONTACT_ID'], filter(lambda a: a['CONTACT_ID'] is not None and extractOne(str(a['ROLE']), ADMIN_ROLE, score_cutoff=80), p['LINKS'])), USERS) ), 'member': mapContactsToLDAP(filter(lambda member: member['CONTACT_ID'] in map(lambda c: c['CONTACT_ID'], filter(lambda m: m[ 'CONTACT_ID'] is not None, p['LINKS'])), USERS) ), 'tenants': mapProjectsToLDAP(filter(lambda t: t['PROJECT_ID'] in map(lambda sp: sp['SECOND_PROJECT_ID'], filter(lambda l: l[ 'SECOND_PROJECT_ID'] is not None, p['LINKS'])), tenant_list), project_type + [LU.OS_TENANT]) if tenant_list else [], }, project_list) if project_list else []
def get_fuzzy_player(player_name, wiki, real=False): """Get player id using fuzzy string match. Not available in UI mode.""" if real: fishname, ratio = process.extractOne(player_name, wiki.keys()) real_id = wiki.get(fishname) print u"It's {0}% {1}{2}{3}!".format(ratio, style.BOLD, real_id, style.END) else: fish_id, prob = process.extractOne(player_name, wiki.values()) print 'It is {0}% {1}!'.format(prob, fish_id) print '' for k,v in wiki.items(): if fuzz.ratio(v, fish_id) >= 85: print u'{0} --> {1}'.format(k, v)
def _getLDAPCompatibleAccount(self, account): account = account.copy() account['objectClass'] = 'inetOrgPerson' if extractOne('True', account.pop('isHidden'), score_cutoff=75): account['employeeType'] = 'hidden' return account
def main_parser(self,record,REFNO,TYPE,people,doctype): # if "Unidentified" not in people: # people.append("Unidentified") record = record.replace('=','') data = {'name':[],'role':[],'locations':[],'tracks':[],'TYPE':TYPE,'REFNO':REFNO} if record in self.cache.keys(): # print 'Found cached record: %s' % record data['name'] = self.cache[record]['name'] data['role'] = self.cache[record]['role'] data['locations'] = self.cache[record]['locations'] data['tracks'] = self.cache[record]['tracks'] elif len(people): data['name'] = fuzzyproc.extractOne(record,people)[0] tracks = re.findall('((?:A|B)\d+(\,*\s*(?:A|B)*\d+)*)',record) if len(tracks): for y in tracks[0][0].split(','): data['tracks'].append(y.strip()) record = record.replace(tracks[0][0],'\n').strip() tokens = filter(lambda x: x not in data['name'].split(),[y.strip() for y in record.split(',')]) for item in tokens: if item in self.roles: data['role'].append(item) elif item in self.locations: data['locations'].append(item) self.cache[record] = data # print 'Caching record: %s' % record else: # print 'Caching record: %s' % record self.cache[record] = data data['role'] = list(set(data['role'])) if len(data['role']) == 0 and TYPE == 'CREATOR' and doctype == 'Sound Recording': data['role'].append('performer') return data
def testNullStrings(self): choices = [None, "new york mets vs chicago cubs", "new york yankees vs boston red sox", None, None] query = "new york mets at chicago cubs" best = process.extractOne(query, choices) self.assertEqual(best[0], choices[1])
def process(service): """Parse text into commands.""" text = service.data[ATTR_TEXT] match = REGEX_TURN_COMMAND.match(text) if not match: logger.error("Unable to process: %s", text) return name, command = match.groups() entities = {state.entity_id: state.name for state in hass.states.all()} entity_ids = fuzzyExtract.extractOne(name, entities, score_cutoff=65)[2] if not entity_ids: logger.error( "Could not find entity id %s from text %s", name, text) return if command == 'on': hass.services.call(core.DOMAIN, SERVICE_TURN_ON, { ATTR_ENTITY_ID: entity_ids, }, blocking=True) elif command == 'off': hass.services.call(core.DOMAIN, SERVICE_TURN_OFF, { ATTR_ENTITY_ID: entity_ids, }, blocking=True) else: logger.error('Got unsupported command %s from text %s', command, text)
def _disableAndNotify(self, dn, ldap_conn): account = ldap_conn.ldap_search(dn, _ldap.SCOPE_BASE, attrlist=['employeeType', 'cn', 'mail'])[0][1] if account and ('employeeType' not in account or not extractOne(account['employeeType'][0], ['disabled'], score_cutoff=80)): ldap_conn.ldap_update(dn, [(_ldap.MOD_REPLACE, 'employeeType', 'disabled')]) map(lambda e: self.mailer.sendCannedMail(e, self.mailer.CANNED_MESSAGES['disabled_account'], account['cn'][0]), account['mail'])
def count_in_category(x='call_type', filter_dict=None, model=DEFAULT_MODEL, app=DEFAULT_APP, sort=True, limit=1000): """ Count the number of records for each discrete (categorical) value of a field and return a dict of two lists, the field values and the counts. >>> x, y = count_in_category(x='call_type', filter_dict={'model__startswith': 'LC60'}, limit=5, sort=1) >>> len(x) == len(y) == 5 True >>> y[1] >= y[0] True """ sort = sort_prefix(sort) model = get_model(model, app) filter_dict = filter_dict or {} x = fuzzy.extractOne(str(x), model._meta.get_all_field_names())[0] objects = model.objects.filter(**filter_dict) objects = objects.values(x) objects = objects.annotate(y=models.Count(x)) if sort is not None: objects = objects.order_by(sort + 'y') objects = objects.all() if limit: objects = objects[:int(limit)] objects = normalize_choices(util.sod_transposed(objects), field_name=x, app=app, human_readable=True) if not objects: return None objects = consolidated_counts(objects, field_name=x, count_name='y') if sort is not None: objects = sorted_dict_of_lists(objects, field_names=['y', x], reverse=bool(sort)) return objects[x], objects['y']
def getPersonsRef(person): match = process.extractOne(person, choices) if match[1] > 90: return match[0], personsDict[match[0]]['Person ID'], match[1] else: return match[0], personsDict[match[0]]['Person ID'], match[1], '-----------------!---MISSING---!!'
def count_in_date(x='date_time', filter_dict=None, model=DEFAULT_MODEL, app=DEFAULT_APP, sort=True, limit=100000): """ Count the number of records for each discrete (categorical) value of a field and return a dict of two lists, the field values and the counts. >>> from django.db import connection >>> connection.close() >>> x, y = count_in_date(x='date', filter_dict={'model__icontains': 'LC5'}, limit=5, sort=1) >>> len(x) == len(y) == 5 True >>> y[1] >= y[0] True """ sort = sort_prefix(sort) model = get_model(model, app) filter_dict = filter_dict or {} x = fuzzy.extractOne(str(x), model._meta.get_all_field_names())[0] objects = model.objects.filter(**filter_dict) objects = objects.extra({'date_bin_for_counting': 'date(%s)' % x}) objects = objects.values('date_bin_for_counting') objects = objects.annotate(count_of_records_per_date_bin=models.Count('pk')) # FIXME: this duplicates the dict of lists sort below if sort is not None: objects = objects.order_by(sort + 'date_bin_for_counting') objects = objects.all() if limit: objects = objects[:int(limit)] objects = util.sod_transposed(objects) if sort is not None: objects = sorted_dict_of_lists(objects, field_names=['count_of_records_per_date_bin', 'date_bin_for_counting'], reverse=bool(sort)) #logger.info(x) return objects['date_bin_for_counting'], objects['count_of_records_per_date_bin']
def location_guesses(self): db_locations = dict([(l.name, l) for l in models.Location.objects.all()]) existing = self.progress["locations"] guesses = self.make_model_guesses(names=self.locations(), existing=existing, name_model=db_locations) db_methods = dict([(pm.planting_methods, pm) for pm in models.PlantingMethod.objects.all()]) for guess in guesses: name = guess.name planting_methods = -1 try: existing_name = existing[name] except (KeyError, TypeError): existing_name = {} try: planting_methods = existing_name["planting_methods"] except (KeyError, TypeError): planting_methods = -1 if guess.ratio and guess.ratio > 98: try: planting_methods = models.PlantingMethod.objects.get(planting_methods="").pk except: planting_methods = -1 elif db_methods: db_methodname, ratio = fuzz_process.extractOne(name, db_methods.keys()) if ratio > 85: planting_methods = db_methods[db_methodname].pk guess.set_extra({"planting_methods": planting_methods}) return guesses
def get_model(model=DEFAULT_MODEL, app=DEFAULT_APP): """ >>> from django.db import connection >>> connection.close() >>> get_model('WikiI').__name__.startswith('WikiItem') True >>> connection.close() >>> isinstance(get_model('master'), models.base.ModelBase) True >>> connection.close() >>> get_model(get_model('CaseMaster', DEFAULT_APP)).objects.count() >= 0 True """ # print 'get_model' + repr(model) + ' app ' + repr(app) if isinstance(model, models.base.ModelBase): return model app = get_app(app) try: model_object = models.get_model(app, model) if model_object: return model_object except: pass app = get_app(app) if not app: return None model_names = [mc.__name__ for mc in models.get_models(app)] if app and model and model_names: return models.get_model(app.__package__.split('.')[-1], fuzzy.extractOne(str(model), model_names)[0])
def nc_att_get(self, attribute, variable=None): """ Get attribute from NetCDF file. Default is to find into global attributes. If attribute key is not found, get the closest key name instead. :param str attribute: The attribute key to get :param str variable: The variable from which to find the attribute. Global is None. :return: The attribute value :rtype: *str* """ with ncopen(self.ffp) as nc: if variable: attrs = nc.variables[variable].__dict__ else: attrs = nc.__dict__ if attribute in attrs.keys(): return attrs[attribute] else: try: key, score = process.extractOne(attribute, attrs, scorer=fuzz.partial_ratio) if score >= 80: Print.warning('Consider "{}" attribute instead of "frequency"'.format(key)) return attrs(key) else: raise NoNetCDFAttribute(attribute, self.ffp) except: raise NoNetCDFAttribute(attribute, self.ffp)
def match_players_official(dfm, daily_projections, official_ids=None): if not official_ids: official_ids = {} for player in daily_projections[yesterday_string]['Player'].unique(): candidates = dfm['PLAYER_NAME'].tolist() official_ids[player] = process.extractOne(player, candidates)[0] return official_ids
def make_model_guesses(self, names=None, existing=None, name_model=None): """ names: list of names the user input existing: dictionary of {name: {pk: int}} that we've previously stored name_model: dictionary of {model.name: model} where model has member pk ('model.pk') returns a list of ModelGuess """ guesses = [] for name in names: pk = -1 try: existing_name = existing[name] except (KeyError, TypeError): existing_name = None if existing_name: try: pk = existing_name["pk"] except: pk = -1 ratio = None if pk == -1 and name_model: db_name, ratio = fuzz_process.extractOne(name, name_model.keys()) if ratio > 85: pk = name_model[db_name].pk guesses.append(ModelGuess(name, pk, ratio)) return guesses
def get_citation_context(cits, sects, title2acl_ids, year2titles, author_last2titles): cits_with_context = [] # (bib_idx, sect_context) for cit in cits: if cit['title'] is None or cit['book_title'] is None or cit[ 'date'] is None: continue # Find section context sect_contexts = [] for context in cit['contexts']: for i, sect in enumerate( sects): # Try to find citation string in all sections if context.get('citStr') in sect['text']: # found! # print(sect['title']) # print(sect['generic']) sect_contexts.append((sect['generic'], sect['title'], context.get('citStr'))) # print(context.get('citStr')) # print(context.get('position')) # print(context.get('startWordPos')) if len(sect_contexts) == 0: continue # Filter for ACL proceedings # TODO could be improved if 'ACL' in cit['book_title'] or 'Linguistics' in cit['book_title']: year_candidates = set( year2titles[cit['date']]) # papers from the same year if len(year_candidates) > 0: # papers from authors with same name # note: all name parts are used, bc we do not know what the first or last name is. author_names = [ name for author in cit['authors'] for name in author.split() ] author_candidates = [] for name in author_names: if name in author_last2titles: author_candidates += author_last2titles[name] author_candidates = set(author_candidates) if len(author_candidates) > 0: # candidate must be in both sets candidates = year_candidates & author_candidates if len(candidates) > 0: match_title, score = process.extractOne( cit['title'], candidates) # Candidate must be above threshold if score > .95 and match_title in title2acl_ids: for acl_id in title2acl_ids[match_title]: # Citation found in bib for sc in sect_contexts: cits_with_context.append((acl_id, sc)) # bib_candidates = process.extract(cit['title'], candidate_titles, limit=1) # for c_title, score in bib_candidates: # for acl_id in title2acl_ids[c_title]: # # Citation found in bib # for sc in sect_contexts: # cits_with_context.append((acl_id, sc)) # TODO multi title matches? -> check for year # print(c_idx) # print(bib_database.entries[c_idx]['title']) # print(marker) # break return cits_with_context
def company_list(request): session = requests.Session() ## initializing the UserAgent object session.headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' } # creating our own def to parse urls def make_soup(url): ## getting the reponse from the page using get method of requests module page = session.get(url, verify=False, headers=session.headers) ## storing the content of the page in a variable html = page.content ## creating BeautifulSoup object soup = BeautifulSoup(html, "html.parser") return soup data_list = [] url_collection = [] html_table = [] excel_link = [] indexlink = [] company_inf = [] cik = [] primarysymbol = [] companyname = [] markettier = [] sicdescription = [] if request.method == 'POST': try: ticker = request.POST['test'] print(request.POST['test']) except: csv_file1 = request.FILES["csv_file1"] c = csv_file1.read().decode("utf-8") ticker = c.replace('\r', ",", (c.count('\r') - 1)).replace( '\n', "").replace("\r", "") #the api link provides meta data about companies like cik, ticker symbol, entity id or market tier. url = urllib.request.urlopen( 'https://datafied.api.edgar-online.com/v2/companies?primarysymbols=' + ticker + '&appkey=a76c61e85f9225192ce5cbbd0b22fb52').read() print(url) # converting JSON data to a dictionary list_of_data = json.loads(url) print(list_of_data) y = int( list_of_data['result']['totalrows'] ) # find total number of the rows. if its 0, then ticker symbol doen't match with sec edgar db if y == 0: messages.success( request, "Unmatched Ticker Symbol or No Available Financial Data." ) #if it's 0, give error return redirect( 'EaganJones:company_list') #show error on search page # data for variable list_of_data for i in range(0, y): data = { "cik": str(list_of_data['result']['rows'][i]['values'][0]['value']), "companyname": str(list_of_data['result']['rows'][i]['values'][1]['value']), "entityid": str(list_of_data['result']['rows'][i]['values'][2]['value']), "primaryexchange": str(list_of_data['result']['rows'][i]['values'][3]['value']), "marketoperator": str(list_of_data['result']['rows'][i]['values'][4]['value']), "markettier": str(list_of_data['result']['rows'][i]['values'][5]['value']), "primarysymbol": str(list_of_data['result']['rows'][i]['values'][6]['value']), "siccode": str(list_of_data['result']['rows'][i]['values'][7]['value']), "sicdescription": str(list_of_data['result']['rows'][i]['values'][8]['value']), } companyname = data.get("companyname", "") cik = data.get("cik", "") primarysymbol = data.get("primarysymbol", "") markettier = data.get("markettier", "") sicdescription = data.get("sicdescription", "") data_list.append( data ) # all the data which came from edgar's api is in this list. We'll show those in the detail page. ticker_list = ticker.split( "," ) # split the tickers that user entered when searchirng for data. print("bu ticker list" + str(ticker.split(","))) for i in ticker_list: #iter over ticker symbols and we're able to get each companies profile on sec edgar url2 = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + i + '&type=10-k&dateb=&owner=exclude&count=40' url_collection.append( url2 ) # company profile links stored in a list. Our journey starts from this point. # We'll go through to the requested cash flows table step by step print("bunlar urller" + str(url2)) print(url_collection) # scrapping starts from this url collection b = [] for z in url_collection: souped_link = make_soup(z) #parse the link b.append(souped_link) print(b) return HttpResponse("hi") table = b.find("table", {"class": "tableFile2"}) indexlink_list = [ ] # the links which contain 10-k filings will be in this link for row in table.find_all("tr"): cells = row.findAll("td") if len( cells ) == 5: # if len(cells) is not 5, it means the company registered to edgar's website but there no data. table is empty. #I should write an error message here, I tried but failed. if cells[0].text.strip( ) == '10-K': # make sure we are at the write row. when we search for 10-k, it pulss 10-ka too. link = cells[1].find( "a", {"id": "documentsbutton" })['href'] # get the link from documents button. url = "https://www.sec.gov" + link indexlink_list.append(url) indexlink = indexlink_list[ 0] # get latest 10=k filing link print(indexlink_list) souped_button = make_soup( indexlink) # parse the link. we're so close to 10-k filing table2 = souped_button.find("div", {"id": "seriesDiv"}) tables_page = "https://www.sec.gov" + table2.find("a")[ "href"] # get link from "interactive" button. souped_excel_button = make_soup(tables_page) excel_button = souped_excel_button.find("td").find_all( "a")[1]['href'] excel_link = "https://www.sec.gov" + excel_button #get excel link from "view excel document" button. this excel file includes all data from latest 10-k filing. print(excel_link) excel_sheet_name = pd.ExcelFile( excel_link ).sheet_names #the problem with this excel file is, there are too much sheets. #the table we're looking for is sometimes named as "cash flows statements". sometimes "consolidated statements of cash" print(excel_sheet_name) choice_one = process.extractOne( "CASH FLOWS STATEMENTS", excel_sheet_name ) # use fuzzywuzzy libray and choose the highest score as sheetname choice_two = process.extractOne("CONSOLIDATED STATEMENTS OF CASH", excel_sheet_name) if choice_two[1] > choice_one[1]: cash_flows_sheet = choice_two[0] else: cash_flows_sheet = choice_one[ 0] # the table cash_flows_sheet is the sheet we're looking for. print(choice_one) print(choice_two) df = pd.read_excel(excel_link, sheet_name=cash_flows_sheet, na_filter=False) #read the excel file print(df) html_table = df.to_html(index=False) #store table as html json_table = df.to_json() #store table as json. print(json_table) # it's time to store and display the data on our website rf = Companies.objects.get_or_create(cik=cik, primarysymbol=primarysymbol, companyname=companyname, jsonnn=json_table, table=html_table, markettier=markettier, sicdescription=sicdescription) #m2 = Companies(table=html_table, jsonnn=json_table, **data) print(rf) #for y in ticker_list: company_inf = Companies.objects.filter( primarysymbol__iexact=data.get("primarysymbol", "")) print(company_inf) context = { 'data_list': data_list, 'excel_link': excel_link, 'html_table': html_table, 'company_inf': company_inf } messages.success(request, "Data Parsed") return render(request, "company_list.html", context) else: Companies() return render(request, "company_list.html", {})
def merge_nursinghome_data(nyt, hifld, manual_merge_tab): ''' merge nursing home data Parameters ---------- nyt : cleaned nyt nursing homes data hifld : cleaned hifld nursing homes data manual_merge_tab : merge table with merges to manually correct Returns ------- data frame with merged nursing homes data ''' try: from fuzzywuzzy import process, fuzz except ImportError: sys.exit("""You need fuzzywuzzy. Install it from https://pypi.org/project/fuzzywuzzy/ or run pip install fuzzywuzzy.""") # clean names and cities for better merge nyt = clean_nh_cities(nyt) nyt = clean_nh_names(nyt, level = 1) nyt2 = clean_nh_names(copy.deepcopy(nyt), level = 2) # more ambitious cleaning hifld = clean_nh_cities(hifld) hifld = clean_nh_names(hifld, level = 1) hifld2 = clean_nh_names(copy.deepcopy(hifld), level = 2) # more ambitious cleaning # fuzzy merging matched_fid = [] for i in range(nyt.shape[0]): name = nyt.loc[i, "Name"] name2 = nyt2.loc[i, "Name"] city = nyt.loc[i, "City"] state = nyt.loc[i, "State"] # get exact matches matched_all = hifld.loc[(hifld["Name"] == name) &\ (hifld["City"] == city) &\ (hifld["State"] == state)] if matched_all.shape[0] == 1: # one exact match fid = matched_all.iloc[0]["Fid"] elif matched_all.shape[0] > 1: # more than one exact match if matched_all.Name.iloc[0] == "CHRISTIAN HEALTH CARE CENTER": fid = 6658 # manual merge else: print("Multiple exact matches for: " + name) else: # if no exact match, do fuzzy matching # first try exact matching on city and state hifld_matched = hifld.loc[(hifld["City"] == city) & (hifld["State"] == state)] if hifld_matched.shape[0] > 0: matched = process.extractOne(name, hifld_matched["Name"], scorer=fuzz.WRatio) if matched[1] >= 87: # if meet threshold requirement, found match matched_fids = hifld_matched.loc[hifld_matched["Name"] == matched[0]] else: # try using names that are even more abbreviated/cleaned hifld2_matched = hifld2.loc[(hifld2["City"] == city) & (hifld2["State"] == state)] matched = process.extractOne(name2, hifld2_matched["Name"], scorer=fuzz.WRatio) if matched[1] >= 87: # if meet threshold requirement, found match matched_fids = hifld2_matched.loc[hifld2_matched["Name"] == matched[0]] else: # finally try using different distance metric matched = process.extractOne(name2, hifld2_matched["Name"], scorer=fuzz.ratio) matched_fids = hifld2_matched.loc[hifld2_matched["Name"] == matched[0]] # get (a single) matched FID if matched_fids.shape[0] == 1: fid = matched_fids["Fid"].iloc[0] else: # if multiple matched FIDs if not matched_fids["Population"].isna().all() == 0: # not all nans in pop field # choose one with large population fid = matched_fids.loc[matched_fids["Population"] ==\ np.nanmax(matched_fids["Population"])]["Fid"].iloc[0] else: # all nans in population field fid = matched_fids["Fid"].iloc[0] # take first one else: # do manual merge later fid = np.NaN matched_fid.append(fid) nyt["Matched FID"] = matched_fid # fix some nursing home matched FIDs manually for i in range(manual_merge_tab.shape[0]): name = manual_merge_tab.Name.iloc[i] fid = manual_merge_tab.FID.iloc[i] city = manual_merge_tab.City.iloc[i] state = manual_merge_tab.State.iloc[i] idx = (nyt.Name == name) & (nyt.City == city) & (nyt.State == state) nyt["Matched FID"].loc[idx] = fid # take entry with max #cases to deal with duplicates in nyt nyt_duplicated_ls = [] for fid in nyt["Matched FID"].loc[nyt["Matched FID"].duplicated()].unique(): if fid == -999: continue nyt_duplicated = nyt.loc[nyt["Matched FID"] == fid] nyt_duplicated = nyt_duplicated.loc[nyt_duplicated["Cases_2020-05-11"] ==\ np.max(nyt_duplicated["Cases_2020-05-11"])] nyt_duplicated_ls.append(nyt_duplicated) nyt = nyt.loc[nyt["Matched FID"] != fid] nyt_duplicated = pd.concat(nyt_duplicated_ls, axis = 0, sort = False) nyt = pd.concat([nyt, nyt_duplicated], axis = 0, sort = False) #nyt.to_csv("full_merge_table.csv", index=False) # merge with hifld nyt["Matched FID"] = nyt["Matched FID"].astype(int) nyt = nyt.rename(columns = {"Name": "NYT Name", "City": "NYT City", "State": "NYT State"}) nh = pd.merge(hifld, nyt, left_on = "Fid", right_on = "Matched FID", how = "right") nh = nh.replace(-999, np.NaN) return nh
"Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY", "RHODE ISLAND": "RI", "DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME" } states = list(state_to_code.keys()) print(fuzz.ratio('Python Package', 'PythonPackage')) print(process.extract('Mississippi', states)) print(process.extract('Mississipi', states, limit=1)) print(process.extractOne('Mississipi', states)) data.apply(find_state_code, axis=1) print('Before Correct State:\n', data['state']) data['state'] = data.apply(correct_state, axis=1) print('After Correct State:\n', data['state']) data.insert(5, 'State Code', np.nan) data['State Code'] = data.apply(fill_state_code, axis=1) print(data) # group by print('==============group by================') print(data.groupby('State Code')) print('All Columns:\n') print(data.groupby('State Code').sum()) print('Short Columns:\n')
def compare_and_find_best_match(self, player_name, team_player_tag_objs, category, player_position): last_name = self._remove_start_end_commas(player_name.strip()) jw_dist_dict = dict() for obj_player_name in team_player_tag_objs.keys(): if len(obj_player_name.strip()) == 0: continue jw = distance.get_jaro_distance(last_name, obj_player_name, winkler=False, scaling=0.1) jw_dist_dict[obj_player_name] = jw sorted_jw_dist_dict = sorted(jw_dist_dict.items(), key=operator.itemgetter(1), reverse=True) if len(sorted_jw_dist_dict) > 0 and float( sorted_jw_dist_dict[0][1]) == 1.0: return sorted_jw_dist_dict[0][0] top_5_players = sorted_jw_dist_dict[:5] jw_list_above_75 = list() for top_5_player in top_5_players: if float(top_5_player[1] ) > JW_NINTY_SCORE and self._compare_first_chars( last_name, top_5_player[0]): return top_5_player[0] if float(top_5_player[1]) >= JW_ACCEPT_SCORE and self._compare_first_chars(last_name, top_5_player[0]) and \ self._compare_player_context(category, team_player_tag_objs.get(top_5_player[0]), player_position): jw_list_above_75.append(top_5_player) if len(jw_list_above_75) == 1: return jw_list_above_75[0][0] matches_best = process.extractBests(last_name, team_player_tag_objs.keys(), limit=5) if len(matches_best) > 0 and int( matches_best[0][1]) == FULL_100_PERCENT_MATCH: return matches_best[0][0] mb_list_above_90 = list() for match_best in matches_best: if match_best[1] >= NINTY_PERCENT_MATCH and self._compare_first_chars(last_name, match_best[0]) and \ self._compare_player_context(category, team_player_tag_objs.get(match_best[0]), player_position): mb_list_above_90.append(match_best) if len(mb_list_above_90) == 1: return mb_list_above_90[0][0] else: set_ratio_100_percent_list = list() set_ratio_90_percent_list = list() for mb_player in matches_best: set_ratio = fuzz.token_set_ratio(last_name, mb_player[0]) if set_ratio == FULL_100_PERCENT_MATCH: set_ratio_100_percent_list.append(mb_player[0]) if set_ratio >= EIGHTY_FIVE_PERCENT_MATCH and self._compare_first_chars( last_name, mb_player[0]): matched_player = team_player_tag_objs.get(mb_player[0]) if matched_player.pos == 'NA' or player_position == 'NA' or matched_player.pos.lower() == player_position.lower() \ or self._is_positions_available_in_one_category(matched_player.pos, player_position): set_ratio_90_percent_list.append(mb_player[0]) if len(set_ratio_100_percent_list) == 1: return set_ratio_100_percent_list[0] elif len(set_ratio_100_percent_list) > 1: set_ratio_100_percent_context_list = list() for set_ratio_100_percent in set_ratio_100_percent_list: matched_player = team_player_tag_objs.get( set_ratio_100_percent) if self._compare_first_chars(last_name, set_ratio_100_percent) and \ (matched_player.pos == 'NA' or player_position == 'NA' or matched_player.pos.lower() == player_position.lower() or self._is_positions_available_in_one_category(matched_player.pos, player_position)): set_ratio_100_percent_context_list.append( set_ratio_100_percent) if len(set_ratio_100_percent_context_list) == 1: return set_ratio_100_percent_context_list[0] if len(set_ratio_90_percent_list) > 0: return set_ratio_90_percent_list[0] double_metaphone_match = self._compare_double_metaphone( last_name, matches_best) if double_metaphone_match is not None: return double_metaphone_match match_one = process.extractOne(last_name, team_player_tag_objs.keys()) if match_one is not None and match_one[ 1] >= NINTY_PERCENT_MATCH and self._compare_first_chars( last_name, match_one[0]): return match_one[0] # To handle 'Rahming, T.J.' and 'RAHMING,TJ', because first one is giving 3 token and second is returning 2 tokens, # after sorting order is getting disturbed. exact_match = self._compare_exact_text_match(last_name, match_one) if exact_match: return match_one[0] final_match = self._compare_final_match(last_name, matches_best) if final_match is not None: return final_match return None
def fuzzy_account(debug, dfl, dfr): msg_1 = 'Searching for FUZZY Matches...' msg_2 = '(Debug Mode Active)\n' if debug else '(Debug Mode Inactive)\n' print '%s\n%s\n%s' % (msg_1, '~' * len(msg_1), msg_2) dfl['NameAddress'] = dfl['NameStrip'] + ' ' + dfl['AddressStrip'] dfr['NameAddress'] = dfr['NameStrip'] + ' ' + dfr['AddressStrip'] matching_records = [] for idx, row in dfl.iterrows(): print idx status_i = AccountMatch('No Name', 'N/A', 0, 'No Address', 'N/A', 0, -1, 'Create New') rhs_i = dfr[dfr['Country'] == row.Country] # Search for Exact Name Match print 'Searching For Name Matches...' name_matches = rhs_i[rhs_i['NameStrip'] == row.NameStrip].reset_index(drop=True) # Exact Name Match(es) Found if not name_matches.empty: print '...%d Exact Name Match(es) Found:\n' % len(name_matches) # Select Closest Address Match match = process.extractOne(row.AddressStrip, name_matches.AddressStrip) name_matches = name_matches[name_matches.AddressStrip == match[0]].reset_index(drop=True) status_i.update_id_action(name_matches['Id'].loc[0], 'Verify') status_i.toggle('Address', 'Partial Address', match[0], match[1]) status_i.toggle('Name', 'Exact Name', name_matches['NameStrip'].loc[0], 100) # Exact Name Match Not Found, Search for Exact Address Match else: msg_4 = '...No Name Matches, Trying Address...' print '%s\n%s' % (msg_4, '-' * len(msg_4)) address_matches = rhs_i[rhs_i['AddressStrip'] == row.AddressStrip].reset_index(drop=True) # Exact Address Found if not address_matches.empty: print '...%d Exact Address Match(es) Found:\n' % len(address_matches) # Select Closest Name Match match = process.extractOne(row.NameStrip, address_matches.NameStrip) address_matches = address_matches[address_matches.NameStrip == match[0]].reset_index(drop=True) status_i.update_id_action(address_matches[address_matches.NameStrip == match[0]]['Id'].loc[0], 'Verify') status_i.toggle('Name', 'Partial Name', match[0], match[1]) status_i.toggle('Address', 'Exact Address', address_matches['AddressStrip'].loc[0], 100) else: print 'Neither Name nor Address Found: ' trigrams = [''.join(i) for i in find_ngrams(row.NameStrip, 3)] trigrams = [i.replace(' ', '').replace('+', '') for i in trigrams if len(i) > 2] print trigrams trigram_matches = rhs_i[rhs_i['NameStrip'].str.contains('|'.join(trigrams), na=False)] if not trigram_matches.empty: print 'Trigram Matches Found' trigram_matches['NameAddress'] = trigram_matches['NameStrip'] + ' ' + trigram_matches[ 'AddressStrip'] print row.AddressStrip best_match = process.extractOne(row.NameAddress, trigram_matches['NameAddress']) best = trigram_matches[trigram_matches['NameAddress'] == best_match[0]].reset_index(drop=True) status_i.update_id_action(best['Id'].loc[0], 'Verify') best_name = process.extractOne(row.NameStrip, best['NameStrip']) best_address = process.extractOne(row.AddressStrip, best['AddressStrip']) print best_name print best_address status_i.toggle('Name', 'Partial Name', best_name[0], best_name[1]) status_i.toggle('Address', 'Partial Address', best_address[0], best_address[1]) else: print 'No Trigram Matches Found' pass print row.Id, '|', row.NameStrip, '|', row.AddressStrip print status_i.id_best, '|', status_i.name_best, '|', status_i.address_best print status_i.name_status, status_i.name_prob, status_i.address_status, status_i.address_prob, '\n' matching_records.append([row.Id, status_i.id_best , row.NameStrip, status_i.name_best, status_i.name_status, status_i.name_prob , row.AddressStrip, status_i.address_best, status_i.address_status, status_i.address_prob]) df = pd.DataFrame() df = df.append(matching_records) df.columns = ['Id_L', 'Id_R', 'Name_L', 'Name_R', 'NameStatus', 'NameProb', 'Address_L', 'Address_R', 'AddressStatus', 'AddressProb'] return df
import csv with open("classnames.tsv") as f: reader = csv.reader(f, delimiter="\t") real_names = set(row[0] for row in reader) sources = { "small-07": "124M-01-07-500.txt", "small-08": "124M-01-08-500.txt", "small-09": "124M-01-09-500.txt", "medium-07": "355M-01-07-1000.txt", "medium-08": "355M-01-08-1000.txt", "medium-09": "355M-01-09-1000.txt" } fake_names = {} for source, file in sources.items(): with open(file) as f: for name in [l.strip() for l in f.readlines()]: match, score = process.extractOne(name, real_names, scorer=fuzz.ratio) if score < 80: fake_names[name] = source with open("fakeclasses.tsv", "w") as f: writer = csv.writer(f, delimiter="\t") for name, source in fake_names.items(): writer.writerow([name, source])
def recommender_final(movie_name): movie_list = [] cos_sim_list = [] # movie_index = process.extractOne(movie_name, df_movies['title'])[2] full_movie_name = process.extractOne(movie_name, df_movies['title'])[0] # If movie index in movies_with_tags - do keyword recommendation if (movies_with_tags['title'] == full_movie_name).any(): movie_index = process.extractOne(movie_name, movies_with_tags['title'])[2] # print(True) # keyword_movie_index = process.extractOne(movie_name, movies_with_tags['title'])[2] # keyword_full_movie_name = process.extractOne(movie_name, movies_with_tags['title'])[0] # universe_movie_index = process.extractOne(movie_name, df_movies['title'])[2] # universe_full_movie_name = process.extractOne(movie_name, df_movies['title'])[0] # go inside of Cosine_matrix and enumerate it # similar movies is list of are the indexes of similar movies inside the movies_with_tags table similar_movies = list(enumerate(cosine_sim[movie_index])) # Now we get the sorted list with most similar cosine similarity at top sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True) # Return the first 6 movies in sort_similar_movies for element in sorted_similar_movies[0:6]: movie = get_title_from_index(element[0]) movie_list.append(movie) sim_score = element[1] # convert score to percentage sim_score = sim_score * 100 sim_score = round(sim_score, 2) cos_sim_list.append(sim_score) # the HTML takes the first movie from the list as "movie selected" # So we're adding it back here at the front after operations # The 1st recommendation will still be the first recommendation #movie_list.insert(0, full_movie_name) # print('keyword_movie_index =', keyword_movie_index, '\n' # 'keyword_full_movie_name =', keyword_full_movie_name, '\n' # 'universe_movie_index =', universe_movie_index, '\n' # 'universe_full_movie_name =', universe_full_movie_name, '\n') results = [movie_list, cos_sim_list] return results # IF user searched movie NOT in our keyword table of movies # Then do a recommendation based on user ratings which is less accurate, # but has a larger universe of movies to get recommendations for else: movie_index = process.extractOne(movie_name, df_movies['title'])[2] distances, indices = model_knn.kneighbors( mat_movies_users[movie_index]) # print('distances, indices = ', distances, indices) # print(indices[0][1]) movie_rec_list = [] distance_list = [] for i in indices[0]: movie = (df_movies['title'][i]) movie_rec_list.append(movie) # We only want the 5 scores returned for j in distances[0]: j = j * 100 score = round(100 - j, 2) distance_list.append(score) results = [movie_rec_list, distance_list] return results
def showmedia_site(message, app_string, site_string): log_message_env(message) if sdk.tenant_id: # get list of apps. appdef_n2id = idname.generate_appdefs_map(key_val='display_name', value_val='id') app_list = appdef_n2id.keys() # get list of sites. sites_n2id = idname.generate_sites_map(key_val='name', value_val='id') site_list = sites_n2id.keys() # fuzzy match app_choice, app_percent = process.extractOne(app_string, app_list) site_choice, site_percent = process.extractOne(site_string, site_list) # perfect match, just get.. if app_percent == 100 and site_percent == 100: message.react(GOOD_RESPONSE) app_id = appdef_n2id[app_choice] site_id = sites_n2id[site_choice] # good guess match.. elif app_percent > 50 and site_percent > 50: message.react(GOOD_RESPONSE) message.reply( "I think you meant *{0}* at *{1}*, looking that up..".format( app_choice, site_choice)) app_id = appdef_n2id[app_choice] site_id = sites_n2id[site_choice] # if only one is good, or both are bad. else: message.react(BAD_RESPONSE) if app_percent <= 50: message.reply( "I couldn't find a media application that matched what you asked for ({0}). " "Try asking me about \"What apps are there?\".".format( app_string)) if site_percent <= 50: message.reply( "I couldn't find a site that matched what you asked for ({0}). " "Try asking me about \"What sites are there?\".".format( site_string)) return # Figure out the links/do all the work now. attachments = render_site_media_paths(app_id, site_id, sdk, global_id2n) # check if successful, add title if attachments[0].get('pretext') != "Sorry, couldn't query the media application info for this site at the " \ "moment. Please try later.": message.reply("*Path status for {0} at {1}:*".format( app_choice, site_choice)) # now, send it message.send_webapi('', json.dumps(attachments)) else: message.react(BAD_RESPONSE) message.send(CGX_API_ERROR_MSG)
def _attach_legistar_details_to_event( event: Dict[str, Any], ignore_minutes_items: Optional[List[str]] = None ) -> Dict[str, Any]: """ Query for and attach the best matching legistar event information to the provided event details. Parameters ---------- event: Dict[str, Any] The parsed event details from the SeattleChannel website. ignore_minutes_items: Optional[List[str]] A list of minute item names to ignore when parsing the minutes items from legistar. Useful for minute items that are so commonly used they lack specific value. Returns ------- joined: Dict[str, Any] The base event details object combined with the found legistar data. """ # Get all legistar events surrounding the provided event date legistar_events = legistar_event_tools.get_legistar_events_for_timespan( "seattle", event["event_datetime"], event["event_datetime"] + timedelta(days=1) ) log.debug("Pulled legistar details for event: {}".format(event["source_uri"])) # Fast return for only one event returned if len(legistar_events) == 1: selected_event = legistar_events[0] else: # Reduce events to not include cancelled events cancelled_reduced = [e for e in legistar_events if e["EventAgendaStatusName"] != "Cancelled"] # Get body names available_bodies = set([e["EventBodyName"] for e in cancelled_reduced]) # Check if the Seattle Channel body name (basically a "display name") is present in the list # If so, choose the events with that exact body name if event["body"] in available_bodies: legistar_events = [e for e in cancelled_reduced if e["EventBodyName"] == event["body"]] # No exact match available, find the closest body name by text diff else: # Returns the closest name and the score that made it the closest closest_body_name, score = process.extractOne(event["body"], available_bodies) # For reasons somewhat unknown to me, SeattleChannel has videos for events that don't exist in legistar # We can somewhat detect this by filtering out body names that are drastically different # In the case that the closest body name is less than a 50% match, return None to be cleaned up after # The body names shouldn't be _that_ different which is why we are just ignoring for now if score < 50: return None # Otherwise, use the found body name legistar_events = [e for e in cancelled_reduced if e["EventBodyName"] == closest_body_name] # Run agenda matching against the events agenda_match_details = legistar_event_tools.get_matching_legistar_event_by_minutes_match( event["minutes_items"], legistar_events ) # Add the details selected_event = agenda_match_details.selected_event # Parse details if ignore_minutes_items is None: ignore_minutes_items = [] parsed_details = legistar_event_tools.parse_legistar_event_details(selected_event, ignore_minutes_items) # Format the event details formatted_event_details = { **parsed_details, "source_uri": event["source_uri"], "video_uri": event["video_uri"], "caption_uri": event["caption_uri"] } log.debug("Attached legistar event details for event: {}".format(formatted_event_details["source_uri"])) return formatted_event_details
def enqueue_audio_playlist(self, arg): """Add all audio tracks in a Plex playlist to the playback queue. :param arg: a playlist search term """ logging.info("arg : %s", arg) print_msg("[Plex] [Playlist search in server] : '{0}'. ".format( self.base_url)) try: count = len(self.queue) playlist_title = "" playlist = None try: playlist = self._plex.playlist(title=arg) if playlist: playlist_title = playlist.title print_wrn("[Plex] Playing '{0}'.".format(playlist_title)) for item in list(playlist.items()): if item.TYPE == "track": track = item track_info = TrackInfo(track, track.artist(), track.album()) self._add_to_playback_queue(track_info) if count == len(self.queue): print_wrn( "[Plex] '{0}' No audio tracks found.".format( playlist_title)) raise ValueError except (NotFound): pass if count == len(self.queue): playlist_dict = dict() playlist_titles = list() playlists = self._plex.playlists() for pl in playlists: playlist_titles.append(pl.title) playlist_dict[pl.title] = pl if len(playlist_titles) > 1: playlist_title = process.extractOne(arg, playlist_titles)[0] playlist = playlist_dict[playlist_title] elif len(playlist_titles) == 1: playlist_title = playlist_titles[0] playlist = playlist_dict[playlist_title] if playlist: print_adv("[Plex] '{0}' not found. " "Playing '{1}' instead.".format( arg, playlist_title)) for item in list(playlist.items()): if item.TYPE == "track": track = item track_info = TrackInfo(track, track.artist(), track.album()) self._add_to_playback_queue(track_info) if count == len(self.queue): print_wrn( "[Plex] '{0}' No audio tracks found.".format( playlist_title)) self._finalise_play_queue(count, arg) except (ValueError, NotFound): raise ValueError( str("Playlist not found or no audio tracks in playlist : %s" % arg))
def fuzzyexact(df_left, df_right, id_col=None, key=None, block1=None, block2=None, threshold=80): '''Fuzzy match function which takes df1 as input and returns fuzzy matched items from df2''' #create key if len(key) == 4: df_left['key'] = df_left[key[0]].str.replace( ' ', '') + df_left[key[1]].str.replace( ' ', '') + df_left[key[2]].str.replace( ' ', '') + df_left[key[3]].str.replace(' ', '') df_right['key'] = df_right[key[0]].str.replace( ' ', '') + df_right[key[1]].str.replace( ' ', '') + df_right[key[2]].str.replace( ' ', '') + df_right[key[3]].str.replace(' ', '') elif len(key) == 3: df_left['key'] = df_left[key[0]].str.replace( ' ', '') + df_left[key[1]].str.replace( ' ', '') + df_left[key[2]].str.replace(' ', '') df_right['key'] = df_right[key[0]].str.replace( ' ', '') + df_right[key[1]].str.replace( ' ', '') + df_right[key[2]].str.replace(' ', '') elif len(key) == 2: df_left['key'] = df_left[key[0]].str.replace( ' ', '') + df_left[key[1]].str.replace(' ', '') df_right['key'] = df_right[key[0]].str.replace( ' ', '') + df_right[key[1]].str.replace(' ', '') elif len(key) == 1: df_left['key'] = df_left[key[0]].str.replace(' ', '') df_right['key'] = df_right[key[0]].str.replace(' ', '') #run fuzzy matching matched = {'Match': [], 'Score': []} for index, row in df_left.iterrows(): if block1 is not None and block2 is not None: df_right_reduced = df_right[(df_right[block1] == row[block1]) & (df_right[block2] == row[block2])] elif block1 is not None and block2 is None: df_right_reduced = df_right[(df_right[block1] == row[block1])] elif block1 is None and block2 is None: df_right_reduced = df_right.copy() if len(df_right_reduced.index) > 0: match = process.extractOne(row['key'], df_right_reduced['key'], score_cutoff=threshold) if match is not None: matched['Match'].append(match[0]) matched['Score'].append(match[1]) else: matched['Match'].append('') matched['Score'].append('') else: matched['Match'].append('') matched['Score'].append('') matched = pd.DataFrame(matched) finl = pd.concat([df_left, matched], axis=1) #append ID column from df_right to allow for easy lookup if id_col is not None: ids = df_right.copy() ids = ids[['key', id_col]] finl = finl.merge(ids, left_on='Match', right_on='key', how='left', suffixes=('', '_y')) finl.drop(['key_y'], axis=1, inplace=True) return finl
def guess_id(self, name): """@return: id, name, score""" name, score = process.extractOne(name, self._all_name_list) return self._roster[name], name, score
def find_potential_checkouts_v2(df_chkout, stmt_amt, stmt_bank, stmt_desc): # Definitions: ################ # _ab : subset of amt & bank # _abn: subset of amt, bank & exact name chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None, None # Step 1: # Filter potential checkouts by proof amount & bank ################ potential_chkouts_ab = df_chkout[ (df_chkout['proof_amount'] == stmt_amt) & (df_chkout['[A] script_bank_cat'] == stmt_bank)] # # Step 2: Further filter potential checkouts if proof cust name is in description if len(potential_chkouts_ab.index) == 0: # Situation 1: No Amt Bank match chkoutid = 'Amount / Bank wrong' pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None else: # Situation 2: Amt & Bank match, proceed to confirm using name (proof name ~ stmt desc) potential_chkouts_ab[ '[B] proof_cust_name_clean'] = potential_chkouts_ab[ '[B] proof_cust_name_clean'].fillna('').str.lower( ).str.replace( '\"', '') # (1) fills na with un-matchable name (2) cleans it potential_chkouts_abn = potential_chkouts_ab[potential_chkouts_ab[ '[B] proof_cust_name_clean'].map(lambda x: x in stmt_desc)] # if len(potential_chkouts_abn.index) == 1: # Situation 2a: Single match using Amt, Bank & exact Name chkoutid = potential_chkouts_abn['checkoutid'].item( ) # use subset of amount, bank & name pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None elif len(potential_chkouts_abn.index) == 0: ######################## ### WORK IN PROGRESS ### ######################## # Situation 2b: (amt & bank --> some candidates, no exact match with name --> 2 options: possibility of approx match / no match at all) chkout_candidates = potential_chkouts_ab['checkoutid'].tolist( ) # check subset of amount & bank pmax = process.extractOne( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(), scorer=fuzz.token_set_ratio, score_cutoff=50) if pmax is None: pmax_name, pmax_score = None, None else: pmax_name = pmax[0] pmax_score = pmax[1] try: chkoutid = potential_chkouts_ab[ potential_chkouts_ab['[B] proof_cust_name_clean'] == str(pmax_name)]['checkoutid'].item() except ValueError: chkoutid = None p0 = list( process.extractWithoutOrder( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist()) ) p0_names = [x[0] for x in p0] p0_scores = [x[1] for x in p0] p1 = list( process.extractWithoutOrder( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(), scorer=fuzz.token_sort_ratio) ) # Note: this is using token_sort_ratio p1_names = [x[0] for x in p1] p1_scores = [x[1] for x in p1] p2 = list( process.extractWithoutOrder( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(), scorer=fuzz.token_set_ratio) ) # Note: this is using token_set_ratio p2_names = [x[0] for x in p2] p2_scores = [x[1] for x in p2] ######################## ### WORK IN PROGRESS ### ######################## else: chkoutid = 'Many names found' pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None # return (chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores)
print('string1 ="'+ string1+'"') print('string2 ="'+ string2+'"') print('string3 ="'+ string3+'"') print('The difference between string1 and string2 is:', fuzz.ratio(string1, string2)) print('The difference between string2 and string1 is:', fuzz.ratio(string2, string3)) print('The difference between string2 and string 3 is: ', fuzz.ratio(string2, string3)) print('The partial difference between string1 and string2 is:', fuzz.partial_ratio(string1, string2)) print('The partial difference between string2 and string1 is:', fuzz.partial_ratio(string2, string3)) print('The partial difference between string2 and string 3 is: ', fuzz.partial_ratio(string2, string3)) print('Example 2 from datacamp.com: ') Str1 = "The supreme court case of Nixon vs The United States" Str2 = "Nixon v. United States" Ratio = fuzz.ratio(Str1.lower(),Str2.lower()) Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower()) Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2) Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2) print('simple ratio', Ratio) print('partial ratio', Partial_Ratio) print('sorted token ratio', Token_Sort_Ratio) print('set token ratio', Token_Set_Ratio) print('Example 3 for process module: ') str2Match = "apple inc" strOptions = ["Apple Inc.","apple park","apple incorporated","iphone"] Ratios = process.extract(str2Match,strOptions) print(Ratios) # You can also select the string with the highest matching percentage highest = process.extractOne(str2Match,strOptions) print(highest)
def fuzzy(media, lib, scorer=fuzz.QRatio): """ Use Fuzzy Wuzzy to return highest scoring item. """ if isinstance(lib, list) and len(lib) > 0: return fw.extractOne(media, lib, scorer=scorer) else: return ["", 0]
attribute3 = rowlist[4] attribute2 = rowlist[3] if (attribute2 != " number"): #Switch with attribute 1 #IF THERE IS AN EXACT MATCH for entity in entity_list: if (attribute1 == entity): continue if (attribute3 == entity): continue else: Ratios = process.extract(attribute1, entity_list) highest = process.extractOne(attribute1, entity_list) rowlist[2] = highest[0] # print("MATCH", highest[0], attribute1) # Ratios = process.extract(attribute2,entity_list) # highest = process.extractOne(attribute2,entity_list) # rowlist[3] = highest[0] # print("MATCH", highest[0], attribute2) Ratios = process.extract(attribute3, entity_list) highest = process.extractOne(attribute3, entity_list) rowlist[4] = highest[0] # print("MATCH", highest[0], attribute3) # i+=1 print(rowlist)
def lerArquivos(request): dominios = 'C:/xampp/htdocs/desafio_emails/domain_list.csv' emails = 'C:/xampp/htdocs/desafio_emails/email_list.csv' dadosDominios = [] novoDadosDominios = [] dadosEmails = [] novosDadosEmails = [] lista_dominios = pd.read_csv(dominios, names=['Domain']) lista_emails = pd.read_csv(emails, names=["E-mails"]) total = lista_emails.shape[0] if request.POST['email'] != "": emailAdicionado = request.POST['email'] lista_emails.loc[total+1] = emailAdicionado total = lista_emails.shape[0] else: emailAdicionado = "" for (i, row) in lista_dominios.itertuples(): dadosDominios.append(row) for dominio in dadosDominios: itemDominio = dominio for y in ["'"]: item = itemDominio.replace(y, "") novoDadosDominios.append(item) for (j, linha) in lista_emails.itertuples(): dadosEmails.append(linha) for emailList in dadosEmails: itemEmail = emailList for z in ["'"]: addEmail = itemEmail.replace(z, "") novosDadosEmails.append(addEmail) listEmailCerto = [] listEmailErrado = [] for i in novosDadosEmails: resultado = i.split("@") email = i if resultado[1] in novoDadosDominios: listEmailCerto.append(email) else: listEmailErrado.append(email) totalErrados = len(listEmailErrado) totalCertos = len(listEmailCerto) corrigidos = [] gmail = [] #0 hotmail = [] #1 hotmailBr = [] #2 hotmailMX = [] #3 hotmailAr = [] #4 msn = [] #5 for i in listEmailErrado: resultadoErrado = i.split("@") z = process.extractOne(resultadoErrado[1], novoDadosDominios, scorer=fuzz.token_sort_ratio) resultadoErrado[1] = z[0] corrigidos.append(resultadoErrado[0] + '@' + resultadoErrado[1]) if z[0] == 'gmail.com': gmail.append(resultadoErrado[0] + '@' + resultadoErrado[1]) elif z[0] == 'hotmail.com.br': hotmailBr.append(resultadoErrado[0] + '@' + resultadoErrado[1]) elif z[0] == 'hotmail.com.mx': hotmailMX.append(resultadoErrado[0] + '@' + resultadoErrado[1]) elif z[0] == 'hotmail.com.ar': hotmailAr.append(resultadoErrado[0] + '@' + resultadoErrado[1]) elif z[0] == 'msn.com': msn.append(resultadoErrado[0] + '@' + resultadoErrado[1]) else: hotmail.append(resultadoErrado[0] + '@' + resultadoErrado[1]) dictErrado = {} dictCerto ={} for emailCorrigido in corrigidos: login = emailCorrigido.split("@") tamLogin = len(login[0]) dictErrado[login[0]] = [tamLogin, login[1]] login[1]="" dataFrameErrados = pd.DataFrame(data=dictErrado) datasetErrados = dataFrameErrados.T.reset_index() for (i, row) in datasetErrados[1].iteritems(): if row == "gmail.com": datasetErrados[1][i] = 0; if row == "hotmail.com": datasetErrados[1][i] = 1; if row == "hotmail.com.br": datasetErrados[1][i] = 2; if row == "hotmail.com.mx": datasetErrados[1][i] = 3; if row == "hotmail.com.ar": datasetErrados[1][i] = 4; if row == "msn.com": datasetErrados[1][i] = 5; groupByErrados = datasetErrados.groupby([datasetErrados[0],datasetErrados[1]], as_index=False).size() print(groupByErrados) #exit() for emailCerto in listEmailCerto: loginCerto = emailCerto.split("@") tamLoginCerto = len(loginCerto[0]) dictCerto[loginCerto[0]] = [tamLoginCerto, loginCerto[1]] loginCerto[1] = "" dataFrameCerto = pd.DataFrame(data=dictCerto) datasetCerto = dataFrameCerto.T.reset_index() for (a, rows) in datasetCerto[1].iteritems(): if rows == "gmail.com": datasetCerto[1][a] = 0; if rows == "hotmail.com": datasetCerto[1][a] = 1; if rows == "hotmail.com.br": datasetCerto[1][a] = 2; if rows == "hotmail.com.mx": datasetCerto[1][a] = 3; if rows == "hotmail.com.ar": datasetCerto[1][a] = 4; if rows == "msn.com": datasetCerto[1][a] = 5; print(datasetCerto.groupby([datasetCerto[0], datasetCerto[1]]).size()) domains = ['Gmail', 'Hotmail', 'HotmailBr', 'HotmailMx', 'HotmailAr', 'Msn'] countDomains = [len(gmail), len(hotmail), len(hotmailBr), len(hotmailMX), len(hotmailAr), len(msn)] descricaoX = 'Domínios' descricaoY = 'Quantidades de erros de escrita' #plotarGraficos(domains, countDomains, descricaoX, descricaoY) #plotarGraficos(dictCerto.values(), descricaoX,domains, 'Tamanho do login') 'countDomains': countDomains,''' countDomains = [len(gmail), len(hotmail), len(hotmailBr), len(hotmailMX), len(hotmailAr), len(msn)] return {'countDomains': countDomains, 'totalEmails': total, 'totalCertos':totalCertos, 'totalErrados': totalErrados, 'emailAdicionado': emailAdicionado}
def sanitize_command(command): sanitized = process.extractOne(command, commands) if sanitized[1] == 0: sanitized = '' else: return sanitized[0]
'milk', 'spaghetti', 'ramen', 'steak', 'drink', 'bread', 'potato', 'barbecue', 'wings', 'burrito', 'pasta', 'pizza', 'vegetable', 'burger', 'hot dog', 'chicken', 'fish', 'rice' ] values = [ 40, 90, 80, 140, 40, 40, 70, 140, 115, 85, 90, 75, 50, 105, 115, 85, 120, 90 ] items = {k: v for k, v in zip(keys, values)} # print the plaintext to screen for convenience i = len(resp['labelAnnotations']) - 1 bestmatch = "dish" while (i >= 0): t = resp['labelAnnotations'][i] result = process.extractOne(t['description'], keys) if result[1] >= 70: bestmatch = result[0] i = i - 1 # print("They are ") first_string = bestmatch + ";" # print("time is") if bestmatch != "dish": foodtime = items[bestmatch] else: foodtime = 80 #save the size response to a size.json file for idx, resp in enumerate(sizeresponse.json()['responses']): # save to JSON file imgname = image_filenames[idx] jpath = join(RESULTS_DIR, basename(imgname) + 'size' + '.json')
def process_urban(text, streets, cities, threshold_city=70, threshold_street=50, ratio=0.85): text = preprocess_urban_text(text, cities) suspected_city = process.extractOne(text, cities, scorer=fuzz.partial_ratio, score_cutoff=threshold_city) if suspected_city is not None: suspected_city = suspected_city[0] streets_in_city = streets.loc[streets.city == suspected_city] relevant_streets_1 = streets_in_city.loc[(streets_in_city.street1 != 'NaN')].street1 relevant_streets_2 = streets_in_city.loc[(streets_in_city.street2 != 'NaN')].street2 relevant_streets = relevant_streets_1.append( relevant_streets_2).drop_duplicates() relevant_streets_scores = relevant_streets.apply( lambda x: streets_in_city.loc[(streets_in_city.street1 == x) | ( streets_in_city.street2 == x)].avg_accidents.max()) relevant_streets = pd.DataFrame({ 'street': relevant_streets.tolist(), 'avg_accidents': relevant_streets_scores.tolist() }) suspected_streets = process.extract( text, list(set(relevant_streets.street.dropna().tolist())), scorer=fuzz.token_set_ratio, limit=3) if len(suspected_streets) > 0: relevant_streets_scores = relevant_streets.loc[ relevant_streets.street.isin([ suspected_street[0] for suspected_street in suspected_streets ])].copy() relevant_streets_scores.avg_accidents = ( relevant_streets_scores.avg_accidents / relevant_streets_scores.avg_accidents.max()).copy() suspected_streets = [ (suspected_street[0], (ratio * fuzz.token_set_ratio( text, suspected_city[0] + ' ' + suspected_street[0])) + ((1 - ratio) * 100 * relevant_streets_scores.loc[ relevant_streets_scores.street == suspected_street[0]].avg_accidents.iloc[0])) for suspected_street in suspected_streets if suspected_street is not None and (ratio * fuzz.token_set_ratio( text, suspected_city[0] + ' ' + suspected_street[0])) + ((1 - ratio) * 100 * relevant_streets_scores.loc[ relevant_streets_scores.street == suspected_street[0]]. avg_accidents.iloc[0]) > threshold_street ] if len(suspected_streets) > 0: suspected_street = max(suspected_streets, key=lambda x: x[1]) suspected_street = suspected_street[0] if suspected_street in streets_in_city.street1.tolist(): suspected_street = streets_in_city.loc[ streets_in_city.street1 == suspected_street].iloc[0] return UrbanAddress(city=suspected_street.yishuv_name, street=suspected_street.street1_hebrew) else: suspected_street = streets_in_city.loc[ streets_in_city.street2 == suspected_street].iloc[0] return UrbanAddress(city=suspected_street.yishuv_name, street=suspected_street.street2_hebrew) return UrbanAddress(city=streets.loc[ streets.city == suspected_city].yishuv_name.iloc[0]) return None
def fuzzy(sentence): str2match = sentence strOptions = words Ratios = process.extract(str2match, strOptions) highest = process.extractOne(str2match, strOptions) return highest[0]
def get_stat_df(live_team, refresh): championship = live_team[1:live_team.find(']')] live_team = live_team[live_team.find(' ') + 1:].rstrip() championship_url = MATCHENDIRECT_URLS_DICT[championship] to_return1 = ([], []) to_return2 = ([], []) to_return3 = '' to_return4 = {} global x_y x_y = [[], []] text_to_add = '' game_score = '' good_game = None game_name = '' whole_df = [] global final_time final_time = None for try_game in fetch_bet_urls(BET_URLS_DICT[championship]): if live_team in get_game_teams(try_game): good_game = try_game whole_df = get_odds(good_game) if whole_df.empty: whole_df.insert(loc=0, column=get_game_name(good_game), value='Côtes indisponibles', allow_duplicates=True) text_to_add = " On ne trouve pas les côtes du match en question !" to_return1 = ([{ 'name': col, 'id': col } for col in whole_df.columns], whole_df.to_dict('records')) else: whole_df.insert(loc=0, column=get_game_name(good_game), value='Côtes :', allow_duplicates=True) to_return1 = ([{ 'name': col, 'id': col } for col in whole_df.columns], whole_df.to_dict('records')) min_odd = whole_df.min(axis=1) print(min_odd.values) page = process_url(MATCHENDIRECT_URLS_DICT[championship] + str(datetime.date.today().isocalendar()[0]) + '-' + str(datetime.date.today().isocalendar()[1]) + '/') target_page = page.findAll('tr', {'class': 'sl'}) url_list = [] if len(target_page) > 1: for elem in target_page: url_list.append(elem.find('a', href=True)['href']) link = 'https://www.matchendirect.fr/' + process.extractOne( live_team, url_list)[0] try: whole_df2 = infos_game(link=link, to_csv=False) except: pass else: if whole_df2 is not None: [x_y[0], x_y[1], predicted_score, final_time] = update_graph(whole_df, whole_df2, previous_x=[x_y[0]], previous_y=[x_y[1]], final_time=final_time) info_game = whole_df2.to_csv('file') to_return4 = { 'data': [{ 'x': x_y[0], 'y': x_y[0] }], 'layout': { 'title': f"Notre prédiction pour {game_name} : score de {predicted_score}. <br> C'est un bon moment pour parier si {x_y[0][-1].values[0]} > 1 !" } } to_return2 = ([{ 'name': col, 'id': col } for col in whole_df2.columns], whole_df2.to_dict('records')) game_score = f" Le score est actuellement de {whole_df2.iloc[0]['Buts']} - {whole_df2.iloc[0]['Buts']} !\n" elif len(target_page) == 1: link = 'https://www.matchendirect.fr/' + page.find(('tr'), { 'class': 'sl' }).find('a', href=True)['href'] try: whole_df2 = infos_game(link=link, to_csv=False) except: pass else: if whole_df2 is not None: [x_y[0], x_y[1], predicted_score, final_time] = update_graph(whole_df, whole_df2, previous_x=[x_y[0]], previous_y=[x_y[1]], final_time=final_time) info_game = whole_df2.to_csv('file') to_return4 = { 'data': [{ 'x': x_y[0], 'y': x_y[0] }], 'layout': { 'title': f"Notre prédiction pour {game_name} : score de {predicted_score}. <br> C'est un bon moment pour parier si {x_y[0][-1].values[0]} > 1 !" } } whole_df2.insert(loc=0, column=get_game_name(good_game), value=[ get_game_teams(good_game)[0], get_game_teams(good_game)[1] ], allow_duplicates=True) to_return2 = ( [{ 'name': col, 'id': col } for col in whole_df2.columns], whole_df2.to_dict('records'), ) game_score = f" Le score est actuellement de {whole_df2.iloc[0]['Buts']} - {whole_df2.iloc[1]['Buts']} !\n" if good_game is not None: game_name = get_game_name(good_game) try: game_time = int( whole_df2.index.get_level_values("Minute").values[0][:-1]) except: to_return3 = f"C'est la mi-temps du match {game_name} !\n" + game_score + " Regardez ce que recommande notre modèle..." else: to_return3 = f"C'est la {game_time}e minute du match {game_name} !\n" + game_score if game_time < 20: to_return3 += " Il est encore trop tôt pour prédire l'avenir..." elif game_time >= 89: to_return3 += " Il est trop tard pour aller parier sur Betclic !" else: to_return3 += " Regardez ce que recommande notre modèle..." to_return3 += text_to_add to_return1 = list(to_return1) to_return2 = list(to_return2) return (to_return1[0], to_return1[1], to_return2[0], to_return2[1], to_return3, to_return4)
for i in lyrs: print i.rstrip("\n") if i.startswith("Root") and len(lyrs) > 1: lyrs.remove(i) elif i.startswith("Root") or i.startswith("UnmatchedService") and len(lyrs) == 1: url = i.split("|||")[-1].rstrip("\n") resource_name = os.path.split(f)[0].lstrip(START_PATH + os.path.sep) r = requests.get(url+"?f=json") if r.status_code == 200: rj = r.json() if rj.has_key("layers"): layers_dict = {lyr["name"]: lyr["id"] for lyr in rj["layers"]} match = process.extractOne(resource_name, layers_dict.keys()) print resource_name, match go_ahead = raw_input("Match? (Y/N/O): ") if go_ahead.lower() == "y": lyr_index = layers_dict[match[0]] lyrs.append("MatchedService|||" + url.rstrip("/") + "/" + str(lyr_index) +"\n") lyrs.remove(i) elif go_ahead.lower() == "o": new_url = raw_input("Enter service url: ") lyrs.append("MatchedService|||" + new_url + "\n") else: lyrs.append("UnmatchedService|||" + url.rstrip("/") + "\n") lyrs.remove(i) elif i.startswith("UnmatchedService") and len(lyrs) == 2:
def get_fuzzy_images(word: str): keywords = [k.keyword for k in db.query(Keyword).all()] keyword, score = processfuzz.extractOne(word, keywords) if score > 93: return keyword return None
fwci_clean['surname'] = fwci_clean['Author'].str.split(',').str[0] fwci_clean = fwci_clean.set_index('Author') fwci_clean.drop( [col for col in fwci_clean.columns.tolist() if 'Unnamed' in str(col)], axis=1, inplace=True) fwci_clean.dropna(axis=0, how='all', inplace=True) fwci_clean.replace('-', np.nan, inplace=True) retrieved_authors = fwci_clean.index.tolist() surname_mapper = {} for index in awardees.index.tolist(): author = awardees.loc[index, 'name'] author_surname = awardees.loc[index, 'surname'] # process.extract(author, retrieved_authors) result, match = process.extractOne(author, retrieved_authors) result_surname = result.split(',')[0] if result_surname.lower() == author_surname.lower(): surname_mapper[author] = result else: logger.info(f'No match found for {author}') awardees['match_name'] = awardees['name'].map(surname_mapper) len(awardees['match_name'].dropna() ) # approximately 88% of awardees had matched names! # use matched name to find FWCI in the year of award from the fwci df search_dict = dict( zip( awardees.dropna(how='any', subset=['match_name'])['match_name'], awardees.dropna(how='any', subset=['match_name'])['Year']))
def process(source): """ process(collector_input) Data process that: * Retrieve facet key, values pairs from file or directory attributes :param str source: The file full path to process or the dataset ID """ # Get process content from process global env assert 'pctx' in globals().keys() pctx = globals()['pctx'] # Block to avoid program stop if a thread fails try: if pctx.directory or pctx.dataset_id or pctx.dataset_list: # Get attributes from directory format or dataset_id format attributes = re.match(pctx.pattern, source).groupdict() else: # Get attributes from NetCDF global attributes attributes = dict() with ncopen(source) as nc: for attr in nc.ncattrs(): attributes[attr] = nc.getncattr(attr) # Get attributes from filename, overwriting existing ones match = re.search(pctx.pattern, source) if not match: raise ExpressionNotMatch(source, pctx.pattern) attributes.update(match.groupdict()) # Get source values from attributes for facet in pctx.facets: if facet in pctx.set_keys.keys(): try: # Rename attribute key attributes[facet] = attributes.pop(pctx.set_keys[facet]) except KeyError: raise NoNetCDFAttribute(pctx.set_keys[facet], source) elif facet in attributes.keys(): # Facet exists in attribute keys pass else: # Find closest NetCDF attributes in terms of partial string comparison key, score = extractOne(facet, attributes.keys(), scorer=partial_ratio) if score >= 80: # Rename attribute key attributes[facet] = attributes.pop(key) Print.debug( 'Consider "{}" attribute instead of "{}" facet'.format( key, facet)) else: raise NoNetCDFAttribute(pctx.set_keys[facet], source) with pctx.lock: s = pctx.source_values[0] s[facet].add(attributes[facet]) pctx.source_values[0] = s msg = TAGS.SUCCESS + 'Deserialize {}'.format(COLORS.HEADER(source)) with pctx.lock: Print.info(msg) return 1 except KeyboardInterrupt: raise except Exception: exc = traceback.format_exc().splitlines() msg = TAGS.FAIL + COLORS.HEADER(source) + '\n' msg += '\n'.join(exc) with pctx.lock: Print.exception(msg, buffer=True) return 0 finally: with pctx.lock: pctx.progress.value += 1 percentage = int(pctx.progress.value * 100 / pctx.nbsources) msg = COLORS.OKBLUE('\rHarvesting facets values from data: ') msg += '{}% | {}/{} {}'.format(percentage, pctx.progress.value, pctx.nbsources, SOURCE_TYPE[pctx.source_type]) Print.progress(msg)
def find_state_code(row): if row['state'] != 0: print(process.extractOne(row['state'], states, score_cutoff=80))
def _get_best_fuzzy(text, sentences): from fuzzywuzzy import process return process.extractOne(text, sentences)
def fix_selections(self, selections): runners = selections.values() self.home, _ = process.extractOne(self.home, runners) self.away, _ = process.extractOne(self.away, runners) self.update_price_selections(runners)