def get_cpe_matches( self, cert_name: str, cert_candidate_cpe_vendors: List[str], cert_candidate_versions: List[str], relax_version: bool = False, n_max_matches=10, threshold: int = 60) -> Optional[List[Tuple[float, CPE]]]: replace_non_letter_non_numbers_with_space = re.compile(r"(?ui)\W") def sanitize_matched_string(string: str): string = string.replace('®', '').replace('™', '').lower() return replace_non_letter_non_numbers_with_space.sub(' ', string) candidates = self.get_candidate_cpe_items(cert_candidate_cpe_vendors, cert_candidate_versions) sanitized_cert_name = sanitize_matched_string(cert_name) reasonable_matches = [] for c in candidates: sanitized_title = sanitize_matched_string(c.title) sanitized_item_name = sanitize_matched_string(c.item_name) set_match_title = fuzz.token_set_ratio(sanitized_cert_name, sanitized_title) partial_match_title = fuzz.partial_ratio(sanitized_cert_name, sanitized_title) set_match_item = fuzz.token_set_ratio(sanitized_cert_name, sanitized_item_name) partial_match_item = fuzz.partial_ratio(sanitized_cert_name, sanitized_item_name) potential = max([ set_match_title, partial_match_title, set_match_item, partial_match_item ]) if potential > threshold: reasonable_matches.append((potential, c)) if reasonable_matches: reasonable_matches = sorted(reasonable_matches, key=operator.itemgetter(0), reverse=True) # possibly filter short titles to avoid false positives # reasonable_matches = list(filter(lambda x: len(x[1].item_name) > 4, reasonable_matches)) return reasonable_matches[:n_max_matches] if not reasonable_matches and not relax_version: return self.get_cpe_matches(cert_name, cert_candidate_cpe_vendors, ['-'], relax_version=True, n_max_matches=n_max_matches, threshold=threshold) return None
def _crossref_score(txt, r): # high score means high similarity from rapidfuzz.fuzz import token_set_ratio score = 0 if 'author' in r: author = ' '.join( [p['family'] for p in r.get('author', []) if 'family' in p]) score += token_set_ratio(author, txt) if 'title' in r: score += token_set_ratio(r['title'][0], txt) if 'abstract' in r: score += token_set_ratio(r['abstract'], txt) return score
def search_score(self, keywords, keys=None): """ search_score calcualtes the matching score of the herb for any given keyword :param key_word: keyword for the search :type key_word: list :param keys: list of keys in the dictionary to look into. :type keys: list, optional """ if keys is None: keys = ["name", "id", "repository", "tags", "description"] if not isinstance(keywords, (list, tuple, set)): keywords = [keywords] herb_for_search = { key: val for key, val in self.herb_meta_json.items() if key in keys } herb_for_search = _flatten_dict(herb_for_search) keywords_scores = [] for keyword in keywords: for key, val in herb_for_search.items(): score = fuzz.token_set_ratio(val, keyword) keywords_scores.append(score) max_score = 0 if keywords_scores: max_score = max(keywords_scores) return max_score
def display_label_search(self, query, n=10, **kwargs): '''Search display labels (place names).''' d = self.get_data_products() dpi_instances = d['demographicprofiles'] return sorted(dpi_instances, key=lambda x: \ fuzz.token_set_ratio(query, x.name), reverse=True)[:n]
def _scholar_score(txt, bib): # high score means high similarity from rapidfuzz.fuzz import token_set_ratio return sum([ token_set_ratio(bib[k], txt) for k in ['title', 'author', 'abstract'] if k in bib ])
def compare_cells(cell1, cell2, comparison_type, ignore_case): if not cell1 and not cell2: return True elif not cell1 or not cell2: return False try: if comparison_type == 'fuzzy_string': if fuzz.token_set_ratio(cell1, cell2, score_cutoff=CELL_THRESHOLD): return True elif comparison_type == 'int': return int(cell1) == int(cell2) elif comparison_type == 'float': return float(cell1) == float(cell2) else: if ignore_case: cell1 = cell1.lower() cell2 = cell2.lower() return str(cell1).strip() == str(cell2).strip() except: if ignore_case: cell1 = cell1.lower() cell2 = cell2.lower() return str(cell1).strip() == str(cell2).strip()
def fpartial(x): # return fuzz.ratio(str(x["alias"]),str(x["data"])) # Also take length of string into account -> uber <-> "pubermarkt test" = 42 return fuzz.token_set_ratio( str(x["alias"]), str(x["data"]) ) # Whole word match (best practice for ALIAS mapping short->withstring) -> uber <-> "pubermarkt test" = 57 # return fuzz.partial_token_set_ratio(str(x["alias"]),str(x["data"])) # Partial word match (best practise to create distance matrix) -> uber <-> "pubermarkt test" = 100 #%%
def management(l_args, s_ticker): parser = argparse.ArgumentParser(prog='mgmt', description="""Print management team. Namely: Name, Title, Information from google and (potentially) Insider Activity page. [Source: Business Insider]""") try: (ns_parser, l_unknown_args) = parser.parse_known_args(l_args) if l_unknown_args: print(f"The following args couldn't be interpreted: {l_unknown_args}\n") return url_market_business_insider = f"https://markets.businessinsider.com/stocks/{s_ticker.lower()}-stock" text_soup_market_business_insider = BeautifulSoup(requests.get(url_market_business_insider).text, "lxml") l_titles = list() for s_title in text_soup_market_business_insider.findAll('td', {'class': 'table__td text-right'}): if any(c.isalpha() for c in s_title.text.strip()) and ('USD' not in s_title.text.strip()): l_titles.append(s_title.text.strip()) l_names = list() for s_name in text_soup_market_business_insider.findAll('td', {'class': 'table__td table--allow-wrap'}): l_names.append(s_name.text.strip()) df_management = pd.DataFrame({'Name': l_names[-len(l_titles):], 'Title':l_titles}, columns=['Name','Title']) df_management['Info'] = '-' df_management['Insider Activity'] = '-' df_management = df_management.set_index('Name') for s_name in df_management.index: df_management.loc[s_name]['Info'] = f"http://www.google.com/search?q={s_name} {s_ticker.upper()}".replace(' ', '%20') s_url_base = "https://markets.businessinsider.com" for insider in text_soup_market_business_insider.findAll('a', {'onclick':"silentTrackPI()"}): for s_name in df_management.index: if fuzz.token_set_ratio(s_name, insider.text.strip()) > 70: df_management.loc[s_name]['Insider Activity'] = s_url_base + insider.attrs['href'] for ind in df_management.index: s_name = f"{ind}{(max([len(x) for x in df_management.index])-len(ind))*' '}" s_title = f"{df_management['Title'][ind]}{(max([len(x) for x in df_management['Title']])-len(df_management['Title'][ind]))*' '}" s_management = f"""{s_name} {s_title} {df_management['Info'][ind]}""" print(s_management) if df_management['Insider Activity'][ind] not in '-': print(f"{df_management['Insider Activity'][ind]}") print("") except: print("") return
def search_by_true_name(self, name, threshold=80): """Finds all items which match closely to all given query parameters. Args: name: Name to search by. Ignored if None. threshold: Threshold for matching with RapidFuzz. Returns: List of matching triplets with NameItem, RapidFuzz ratio and RapidFuzz token_set_ratio """ matches = [] for item in self.items: # Search with false name ratio = fuzz.ratio(item.true_name, name) token_set_ratio = fuzz.token_set_ratio(item.true_name.lower(), name.lower()) if ratio > threshold or token_set_ratio > threshold: matches.append((item, ratio, token_set_ratio)) return sorted(matches, key=lambda x: x[1], reverse=True)
def iterate(self, uid, start_date=None, end_date=None, class_name=None): date1 = (self.tmzn.localize(dtparse(start_date).replace( tzinfo=None)) if start_date else datetime.now(tz=self.tmzn)) date2 = (self.tmzn.localize(dtparse(end_date).replace(tzinfo=None)) if end_date else date1.replace(hour=23, minute=59, second=59)) class_list = [] for event in self.calendars[uid].walk("vevent"): if (event["dtstart"].dt.astimezone(self.tmzn) >= date1 and event["dtstart"].dt.astimezone(self.tmzn) <= date2): _ = (class_list.extend( event for c_name in class_name if fuzz.token_set_ratio(c_name.lower(), event["summary"]. lower()) > self.fuzz_threshold) if class_name else class_list.append(event)) return class_list
def featurize(df): if len(df.columns)==3: df.columns=['a', 'b', 'target'] elif len(df.columns)==2: df.columns=['a', 'b'] else: df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' }) df['TM_A'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1) df['TM_B'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1) df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1) df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1) # Jellyfish levenshtein df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1) # Scale Levenshtein column scaler = MinMaxScaler() df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1)) # Jellyfish phoneme df['metaphone'] = df.apply( lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1) df['nysiis'] = df.apply( lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1) df['mtch_rtng_cdx'] = df.apply( lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1) df['pshp_soundex_first'] = df.apply( lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1) for i, algo in enumerate(algos): df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1) return df
def get_one(self, uid, class_name=None): date1 = datetime.now(tz=self.tmzn) class_list = [] if not class_name: for event in self.calendars[uid].walk("vevent"): if event["dtstart"].dt.astimezone(self.tmzn) >= date1: class_list.append(event) break else: for c_name in class_name: for event in self.calendars[uid].walk("vevent"): if (event["dtstart"].dt.astimezone(self.tmzn) >= date1 and fuzz.token_set_ratio( c_name.lower(), event["summary"].lower()) > self.fuzz_threshold): class_list.append(event) break return class_list
def get_management(ticker: str) -> pd.DataFrame: """Get company managers from Business Insider Parameters ---------- ticker : str Stock ticker Returns ------- pd.DataFrame Dataframe of managers """ url_market_business_insider = ( f"https://markets.businessinsider.com/stocks/{ticker.lower()}-stock" ) text_soup_market_business_insider = BeautifulSoup( requests.get( url_market_business_insider, headers={"User-Agent": get_user_agent()} ).text, "lxml", ) found_h2s = {} for next_h2 in text_soup_market_business_insider.findAll( "h2", {"class": "header-underline"} ): next_table = next_h2.find_next_sibling("table", {"class": "table"}) if next_table: found_h2s[next_h2.text] = next_table if found_h2s.get("Management") is None: print(f"No management information in Business Insider for {ticker}") print("") return pd.DataFrame() l_titles = [] for s_title in found_h2s["Management"].findAll( "td", {"class": "table__td text-right"} ): if any(c.isalpha() for c in s_title.text.strip()) and ( "USD" not in s_title.text.strip() ): l_titles.append(s_title.text.strip()) l_names = [] for s_name in found_h2s["Management"].findAll( "td", {"class": "table__td table--allow-wrap"} ): l_names.append(s_name.text.strip()) df_management = pd.DataFrame( {"Name": l_names[-len(l_titles) :], "Title": l_titles}, columns=["Name", "Title"], ) df_management["Info"] = "-" df_management["Insider Activity"] = "-" df_management = df_management.set_index("Name") for s_name in df_management.index: df_management.loc[s_name][ "Info" ] = f"http://www.google.com/search?q={s_name} {ticker.upper()}".replace( " ", "%20" ) s_url_base = "https://markets.businessinsider.com" for insider in text_soup_market_business_insider.findAll( "a", {"onclick": "silentTrackPI()"} ): for s_name in df_management.index: if fuzz.token_set_ratio(s_name, insider.text.strip()) > 70: # type: ignore df_management.loc[s_name]["Insider Activity"] = ( s_url_base + insider.attrs["href"] ) return df_management
def management(other_args: List[str], ticker: str): """Display company's managers Parameters ---------- other_args : List[str] argparse other args ticker : str Stock ticker """ parser = argparse.ArgumentParser( add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter, prog="mgmt", description=""" Print management team. Namely: Name, Title, Information from google and (potentially) Insider Activity page. [Source: Business Insider] """, ) try: ns_parser = parse_known_args_and_warn(parser, other_args) if not ns_parser: return url_market_business_insider = ( f"https://markets.businessinsider.com/stocks/{ticker.lower()}-stock" ) text_soup_market_business_insider = BeautifulSoup( requests.get( url_market_business_insider, headers={"User-Agent": get_user_agent()} ).text, "lxml", ) found_h2s = dict() for next_h2 in text_soup_market_business_insider.findAll( "h2", {"class": "header-underline"} ): next_table = next_h2.find_next_sibling("table", {"class": "table"}) if next_table: found_h2s[next_h2.text] = next_table if found_h2s.get("Management") is None: print(f"No management information in Business Insider for {ticker}") print("") return l_titles = [] for s_title in found_h2s["Management"].findAll( "td", {"class": "table__td text-right"} ): if any(c.isalpha() for c in s_title.text.strip()) and ( "USD" not in s_title.text.strip() ): l_titles.append(s_title.text.strip()) l_names = [] for s_name in found_h2s["Management"].findAll( "td", {"class": "table__td table--allow-wrap"} ): l_names.append(s_name.text.strip()) df_management = pd.DataFrame( {"Name": l_names[-len(l_titles) :], "Title": l_titles}, columns=["Name", "Title"], ) df_management["Info"] = "-" df_management["Insider Activity"] = "-" df_management = df_management.set_index("Name") for s_name in df_management.index: df_management.loc[s_name][ "Info" ] = f"http://www.google.com/search?q={s_name} {ticker.upper()}".replace( " ", "%20" ) s_url_base = "https://markets.businessinsider.com" for insider in text_soup_market_business_insider.findAll( "a", {"onclick": "silentTrackPI()"} ): for s_name in df_management.index: if fuzz.token_set_ratio(s_name, insider.text.strip()) > 70: df_management.loc[s_name]["Insider Activity"] = ( s_url_base + insider.attrs["href"] ) for ind in df_management.index: s_name = f"{ind}{(max([len(x) for x in df_management.index])-len(ind))*' '}" df_mgmt_title = df_management["Title"] spaces = max(len(x) for x in df_mgmt_title) - len(df_mgmt_title[ind]) s_title = f"{df_mgmt_title[ind]}{spaces * ' '}" s_management = f"""{s_name} {s_title} {df_management['Info'][ind]}""" print(s_management) if df_management["Insider Activity"][ind] not in "-": print(f"{df_management['Insider Activity'][ind]}") print("") except Exception as e: print(e, "\n")
def test_token_ratio(s1, s2): """ token_ratio should be max(token_sort_ratio, token_set_ratio) """ assert fuzz.token_ratio(s1, s2) == max(fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2))
def testTokenSetRatio(self): self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5), 100)
def findMatchingDownloadedFile(torrentDataRootName, torrentDataFilesize, torrentDataFilePath, isDisc=False, isTV=False): torrentDataFilename = os.path.basename(torrentDataFilePath) # maximum difference, in MB, the downloaded filesize and listed file size can be MAX_FILESIZE_DIFFERENCE = 2 * 1000000 if isTV or torrentDataFilesize < 100 * 1000000: MAX_FILESIZE_DIFFERENCE = 0 listings = os.listdir(args.ROOT_PATH) for listing in listings: listingPath = os.path.join(args.ROOT_PATH, listing) # if 'planet' in listing.lower(): # print(listing) # print(fuzz.token_set_ratio(listing, torrentDataFilename)) if os.path.isfile(listingPath) and fuzz.token_set_ratio( listing, torrentDataFilename, score_cutoff=80): localFilesize = get_file_size(listingPath) # print((localFilesize - torrentDataFilesize)/1000000) if localFilesize == None: return None if abs(localFilesize - torrentDataFilesize) <= MAX_FILESIZE_DIFFERENCE: return listingPath elif fuzz.token_set_ratio(listing, torrentDataRootName, score_cutoff=85): for root, dirs, filenames in os.walk(listingPath): for filename in filenames: localFilePath = os.path.join(root, filename) localFilesize = get_size(localFilePath) if localFilesize == None: continue if isDisc and areRootPathsSimilar( localFilePath, listingPath, torrentDataFilePath ) and filename == torrentDataFilename: if abs(localFilesize - torrentDataFilesize) <= MAX_FILESIZE_DIFFERENCE: return localFilePath elif re.search(SEASON_EP_RE, torrentDataFilePath, re.IGNORECASE) and fuzz.token_set_ratio( filename, torrentDataFilename, score_cutoff=95): season_ep_str_torrent = getSeasonEpisodeStr( torrentDataFilePath) season_ep_str_filename = getSeasonEpisodeStr(filename) if season_ep_str_torrent == season_ep_str_filename and abs( localFilesize - torrentDataFilesize ) <= MAX_FILESIZE_DIFFERENCE: return localFilePath elif fuzz.token_set_ratio(filename, torrentDataFilename, score_cutoff=95): if abs(localFilesize - torrentDataFilesize) <= MAX_FILESIZE_DIFFERENCE: return localFilePath return None