def strDistance(name1, name2): if name1 == None or name2 == None: return 0 allr = (fuzz.partial_ratio(name1, name2), fuzz.UWRatio(name1, name2), fuzz.partial_token_set_ratio(name1, name2), fuzz.partial_token_sort_ratio(name1, name2)) return (sum(allr) // len(allr), allr)
def identify(groupby_articles, args): ''' :param groupby_articles: pandas groupbyobject by article_id :param args: argparser object Given a pandas groupby object, this module identifies duplicates and write the duplicate pairs to output csv mentioned in argparse.output_csv doing the word segmentation. ''' csv_header_list = ['article_id','author','comment_counter1','comment1','comment_counter2','comment2','weighted_score','token_sort_score'] with open(args.output_csv,'w',newline='',encoding='utf-8') as duplicatecsv: writer = csv.writer(duplicatecsv) writer.writerow(csv_header_list) for arts in range(0,len(groupby_articles)): for i in groupby_articles[arts].itertuples(): for m in groupby_articles[arts].itertuples(): if i.author!=m.author: continue if m.Index <= i.Index: continue try: if len(i.text.decode('utf-8'))<=len(m.text.decode('utf-8'))/2 or len(m.text.decode('utf-8'))<=len(i.text.decode('utf-8'))/2: continue if any(match == "< this comment did not meet civility standards >" or match == "This comment has been deleted" for match in [i.text,m.text]): continue score = fuzz.UWRatio(i.text.decode('utf-8'),m.text.decode('utf-8')) tsort_score = fuzz.token_sort_ratio(i.text.decode('utf-8'),m.text.decode('utf-8'),force_ascii=False) if score>=args.threshold and tsort_score>=args.threshold: writer.writerow([i.article_id,i.author,i.comment_counter,i.text,m.comment_counter,m.text,score,tsort_score]) except: if len(str(i.text))<=len(str(m.text))/2 or len(str(m.text))<=len(str(i.text))/2: continue if any(match == "< this comment did not meet civility standards >" or match == "This comment has been deleted" for match in [i.text,m.text]): continue score = fuzz.UWRatio(str(i.text),str(m.text)) tsort_score = fuzz.token_sort_ratio(str(i.text),str(m.text),force_ascii=False) if score>=args.threshold and tsort_score>=args.threshold: writer.writerow([i.article_id,i.author,i.comment_counter,i.text,m.comment_counter,m.text,score,tsort_score]) #print(arts) print('Output file written: ', args.output_csv)
def compute_title_penalty(medianame, title): medianame = medianame.lower() title = title.lower() if medianame != title: diffratio = fuzz.UWRatio(medianame, title)/float(100) #difflib.SequenceMatcher(None, medianame, title).ratio() penalty = int(round(const.SCORE_PENALTY_TITLE * (1 - diffratio), 0)) if penalty >= 15: medianameparts = medianame.split() titleparts = title.split() if len(medianameparts) <= len(titleparts): i = 0 penaltyalt = max(5, int(round((1.0 - (float(len(medianameparts)) / len(titleparts))) * 15 - 5))) penaltyperpart = const.SCORE_PENALTY_TITLE / len(medianameparts) for mediaNamePart in medianameparts: partdiffratio = fuzz.ratio(mediaNamePart, titleparts[i])/float(100) #difflib.SequenceMatcher(None, mediaNamePart, titleparts[i]).ratio() penaltyalt = penaltyalt + int(penaltyperpart * (1 - partdiffratio)) i = i + 1 penalty = min(penalty, penaltyalt) return penalty return 0
for item in raw_data: top_ratio = 0 top_index = 0 for index in xrange(len(broker_data)): ratio = fuzz.UQRatio(item, broker_data[index]) if top_ratio < ratio: top_ratio = ratio top_index = index print "UQRatio: {0} : {1} - {2}%".format(item, broker_data[top_index], top_ratio) for item in raw_data: top_ratio = 0 top_index = 0 for index in xrange(len(broker_data)): ratio = fuzz.UWRatio(item, broker_data[index]) if top_ratio < ratio: top_ratio = ratio top_index = index print "UWRatio: {0} : {1} - {2}%".format(item, broker_data[top_index], top_ratio) for item in raw_data: top_ratio = 0 top_index = 0 for index in xrange(len(broker_data)): ratio = fuzz.WRatio(item, broker_data[index]) if top_ratio < ratio: top_ratio = ratio top_index = index print "WRatio: {0} : {1} - {2}%".format(item, broker_data[top_index],
def criteria_scorer(supp, crit): return sum(fuzz.UWRatio(supp[k], crit[k]) for k in ('purpose', 'office'))
def _cartesian_product(df_arSft_grp, df_cpeSoft_grp): # Loop thru the data to find potential matches self.logger.info('\n\nEntering cartesian_product\n\n') # list of product tuples to check lst_dict = [] t0 = time() self.logger.info('\n\nStarting generation of ' 'cartesian product of CPE products ' 'to SCCM software ... \n' '*** This can take some time - ' 'maybe 5 min or more.\n\n') n = 0 m = 0 for key, df_ar_grp in df_arSft_grp: # split out vendor / SCCM DisplayName0 / SCCM Version0 strings (t_ar_vndrX, t_ar_dsply0, t_ar_ver0) = key # microsoft will be handled separately as service bulletins if t_ar_vndrX == 'microsoft': continue # some cisco webex products are also hard to match if t_ar_vndrX == 'cisco': if t_ar_ver0 == '-' and 'webex' in t_ar_dsply0.lower(): continue # get the corresponding CPE data for this vendor try: df_cpe_grp = df_cpeSoft_grp.get_group((t_ar_vndrX)) except KeyError as e: self.logger.critical( '\n\n***matchsft.py cartes product loop -' ' KeyError: {0}\n\n'.format(e)) continue for (t_cpeix, t_cpe_vdr_X, t_cpe_sft_X, t_cpe_relX, t_cpe_titleX, t_cpe23_name, t_cve_name) in df_cpe_grp.itertuples(): # 'normal' CPE release # t_cpe_relX_tmp = t_cpe_relX # but .... java - an exception (as always!) if t_cpe_vdr_X in ['oracle', 'sun']: if t_cpe_sft_X == 'jre' or t_cpe_sft_X == 'jdk': t_cpe_relX_tmp = _fix_java_rel(t_cpe23_name) # don't consider vendor name in fuzzy matching t_cpe_titleX_tmp = t_cpe_titleX.lower().replace( t_cpe_vdr_X, ' ') t_ar_dsply0_tmp = t_ar_dsply0.lower().replace( t_cpe_vdr_X, ' ') ###### # Apply quick heuristics to reduce the number of # possible matches ###### # 1) Release #'s should at least partially match fz_rel_ratio = fz.ratio(t_cpe_relX_tmp, t_ar_ver0) fz_rel_ptl_ratio = fz.partial_ratio( t_cpe_relX_tmp, t_ar_ver0) if (t_cpe_relX_tmp != '-') and (t_ar_ver0 != '-'): # If release data is specified, then check that # there is at least a partial match if fz_rel_ratio < 90 or fz_rel_ptl_ratio < 100: continue # 2) There should be at least one occurence of one word in # the cpe full name somewhere in sccm full name fz_ptl_tok_set_ratio = fz.partial_token_set_ratio( t_cpe_titleX_tmp, t_ar_dsply0_tmp, force_ascii=False) if fz_ptl_tok_set_ratio < 70: continue ###### # calculate fuzzy matching statistics for this match ###### lst_dict.append({ 'vendor_X': t_cpe_vdr_X, 'software_X': t_cpe_sft_X, 'Version0': t_ar_ver0, 'release_X': t_cpe_relX, 'title_X': t_cpe_titleX, 'DisplayName0': t_ar_dsply0, 'fz_ratio': fz.ratio(t_cpe_titleX_tmp, t_ar_dsply0_tmp), 'fz_ptl_ratio': fz.partial_ratio(t_cpe_titleX_tmp, t_ar_dsply0_tmp), 'fz_tok_set_ratio': fz_ptl_tok_set_ratio, 'fz_ptl_tok_sort_ratio': fz.token_sort_ratio(t_cpe_titleX_tmp, t_ar_dsply0_tmp, force_ascii=False), 'fz_uwratio': fz.UWRatio(t_cpe_titleX_tmp, t_ar_dsply0_tmp), 'fz_rel_ratio': fz_rel_ratio, 'fz_rel_ptl_ratio': fz_rel_ptl_ratio, 't_cve_name': t_cve_name }) m = m + 1 n = n + 1 if n % 100 < 1: self.logger.debug( '---Working ar: ' 'sccm sft i/p: {0} ' ', potential matches output: {1}\n'.format(n, m)) # # debug code to speed thru looping process # if n > 2000: # break duration = time() - t0 self.logger.info('\n\nDone in {0} sec.\n\n'.format(duration)) df_match = pd.DataFrame(lst_dict) if df_match.empty: self.logger.info( '\n\nResulting cartesian product is empty\n\n') return (df_match) else: df_match1 = df_match[[ 'vendor_X', 'software_X', 'title_X', 'DisplayName0', 'release_X', 'Version0', 'fz_ratio', 'fz_ptl_ratio', 'fz_tok_set_ratio', 'fz_ptl_tok_sort_ratio', 'fz_uwratio', 'fz_rel_ratio', 'fz_rel_ptl_ratio', 't_cve_name' ]] # add in length of names as features df_match1['titlX_len'] = df_match1['title_X'].apply(len) df_match1['DsplyNm0_len'] = df_match1['DisplayName0'].apply( len) self.logger.info('\n\n Results of matching: \n' '# matches: {0}\n' '# vendors: {1}\n' '# CPE software: {2}\n' '# SCCM inventory products: {3}\n' '{4}\n{5}'.format( df_match1['t_cve_name'].count(), df_match1['vendor_X'].nunique(), df_match1['software_X'].nunique(), df_match1['DisplayName0'].nunique(), df_match1.shape, df_match1.columns)) return (df_match1)
# --coding:utf-8 -- from fuzzywuzzy import fuzz from fuzzywuzzy import process text1 = u"百度是一家高科技公司" text2 = u"发丝发生发生的v发生的!" print(float(fuzz.ratio(text1, text2)) / 100) print fuzz.partial_ratio(text1, text2) print fuzz.token_sort_ratio(text1, text2, force_ascii=True) print fuzz.token_set_ratio(text1, text2, force_ascii=False) print fuzz.UWRatio(text1, text2)
def testFuzzy(self): print( 'ratio', fuzz.ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'ratio', fuzz.ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_ratio', fuzz.partial_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_ratio', fuzz.partial_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'token_sort_ratio', fuzz.token_sort_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'token_sort_ratio', fuzz.token_sort_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_token_sort_ratio', fuzz.partial_token_sort_ratio( 'MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_token_sort_ratio', fuzz.partial_token_sort_ratio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'token_set_ratio', fuzz.token_set_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'token_set_ratio', fuzz.token_set_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_token_set_ratio', fuzz.partial_token_set_ratio( 'MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_token_set_ratio', fuzz.partial_token_set_ratio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'QRatio', fuzz.QRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'QRatio', fuzz.QRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'UQRatio', fuzz.UQRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'UQRatio', fuzz.UQRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'WRatio', fuzz.WRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'WRatio', fuzz.WRatio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'UWRatio', fuzz.UWRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'UWRatio', fuzz.UWRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) pass
kod1 = ele1[0] firm1 = ele1[1] kod2 = ele2[0] firm2 = ele2[1] # score = fuzz.token_set_ratio(firm1, firm2) score_r = fuzz.ratio(firm1, firm2) score_pr = fuzz.partial_ratio(firm1, firm2) score_tsor = fuzz.token_sort_ratio(firm1, firm2) score_tser = fuzz.token_set_ratio(firm1, firm2) score_ptsor = fuzz.partial_token_sort_ratio(firm1, firm2) score_ptser = fuzz.partial_token_set_ratio(firm1, firm2) score_qr = fuzz.QRatio(firm1, firm2) score_uqr = fuzz.UQRatio(firm1, firm2) score_wr = fuzz.WRatio(firm1, firm2) score_uwr = fuzz.UWRatio(firm1, firm2) # print('kod1:' + kod1) # print('firm1:' + firm1) # print('kod2:' + kod2) # print('firm2:' + firm2) # print('score:' + str(score)) # if score_r > 90 or score_pr > 90 or score_tsor > 90 or score_tser > 90 or score_ptsor > 90 or score_ptser > 90 \ # or score_qr > 90 or score_uqr > 90 or score_wr > 90 or score_uwr > 90: if score_tser > 90: temp3 = ( kod1, firm1, kod2, firm2, score_r, score_pr, score_tsor, score_tser, score_ptsor, score_ptser, score_qr, score_uqr, score_wr, score_uwr) writer3.writerow(temp3)
def _cartesian_product(p_df_arPub, p_df_cpeVen): self.logger.info('\n\nEntering cartesian_product\n\n') # Force copy-by-value df_arPub = p_df_arPub.copy() df_cpeVen = p_df_cpeVen.copy() # List of name tuples to check lst_dict = [] t0 = time() mycount = 0 self.logger.info('\n\nStarting generation of ' 'cartesian product of NIST vendors ' 'to SCCM publishers ... \n' ' *** This can take some time ' '- up to 20 min for large prod datasets.\n\n') # Build the the cartisan product of the two sets of names (CPE, # SCCM/WMI) by iterating through the input dataframes for (t_cpeix, t_cpeVen_orig, t_cpevend_toks, t_cpeVen) in df_cpeVen.itertuples(): # Ignore cpe vendors that are 1 character long (e.g. 'X') if (len(t_cpeVen) < 2): self.logger.debug('cpeVen too short - continuing\n') continue for (t_arix, t_arPub0_orig, t_arpub_toks, t_arPub0) in df_arPub.itertuples(): # quick heuristics: # a) 1st word of cpe Vendor string has to be in the # tokenized wmi Publisher0 string somewhere # b) condensed cpe name has to be shorter than the full # WMI 'Publisher0' name if len(t_cpeVen) > len(t_arPub0): # self.logger.debug('arPub0 too short - continuing' continue # Look for at least one occurence of one word in cpeVen # somewhere in arPub if fz.partial_token_set_ratio( t_cpeVen, t_arPub0, force_ascii=False) < 100: continue # Calculate fuzzy matching statistics as "features" for # the subsequent ML classification lst_dict.append({ 'publisher0': t_arPub0_orig, 'pub0_cln': t_arPub0, 'vendor_X': t_cpeVen_orig, 'ven_cln': t_cpeVen, 'fz_ratio': fz.ratio(t_cpeVen, t_arPub0), 'fz_ptl_ratio': fz.partial_ratio(t_cpeVen, t_arPub0), 'fz_tok_set_ratio': fz.token_set_ratio(t_cpeVen, t_arPub0, force_ascii=False), 'fz_ptl_tok_sort_ratio': fz.partial_token_sort_ratio(t_cpeVen, t_arPub0, force_ascii=False), 'fz_uwratio': fz.UWRatio(t_cpeVen, t_arPub0) }) mycount = mycount + 1 if mycount % 1000 == 0: self.logger.debug( '# entries produced: {0}\n'.format(mycount)) # # debug code to shorten loop for testing # if mycount > 1000: # break # # debug code to speed thru loops # if mycount > 1000: # break duration = time() - t0 self.logger.info('\n\n*** Done in {0} sec\n\n'.format(duration)) df_match = pd.DataFrame(lst_dict) if df_match.empty: self.logger.info('Resulting cartesian product is empty.\n\n') else: self.logger.info('\n\n Vendor match dataframe: \nCounts {0},\n' 'Columns: {1}\n\n'.format( df_match.shape, df_match.columns)) return (df_match)
# print(t1) # print(t2) # print(t3) # print(t4) # a = fuzz.partial_ratio(t1,t2) # b = fuzz.ratio(t1,t2) # c = fuzz.token_set_ratio(t1,t2) # d = fuzz.partial_token_set_ratio(t1,t2) # e = fuzz.QRatio(t1,t2) # f = fuzz.WRatio(t1,t2) # g = fuzz.UWRatio(t1,t2) # print(a,b,c,d,e,f,g) s1 = filtering(str_list=["방카슈랑스 계약사항 중에서 변경하고 싶은 부분이 있습니다."], noun=False) s2 = filtering(str_list=["스마트알림 메시지 데이터는 얼마동안 볼 수 있나요?"], noun=False) s3 = "후 스마트 경우 수신 해외 서비스 외국 알림 신청 메시지 출국" print(fuzz.token_set_ratio(s3, s2)) print(fuzz.QRatio(s3, s2)) print(fuzz.UWRatio(s3, s2)) print(fuzz.WRatio(s3, s2)) # testlist = [{ # 'category' : 1, 'value' : [1,2] # },{ # 'category' : 2, 'value' : [1,2,3] # }] # # print(testlist) # # print([value['value'].append(4) for value in testlist if value.get('category')==2]) # # print(testlist)