Exemple #1
0
def strDistance(name1, name2):
    if name1 == None or name2 == None:
        return 0

    allr = (fuzz.partial_ratio(name1, name2), fuzz.UWRatio(name1, name2),
            fuzz.partial_token_set_ratio(name1, name2),
            fuzz.partial_token_sort_ratio(name1, name2))
    return (sum(allr) // len(allr), allr)
def identify(groupby_articles, args):
    '''
    :param groupby_articles: pandas groupbyobject by article_id
    :param args: argparser object
     Given a pandas groupby object, this module identifies duplicates and write the duplicate pairs to output csv mentioned in argparse.output_csv
     doing the word segmentation.
    '''
    csv_header_list = ['article_id','author','comment_counter1','comment1','comment_counter2','comment2','weighted_score','token_sort_score']
    with open(args.output_csv,'w',newline='',encoding='utf-8') as duplicatecsv:
        writer = csv.writer(duplicatecsv)
        writer.writerow(csv_header_list)

        for arts in range(0,len(groupby_articles)):
            for i in groupby_articles[arts].itertuples():
                for m in groupby_articles[arts].itertuples():
                    if i.author!=m.author:
                        continue
                    if m.Index <= i.Index:
                        continue
                    try:
                        if len(i.text.decode('utf-8'))<=len(m.text.decode('utf-8'))/2 or len(m.text.decode('utf-8'))<=len(i.text.decode('utf-8'))/2:
                            continue
                        if any(match == "< this comment did not meet civility standards >" or match == "This comment has been deleted" for match in [i.text,m.text]):
                            continue
                        score = fuzz.UWRatio(i.text.decode('utf-8'),m.text.decode('utf-8'))
                        tsort_score = fuzz.token_sort_ratio(i.text.decode('utf-8'),m.text.decode('utf-8'),force_ascii=False)
                        if score>=args.threshold and tsort_score>=args.threshold:
                            writer.writerow([i.article_id,i.author,i.comment_counter,i.text,m.comment_counter,m.text,score,tsort_score])
                    except:
                        if len(str(i.text))<=len(str(m.text))/2 or len(str(m.text))<=len(str(i.text))/2:
                            continue
                        if any(match == "< this comment did not meet civility standards >" or match == "This comment has been deleted" for match in [i.text,m.text]):
                            continue
                        score = fuzz.UWRatio(str(i.text),str(m.text))
                        tsort_score = fuzz.token_sort_ratio(str(i.text),str(m.text),force_ascii=False)
                        if score>=args.threshold and tsort_score>=args.threshold:
                            writer.writerow([i.article_id,i.author,i.comment_counter,i.text,m.comment_counter,m.text,score,tsort_score])
            #print(arts)
    print('Output file written: ', args.output_csv)
Exemple #3
0
def compute_title_penalty(medianame, title):
    medianame = medianame.lower()
    title = title.lower()
    if medianame != title:
        diffratio = fuzz.UWRatio(medianame, title)/float(100) #difflib.SequenceMatcher(None, medianame, title).ratio()
        penalty = int(round(const.SCORE_PENALTY_TITLE * (1 - diffratio), 0))
        if penalty >= 15:
            medianameparts = medianame.split()
            titleparts = title.split()
            if len(medianameparts) <= len(titleparts):
                i = 0
                penaltyalt = max(5, int(round((1.0 - (float(len(medianameparts)) / len(titleparts))) * 15 - 5)))
                penaltyperpart = const.SCORE_PENALTY_TITLE / len(medianameparts)
                for mediaNamePart in medianameparts:
                    partdiffratio = fuzz.ratio(mediaNamePart, titleparts[i])/float(100) #difflib.SequenceMatcher(None, mediaNamePart, titleparts[i]).ratio()
                    penaltyalt = penaltyalt + int(penaltyperpart * (1 - partdiffratio))
                    i = i + 1
                penalty = min(penalty, penaltyalt)
        return penalty
    return 0
Exemple #4
0
for item in raw_data:
    top_ratio = 0
    top_index = 0
    for index in xrange(len(broker_data)):
        ratio = fuzz.UQRatio(item, broker_data[index])
        if top_ratio < ratio:
            top_ratio = ratio
            top_index = index
    print "UQRatio: {0} : {1} - {2}%".format(item, broker_data[top_index],
                                             top_ratio)

for item in raw_data:
    top_ratio = 0
    top_index = 0
    for index in xrange(len(broker_data)):
        ratio = fuzz.UWRatio(item, broker_data[index])
        if top_ratio < ratio:
            top_ratio = ratio
            top_index = index
    print "UWRatio: {0} : {1} - {2}%".format(item, broker_data[top_index],
                                             top_ratio)

for item in raw_data:
    top_ratio = 0
    top_index = 0
    for index in xrange(len(broker_data)):
        ratio = fuzz.WRatio(item, broker_data[index])
        if top_ratio < ratio:
            top_ratio = ratio
            top_index = index
    print "WRatio: {0} : {1} - {2}%".format(item, broker_data[top_index],
Exemple #5
0
def criteria_scorer(supp, crit):
    return sum(fuzz.UWRatio(supp[k], crit[k])
               for k in ('purpose', 'office'))
        def _cartesian_product(df_arSft_grp, df_cpeSoft_grp):

            # Loop thru the data to find potential matches

            self.logger.info('\n\nEntering cartesian_product\n\n')

            # list of product tuples to check
            lst_dict = []

            t0 = time()

            self.logger.info('\n\nStarting generation of '
                             'cartesian product of CPE products '
                             'to SCCM software ... \n'
                             '*** This can take some time - '
                             'maybe 5 min or more.\n\n')
            n = 0
            m = 0
            for key, df_ar_grp in df_arSft_grp:

                # split out vendor / SCCM DisplayName0 / SCCM Version0 strings
                (t_ar_vndrX, t_ar_dsply0, t_ar_ver0) = key

                # microsoft will be handled separately as service bulletins
                if t_ar_vndrX == 'microsoft':
                    continue

                # some cisco webex products are also hard to match
                if t_ar_vndrX == 'cisco':
                    if t_ar_ver0 == '-' and 'webex' in t_ar_dsply0.lower():
                        continue

                # get the corresponding CPE data for this vendor
                try:
                    df_cpe_grp = df_cpeSoft_grp.get_group((t_ar_vndrX))

                except KeyError as e:
                    self.logger.critical(
                        '\n\n***matchsft.py cartes product loop -'
                        ' KeyError: {0}\n\n'.format(e))
                    continue

                for (t_cpeix, t_cpe_vdr_X, t_cpe_sft_X, t_cpe_relX,
                     t_cpe_titleX, t_cpe23_name,
                     t_cve_name) in df_cpe_grp.itertuples():

                    # 'normal' CPE release #
                    t_cpe_relX_tmp = t_cpe_relX

                    # but .... java - an exception (as always!)
                    if t_cpe_vdr_X in ['oracle', 'sun']:
                        if t_cpe_sft_X == 'jre' or t_cpe_sft_X == 'jdk':
                            t_cpe_relX_tmp = _fix_java_rel(t_cpe23_name)

                    # don't consider vendor name in fuzzy matching

                    t_cpe_titleX_tmp = t_cpe_titleX.lower().replace(
                        t_cpe_vdr_X, ' ')
                    t_ar_dsply0_tmp = t_ar_dsply0.lower().replace(
                        t_cpe_vdr_X, ' ')
                    ######
                    #   Apply quick heuristics to reduce the number of
                    #   possible matches
                    ######

                    # 1) Release #'s should at least partially match

                    fz_rel_ratio = fz.ratio(t_cpe_relX_tmp, t_ar_ver0)
                    fz_rel_ptl_ratio = fz.partial_ratio(
                        t_cpe_relX_tmp, t_ar_ver0)

                    if (t_cpe_relX_tmp != '-') and (t_ar_ver0 != '-'):

                        # If release data is specified, then check that
                        # there is at least a partial match

                        if fz_rel_ratio < 90 or fz_rel_ptl_ratio < 100:
                            continue

                    # 2) There should be at least one occurence of one word in
                    # the cpe full name somewhere in sccm full name

                    fz_ptl_tok_set_ratio = fz.partial_token_set_ratio(
                        t_cpe_titleX_tmp, t_ar_dsply0_tmp, force_ascii=False)

                    if fz_ptl_tok_set_ratio < 70:
                        continue

                    ######
                    # calculate fuzzy matching statistics for this match
                    ######

                    lst_dict.append({
                        'vendor_X':
                        t_cpe_vdr_X,
                        'software_X':
                        t_cpe_sft_X,
                        'Version0':
                        t_ar_ver0,
                        'release_X':
                        t_cpe_relX,
                        'title_X':
                        t_cpe_titleX,
                        'DisplayName0':
                        t_ar_dsply0,
                        'fz_ratio':
                        fz.ratio(t_cpe_titleX_tmp, t_ar_dsply0_tmp),
                        'fz_ptl_ratio':
                        fz.partial_ratio(t_cpe_titleX_tmp, t_ar_dsply0_tmp),
                        'fz_tok_set_ratio':
                        fz_ptl_tok_set_ratio,
                        'fz_ptl_tok_sort_ratio':
                        fz.token_sort_ratio(t_cpe_titleX_tmp,
                                            t_ar_dsply0_tmp,
                                            force_ascii=False),
                        'fz_uwratio':
                        fz.UWRatio(t_cpe_titleX_tmp, t_ar_dsply0_tmp),
                        'fz_rel_ratio':
                        fz_rel_ratio,
                        'fz_rel_ptl_ratio':
                        fz_rel_ptl_ratio,
                        't_cve_name':
                        t_cve_name
                    })
                    m = m + 1

                n = n + 1
                if n % 100 < 1:
                    self.logger.debug(
                        '---Working ar: '
                        'sccm sft i/p: {0} '
                        ', potential matches output: {1}\n'.format(n, m))
                # # debug code to speed thru looping process
                # if n > 2000:
                #     break

            duration = time() - t0
            self.logger.info('\n\nDone in {0} sec.\n\n'.format(duration))

            df_match = pd.DataFrame(lst_dict)

            if df_match.empty:
                self.logger.info(
                    '\n\nResulting cartesian product is empty\n\n')
                return (df_match)

            else:
                df_match1 = df_match[[
                    'vendor_X', 'software_X', 'title_X', 'DisplayName0',
                    'release_X', 'Version0', 'fz_ratio', 'fz_ptl_ratio',
                    'fz_tok_set_ratio', 'fz_ptl_tok_sort_ratio', 'fz_uwratio',
                    'fz_rel_ratio', 'fz_rel_ptl_ratio', 't_cve_name'
                ]]

                # add in length of names as features

                df_match1['titlX_len'] = df_match1['title_X'].apply(len)
                df_match1['DsplyNm0_len'] = df_match1['DisplayName0'].apply(
                    len)

                self.logger.info('\n\n Results of matching: \n'
                                 '# matches: {0}\n'
                                 '# vendors: {1}\n'
                                 '# CPE software: {2}\n'
                                 '# SCCM inventory products: {3}\n'
                                 '{4}\n{5}'.format(
                                     df_match1['t_cve_name'].count(),
                                     df_match1['vendor_X'].nunique(),
                                     df_match1['software_X'].nunique(),
                                     df_match1['DisplayName0'].nunique(),
                                     df_match1.shape, df_match1.columns))
                return (df_match1)
Exemple #7
0
# --coding:utf-8 --
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

text1 = u"百度是一家高科技公司"
text2 = u"发丝发生发生的v发生的!"

print(float(fuzz.ratio(text1, text2)) / 100)
print fuzz.partial_ratio(text1, text2)
print fuzz.token_sort_ratio(text1, text2, force_ascii=True)
print fuzz.token_set_ratio(text1, text2, force_ascii=False)
print fuzz.UWRatio(text1, text2)
Exemple #8
0
    def testFuzzy(self):
        print(
            'ratio',
            fuzz.ratio('MISSION HOSPITAL',
                       'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'ratio',
            fuzz.ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                       'MISSION HOSPITAL'))

        print(
            'partial_ratio',
            fuzz.partial_ratio('MISSION HOSPITAL',
                               'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_ratio',
            fuzz.partial_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                               'MISSION HOSPITAL'))

        print(
            'token_sort_ratio',
            fuzz.token_sort_ratio('MISSION HOSPITAL',
                                  'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'token_sort_ratio',
            fuzz.token_sort_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                                  'MISSION HOSPITAL'))

        print(
            'partial_token_sort_ratio',
            fuzz.partial_token_sort_ratio(
                'MISSION HOSPITAL',
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_token_sort_ratio',
            fuzz.partial_token_sort_ratio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'token_set_ratio',
            fuzz.token_set_ratio('MISSION HOSPITAL',
                                 'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'token_set_ratio',
            fuzz.token_set_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                                 'MISSION HOSPITAL'))

        print(
            'partial_token_set_ratio',
            fuzz.partial_token_set_ratio(
                'MISSION HOSPITAL',
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_token_set_ratio',
            fuzz.partial_token_set_ratio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'QRatio',
            fuzz.QRatio('MISSION HOSPITAL',
                        'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'QRatio',
            fuzz.QRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                        'MISSION HOSPITAL'))

        print(
            'UQRatio',
            fuzz.UQRatio('MISSION HOSPITAL',
                         'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'UQRatio',
            fuzz.UQRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                         'MISSION HOSPITAL'))

        print(
            'WRatio',
            fuzz.WRatio('MISSION HOSPITAL',
                        'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'WRatio',
            fuzz.WRatio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'UWRatio',
            fuzz.UWRatio('MISSION HOSPITAL',
                         'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'UWRatio',
            fuzz.UWRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                         'MISSION HOSPITAL'))

        pass
Exemple #9
0
        kod1 = ele1[0]
        firm1 = ele1[1]
        kod2 = ele2[0]
        firm2 = ele2[1]
        # score = fuzz.token_set_ratio(firm1, firm2)

        score_r = fuzz.ratio(firm1, firm2)
        score_pr = fuzz.partial_ratio(firm1, firm2)
        score_tsor = fuzz.token_sort_ratio(firm1, firm2)
        score_tser = fuzz.token_set_ratio(firm1, firm2)
        score_ptsor = fuzz.partial_token_sort_ratio(firm1, firm2)
        score_ptser = fuzz.partial_token_set_ratio(firm1, firm2)
        score_qr = fuzz.QRatio(firm1, firm2)
        score_uqr = fuzz.UQRatio(firm1, firm2)
        score_wr = fuzz.WRatio(firm1, firm2)
        score_uwr = fuzz.UWRatio(firm1, firm2)

        # print('kod1:' + kod1)
        # print('firm1:' + firm1)
        # print('kod2:' + kod2)
        # print('firm2:' + firm2)
        # print('score:' + str(score))

        # if score_r > 90 or score_pr > 90 or score_tsor > 90 or score_tser > 90 or score_ptsor > 90 or score_ptser > 90 \
        #         or score_qr > 90 or score_uqr > 90 or score_wr > 90 or score_uwr > 90:

        if score_tser > 90:
            temp3 = (
                kod1, firm1, kod2, firm2, score_r, score_pr, score_tsor, score_tser, score_ptsor, score_ptser, score_qr,
                score_uqr, score_wr, score_uwr)
            writer3.writerow(temp3)
Exemple #10
0
        def _cartesian_product(p_df_arPub, p_df_cpeVen):

            self.logger.info('\n\nEntering cartesian_product\n\n')

            # Force copy-by-value
            df_arPub = p_df_arPub.copy()
            df_cpeVen = p_df_cpeVen.copy()

            # List of name tuples to check
            lst_dict = []
            t0 = time()
            mycount = 0
            self.logger.info('\n\nStarting generation of '
                             'cartesian product of NIST vendors '
                             'to SCCM publishers ... \n'
                             ' *** This can take some time '
                             '- up to 20 min for large prod datasets.\n\n')

            # Build the the cartisan product of the two sets of names (CPE,
            # SCCM/WMI) by iterating through the input dataframes

            for (t_cpeix, t_cpeVen_orig, t_cpevend_toks,
                 t_cpeVen) in df_cpeVen.itertuples():

                #   Ignore cpe vendors that are 1 character long (e.g. 'X')

                if (len(t_cpeVen) < 2):
                    self.logger.debug('cpeVen too short - continuing\n')
                    continue

                for (t_arix, t_arPub0_orig, t_arpub_toks,
                     t_arPub0) in df_arPub.itertuples():

                    # quick heuristics:
                    #   a) 1st word of cpe Vendor string has to be in the
                    #            tokenized wmi Publisher0 string somewhere
                    #   b) condensed cpe name has to be shorter than the full
                    #           WMI 'Publisher0' name

                    if len(t_cpeVen) > len(t_arPub0):
                        # self.logger.debug('arPub0 too short - continuing'
                        continue

                    # Look for at least one occurence of one word in cpeVen
                    #       somewhere in arPub
                    if fz.partial_token_set_ratio(
                            t_cpeVen, t_arPub0, force_ascii=False) < 100:
                        continue

                    # Calculate fuzzy matching statistics as "features" for
                    # the subsequent ML classification

                    lst_dict.append({
                        'publisher0':
                        t_arPub0_orig,
                        'pub0_cln':
                        t_arPub0,
                        'vendor_X':
                        t_cpeVen_orig,
                        'ven_cln':
                        t_cpeVen,
                        'fz_ratio':
                        fz.ratio(t_cpeVen, t_arPub0),
                        'fz_ptl_ratio':
                        fz.partial_ratio(t_cpeVen, t_arPub0),
                        'fz_tok_set_ratio':
                        fz.token_set_ratio(t_cpeVen,
                                           t_arPub0,
                                           force_ascii=False),
                        'fz_ptl_tok_sort_ratio':
                        fz.partial_token_sort_ratio(t_cpeVen,
                                                    t_arPub0,
                                                    force_ascii=False),
                        'fz_uwratio':
                        fz.UWRatio(t_cpeVen, t_arPub0)
                    })
                    mycount = mycount + 1
                    if mycount % 1000 == 0:
                        self.logger.debug(
                            '# entries produced: {0}\n'.format(mycount))

                #     # debug code to shorten loop for testing
                #     if mycount > 1000:
                #         break

                # # debug code to speed thru loops
                # if mycount > 1000:
                #     break

            duration = time() - t0
            self.logger.info('\n\n*** Done in {0} sec\n\n'.format(duration))
            df_match = pd.DataFrame(lst_dict)

            if df_match.empty:
                self.logger.info('Resulting cartesian product is empty.\n\n')

            else:
                self.logger.info('\n\n Vendor match dataframe: \nCounts {0},\n'
                                 'Columns: {1}\n\n'.format(
                                     df_match.shape, df_match.columns))
            return (df_match)
Exemple #11
0
# print(t1)
# print(t2)
# print(t3)
# print(t4)
# a = fuzz.partial_ratio(t1,t2)
# b = fuzz.ratio(t1,t2)
# c = fuzz.token_set_ratio(t1,t2)
# d = fuzz.partial_token_set_ratio(t1,t2)
# e = fuzz.QRatio(t1,t2)
# f = fuzz.WRatio(t1,t2)
# g = fuzz.UWRatio(t1,t2)
# print(a,b,c,d,e,f,g)
s1 = filtering(str_list=["방카슈랑스 계약사항 중에서 변경하고 싶은 부분이 있습니다."], noun=False)
s2 = filtering(str_list=["스마트알림 메시지 데이터는 얼마동안 볼 수 있나요?"], noun=False)
s3 = "후 스마트 경우 수신 해외 서비스 외국 알림 신청 메시지 출국"
print(fuzz.token_set_ratio(s3, s2))
print(fuzz.QRatio(s3, s2))
print(fuzz.UWRatio(s3, s2))
print(fuzz.WRatio(s3, s2))
# testlist = [{
#     'category' : 1, 'value' : [1,2]
# },{
#     'category' : 2, 'value' : [1,2,3]
# }]
#
# print(testlist)
#
# print([value['value'].append(4) for value in testlist if value.get('category')==2])
#
# print(testlist)