Ejemplo n.º 1
0
def prepare_distinct(path, out, nlp):
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        columns = [
            'ent_max',
            'ent_min',
            'ent_diff',
            'ent_jaccard',
        ]
        columns = ','.join(columns)
        outfile.write(columns + '\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')):
            if c % 100000 == 0:
                print 'finished', c
            q1 = unicode(remove_punctuation(str(row['question1']).lower()))
            q2 = unicode(remove_punctuation(str(row['question2']).lower()))
            # spacy_sim = nlp(q1).similarity(nlp(q2))
            # print q1,q2
            # q1,q2 = distinct_terms(q1,q2)
            # spacy_sim_distinct = nlp(unicode(q1)).similarity(nlp(unicode(q2)))
            q1 = nlp(q1)
            q2 = nlp(q2)
            # q1_ent = [ent.label_ for ent in q1.ents]
            # q2_ent = [ent.label_ for ent in q2.ents]

            q1_ent = [ent.text for ent in q1.ents]
            q2_ent = [ent.text for ent in q2.ents]

            q1_len = len(q1_ent)
            q2_len = len(q2_ent)
            ent_max = max(q1_len, q2_len)
            ent_min = min(q1_len, q2_len)
            ent_diff = ent_max - ent_min
            # print(q1,q2)
            # print(q1_ent,q2_ent)
            try:
                ent_jaccard = get_jaccard(q1_ent, q2_ent)
            except:
                ent_jaccard = -1

            features = (
                ent_max,
                ent_min,
                ent_diff,
                ent_jaccard,
            )
            outfile.write('%s,%s,%s,%s\n' % features)
            c += 1
        end = datetime.now()
    print 'times:', end - start
def prepare_hash_df(path, out, neighbour_dict, df_dict):
    n_qids = float(len(neighbour_dict.keys()))
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write(
            'max_entropy,min_entropy,jaccard,intersection,intersection_entropy\n'
        )
        for t, row in enumerate(DictReader(open(path), delimiter=',')):
            if c % 100000 == 0:
                print 'finished', c
            q1 = str(row['question1_hash'])
            q2 = str(row['question2_hash'])

            q1_df = neighbour_dict.get(q1, [])
            q2_df = neighbour_dict.get(q2, [])
            HA = 0.0
            for q in q1_df:
                q_df = df_dict.get(q, 1)
                HA += -(q_df / n_qids) * log(q_df / n_qids)

            HB = 0.0
            for q in q2_df:
                q_df = df_dict.get(q, 1)
                HB += -(q_df / n_qids) * log(q_df / n_qids)

            qmax = max(HA, HB)
            qmin = min(HA, HB)

            intersection = set(q1_df).intersection(set(q2_df))

            H_intersection = 0.0
            for q in intersection:
                q_df = df_dict.get(q, 1)
                H_intersection += -(q_df / n_qids) * log(q_df / n_qids)

            jaccard = get_jaccard(q1_df, q2_df)

            outfile.write(
                '%s,%s,%s,%s,%s\n' %
                (qmax, qmin, jaccard, len(intersection), H_intersection))

            c += 1
            end = datetime.now()

    print 'times:', end - start
def prepare_ngram_interaction(path, out, ngram='unigram'):
    data_input = pd.read_csv(path)
    data_ouput = DataFrame(columns=[
        'jaccard_' + ngram, 'dice_' + ngram, 'count_s1_in_s2_' +
        ngram, 'ratio_s1_in_s2_' + ngram, 'count_of_sen1_' +
        ngram, 'count_of_sen2_' + ngram, 'count_of_unique_sen1_' +
        ngram, 'count_of_unique_sen2_' + ngram, 'ratio_of_unique_sen1_' +
        ngram, 'ratio_of_unique_sen2_' + ngram, 'count_of_digit_sen1_' +
        ngram, 'count_of_digit_sen2_' + ngram, 'ratio_of_digit_sen1_' +
        ngram, 'ratio_of_digit_sen2_' + ngram
    ])
    for index, row in data_input.iterrows():
        s1_ngram = str(row['sen1_%s' % ngram]).split()
        s2_ngram = str(row['sen2_%s' % ngram]).split()

        jaccard = get_jaccard(s1_ngram, s2_ngram)
        dice = get_dice(s1_ngram, s2_ngram)

        count_s1_in_s2 = get_count_s1_in_s2(s1_ngram, s2_ngram)
        ratio_s1_in_s2 = get_ratio_s1_in_s2(s1_ngram, s2_ngram)

        count_of_sen1 = get_count_of_sen(s1_ngram)
        count_of_sen2 = get_count_of_sen(s2_ngram)

        count_of_unique_sen1 = get_count_of_unique_sen(s1_ngram)
        count_of_unique_sen2 = get_count_of_unique_sen(s2_ngram)

        ratio_of_unique_sen1 = get_ratio_of_unique_sen(s1_ngram)
        ratio_of_unique_sen2 = get_ratio_of_unique_sen(s2_ngram)

        count_of_digit_sen1 = get_count_of_digit(s1_ngram)
        count_of_digit_sen2 = get_count_of_digit(s2_ngram)

        ratio_of_digit_sen1 = get_ratio_of_digit(s1_ngram)
        ratio_of_digit_sen2 = get_ratio_of_digit(s2_ngram)

        data_ouput.loc[index] = [
            jaccard, dice, count_s1_in_s2, ratio_s1_in_s2, count_of_sen1,
            count_of_sen2, count_of_unique_sen1, count_of_unique_sen2,
            ratio_of_unique_sen1, ratio_of_unique_sen2, count_of_digit_sen1,
            count_of_digit_sen2, ratio_of_digit_sen1, ratio_of_digit_sen2
        ]
    data_ouput.to_csv(out, index=False)
def generate_ngram_inter(path,out):
    print('generate basic features,data path is',path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('jaccard,dice,count_q1_in_q2,ratio_q1_in_q2,count_of_sen1,count_of_sen2,count_of_unique_sen1,count_of_unique_sen2,ratio_of_unique_sen1,ratio_of_unique_sen2,count_of_digit_sen1,count_of_digit_sen2,ratio_of_digit_sen1,ratio_of_digit_sen2,count_of_sen_min,count_of_sen_max,count_of_unique_sen_min,count_of_unique_sen_max,ratio_of_unique_sen_min,ratio_of_unique_sen_max,count_of_digit_sen_min,count_of_digit_sen_max,ratio_of_digit_sen_min,ratio_of_digit_sen_max\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            
            sen1 = str(row['sen1']).split()
            sen2 = str(row['sen2']).split()

            jaccard = get_jaccard(sen1,sen2)
            dice = get_dice(sen1,sen2)

            count_q1_in_q2 = get_count_q1_in_q2(sen1,sen2)
            ratio_q1_in_q2 = get_ratio_q1_in_q2(sen1,sen2)

            count_of_sen1 = get_count_of_sen(sen1)
            count_of_sen2 = get_count_of_sen(sen2)

            count_of_sen_min = min(count_of_sen1,count_of_sen2)
            count_of_sen_max = max(count_of_sen1,count_of_sen2)
            
            count_of_unique_sen1 = get_count_of_unique_sen(sen1)
            count_of_unique_sen2 = get_count_of_unique_sen(sen2)
            
            count_of_unique_sen_min = min(count_of_unique_sen1,count_of_unique_sen2)
            count_of_unique_sen_max = max(count_of_unique_sen1,count_of_unique_sen2)
            
            ratio_of_unique_sen1 = get_ratio_of_unique_sen(sen1)
            ratio_of_unique_sen2 = get_ratio_of_unique_sen(sen2)
            
            ratio_of_unique_sen_min = min(ratio_of_unique_sen1,ratio_of_unique_sen2)
            ratio_of_unique_sen_max = max(ratio_of_unique_sen1,ratio_of_unique_sen2)
            
            count_of_digit_sen1 = get_count_of_digit(sen1)
            count_of_digit_sen2 = get_count_of_digit(sen2)
                        
            count_of_digit_sen_min = min(count_of_digit_sen1,count_of_digit_sen2)
            count_of_digit_sen_max = max(count_of_digit_sen1,count_of_digit_sen2)
            
            ratio_of_digit_sen1 = get_ratio_of_digit(sen1)
            ratio_of_digit_sen2 = get_ratio_of_digit(sen2)
                        
            ratio_of_digit_sen_min = min(ratio_of_digit_sen1,ratio_of_digit_sen2)
            ratio_of_digit_sen_max = max(ratio_of_digit_sen1,ratio_of_digit_sen2)
            
            
            outfile.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
                jaccard, dice,
                count_q1_in_q2,ratio_q1_in_q2,
                count_of_sen1,count_of_sen2,
                count_of_unique_sen1,count_of_unique_sen2,
                ratio_of_unique_sen1,ratio_of_unique_sen2,
                count_of_digit_sen1,count_of_digit_sen2,
                ratio_of_digit_sen1,ratio_of_digit_sen2,
                count_of_sen_min,count_of_sen_max,
                count_of_unique_sen_min,count_of_unique_sen_max,
                ratio_of_unique_sen_min,ratio_of_unique_sen_max,
                count_of_digit_sen_min,count_of_digit_sen_max,
                ratio_of_digit_sen_min,ratio_of_digit_sen_max,
                ))
            c+=1
        end = datetime.now()

    print('times:',end-start)
Ejemplo n.º 5
0
def prepare_ngram_interaction(path, out, ngram='unigram'):
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write(
            'jaccard,dice,count_q1_in_q2,ratio_q1_in_q2,count_of_question1,count_of_question2,count_of_unique_question1,count_of_unique_question2,ratio_of_unique_question1,ratio_of_unique_question2,count_of_digit_question1,count_of_digit_question2,ratio_of_digit_question1,ratio_of_digit_question2\n'
        )
        for t, row in enumerate(DictReader(open(path), delimiter=',')):
            if c % 100000 == 0:
                print 'finished', c
            q1_ngram = str(row['question1_%s' % ngram]).split()
            q2_ngram = str(row['question2_%s' % ngram]).split()

            jaccard = get_jaccard(q1_ngram, q2_ngram)
            dice = get_dice(q1_ngram, q2_ngram)

            count_q1_in_q2 = get_count_q1_in_q2(q1_ngram, q2_ngram)
            ratio_q1_in_q2 = get_ratio_q1_in_q2(q1_ngram, q2_ngram)

            count_of_question1 = get_count_of_question(q1_ngram)
            count_of_question2 = get_count_of_question(q2_ngram)

            count_of_question_min = min(count_of_question1, count_of_question2)
            count_of_question_max = max(count_of_question1, count_of_question2)

            count_of_unique_question1 = get_count_of_unique_question(q1_ngram)
            count_of_unique_question2 = get_count_of_unique_question(q2_ngram)

            count_of_unique_question_min = min(count_of_unique_question1,
                                               count_of_unique_question2)
            count_of_unique_question_max = max(count_of_unique_question1,
                                               count_of_unique_question2)

            ratio_of_unique_question1 = get_ratio_of_unique_question(q1_ngram)
            ratio_of_unique_question2 = get_ratio_of_unique_question(q2_ngram)

            ratio_of_unique_question_min = min(ratio_of_unique_question1,
                                               ratio_of_unique_question2)
            ratio_of_unique_question_max = max(ratio_of_unique_question1,
                                               ratio_of_unique_question2)

            count_of_digit_question1 = get_count_of_digit(q1_ngram)
            count_of_digit_question2 = get_count_of_digit(q2_ngram)

            count_of_digit_question_min = min(count_of_digit_question1,
                                              count_of_digit_question2)
            count_of_digit_question_max = max(count_of_digit_question1,
                                              count_of_digit_question2)

            ratio_of_digit_question1 = get_ratio_of_digit(q1_ngram)
            ratio_of_digit_question2 = get_ratio_of_digit(q2_ngram)

            ratio_of_digit_question_min = min(ratio_of_digit_question1,
                                              ratio_of_digit_question2)
            ratio_of_digit_question_max = max(ratio_of_digit_question1,
                                              ratio_of_digit_question2)

            outfile.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
                jaccard,
                dice,
                count_q1_in_q2,
                ratio_q1_in_q2,
                count_of_question_min,
                count_of_question_max,
                count_of_unique_question_min,
                count_of_unique_question_max,
                ratio_of_unique_question_min,
                ratio_of_unique_question_max,
                count_of_digit_question_min,
                count_of_digit_question_max,
                ratio_of_digit_question_min,
                ratio_of_digit_question_max,
            ))
            c += 1
        end = datetime.now()

    print 'times:', end - start