コード例 #1
0
def create_similarity_features6(train, row, id=None):
    clr_print(id, row['product_title'])

    tx1, dx1 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 1, id)
    tx2, dx2 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 2, id)
    tx3, dx3 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 3, id)
    tx4, dx4 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 4, id)

    our_tx = ' '.join(process_str(row['product_title']))
    our_dx = ' '.join(process_str(row['product_description']))

    rez = []
    for ngrams in [get_ngrams(our_tx, 2),
                   get_ngrams(our_tx, 4),
                   get_ngrams(our_tx, 6),
                   get_ngrams(our_tx, 8),
                  ]:
        for fx in [tx1, dx1, tx2, dx2, tx3, dx3, tx4, dx4]:
            c = 1
            for ngram in ngrams:
                for f in fx:
                    c+= f.count(ngram)
            c /= len(ngrams)+1
            rez.append(c)
    return rez
コード例 #2
0
def create_similarity_features7(train, row, id=None):
    clr_print(id, row['product_title'])
    tx1 = get_str_for_query7(train, row['query'], 1, id)
    tx2 = get_str_for_query7(train, row['query'], 2, id)
    tx3 = get_str_for_query7(train, row['query'], 3, id)
    tx4 = get_str_for_query7(train, row['query'], 4, id)
    our_tx = set(process_str_replace(row['product_names']))

    return len(tx1 & our_tx), len(tx2 & our_tx), len(tx3 & our_tx), len(tx4 & our_tx)