def create_similarity_features6(train, row, id=None): clr_print(id, row['product_title']) tx1, dx1 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 1, id) tx2, dx2 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 2, id) tx3, dx3 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 3, id) tx4, dx4 = get_str_for_query6(train, row['query'], row['product_title'], row['product_description'], 4, id) our_tx = ' '.join(process_str(row['product_title'])) our_dx = ' '.join(process_str(row['product_description'])) rez = [] for ngrams in [get_ngrams(our_tx, 2), get_ngrams(our_tx, 4), get_ngrams(our_tx, 6), get_ngrams(our_tx, 8), ]: for fx in [tx1, dx1, tx2, dx2, tx3, dx3, tx4, dx4]: c = 1 for ngram in ngrams: for f in fx: c+= f.count(ngram) c /= len(ngrams)+1 rez.append(c) return rez
def create_similarity_features7(train, row, id=None): clr_print(id, row['product_title']) tx1 = get_str_for_query7(train, row['query'], 1, id) tx2 = get_str_for_query7(train, row['query'], 2, id) tx3 = get_str_for_query7(train, row['query'], 3, id) tx4 = get_str_for_query7(train, row['query'], 4, id) our_tx = set(process_str_replace(row['product_names'])) return len(tx1 & our_tx), len(tx2 & our_tx), len(tx3 & our_tx), len(tx4 & our_tx)