def generate_word_counting_features(df): ################################ ## word count and digit count ## ################################ print("generate word counting features") feat_names = ["search_term", "title", "description"] grams = ["unigram", "bigram", "trigram"] count_digit = lambda x: sum([1. for w in x if w.isdigit()]) for feat_name in feat_names: for gram in grams: # word count print("Generating count_of_{0}_{1} feature...".format(feat_name, gram)) df["count_of_%s_%s" % (feat_name, gram)] = df.apply(lambda x: len(x[feat_name + "_" + gram]), axis=1) print("Generating count_of_unique_{0}_{1} feature...".format(feat_name, gram)) df["count_of_unique_%s_%s" % (feat_name, gram)] = df.apply(lambda x: len(set(x[feat_name + "_" + gram])), axis=1) print("Generating ratio_of_unique_{0}_{1} feature...".format(feat_name, gram)) df["ratio_of_unique_%s_%s" % (feat_name, gram)] = df.apply( lambda x: feature_utils.try_divide(x["count_of_unique_%s_%s" % (feat_name, gram)], x["count_of_%s_%s" % (feat_name, gram)]), axis=1) # digit count print("Generating count_of_digit_in_{0} feature...".format(feat_name)) df["count_of_digit_in_%s" % feat_name] = df.apply(lambda x: count_digit(x[feat_name + "_unigram"]), axis=1) print("Generating ratio_of_digit_in_{0} feature...".format(feat_name)) df["ratio_of_digit_in_%s" % feat_name] = df.apply(lambda x: feature_utils.try_divide(x["count_of_digit_in_%s" % feat_name], x["count_of_%s_unigram" % (feat_name)]), axis=1) # description missing indicator print("Generating description_missing feature...") df["description_missing"] = df.apply(lambda x: int(x["description_unigram"] == ""), axis=1)
def generate_intersect_word_position_features(df): ###################################### ## intersect word position feat ## ###################################### print("generate intersect word position features") feat_names = ["search_term", "title", "description"] grams = ["unigram"] for gram in grams: for target_name in feat_names: for obs_name in feat_names: if target_name != obs_name: pos = df.apply(lambda x: get_position_list(x[target_name + "_" + gram], obs=x[obs_name + "_" + gram]), axis=1) # stats feat on pos print("Generating pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)) df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.min(x)) # np.min(pos) # print(df['pos_of_title_unigram_in_search_term_min']) print("Generating pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)) df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.mean(x)) print("Generating pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)) df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.median(x)) print("Generating pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)) df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.max(x)) print("Generating pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)) df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = pos.apply(lambda x: np.std(x)) # stats feat on normalized_pos print("Generating normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)) df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = df.apply( lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], x["count_of_%s_%s" % (obs_name, gram)]), axis=1) print("Generating normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)) df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = df.apply( lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], x["count_of_%s_%s" % (obs_name, gram)]), axis=1) print("Generating normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)) df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = df.apply( lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], x["count_of_%s_%s" % (obs_name, gram)]), axis=1) print("Generating normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)) df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = df.apply( lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], x["count_of_%s_%s" % (obs_name, gram)]), axis=1) print("Generating normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)) df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = df.apply( lambda x: feature_utils.try_divide(x["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)], x["count_of_%s_%s" % (obs_name, gram)]), axis=1)
def generate_intersect_word_count(df): ############################## ## intersect word count ## ############################## print("generate intersect word counting features") feat_names = ["search_term", "title", "description"] grams = ["unigram", "bigram", "trigram"] for gram in grams: for obs_name in feat_names: for target_name in feat_names: if target_name != obs_name: ## query print("Generating count_of_{0}_{1}_in_{2} feature...".format(obs_name, gram, target_name)) df["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = list( df.apply(lambda x: sum([1. for w in x[obs_name + "_" + gram] if w in set(x[target_name + "_" + gram])]), axis=1)) print("Generating ratio_of_{0}_{1}_in_{2} feature...".format(obs_name, gram, target_name)) df["ratio_of_%s_%s_in_%s" % (obs_name, gram, target_name)] = df.apply( lambda x: feature_utils.try_divide(x["count_of_%s_%s_in_%s" % (obs_name, gram, target_name)], x["count_of_%s_%s" % (obs_name, gram)]), axis=1) ## some other feat print("Generating title_{0}_in_search_term_div_search_term_{1} feature...".format(gram, gram)) df["title_%s_in_search_term_div_search_term_%s" % (gram, gram)] = df.apply( lambda x: feature_utils.try_divide(x["count_of_title_%s_in_search_term" % gram], x["count_of_search_term_%s" % gram]), axis=1) print("Generating title_{0}_in_search_term_div_search_term_{1}_in_title feature...".format(gram, gram)) df["title_%s_in_search_term_div_search_term_%s_in_title" % (gram, gram)] = df.apply( lambda x: feature_utils.try_divide(x["count_of_title_%s_in_search_term" % gram], x["count_of_search_term_%s_in_title" % gram]), axis=1) print("Generating description_{0}_in_search_term_div_search_term_{1} feature...".format(gram, gram)) df["description_%s_in_search_term_div_search_term_%s" % (gram, gram)] = df.apply( lambda x: feature_utils.try_divide(x["count_of_description_%s_in_search_term" % gram], x["count_of_search_term_%s" % gram]), axis=1) print("Generating description_{0}_in_search_term_div_search_term_{1}_in_description feature...".format(gram, gram)) df["description_%s_in_search_term_div_search_term_%s_in_description" % (gram, gram)] = df.apply( lambda x: feature_utils.try_divide(x["count_of_description_%s_in_search_term" % gram], x["count_of_search_term_%s_in_description" % gram]), axis=1)
def generate_intersect_word_count(df): ############################## ## intersect word count ## ############################## print("generate intersect word counting features") grams = ["unigram", "bigram", "trigram"] for gram in grams: # word count print("Generating count_of_brand_{0} feature...".format(gram)) df["count_of_brand_%s" % (gram)] = df.apply(lambda x: len(x["brand_" + gram]), axis=1) # search term print("Generating count_of_search_term_{0}_in_brand_{1} feature...".format(gram, gram)) df["count_of_search_term_%s_in_brand_%s" % (gram, gram)] = list( df.apply(lambda x: sum([1. for w in x["search_term_" + gram] if w in set(x["title_" + gram])]), axis=1)) print("Generating ratio_of_search_term_{0}_in_title_{1} feature...".format(gram, gram)) df["ratio_of_search_term_%s_in_brand_%s" % (gram, gram)] = df.apply( lambda x: feature_utils.try_divide(x["count_of_search_term_%s_in_brand_%s" % (gram, gram)], x["count_of_brand_%s" % (gram)]), axis=1)