for i in range(config.n_classes): dfTrain["median_relevance_%d" % (i+1)] = 0 dfTrain["median_relevance_%d" % (i+1)][dfTrain["median_relevance"]==(i+1)] = 1 ## query ids qid_dict = dict() for i,q in enumerate(np.unique(np.append(dfTrain['query'].unique(), dfTest['query'].unique())), start=1): #for i,q in enumerate(np.unique(dfTrain["query"]), start=1): qid_dict[q] = i ## insert query id dfTrain["qid"] = map(lambda q: qid_dict[q], dfTrain["query"]) dfTest["qid"] = map(lambda q: qid_dict[q], dfTest["query"]) ## clean text clean = lambda line: clean_text(line, drop_html_flag=config.drop_html_flag) dfTrain = dfTrain.apply(clean, axis=1) dfTest = dfTest.apply(clean, axis=1) ## unique chars #uniq_char = set(''.join(''.join(dfTrain.product_title.tolist()).split())) | set(''.join(''.join(dfTest.product_title.tolist()).split())) #reject = [x for x in uniq_char if not x.isalnum()] #reject = '|'.join(reject) print("Done.") ############### ## Save Data ## ############### print("Save data...")
dfTrain = pd.read_csv(config.original_train_data_path,encoding='utf8').fillna("") # number of train samples num_train = dfTrain.shape[0] print num_train print("Done.") ###################### ## Pre-process Data ## ###################### print("Pre-process data...") ## clean text clean = lambda line: clean_text(line, drop_html_flag=config.drop_html_flag) dfTrain.loc[:,"question1"] = list(map(clean, dfTrain["question1"])) dfTrain.loc[:,"question2"] = list(map(clean, dfTrain["question2"])) print("Done.") dfTrain.to_csv('./check_csv/check.train.csv', index=False, encoding='utf-8') ############### ## Save Data ## ############### print("Save data...")
df_test = pd.read_csv(config.path_raw + config.file_test, engine='python') df_product = pd.read_csv(config.path_raw + config.file_product_descriptions, engine='python') df_attr = pd.read_csv(config.path_raw + config.file_attributes, engine='python') # number of train/test samples num_train, num_test = df_train.shape[0], df_test.shape[0] print("Done.") ###################### ## Pre-process Data ## ###################### print("Pre-process data...") ## clean text clean = lambda line: nlp_utils.clean_text(line) print("Processing " + config.path_raw + config.file_product_descriptions) df_product.product_description = df_product.product_description.apply(clean) print("Processing " + config.path_raw + config.file_train) df_train.search_term = df_train.search_term.apply(clean) df_train.product_title = df_train.product_title.apply(clean) print("Processing " + config.path_raw + config.file_test) df_test.search_term = df_test.search_term.apply(clean) df_test.product_title = df_test.product_title.apply(clean) print("Extracting brand...") df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"}).fillna('') print(df_brand.info())