Beispiel #1
0
for i in range(config.n_classes):
    dfTrain["median_relevance_%d" % (i+1)] = 0
    dfTrain["median_relevance_%d" % (i+1)][dfTrain["median_relevance"]==(i+1)] = 1
    
## query ids
qid_dict = dict()
for i,q in enumerate(np.unique(np.append(dfTrain['query'].unique(), dfTest['query'].unique())), start=1):
#for i,q in enumerate(np.unique(dfTrain["query"]), start=1):
    qid_dict[q] = i
    
## insert query id
dfTrain["qid"] = map(lambda q: qid_dict[q], dfTrain["query"])
dfTest["qid"] = map(lambda q: qid_dict[q], dfTest["query"])

## clean text
clean = lambda line: clean_text(line, drop_html_flag=config.drop_html_flag)
dfTrain = dfTrain.apply(clean, axis=1)
dfTest = dfTest.apply(clean, axis=1)

## unique chars
#uniq_char = set(''.join(''.join(dfTrain.product_title.tolist()).split())) | set(''.join(''.join(dfTest.product_title.tolist()).split()))
#reject = [x for x in uniq_char if not x.isalnum()]
#reject = '|'.join(reject)

print("Done.")


###############
## Save Data ##
###############
print("Save data...")
Beispiel #2
0

dfTrain = pd.read_csv(config.original_train_data_path,encoding='utf8').fillna("")
# number of train samples
num_train = dfTrain.shape[0]
print num_train
print("Done.")

######################
## Pre-process Data ##
######################
print("Pre-process data...")


## clean text
clean = lambda line: clean_text(line, drop_html_flag=config.drop_html_flag)


dfTrain.loc[:,"question1"] = list(map(clean, dfTrain["question1"]))
dfTrain.loc[:,"question2"] = list(map(clean, dfTrain["question2"]))

print("Done.")


dfTrain.to_csv('./check_csv/check.train.csv', index=False, encoding='utf-8')

###############
## Save Data ##
###############
print("Save data...")
    df_test = pd.read_csv(config.path_raw + config.file_test, engine='python')
    df_product = pd.read_csv(config.path_raw + config.file_product_descriptions, engine='python')
    df_attr = pd.read_csv(config.path_raw + config.file_attributes, engine='python')

    # number of train/test samples
    num_train, num_test = df_train.shape[0], df_test.shape[0]

    print("Done.")

    ######################
    ## Pre-process Data ##
    ######################
    print("Pre-process data...")

    ## clean text
    clean = lambda line: nlp_utils.clean_text(line)

    print("Processing " + config.path_raw + config.file_product_descriptions)
    df_product.product_description = df_product.product_description.apply(clean)

    print("Processing " + config.path_raw + config.file_train)
    df_train.search_term = df_train.search_term.apply(clean)
    df_train.product_title = df_train.product_title.apply(clean)

    print("Processing " + config.path_raw + config.file_test)
    df_test.search_term = df_test.search_term.apply(clean)
    df_test.product_title = df_test.product_title.apply(clean)

    print("Extracting brand...")
    df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"}).fillna('')
    print(df_brand.info())