def merge_att(prd): infile = "01att_brand.csv" df = pd.read_csv(hd.path_sub+infile) prd = pd.merge(prd,df,on='pid',how='left') infile = "02att_color_power_material.csv" df = pd.read_csv(hd.path_sub+infile) prd = pd.merge(prd,df,on='pid',how='left') prd.att_color.fillna('',inplace=True) prd.att_power.fillna('',inplace=True) prd.att_material.fillna('',inplace=True) infile = "03att_size_weight.csv" df = pd.read_csv(hd.path_sub+infile) prd = pd.merge(prd,df,on='pid',how='left') prd.att_size.fillna('',inplace=True) prd.att_weight.fillna('',inplace=True) prd['p_brand_'] = prd.p_brand.map(lambda x:hd.str_stem(x)).str.split() prd['att_color_'] = prd.att_color.str.split() prd['att_power_'] = prd.att_power.str.split() prd['att_material_'] = prd.att_material.str.split() prd['att_size_'] = prd.att_size.str.split() prd['att_weight_'] = prd.att_weight.str.split() return prd
def get_att_sub(base,name): pid = base.pid.tolist() att = hd.load_att() att = att[att.name==name] del att['name'] att.columns = [['pid',name]] att = att[att.pid.isin(pid)] att[name] = att[name].map(lambda x:hd.str_stem(x)).str.split() dup_pids = att[att.pid.duplicated()].pid.tolist() dup_inds = att[att.pid.duplicated()].index.tolist()#second ind if len(dup_pids)>0: for i,j in zip(dup_pids,dup_inds): att[name][j] = sum(att[att.pid==i][name].tolist(),[]) att.drop_duplicates(subset='pid',keep='last',inplace=True) return pd.merge(base,att,on='pid',how='left')
if os.name != 'nt': 'Mac' path_org = "/Users/Kazuki/home-depot/ORG//" path_sub = "/Users/Kazuki/home-depot/SUB//" else: 'Win' path_org = "D:\COMPE\KAGGLE\home-depot\ORG\\" path_sub = "D:\COMPE\KAGGLE\home-depot\SUB\\" infile = "train.csv" train = pd.read_csv(path_org + infile) #['id', 'product_uid', 'product_title', 'search_term', 'relevance'] train.columns = [['id', 'pid', 't', 's', 'r']] train['t_'] = train.t.map(lambda x: hd.str_stem(x)).str.split() train['s_'] = train.s.map(lambda x: hd.str_stem(x)).str.split() #============================================================================== # #============================================================================== s_table = hd.mk_freq_table(train.s_) s_table = s_table.to_frame() li = [] for i in s_table.index: sum_r = 0 cnt = 0 for j in train.index: if i in train.s_[j]:
items = pd.read_csv(hd.path_sub + '06sort_items_query.csv', index_col=0)['0'].tolist() train['s_item_myrule'] = train.s_.map(lambda x: hd.get_sitem(x)) train['s_item_list'] = train.s_.map( lambda x: hd.get_item_list_in_list(items, x)) train['s_item'] = '' for i in train.index: if train['s_item_list'].values[i] == '': train['s_item'].values[i] = train['s_item_myrule'].values[i] else: train['s_item'].values[i] = train['s_item_list'].values[i] items = pd.read_csv(hd.path_sub + '06sort_items_title.csv', index_col=0)['0'].tolist() prd['t_item_myrule'] = hd.hdp.pred_item_in_title(prd) prd['t_item_myrule'] = prd.t_item_myrule.map(lambda x: hd.str_stem(x)) prd['t_item_list'] = prd.t_.map(lambda x: hd.get_item_list_in_list(items, x)) prd['t_item'] = '' for i in prd.index: if prd['t_item_list'].values[i] == '': prd['t_item'].values[i] = prd['t_item_myrule'].values[i] else: prd['t_item'].values[i] = prd['t_item_list'].values[i] merged = pd.merge(train, prd, on='pid', how='left') #============================================================================== # #============================================================================== #merged = merged[['s_','t_','r']] # #def del_num(li): # li = [x for x in li if x.isalpha()]
#['id', 'product_uid', 'product_title', 'search_term'] test.columns = [['id', 'pid', 't', 's']] """train test""" tt = pd.concat([train, test]) tt.reset_index(drop=True, inplace=True) """product_table""" prd = pd.concat([train, test]) prd = prd.drop_duplicates(subset='pid') prd.drop(['id', 's', 'r'], axis=1, inplace=True) """"============================ SEARCH TERMS FEATURES ============================""" print '========== SEARCH ==========' tt['slen'] = tt.s.str.len() tt['sfix'] = tt.s.map(lambda x: hd.fix_typo_google(x)) tt['s_'] = tt.sfix.map(lambda x: hd.str_stem(x)).str.split() tt['s_'] = tt.s_.map(lambda x: hd.fix_typo(x)) tt['typo'] = 1 - (tt.s.map(lambda x: hd.str_stem(x)).str.split() == tt.s_) * 1 # tt['s_'] = tt.s_.map(lambda x:del_ng(x)) tt['s_len'] = tt.s_.str.len() del tt['t'] del test del train del desc """"============================ PRODUCT FEATURES ============================""" print '========== PRODUCT ==========' prd = prd.sort_values(by='pid') "TITLE" prd.reset_index(drop=True, inplace=True)
desc.columns = [['pid', 'd']] infile = "test.csv" test = pd.read_csv(path_org + infile) #['id', 'product_uid', 'product_title', 'search_term'] test.columns = [['id', 'pid', 't', 's']] """product_table""" prd = pd.concat([train, test]) del train del test prd = prd.drop_duplicates(subset='pid') prd.drop(['id', 's', 'r'], axis=1, inplace=True) prd = prd.sort_values(by='pid') "TITLE" prd.loc[:, 't_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split() prd.reset_index(drop=True, inplace=True) name = 'MFG Brand Name' pid = prd.pid.tolist() att = hd.load_att() att = att[att.name == name] del att['name'] att.columns = [['pid', name]] att = att[att.pid.isin(pid)] att[name] = att[name].map(lambda x: hd.str_stem(x)) prd = pd.merge(prd, att, on='pid', how='left') prd = prd.drop_duplicates(subset='pid') prd.reset_index(drop=True, inplace=True)
if prd['att_material'][i] != -1: mate += prd['att_material'][i] li_color.append(' '.join(color)) li_power.append(' '.join(power)) li_mate.append(' '.join(mate)) prd['att_color'] = li_color prd['att_power'] = li_power prd['att_material'] = li_mate prd.drop(['Color','Color Family','Color/Finish','Finish','Finish Family', 'Mount Type','Fuel Type','Power Type','Bulb Type'],axis=1, inplace=True) prd['att_material'] = prd.att_material.map(lambda x:hd.str_stem(x)) hd.out(prd[['pid','att_color','att_power','att_material']],'02att_color_power_material')
@author: Kazuki """ import homedepot as hd #comment out syn_tbl import numpy as np import pandas as pd import hdpath as hdp tt, prd = hd.load_all(onlytrain=True, mk_csv=True) hd.out(tt, 'tt') hd.out(prd, 'prd') po, ps = hdp.load() tt = pd.read_csv(ps + 'tt.csv') tt['s_'] = tt.sfix.map(lambda x: hd.str_stem(x)).str.split() tt['s_'] = tt.s_.map(lambda x: hd.fix_typo(x)) prd = pd.read_csv(ps + 'prd.csv') prd['t_'] = prd.t.map(lambda x: hd.str_stem(x)).str.split() def is_contain_w(li, w): if w in li: return True return False def flaten_list(lili): li = [] for i in lili: for j in i: