# -*- coding: utf-8 -*- """ Created on Mon Mar 14 09:52:48 2016 @author: onodera """ import homedepot as hd #comment out syn_tbl import pandas as pd import numpy as np train, prd = hd.load_all(onlytrain=True, mk_csv=True) items = pd.read_csv(hd.path_sub + '06sort_items_query.csv', index_col=0)['0'].tolist() train['s_item_myrule'] = train.s_.map(lambda x: hd.get_sitem(x)) train['s_item_list'] = train.s_.map( lambda x: hd.get_item_list_in_list(items, x)) train['s_item'] = '' for i in train.index: if train['s_item_list'].values[i] == '': train['s_item'].values[i] = train['s_item_myrule'].values[i] else: train['s_item'].values[i] = train['s_item_list'].values[i] items = pd.read_csv(hd.path_sub + '06sort_items_title.csv', index_col=0)['0'].tolist() prd['t_item_myrule'] = hd.hdp.pred_item_in_title(prd) prd['t_item_myrule'] = prd.t_item_myrule.map(lambda x: hd.str_stem(x)) prd['t_item_list'] = prd.t_.map(lambda x: hd.get_item_list_in_list(items, x)) prd['t_item'] = '' for i in prd.index:
Created on Mon Feb 29 13:13:41 2016 @author: onodera """ import pandas as pd import numpy as np import time import homedepot as hd reload(hd) #seed = 1457665531 # fix seed = int(time.time()) np.random.seed(seed) tt, prd = hd.load_all(mod=4) if len(tt) == 240760: test_id = tt[np.isnan(tt.r)][tt.s_len > 4]['id'] test_id = test_id.to_frame().id.tolist() else: raise Exception("SUCCESS!!!") #============================================================================== # train test #============================================================================== labels = np.array(tt[~np.isnan(tt.r)].r) tt = pd.merge(tt, prd, on='pid', how='left') col = [x for x in tt.columns if tt[x].dtype != 'O' and x not in ['id', 'pid']] tt = tt[col] col.remove('r')
import time import homedepot as hd import xgboost as xgb from sklearn.cross_validation import train_test_split reload(hd) #seed = 1457665531 # fix seed = int(time.time()) np.random.seed(seed) sub = hd.load_sub() sub.drop(['relevance'], axis=1, inplace=True) words = [1, 2, 3, 4] for w in words: tt, prd = hd.load_all(word=w) test_id = tt[np.isnan(tt.r)]['id'] test_id = test_id.to_frame() #raise Exception("SUCCESS!!!") #============================================================================== # train test #============================================================================== labels = np.array(tt[~np.isnan(tt.r)].r) tt = pd.merge(tt, prd, on='pid', how='left') col = [ x for x in tt.columns if tt[x].dtype != 'O' and x not in ['id', 'pid'] ] tt = tt[col] col.remove('r') 'train'