def Multiply_Divide(train, test, features): """ combinations: combinations(['A', 'B','C'],2) retrun AB AC BC combinations(range(4), 3) --> 012 013 023 123 """ feature_names= [] for e, (x, y) in enumerate(combinations(features, 2)): train, test, feature_name= interaction_features(train, test, x, y, e) for name in feature_name: feature_names.append(name) return train, test, feature_names
test_id = test['id'] del test['id'] cat_fea = [x for x in list(train) if 'cat' in x] bin_fea = [x for x in list(train) if 'bin' in x] train['missing'] = (train == -1).sum(axis=1).astype(float) test['missing'] = (test == -1).sum(axis=1).astype(float) # include interactions for e, (x, y) in enumerate( combinations([ 'ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01' ], 2)): train, test = interaction_features(train, test, x, y, e) num_features = [c for c in list(train) if ('cat' not in c and 'calc' not in c)] num_features.append('missing') inter_fea = [x for x in list(train) if 'inter' in x] #train['cat_sum'] = train[cat_fea].sum(axis=1) #test['cat_sum'] = test[cat_fea].sum(axis=1) #X = train.as_matrix() #X_test = test.as_matrix() #print(X.shape, X_test.shape) #ohe ohe = OneHotEncoder(sparse=True) train_cat = train[cat_fea].as_matrix() train_num = train[[x for x in list(train) if x in num_features]]
train_id = train['id'] del train['target'], train['id'] test = pd.read_csv("../input/test.csv") test_id = test['id'] del test['id'] cat_fea = [x for x in list(train) if 'cat' in x] bin_fea = [x for x in list(train) if 'bin' in x] train['missing'] = (train==-1).sum(axis=1).astype(float) test['missing'] = (test==-1).sum(axis=1).astype(float) # include interactions for e, (x, y) in enumerate(combinations(['ps_car_13', 'ps_ind_03', 'ps_reg_03', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01'], 2)): train, test = interaction_features(train, test, x, y, e) num_features = [c for c in list(train) if ('cat' not in c and 'calc' not in c)] num_features.append('missing') inter_fea = [x for x in list(train) if 'inter' in x] #train['cat_sum'] = train[cat_fea].sum(axis=1) #test['cat_sum'] = test[cat_fea].sum(axis=1) path = "../input/" num_features_comb = [] for p in os.listdir(path): if 'ps_reg_02___ps_car_07_cat' in p or 'ps_reg_01___ps_car_13___ps_car_15' in p: print(p) x,xt = pd.read_pickle(path+p) train[p] = x test[p] = xt