def main(): logging.info("[Normalized + Feature Selection] Features: Mean, Std") print "Reading data..." X, Y = utils.read_data("../files/train.csv") print "Preprocessing..." X = preprocess(X) print "Extracting Features..." X = extractFeatures(X) Y = [int(x) for x in Y] X, Y = np.array(X), np.array(Y) classMap = sorted(list(set(Y))) accs = [] rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1) logging.info(rf) print "Selecting Features..." X = selectFeatures(X, Y, rf) folds = 5 stf = cross_validation.StratifiedKFold(Y, folds) logging.info("CV Folds: " + str(folds)) loss = [] print "Testing..." for i, (train, test) in enumerate(stf): X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] rf.fit(X_train, y_train) predicted = rf.predict(X_test) probs = rf.predict_proba(X_test) probs = [[min(max(x, 0.001), 0.999) for x in y] for y in probs] loss.append(utils.logloss(probs, y_test, classMap)) accs.append(utils.accuracy(predicted, y_test)) logging.info("Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1])) logging.info("Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1])) logging.info("Mean Accuracy: " + str(np.mean(accs))) logging.info("Mean Loss: " + str(np.mean(loss)))
def main(): training, target = utils.read_data("../files/train.csv") training = [x[1:] for x in training] target = [float(x) for x in target] test, throwaway = utils.read_data("../files/test.csv") test = [x[1:] for x in test] rf = RandomForestClassifier(n_estimators=100, min_samples_split=2) rf.fit(training, target) predicted_probs = rf.predict_proba(test) predicted_probs = [[min(max(x,0.001),0.999) for x in y] for y in predicted_probs] print utils.logloss(predicted_probs, test) predicted_probs = [["%f" % x for x in y] for y in predicted_probs] utils.write_delimited_file("../files/rf_benchmark.csv", predicted_probs)
def _cost_function(self, xt, wt, y): pt = self._get_p(xt, wt) ll = logloss([y], [pt]) l1 = self.lambda1 * np.abs(wt) l2 = self.lambda2 * (np.array(wt)**2) J = ll + l1 + l2 return J
def generate_pred_with_validation(all_data, xgb_param, xgb_feature, n_trees, day_test=31): filter1 = np.logical_and(day_values >= 17, day_values < day_test) filter_v1 = day_values == day_test xt1 = all_data.ix[filter1, xgb_feature] yt1 = cvrt_value[filter1] xv1 = all_data.ix[filter_v1, xgb_feature] yv1 = cvrt_value[filter_v1] if xt1.shape[0] <= 0 or xt1.shape[0] != yt1.shape[0]: print(xt1.shape, xv1.shape) raise ValueError('wrong shape!') dtrain = xgb.DMatrix(xt1, label=yt1) dvalid = xgb.DMatrix(xv1, label=yv1) watchlist = [(dtrain, 'train'), (dvalid, 'valid')] print(xt1.shape, yt1.shape) plst = list(xgb_param.items()) + [('eval_metric', 'logloss')] xgb1 = xgb.train(plst, dtrain, n_trees, watchlist, early_stopping_rounds=50) print('-' * 30, utils.logloss(xgb1.predict(dvalid), cvrt_value[filter_v1]))
def _cost_function(self, xt, wt, y): pt = self._get_p(xt, wt) ll = logloss([y], [pt]) l1 = self.lambda1 * np.abs(wt) l2 = self.lambda2 * (np.array(wt) ** 2) J = ll + l1 + l2 return J
def _cost_function(self, xt, wt, y): pt = self._get_p(xt, wt)[-1] ll = logloss([y], pt) J = [] for w in wt: l1 = self.lambda1 * np.abs(w) l2 = self.lambda2 * w * w J.append(ll + l1 + l2) return J
def main(): X, Y = utils.read_data("../files/train_10.csv") n_target = len(set(Y)) Y = map(int, Y) folds = 5 stf = cross_validation.StratifiedKFold(Y, folds) loss = [] accs = [] classMap = sorted(list(set(Y))) X, Y = np.array(X), np.array(Y) print "Testing..." for i, (train, test) in enumerate(stf): X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] probs = [[0.001 for x in range(n_target)] for y in range(len(y_test))] loss.append(utils.logloss(probs, y_test, classMap)) accs.append(utils.accuracy([1] * len(y_test), y_test)) print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1]) print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1]) print "Mean Accuracy: " + str(np.mean(accs)) print "Mean Loss: " + str(np.mean(loss))
def main(): X, Y = utils.read_data("../files/train_10.csv") n_target = len(set(Y)) Y = map(int, Y) folds = 5 stf = cross_validation.StratifiedKFold(Y, folds) loss = [] accs = [] classMap = sorted(list(set(Y))) X, Y = np.array(X), np.array(Y) print "Testing..." for i, (train, test) in enumerate(stf): X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] probs = [[0.001 for x in range(n_target)] for y in range(len(y_test))] loss.append(utils.logloss(probs, y_test, classMap)) accs.append(utils.accuracy([1]*len(y_test), y_test)) print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1]) print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1]) print "Mean Accuracy: " + str(np.mean(accs)) print "Mean Loss: " + str(np.mean(loss))
def main(): X, Y = utils.read_data("../files/train_10.csv") Y = map(int, Y) folds = 5 stf = cross_validation.StratifiedKFold(Y, folds) loss = [] svc = svm.SVC(probability=True) accs = [] classMap = sorted(list(set(Y))) X, Y = np.array(X), np.array(Y) print "Testing..." for i, (train, test) in enumerate(stf): X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] svc.fit(X_train, y_train) predicted = svc.predict(X_test) probs = svc.predict_proba(X_test) probs = [[min(max(x, 0.001), 0.999) for x in y] for y in probs] loss.append(utils.logloss(probs, y_test, classMap)) accs.append(utils.accuracy(predicted, y_test)) print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1]) print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1]) print "Mean Accuracy: " + str(np.mean(accs)) print "Mean Loss: " + str(np.mean(loss))
def from_str(cls, s): if s == 'crossentropy': return cls(lambda x, y: logloss(x, y), s) else: raise ValueError('Nope.')
#p1 = calcLeaveOneOut2(df1, vn, 'label', n_ks[vn], 0, 0.25, mean0=pred_prev) p1 = calcLeaveOneOut2(df1, vn, 'label', 100, 0, 0.25, mean0=pred_prev) pred = pred_prev * p1 print (day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean()) pred_prev = pred del pred gc.collect() pred1 = df1.label.values.mean() for vn in vns: print ("="*20, "merge", day_v, vn) diff1 = mergeLeaveOneOut2(df1, df2, vn) pred1 *= diff1 exp2_dict[vn][days_npa == day_v] = diff1 pred1 *= df1.label.values.mean() / pred1.mean() print ("logloss = ", logloss(pred1, df2.label.values)) del df1 del df2 gc.collect() for vn in vns: fea_data['exp2_'+vn] = exp2_dict[vn] #trick feature print ('to count prev/current/next hour by appID ...') feature_list = ['appID','userID','creativeID','positionID','adID','sitesetID','advertiserID'] for feature in feature_list: cntDualKey(fea_data,feature,None,'day_hour','day_hour_prev',fill_na=0) cntDualKey(fea_data,feature,None,'day_hour','day_hour',fill_na=0) cntDualKey(fea_data,feature,None,'day_hour','day_hour_next',fill_na=0)