Esempio n. 1
0
def main():
    logging.info("[Normalized + Feature Selection] Features: Mean, Std")
    print "Reading data..."
    X, Y = utils.read_data("../files/train.csv")
    print "Preprocessing..."
    X = preprocess(X)
    print "Extracting Features..."
    X = extractFeatures(X)
    Y = [int(x) for x in Y]
    X, Y = np.array(X), np.array(Y)
    classMap = sorted(list(set(Y)))
    accs = []
    rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    logging.info(rf)
    print "Selecting Features..."
    X = selectFeatures(X, Y, rf)
    folds = 5
    stf = cross_validation.StratifiedKFold(Y, folds)
    logging.info("CV Folds: " + str(folds))
    loss = []
    print "Testing..."
    for i, (train, test) in enumerate(stf):
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        rf.fit(X_train, y_train)
        predicted = rf.predict(X_test)
        probs = rf.predict_proba(X_test)
        probs = [[min(max(x, 0.001), 0.999) for x in y]
                       for y in probs]
        loss.append(utils.logloss(probs, y_test, classMap))
        accs.append(utils.accuracy(predicted, y_test))
        logging.info("Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1]))
        logging.info("Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1]))
    logging.info("Mean Accuracy: " + str(np.mean(accs)))
    logging.info("Mean Loss: " + str(np.mean(loss)))
Esempio n. 2
0
def main():
    training, target = utils.read_data("../files/train.csv")
    training = [x[1:] for x in training]
    target = [float(x) for x in target]
    test, throwaway = utils.read_data("../files/test.csv")
    test = [x[1:] for x in test]

    rf = RandomForestClassifier(n_estimators=100, min_samples_split=2)
    rf.fit(training, target)
    predicted_probs = rf.predict_proba(test)
    predicted_probs = [[min(max(x,0.001),0.999) for x in y]
                       for y in predicted_probs]
    print utils.logloss(predicted_probs, test)
    predicted_probs = [["%f" % x for x in y] for y in predicted_probs]
    utils.write_delimited_file("../files/rf_benchmark.csv",
                                predicted_probs)
Esempio n. 3
0
def main():
    logging.info("[Normalized + Feature Selection] Features: Mean, Std")
    print "Reading data..."
    X, Y = utils.read_data("../files/train.csv")
    print "Preprocessing..."
    X = preprocess(X)
    print "Extracting Features..."
    X = extractFeatures(X)
    Y = [int(x) for x in Y]
    X, Y = np.array(X), np.array(Y)
    classMap = sorted(list(set(Y)))
    accs = []
    rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    logging.info(rf)
    print "Selecting Features..."
    X = selectFeatures(X, Y, rf)
    folds = 5
    stf = cross_validation.StratifiedKFold(Y, folds)
    logging.info("CV Folds: " + str(folds))
    loss = []
    print "Testing..."
    for i, (train, test) in enumerate(stf):
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        rf.fit(X_train, y_train)
        predicted = rf.predict(X_test)
        probs = rf.predict_proba(X_test)
        probs = [[min(max(x, 0.001), 0.999) for x in y] for y in probs]
        loss.append(utils.logloss(probs, y_test, classMap))
        accs.append(utils.accuracy(predicted, y_test))
        logging.info("Accuracy(Fold {0}): ".format(i) +
                     str(accs[len(accs) - 1]))
        logging.info("Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1]))
    logging.info("Mean Accuracy: " + str(np.mean(accs)))
    logging.info("Mean Loss: " + str(np.mean(loss)))
Esempio n. 4
0
 def _cost_function(self, xt, wt, y):
     pt = self._get_p(xt, wt)
     ll = logloss([y], [pt])
     l1 = self.lambda1 * np.abs(wt)
     l2 = self.lambda2 * (np.array(wt)**2)
     J = ll + l1 + l2
     return J
Esempio n. 5
0
def generate_pred_with_validation(all_data,
                                  xgb_param,
                                  xgb_feature,
                                  n_trees,
                                  day_test=31):
    filter1 = np.logical_and(day_values >= 17, day_values < day_test)
    filter_v1 = day_values == day_test

    xt1 = all_data.ix[filter1, xgb_feature]
    yt1 = cvrt_value[filter1]

    xv1 = all_data.ix[filter_v1, xgb_feature]
    yv1 = cvrt_value[filter_v1]

    if xt1.shape[0] <= 0 or xt1.shape[0] != yt1.shape[0]:
        print(xt1.shape, xv1.shape)
        raise ValueError('wrong shape!')

    dtrain = xgb.DMatrix(xt1, label=yt1)
    dvalid = xgb.DMatrix(xv1, label=yv1)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    print(xt1.shape, yt1.shape)
    plst = list(xgb_param.items()) + [('eval_metric', 'logloss')]
    xgb1 = xgb.train(plst,
                     dtrain,
                     n_trees,
                     watchlist,
                     early_stopping_rounds=50)
    print('-' * 30, utils.logloss(xgb1.predict(dvalid), cvrt_value[filter_v1]))
Esempio n. 6
0
 def _cost_function(self, xt, wt, y):
     pt = self._get_p(xt, wt)
     ll = logloss([y], [pt])
     l1 = self.lambda1 * np.abs(wt)
     l2 = self.lambda2 * (np.array(wt) ** 2)
     J = ll + l1 + l2
     return J
Esempio n. 7
0
 def _cost_function(self, xt, wt, y):
     pt = self._get_p(xt, wt)[-1]
     ll = logloss([y], pt)
     J = []
     for w in wt:
         l1 = self.lambda1 * np.abs(w)
         l2 = self.lambda2 * w * w
         J.append(ll + l1 + l2)
     return J
Esempio n. 8
0
 def _cost_function(self, xt, wt, y):
     pt = self._get_p(xt, wt)[-1]
     ll = logloss([y], pt)
     J = []
     for w in wt:
         l1 = self.lambda1 * np.abs(w)
         l2 = self.lambda2 * w * w
         J.append(ll + l1 + l2)
     return J
Esempio n. 9
0
def main():
    X, Y = utils.read_data("../files/train_10.csv")
    n_target = len(set(Y))
    Y = map(int, Y)
    folds = 5
    stf = cross_validation.StratifiedKFold(Y, folds)
    loss = []
    accs = []
    classMap = sorted(list(set(Y)))
    X, Y = np.array(X), np.array(Y)
    print "Testing..."
    for i, (train, test) in enumerate(stf):
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        probs = [[0.001 for x in range(n_target)] for y in range(len(y_test))]
        loss.append(utils.logloss(probs, y_test, classMap))
        accs.append(utils.accuracy([1] * len(y_test), y_test))
        print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1])
        print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1])
    print "Mean Accuracy: " + str(np.mean(accs))
    print "Mean Loss: " + str(np.mean(loss))
Esempio n. 10
0
def main():
    X, Y = utils.read_data("../files/train_10.csv")
    n_target = len(set(Y))
    Y = map(int, Y)
    folds = 5
    stf = cross_validation.StratifiedKFold(Y, folds)
    loss = []
    accs = []
    classMap = sorted(list(set(Y)))
    X, Y = np.array(X), np.array(Y)
    print "Testing..."
    for i, (train, test) in enumerate(stf):
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        probs = [[0.001 for x in range(n_target)]
                           for y in range(len(y_test))]
        loss.append(utils.logloss(probs, y_test, classMap))
        accs.append(utils.accuracy([1]*len(y_test), y_test))
        print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1])
        print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1])
    print "Mean Accuracy: " + str(np.mean(accs))
    print "Mean Loss: " + str(np.mean(loss))
Esempio n. 11
0
def main():
    X, Y = utils.read_data("../files/train_10.csv")
    Y = map(int, Y)
    folds = 5
    stf = cross_validation.StratifiedKFold(Y, folds)
    loss = []
    svc = svm.SVC(probability=True)
    accs = []
    classMap = sorted(list(set(Y)))
    X, Y = np.array(X), np.array(Y)
    print "Testing..."
    for i, (train, test) in enumerate(stf):
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        svc.fit(X_train, y_train)
        predicted = svc.predict(X_test)
        probs = svc.predict_proba(X_test)
        probs = [[min(max(x, 0.001), 0.999) for x in y]
                       for y in probs]
        loss.append(utils.logloss(probs, y_test, classMap))
        accs.append(utils.accuracy(predicted, y_test))
        print "Accuracy(Fold {0}): ".format(i) + str(accs[len(accs) - 1])
        print "Loss(Fold {0}): ".format(i) + str(loss[len(loss) - 1])
    print "Mean Accuracy: " + str(np.mean(accs))
    print "Mean Loss: " + str(np.mean(loss))
Esempio n. 12
0
 def from_str(cls, s):
     if s == 'crossentropy':
         return cls(lambda x, y: logloss(x, y), s)
     else:
         raise ValueError('Nope.')
Esempio n. 13
0
            #p1 = calcLeaveOneOut2(df1, vn, 'label', n_ks[vn], 0, 0.25, mean0=pred_prev)
            p1 = calcLeaveOneOut2(df1, vn, 'label', 100, 0, 0.25, mean0=pred_prev)
            pred = pred_prev * p1
            print (day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean())
            pred_prev = pred
        del pred
        gc.collect() 
        pred1 = df1.label.values.mean()
        for vn in vns:
            print ("="*20, "merge", day_v, vn)
            diff1 = mergeLeaveOneOut2(df1, df2, vn)
            pred1 *= diff1
            exp2_dict[vn][days_npa == day_v] = diff1

        pred1 *= df1.label.values.mean() / pred1.mean()
        print ("logloss = ", logloss(pred1, df2.label.values))
    del df1
    del df2
    gc.collect()

for vn in vns:
    fea_data['exp2_'+vn] = exp2_dict[vn]
    

#trick feature
print ('to count prev/current/next hour by appID ...')
feature_list = ['appID','userID','creativeID','positionID','adID','sitesetID','advertiserID']
for feature in feature_list:
    cntDualKey(fea_data,feature,None,'day_hour','day_hour_prev',fill_na=0)
    cntDualKey(fea_data,feature,None,'day_hour','day_hour',fill_na=0)
    cntDualKey(fea_data,feature,None,'day_hour','day_hour_next',fill_na=0)