Ejemplo n.º 1
0
 def test_logLoss(self):
     self.assertAlmostEqual(metrics.log_loss([1, 1, 0, 0], [1, 1, 0, 0]), 0)
     self.assertAlmostEqual(metrics.log_loss([1, 1, 0, 0], [1, 1, 1, 0]),
                            np.inf)
     self.assertAlmostEqual(
         metrics.log_loss([1, 1, 1, 0, 0, 0],
                          [0.5, 0.1, 0.01, 0.9, 0.75, 0.001]),
         1.881797068998267)
     self.assertAlmostEqual(metrics.log_loss(1, 0.5), -np.log(0.5))
Ejemplo n.º 2
0
    def fit(self, X, y, cv=None, **fit_params):
        self._set_params(**fit_params)
        indices = np.arange(X.shape[0])
        np.random.shuffle(indices)
        X = X[indices,:]
        y = y[indices]
        if cv is None:
            cv = KFold(y.size, k=5)

        clf = self.classifier
        score_list = []
        y_list = []
        for train_index, test_index in cv:
            print train_index.shape, test_index.shape
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)
            score = clf.predict_proba(X_test)[:,1].reshape(-1,1)
            score_list.append(score)
            y_list.append(y_test)

        yy = np.concatenate(y_list)
        scores = np.concatenate(score_list)

        self.a, self.b = fit_platt_logreg(scores, yy)
        print("Optimistic Log-loss: {0:f}".format(
            log_loss(yy, (1./1. + np.exp(-(self.a *scores + self.b))))))
        self.classifier.fit(X, y)
        return self
Ejemplo n.º 3
0
def create_prediction_file(data_version, fold=99, is_test=False):

    # we can save the test prediction directly to the test predictions directory
    if is_test:
        data_file = os.path.join(parent_dir,
                                 ("P" + ` data_version ` +
                                  "/")) + "P" + ` data_version ` + "_test.csv"
        out_file = os.path.join(
            parent_dir,
            "Predictions/tests/") + "P" + ` data_version ` + "_test.csv"

    # got to save individual fold predictions to a temp directory names fold_predictions in order to
    # merge them in a single file to be used at second layer
    else:
        data_file = os.path.join(parent_dir, (
            "P" + ` data_version ` +
            "/")) + "P" + ` data_version ` + "_Fold_" + ` fold ` + "_valid.csv"
        out_file = os.path.join(
            parent_dir, "Predictions/fold_predictions/"
        ) + "P" + ` data_version ` + "_Fold_" + ` fold ` + "_eval.csv"

    # the raw prediction file generated by VW
    text_file_name = os.path.join(parent_dir,
                                  "Predictions/tmp/") + "temp_prediction.txt"
    model_version = "VW_P" + ` data_version `

    # need to match the ID field with the predictions
    with open(out_file, "wb") as prediction_csv:
        prediction_csv.write("raw\n")
        for line in open(text_file_name):
            row = line.strip().split(" ")
            prediction_csv.write("%s\n" % row[0])

    if is_test:
        id_df = pd.read_csv(data_file)[["ID"]]
        preds_df = pd.read_csv(out_file)
        assert (
            id_df.shape[0] == preds_df.shape[0]
        ), "data file and prediction file has differring number of rows..."
        id_df[model_version] = np.array(preds_df["raw"])
        id_df.to_csv(out_file, index=False)
    else:
        id_df = pd.read_csv(data_file)[["ID", "target"]]
        id_df["Fold"] = np.repeat(fold, id_df.shape[0])
        preds_df = pd.read_csv(out_file)
        assert (
            id_df.shape[0] == preds_df.shape[0]
        ), "data file and prediction file has differring number of rows..."
        id_df[model_version] = np.array(preds_df["raw"])
        id_df.to_csv(out_file, index=False)
        ll = log_loss(np.array(id_df["target"]),
                      np.array(id_df[model_version]))
        print "******************************************************************************"
        print "***** data version : {0} | fold : {1} | fold sample: {2} | log loss {3} ******".format(
            data_version, fold, id_df.shape[0], np.round(ll, 7))
        print "******************************************************************************"
    os.remove(text_file_name)
def create_prediction_file(data_version, fold = 99, is_test = False):

    # we can save the test prediction directly to the test predictions directory    
    if is_test:
        data_file = os.path.join(parent_dir, ("P" + `data_version`+"/")) +"P" + `data_version` + "_test.csv"
        out_file = os.path.join(parent_dir, "Predictions/tests/") + "P" + `data_version` + "_test.csv"
        
    # got to save individual fold predictions to a temp directory names fold_predictions in order to
    # merge them in a single file to be used at second layer
    else:
        data_file = os.path.join(parent_dir, ("P" + `data_version`+"/")) + "P" + `data_version` + "_Fold_" + `fold` + "_valid.csv"
        out_file = os.path.join(parent_dir, "Predictions/fold_predictions/") + "P" + `data_version` + "_Fold_" + `fold` + "_eval.csv"

    # the raw prediction file generated by VW
    text_file_name = os.path.join(parent_dir, "Predictions/tmp/") + "temp_prediction.txt"
    model_version = "VW_P" + `data_version`
    
    # need to match the ID field with the predictions
    with open(out_file, "wb") as prediction_csv:
        prediction_csv.write("raw\n")
        for line in open(text_file_name):
            row = line.strip().split(" ")
            prediction_csv.write("%s\n" % row[0])
            
    if is_test:
        id_df = pd.read_csv(data_file)[["ID"]]
        preds_df = pd.read_csv(out_file)
        assert (id_df.shape[0] == preds_df.shape[0]), "data file and prediction file has differring number of rows..."
        id_df[model_version] = np.array(preds_df["raw"])
        id_df.to_csv(out_file, index = False)
    else:
        id_df = pd.read_csv(data_file)[["ID", "target"]]
        id_df["Fold"] = np.repeat(fold, id_df.shape[0])
        preds_df = pd.read_csv(out_file)
        assert (id_df.shape[0] == preds_df.shape[0]), "data file and prediction file has differring number of rows..."
        id_df[model_version] = np.array(preds_df["raw"])
        id_df.to_csv(out_file, index = False)
        ll = log_loss(np.array(id_df["target"]), np.array(id_df[model_version]))
        print "******************************************************************************"
        print "***** data version : {0} | fold : {1} | fold sample: {2} | log loss {3} ******".format(data_version, fold, id_df.shape[0], np.round(ll, 7))
        print "******************************************************************************"
    os.remove(text_file_name)
Ejemplo n.º 5
0
                            n_estimators=2100,
                            subsample=0.9,
                            colsample_bytree=0.45,
                            objective="binary:logistic",
                            silent=False,
                            min_child_weight=1,
                            nthread=-1)

        bst.fit(X_train,
                y_train,
                eval_metric="logloss",
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                verbose=200)

        preds = bst.predict_proba(X_valid)[:, 1]
        ll = log_loss(validationSet["target"], preds)
        df = pd.DataFrame({"ID": validationSet["ID"], pred_name: preds})
        eval_matrix = eval_matrix.append(df, ignore_index=True)
        print "fold : {} | logloss: {}".format(i + 1, ll)
        del trainingSet, validationSet, bst, preds, ll, X_train, X_valid, y_train, y_valid
        gc.collect()

    X_train = train[feature_names].copy()
    y_train = np.array(train["target"].copy())
    bst = XGBClassifier(max_depth=8,
                        learning_rate=0.01,
                        n_estimators=2100,
                        subsample=0.9,
                        colsample_bytree=0.45,
                        objective="binary:logistic",
                        silent=False,
     validationSet = train[idx]
     
     rf = RandomForestClassifier(n_estimators = 2000,
                                 criterion = "entropy",
                                 max_depth = 50,
                                 max_features = 0.8,
                                 min_samples_split = 3,
                                 bootstrap = False,
                                 oob_score = False,
                                 random_state = 112,
                                 verbose = 0,
                                 n_jobs = -1)
     
     rf.fit(trainingSet[feature_names], np.array(trainingSet["target"]))                          
     preds = rf.predict_proba(validationSet[feature_names])[:, 1]
     ll = log_loss(np.array(validationSet["target"]), preds)
     print "# Data_version : {0} | Fold : {1} | log_loss : {2}".format(i+1, j+1, ll)
     df = pd.DataFrame({"Fold" : np.repeat((j + 1), validationSet.shape[0]) ,"ID" : validationSet["ID"], "ground_truth" : validationSet["target"], 
                             model_version : preds})
     tmp_name = "P" + `data_version` + "_Fold_" + `fold` + "_valid.csv"
     tmp_file = train_prediction_path + "tmp/" + tmp_name
     df.to_csv(tmp_file, index  = False)
     eval_matrix = eval_matrix.append(df, ignore_index = True)
     del rf, trainingSet, validationSet, ll, df
     
 # generate test meta features
 # train on all training instances
 rf = RandomForestClassifier(n_estimators = 2000,
                                 criterion = "entropy",
                                 max_depth = 50,
                                 max_features = 0.8,
     bst = XGBClassifier(max_depth=8,
                         learning_rate = 0.01,
                         n_estimators=2100,
                         subsample=0.9,
                         colsample_bytree=0.45,
                         objective="binary:logistic",
                         silent = False,
                         min_child_weight=1,                       
                         nthread=-1)
                             
     bst.fit(X_train, y_train, eval_metric= "logloss",
             eval_set=[(X_train, y_train), (X_valid, y_valid)],
                       verbose=200)
                      
     preds = bst.predict_proba(X_valid)[:, 1]
     ll = log_loss(validationSet["target"], preds)
     df = pd.DataFrame({"ID" : validationSet["ID"], pred_name : preds})
     eval_matrix = eval_matrix.append(df, ignore_index = True)
     print "fold : {} | logloss: {}".format(i+1, ll)        
     del trainingSet, validationSet, bst, preds, ll, X_train, X_valid, y_train, y_valid
     gc.collect()
 
 X_train = train[feature_names].copy()
 y_train = np.array(train["target"].copy())
 bst = XGBClassifier(max_depth=8,
                         learning_rate = 0.01,
                         n_estimators=2100,
                         subsample=0.9,
                         colsample_bytree=0.45,
                         objective="binary:logistic",
                         silent = False,
Ejemplo n.º 8
0
 def test_logLoss(self):
     self.assertAlmostEqual(metrics.log_loss([1,1,0,0],[1,1,0,0]), 0)
     self.assertAlmostEqual(metrics.log_loss([1,1,0,0],[1,1,1,0]), np.inf)
     self.assertAlmostEqual(metrics.log_loss([1,1,1,0,0,0],[0.5,0.1,0.01,0.9,0.75,0.001]), 1.881797068998267)
     self.assertAlmostEqual(metrics.log_loss(1,0.5), -np.log(0.5))