def gen_sub():
    train = train_data.copy()
    train_idx = [i for i in range(train.shape[0])]
    test = test_data.copy()
    test_idx = [i + train.shape[0] for i in range(test.shape[0])]
    y = train["interest_level"].apply(lambda x: target_num_map[x])
    del train["interest_level"]
    data = pd.concat([train, test]).reset_index()
    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
    xgb_clf = xgboostClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=3,
        nthread=12,
        eta=0.02,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=1.0,
        colsample_bylevel=0.8,
        min_child_weight=1,
        silent=1,
        num_rounds=1700,
        seed=RS,
    )
    print("Trainning:...")
    xgb_clf.fit(X_train, y)

    preds = xgb_clf.predict_proba(X_test)
    sub = pd.DataFrame(preds)
    # sub.columns = ["high", "medium", "low"]
    sub.columns = ["high", "medium", "low"]
    sub["listing_id"] = test.listing_id.values
    sub.to_csv("submission.csv", index=False)
def validation_score(early_stop=False):
    clf = xgboostClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=3,
        nthread=3,
        eta=0.04,
        max_depth=6,
        subsample=0.7,
        colsample_bytree=1.0,
        colsample_bylevel=0.7,
        min_child_weight=1,
        silent=1,
        num_rounds=700,
        seed=RS,
    )
    print("*** Validation start ***")
    data = train_data.copy()
    y = data["interest_level"].apply(lambda x: target_num_map[x])
    del data["interest_level"]

    # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)
    skf = StratifiedKFold(n_splits=3, shuffle=False)
    cv_scores = []
    i = 0
    for train_idx, val_idx in skf.split(data, y):
        i += 1
        X = data.copy()
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
        clf.fit(X_train, y_train)
        # clf.fit_CV(X_train, X_val, y_train, y_val)
        y_val_pred = clf.predict_proba(X_val)
        loss = log_loss(y_val, y_val_pred)
        print("Iteration {}'s loss: {}".format(i, loss))
        cv_scores.append(loss)
        if early_stop:
            break
    print("*** Validation finished ***\n")
    return cv_scores
def validation_avg_score(clfs):
    print("*** Validation start ***")
    data = train_data.copy()
    y = data["interest_level"].apply(lambda x: target_num_map[x])
    del data["interest_level"]

    # skf = StratifiedKFold(n_splits=5, random_state=RS, shuffle=True)
    skf = StratifiedKFold(n_splits=3)
    cv_scores = {i: [] for i in range(len(clfs))}
    cv_scores["Avg"] = []
    i = 0
    for train_idx, val_idx in skf.split(data, y):
        i += 1
        X = data.copy()
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
        tmp = []
        preds = []
        j = 0
        for clf in clfs:
            clf.fit(X_train, y_train)
            y_val_pred = clf.predict_proba(X_val)
            tmp.append(y_val_pred)
            loss = log_loss(y_val, y_val_pred)
            cv_scores[j].append(loss)
            preds.append(y_val_pred)
            j += 1
            print("clf_{}, Iteration {}'s loss: {}".format(j, i, loss))
        preds = np.array(preds)
        avg_pred = np.mean(preds, axis=0)
        loss = log_loss(y_val, avg_pred)
        cv_scores["Avg"].append(loss)
        print("Iteration {}'s Avg loss: {}".format(i, loss))
    for i in range(len(clfs)):
        print("clf_{} validation loss : {}".format(i, np.mean(cv_scores[i])))
    print("Average validation loss : {}".format(np.mean(cv_scores["Avg"])))
    print("*** Validation finished ***\n")
    return cv_scores["Avg"]
def genAvgSub(clfs):
    train = train_data.copy()
    train_idx = [i for i in range(train.shape[0])]
    test = test_data.copy()
    test_idx = [i + train.shape[0] for i in range(test.shape[0])]
    y = train["interest_level"].apply(lambda x: target_num_map[x])
    del train["interest_level"]
    data = pd.concat([train, test]).reset_index()
    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
    print("Trainning:...")
    preds = []
    for i in range(len(clfs)):
        print("Clf_{} fiting".format(i))
        clfs[i].fit(X_train, y)
        print("Clf_{} predicting".format(i))
        pred = clfs[i].predict_proba(X_test)
        preds.append(pred)
    sub = pd.DataFrame(np.mean(preds, axis=0))
    # sub.columns = ["high", "medium", "low"]
    sub.columns = ["high", "medium", "low"]
    sub["listing_id"] = test.listing_id.values
    sub.to_csv("submission.csv", index=False)
    print("Train done.")
def stacking(clfs):
    print("Stacking")
    train = train_data.copy()
    test = test_data.copy()
    y = train["interest_level"].apply(lambda x: target_num_map[x])
    del train["interest_level"]
    train_stackers = []
    for RS in [0, 1, 2, 64, 128, 256, 512, 1024, 2048, 4096]:
        skf = StratifiedKFold(n_splits=10, random_state=RS, shuffle=True)
        #Create Arrays for meta
        train_stacker = [[0.0 for s in range(3)]
                         for k in range(0, (train.shape[0]))]
        cv_scores = {i: [] for i in range(len(clfs))}
        cv_scores["Avg"] = []
        print("Begin 10-flod cross validation")
        cnt = 0
        for train_idx, val_idx in skf.split(train, y):
            cnt += 1
            X = train.copy()
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)
            X_train.toarray()
            preds = []
            k = 0
            for clf in clfs:
                clf.fit(X_train, y_train)
                y_val_pred = clf.predict_proba(X_val)
                loss = log_loss(y_val, y_val_pred)
                preds.append(y_val_pred)
                cv_scores[k].append(loss)
                k += 1
                print("Clf_{} iteration {}'s loss: {}".format(k, cnt, loss))
            preds = np.array(preds)
            avg_pred = np.mean(preds, axis=0)
            avg_loss = log_loss(y_val, avg_pred)
            cv_scores["Avg"].append(avg_loss)
            print("Iteration {}'s Avg loss: {}".format(cnt, avg_loss))
            no = 0
            for real_idx in val_idx:
                for i in range(3):
                    train_stacker[real_idx][i] = avg_pred[no][i]
                no += 1
        for i in range(len(clfs)):
            print("clf_{} validation loss : {}".format(i,
                                                       np.mean(cv_scores[i])))
        print("Average validation loss : {}".format(np.mean(cv_scores["Avg"])))
        train_stackers.append(train_stacker)
    train_stacker = np.mean(train_stackers, axis=0)
    print("*** Validation finished ***\n")

    test_stacker = [[0.0 for s in range(3)] for k in range(0, (test.shape[0]))]
    train_idx = [i for i in range(train.shape[0])]
    test_idx = [i + train.shape[0] for i in range(test.shape[0])]
    data = pd.concat([train, test]).reset_index()
    X_train, X_test, feats = coreProcess(data, y, train_idx, test_idx)
    print(X_train.shape, len(train_stacker))
    print("Begin predicting")
    preds = []
    for i in range(len(clfs)):
        print("Clf_{} fiting".format(i))
        clfs[i].fit(X_train, y)
        print("Clf_{} predicting".format(i))
        pred = clfs[i].predict_proba(X_test)
        preds.append(pred)
    preds = np.mean(preds, axis=0)
    for pr in range(0, len(preds)):
        for d in range(0, 3):
            test_stacker[pr][d] = (preds[pr][d])
    print("merging columns")
    #stack xgboost predictions
    X_train = np.column_stack((X_train.toarray(), train_stacker))
    # stack id to test
    X_test = np.column_stack((X_test.toarray(), test_stacker))
    # stack target to train
    X = np.column_stack((y, X_train))
    ids = test.listing_id.values
    X_test = np.column_stack((ids, X_test))
    np.savetxt("./train_stacknet.csv", X, delimiter=",", fmt='%.5f')
    np.savetxt("./test_stacknet.csv", X_test, delimiter=",", fmt='%.5f')
    print("Write results...")
    output_file = "submission_{}.csv".format(np.mean(cv_scores["Avg"]))
    print("Writing submission to %s" % output_file)
    f = open(output_file, "w")
    f.write("listing_id,high,medium,low\n")  # the header
    for g in range(0, len(test_stacker)):
        f.write("%s" % (ids[g]))
        for prediction in test_stacker[g]:
            f.write(",%f" % (prediction))
        f.write("\n")
    f.close()
    print("Done.")
attributes = full.columns[1:]
skf = StratifiedKFold(n_splits=3, shuffle=True)
data = full[attributes].copy()
utils = preprcessUtil.Utils()
y = full['timestamp'].apply(utils.processTime)
cv_scores = []
i = 0



for train_idx, val_idx in skf.split(data, y):
    i += 1
    X = data.copy()
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    X_train, X_val, feats = coreProcess(X, y_train, train_idx, val_idx)

    clf.fit(X_train, y_train)
    #y_val_pred = clf.predict_proba(X_val)
    y_val_pred = clf.predict(X_val)
    pred =  y_val_pred
    label = y_val.values
    correct = 0
    for j in range(len(pred)):
        if pred[j] == label[j]:
            correct += 1
    precision = float(correct*100.00)/len(pred)
    loss = log_loss(y_val, y_val_pred)

    precision = precision_score(label, pred, average='binary')
    recall = recall_score(label, pred, average='binary')