from pprint import pprint
import os
import csv
import cPickle as pickle

import numpy as np
import scipy as sp

from feature_label_builder import FeatureLabelBuilder, get_feature_keys
from feature_label_builder import data_path, full_path
from util_eval import multiclass_log_loss, multiclass_accuracy

import xgboost as xgb

# load data
(all_xs, all_ys) = pickle.load(open(full_path("train_xs_ys_np.p"), "rb"))
all_ys = all_ys - 1.0

# set up split portion of train and test data
test_percentage = 0.1
all_num = len(all_ys)
train_num = int(round((1.0 - test_percentage) * all_num))
test_num = all_num - train_num

# prepare random shuffle index
random_idx = np.array(range(all_num))
np.random.seed(0)
np.random.shuffle(random_idx)

# prepare train and test dataset
train_xs = all_xs[random_idx][:train_num]
from dateutil import parser as dateparser
import datetime, time

import numpy as np
import scipy as sp

from feature_label_builder import FeatureLabelBuilder, get_feature_keys
from feature_label_builder import data_path, full_path

# reader = csv.DictReader(open(os.path.join(data_path, 'train.csv')))
# post_times = [int(time.mktime(dateparser.parse(datum['PostCreationDate']).timetuple()))
#               for datum in reader]
# post_times = np.array(post_times)
# pickle.dump(post_times, open(full_path("post_times_37.p"), "wb"),
#             protocol=pickle.HIGHEST_PROTOCOL)
post_times = np.array(pickle.load(open(full_path("post_times_37.p"), "rb")))

sort_arg = np.argsort(post_times)

def is_sorted(l):
    return all(l[i] <= l[i+1] for i in xrange(len(l)-1))

print(is_sorted(post_times))
print(is_sorted(post_times[sort_arg]))

(train_xs, train_ys) = pickle.load(open(full_path("train_37_xs_ys_np.p"), "rb"))
train_xs = train_xs[sort_arg]
train_ys = train_ys[sort_arg]
pickle.dump((train_xs, train_ys),
            open(full_path("train_37_xs_ys_np_sorted.p"), "wb"),
            protocol=pickle.HIGHEST_PROTOCOL)
import os
import csv
import cPickle as pickle

import numpy as np
import scipy as sp

from feature_label_builder import FeatureLabelBuilder, get_feature_keys
from feature_label_builder import data_path, full_path
from util_eval import multiclass_log_loss, multiclass_accuracy

import xgboost as xgb
import itertools

# load data
(all_xs, all_ys) = pickle.load(open(full_path("train_37_xs_ys_np_sorted.p"), "rb"))
all_ys = all_ys - 1.0

# set up split portion of train and test data
test_percentage = 0.1
all_num = len(all_ys)
train_num = int(round((1. - test_percentage) * all_num))
test_num = all_num - train_num

# prepare train and test dataset
train_xs = all_xs[:train_num]
train_ys = all_ys[:train_num]
test_xs = all_xs[train_num:]
test_ys = all_ys[train_num:]

# setup param grid
from __future__ import print_function
from pprint import pprint
import os
import csv
import cPickle as pickle

import numpy as np
import scipy as sp
import sklearn
from sklearn.ensemble import GradientBoostingClassifier as GBC

from feature_label_builder import FeatureLabelBuilder, get_feature_keys
from feature_label_builder import data_path, full_path

(train_xs, train_ys) = pickle.load(open(full_path("train_xs_ys_np.p"), "rb"))

classifier = GBC(n_estimators=500,
                 learning_rate=0.15,
                 subsample=0.001,
                 max_features='auto',
                 min_samples_leaf=9,
                 verbose=1)
classifier.fit(train_xs, train_ys)

pickle.dump(classifier, open(full_path("classifier_500_0.1_0.001_9.p"), "wb"))

# keys = get_feature_keys()
# for key, val in zip(keys, classifier.feature_importances_):
#     print(key, val)