Exemple #1
0
def make_KL_slip(fault,num_modes,eigenvals,V,mean_slip,max_slip,lognormal=True,maxiter=5,seed=12345):
    '''
    Make slip map using num_modes
    '''
    from numpy import sqrt,exp
    from numpy.random import randn
    from numpy.random import seed as random_seed
    
    iterations=0
    success=False
    

    
    while True:
        #Generate random numbers
            #Is there a seed?
        if seed != None:
            random_seed(seed)
        if len(fault)>num_modes:
            z = randn(num_modes) 
        else: #if fewer faults than requested modes then use all modes
            z = randn(len(fault)) 
        KL_slip = mean_slip.copy()  # start with the mean slip
        # add in the terms in the K-L expansion:
        for k in range(len(z)):
            KL_slip += z[k] * sqrt(eigenvals[k]) * V[:,k]
        # exponentiate for lognormal:
        if lognormal==True:
            KL_slip = exp(KL_slip)
        #Check if max_slip condition is met, if so then you're done
        if KL_slip.max()<=max_slip:
            success=True
            break
        iterations+=1
        if iterations>maxiter:
            print'... ... ... improper eigenvalues, recalculating...'
            break
    

    
    return KL_slip,success
Exemple #2
0
from __future__ import print_function, division, unicode_literals

import os
from collections import Counter
import re
import numpy as np
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros  # pylint:disable=no-name-in-module

from keras.models import Sequential
from keras.engine.training import slice_X
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout
from keras.layers import recurrent

random_seed(123)  # Reproducibility

# Parameters for the model and dataset
NUMBER_OF_ITERATIONS = 20000
EPOCHS_PER_ITERATION = 5
RNN = recurrent.LSTM
INPUT_LAYERS = 2
OUTPUT_LAYERS = 2
AMOUNT_OF_DROPOUT = 0.3
BATCH_SIZE = 500
HIDDEN_SIZE = 700
INITIALIZATION = "he_normal"  # : Gaussian initialization scaled by fan_in (He et al., 2014)
MAX_INPUT_LEN = 40
MIN_INPUT_LEN = 3
INVERTED = True
AMOUNT_OF_NOISE = 0.2 / MAX_INPUT_LEN
def work(out_csv_file,
         estimator,
         nest,
         njobs,
         nfolds,
         cv_grid,
         minimizer,
         nbuckets,
         mvector,
         imputer,
         clf_kwargs,
         int_fold):

    from numpy.random import seed as random_seed
    random_seed(1)


    from zipfile import ZipFile
    from pandas import read_csv,factorize
    from numpy import rint,clip,savetxt,stack

    if KAGGLE:
        train = read_csv("../input/train.csv")
        test = read_csv("../input/test.csv")
    else:
        train = read_csv(ZipFile("../../data/train.csv.zip", 'r').open('train.csv'))
        test = read_csv(ZipFile("../../data/test.csv.zip", 'r').open('test.csv'))

#    gmm17_train = read_csv('GMM_17_full_train.csv')
#    gmm17_test = read_csv('GMM_17_full_test.csv')
#    gmm6_train = read_csv('GMM_6_full_train.csv')
#    gmm6_test = read_csv('GMM_6_full_test.csv')
#
#    train['GMM17'] = gmm17_train['Response']
#    test['GMM17'] = gmm17_test['Response']
#    train['GMM6'] = gmm6_train['Response']
#    test['GMM6'] = gmm6_test['Response']

    # combine train and test
    all_data = train.append(test)

#    G_vectors = read_csv('../../data/G_vectors.csv')
#    #all_data = all_data.join(G_vectors.drop(['G3'], axis=1))
#    all_data = all_data.join(
#        G_vectors[['G8', 'G11', 'G12', 'G13', 'G17', 'G18', 'G19', 'G20']])

    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    all_data[DISCRETE] = imp.fit_transform(all_data[DISCRETE])
#    from numpy import bincount
#    for col in all_data[DISCRETE]:
#        top = bincount(all_data[col].astype(int)).argmax()
#        all_data[col] -= top
    imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    all_data[CONTINUOUS] = imp.fit_transform(all_data[CONTINUOUS])
#    all_data[BOOLEANS] = all_data[BOOLEANS] + 1e6


#    from sklearn.preprocessing import StandardScaler
#    from sklearn.decomposition import PCA
#    std = StandardScaler(copy=True)
#    all_data[CONTINUOUS] = std.fit_transform(all_data[CONTINUOUS])
#    pca = PCA(whiten=False, copy=True)
#    all_data[CONTINUOUS] = pca.fit_transform(all_data[CONTINUOUS])


    # create any new variables
    all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
    all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]


    # factorize categorical variables
    all_data['Product_Info_2'] = factorize(all_data['Product_Info_2'])[0]# + 1
    all_data['Product_Info_2_char'] = factorize(all_data['Product_Info_2_char'])[0]# + 1
    all_data['Product_Info_2_num'] = factorize(all_data['Product_Info_2_num'])[0]# + 1

    """
    Both: 0.65576
    BmiAge: 0.65578
    MedCount: 0.65638
    None: 0.65529
    """
    all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']
    med_keyword_columns = all_data.columns[all_data.columns.str.startswith('Medical_Keyword_')]
    all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)


    """
    print('BOOLEANS:')
    for col in all_data[BOOLEANS]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    print('DISCRETE:')
    for col in all_data[DISCRETE]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    print('CONTINUOUS:')
    for col in all_data[CONTINUOUS]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    print('NOMINALS:')
    for col in all_data[NOMINALS]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    return
    """

    # Use -1 for any others
    if imputer is None:
        all_data.fillna(-1, inplace=True)
    else:
        all_data['Response'].fillna(-1, inplace=True)

    # fix the dtype on the label column
    all_data['Response'] = all_data['Response'].astype(int)

    # split train and test
    train = all_data[all_data['Response'] > 0].copy()
    test = all_data[all_data['Response'] < 1].copy()

    #dropped_cols = ['Id', 'Response', 'Medical_History_10', 'Medical_History_24']#, 'Medical_History_32']
    dropped_cols = ['Id', 'Response']

    train_y = train['Response'].values
    train_X = train.drop(dropped_cols, axis=1)
    test_X = test.drop(dropped_cols, axis=1)

    if imputer is not None:
        from sklearn.preprocessing import Imputer
        imp = Imputer(missing_values='NaN', strategy=imputer, axis=0)
        train_X = imp.fit_transform(train_X)
        test_X = imp.transform(test_X)

    prudential_kwargs = \
    {
        'objective': 'reg:linear',
        'learning_rate': 0.045,
        'min_child_weight': 50,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'max_depth': 7,
        'n_estimators': nest,
        'nthread': njobs,
        'seed': 0,
        'n_buckets': nbuckets,
        'initial_params': mvector,
        'minimizer': minimizer,
        'scoring': NegQWKappaScorer
    }
    if estimator == 'PrudentialRegressorCVO2FO' or estimator == 'PrudentialRegressorCVO2':
        prudential_kwargs['int_fold'] = int_fold
        pass

    # override kwargs with any changes
    for k, v in clf_kwargs.items():
        prudential_kwargs[k] = v
    clf = globals()[estimator](**prudential_kwargs)
    print(estimator, clf.get_params())

    if nfolds > 1:
        param_grid = {
                    'n_estimators': [700],
                    'max_depth': [6],
                    'colsample_bytree': [0.67],
                    'subsample': [0.9],
                    'min_child_weight': [240],
                    #'initial_params': [[-0.71238755, -1.4970176, -1.73800531, -1.13361266, -0.82986203, -0.06473039, 0.69008725, 0.94815881]]
                    }
        for k, v in cv_grid.items():
            param_grid[k] = v

        from sklearn.metrics import make_scorer
        MIN, MAX = (1, 8)
        qwkappa = make_scorer(Kappa, weights='quadratic',
                              min_rating=MIN, max_rating=MAX)

        from sklearn.cross_validation import StratifiedKFold
        from sklearn.grid_search import GridSearchCV
        grid = GridSearchCV(estimator=clf,
                            param_grid=param_grid,
                            cv=StratifiedKFold(train_y, n_folds=nfolds),
                            scoring=qwkappa, n_jobs=1,
                            verbose=1,
                            refit=False)
        grid.fit(train_X, train_y)
        print('grid scores:')
        for item in grid.grid_scores_:
            print('  {:s}'.format(item))
        print('best score: {:.5f}'.format(grid.best_score_))
        print('best params:', grid.best_params_)

        pass

    else:
        clf.fit(train_X, train_y)


        final_test_preds = clf.predict(test_X)
        final_test_preds = rint(clip(final_test_preds, 1, 8))

        savetxt(out_csv_file,
                stack(zip(test['Id'].values, final_test_preds), axis=1).T,
                delimiter=',',
                fmt=['%d', '%d'],
                header='"Id","Response"', comments='')

        importance = clf.xgb.booster().get_fscore()
        import operator
        print(sorted(importance.items()), "\n")
        importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
        print(importance, "\n")
        features = [k for k, _ in importance]
        print(len(features), features)

    return
Exemple #4
0
 def randn_data(self, seed, shape):
     """
     Build a block of testing data from numpy.random.randn.
     """
     random_seed(seed)
     return randn(*shape)
Exemple #5
0
 def randn_data(self, seed, shape):
     """
     Build a block of testing data from numpy.random.randn.
     """
     random_seed(seed)
     return randn(*shape)
Exemple #6
0
    def fit(self, X, Y, X_val, Y_val, epochs=100):
        """Fit the model to data matrix X and target(s) y.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_channels, n_eeg_samples)
            The input data.

        y : array-like, shape (n_samples,)
            The target values (class labels).

        :param X: shape (n_samples, n_channels, n_eeg_samples)
            The input data.
        :param Y: shape (n_samples,)
            The target values (class labels)
        :param X_val: shape (n_samples, n_channels, n_eeg_samples)
            The validation input data.
        :param Y_val: shape (n_samples,)
            The validation target values (class labels)
        :param epochs:
            Number of training epochs

        Returns
        -------
        self : returns a trained RecCnn model.

        :return: self : returns a trained RecCnn model.

        """

        # for reproducability
        # note: it still won't be too reproducible if you use GPUs, for more see:
        #       https://github.com/keras-team/keras/issues/2479#issuecomment-213987747
        random_seed(self.seed)
        set_random_seed(self.seed)

        # calculates the class weights of the dataset
        w = len(np.extract(Y == 0, Y))
        w_0 = 1 / (len(np.extract(Y == 0, Y)) / w)
        w_1 = 1 / (len(np.extract(Y == 1, Y)) / w)
        class_weight = {0: w_0, 1: w_1}

        # Save best model only, based on the training loss. Saves model to topology specific file in working directory
        saveBestModel = ModelCheckpoint(self.path + '.h5',
                                        monitor='loss',
                                        verbose=1,
                                        save_best_only=True,
                                        mode='auto')

        # Log the training metrics in a topology specific file in the working directory
        csv_logger = CSVLogger(self.path + '_log.csv',
                               append=True,
                               separator=';')

        if self.recurrent:
            self.model = self.build_model_rcnn(X,
                                               depth=self.conv_depth,
                                               num_features=self.num_features)
        else:
            self.model = self.build_model_cnn(X,
                                              depth=self.conv_depth,
                                              num_features=self.num_features)

        if self.save_model:
            self.model.fit(x=X,
                           y=Y,
                           batch_size=64,
                           epochs=epochs,
                           verbose=1,
                           class_weight=class_weight,
                           shuffle=True,
                           validation_data=(X_val, Y_val),
                           callbacks=[saveBestModel, csv_logger])
        else:
            self.model.fit(x=X,
                           y=Y,
                           batch_size=64,
                           epochs=epochs,
                           verbose=1,
                           class_weight=class_weight,
                           shuffle=True,
                           validation_data=(X_val, Y_val))
Exemple #7
0
import sys
import os
import struct
import argparse
import collections

from os import listdir
from os.path import isfile, join
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle

import tensorflow as tf
from tensorflow.core.example import example_pb2

random_seed(123)

# for separating the sentences in the .bin files
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

VOCAB_SIZE = 60000
CHUNK_SIZE = 1000

def ParseStory(story_file):
  lines = []
  with open(story_file, "r") as f:
    for line in f:
      if line.strip()!='':
        lines.append(line.strip())
      
Exemple #8
0
from __future__ import print_function, division, unicode_literals


import os
from collections import Counter
import re
import numpy as np
from numpy.random import choice as random_choice, randint as random_randint, shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros # pylint:disable=no-name-in-module

from keras.models import Sequential, slice_X
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector, Dropout
from keras.layers import recurrent

random_seed(123) # Reproducibility

# Parameters for the model and dataset
NUMBER_OF_ITERATIONS = 20000
EPOCHS_PER_ITERATION = 5
RNN = recurrent.LSTM
INPUT_LAYERS = 2
OUTPUT_LAYERS = 2
AMOUNT_OF_DROPOUT = 0.3
BATCH_SIZE = 500
HIDDEN_SIZE = 700
INITIALIZATION = "he_normal" # : Gaussian initialization scaled by fan_in (He et al., 2014)
MAX_INPUT_LEN = 40
MIN_INPUT_LEN = 3
INVERTED = True
AMOUNT_OF_NOISE = 0.2 / MAX_INPUT_LEN
def work(estimator,
         nest,
         njobs,
         nfolds,
         cv_grid,
         clf_kwargs,
         do_hyperopt):

    from numpy.random import seed as random_seed
    random_seed(1)

    from pandas import read_csv

    train = read_csv('../../data/example_data.csv')

    train = train.drop(TO_DROP, axis=1)

    for col in FACTORIZABLE:
        from pandas import factorize
        train[col] = factorize(train[col])[0]
        pass

    train = ParseDates(train, ['TRANSACTION_DATE', 'CUSTOMER_FIRST_ORDER_DATE'])

    symbols = {}
    for col in [
        'PRICE_METHOD', 'ORDER_SOURCE', 'CUSTOMER_ACCOUNT_TYPE',
        'CUSTOMER_MANAGED_LEVEL', 'CUSTOMER_TYPE2', 'CUSTOMER_TYPE1',
        'CUSTOMER_ZIP', 'CUSTOMER_NUMBER'
        ]:
        uniq = set(train[col])
        symbols[col] = list(uniq)
        pass

    grouped = train.groupby(['PRODUCT_NUMBER', 'CUSTOMER_SEGMENT1'])
    samples = []
    for k, df in grouped:
        #print('{' + '"{}", \'{}\''.format(k[0], 'B' if k[1] else 'A') + '},')
        sample = Fractions(df, symbols, [
        'PRICE_METHOD', 'ORDER_SOURCE', 'CUSTOMER_ACCOUNT_TYPE',
        'CUSTOMER_MANAGED_LEVEL', 'CUSTOMER_TYPE2', 'CUSTOMER_TYPE1',
        #'CUSTOMER_ZIP', 'CUSTOMER_NUMBER'
        ])
        ATTRIBUTES2 = ['PRODUCT_CLASS_ID1',
              'BRAND', # binary
              'PRODUCT_SALES_UNIT', # binary
              'PRODUCT_UNIT_OF_MEASURE',
              'SPECIAL_PART'
              ]
        sample = sample.append(df.iloc[0][ATTRIBUTES2])

        ########################
        boxes_sold = df['TOTAL_BOXES_SOLD']

        pcost1 = df['PRODUCT_COST1'].abs()
        pcost1_per_item = pcost1 / boxes_sold
        pcost1_mean = pcost1_per_item.mean()
        pcost1_std = pcost1_per_item.std()
        sample.set_value('PCOST1_REL_STD', pcost1_std / pcost1_mean)
        sample.set_value('PCOST1_REL_MAX', pcost1_per_item.max() / pcost1_mean)
        sample.set_value('PCOST1_REL_MIN', pcost1_per_item.min() / pcost1_mean)

        price = df['PRODUCT_PRICE'].abs()
        price_mean = price.mean()
        price_std = price.std()
        sample.set_value('PRICE_REL_STD', price_std / price_mean)
        sample.set_value('PRICE_REL_MAX', price.max() / price_mean)
        sample.set_value('PRICE_REL_MIN', price.min() / price_mean)

        if sample['PRODUCT_UNIT_OF_MEASURE'] < 2:
            commision = price / pcost1_per_item
        else:
            commision = df['GROSS_SALES'].abs() / pcost1
        commision_mean = commision.mean()
        commision_std = commision.std()
        sample.set_value('COMMN_MEAN', commision_mean)
        sample.set_value('COMMN_REL_STD', commision_std / commision_mean)
        sample.set_value('COMMN_REL_MAX', commision.max() / commision_mean)
        sample.set_value('COMMN_REL_MIN', commision.min() / commision_mean)

#        tx_days = df['TRANSACTION_DATE_1'].combine(
#                df[['TRANSACTION_DATE_2', 'TRANSACTION_DATE_3']],
#                 func=lambda y, m_d: (y - 1970)* 365 + m_d['TRANSACTION_DATE_2'] * 30 + m_d['TRANSACTION_DATE_3'] - 1)
#        sample.set_value('FIRST_TX', tx_days.min())
#        august2014 = (2014 - 1970) * 365 + 8 * 30
#        sample.set_value('LAST_365D_TX', (tx_days > august2014).sum())

#        monthly = df['TRANSACTION_DATE_2'].value_counts(normalize=True)
#        monthly = monthly.reindex([i + 1 for i in range(12)], fill_value=0.)
#        sample.set_value('TX_Q1', monthly[[1, 2, 3]].sum())
#        sample.set_value('TX_Q2', monthly[[4, 5, 6]].sum())
#        sample.set_value('TX_Q3', monthly[[7, 8, 9]].sum())
#        sample.set_value('TX_Q4', monthly[[10, 11, 12]].sum())
#        sample = sample.append(monthly.rename(lambda i: 'TX_M_' + str(i)))
#
#        tx_days = df['TRANSACTION_DATE_1'].combine(
#                df[['TRANSACTION_DATE_2', 'TRANSACTION_DATE_3']],
#                 func=lambda y, m_d: y * 365 + m_d['TRANSACTION_DATE_2'] * 30 + m_d['TRANSACTION_DATE_3'])
#        tx_days.sort()
#        delta_tx_days = tx_days.diff()
#        means_delta_tx_days = delta_tx_days.mean()
#        sample.set_value('DTX_DAYS_MEAN', means_delta_tx_days)
#        sample.set_value('DTX_DAYS_REL_STD', delta_tx_days.std() / means_delta_tx_days)
        ########################

#        #most frequent customer
#        custcounts = df['CUSTOMER_NUMBER'].value_counts()
#        topcust = custcounts.index[0]
#        sample.set_value('TOP_CUST', topcust)
#        # most frequent zip
#        zipcounts = df['CUSTOMER_ZIP'].value_counts()
#        topzip = zipcounts.index[0]
#        sample.set_value('TOP_ZIP', topzip)

#        # number of unique transactions
#        sample.set_value('NTRANS', len(df))
#        # number of unique customers
#        custcounts = df['CUSTOMER_NUMBER'].value_counts()
#        sample.set_value('NCUST', len(custcounts))

        #sample = sample.append(df.iloc[0][['SPECIAL_PART']])
        samples.append(sample)
        pass

    from pandas import DataFrame
    train_df = DataFrame.from_records(samples)
    train_y = train_df['SPECIAL_PART'].values
    train_X = train_df.drop(['SPECIAL_PART'], axis=1)
    train_keys = [k for k, _ in grouped]

    from numpy import digitize
    train_y = digitize(train_y, [0.5])

    avnet_kwargs = \
    {
        #'objective': 'reg:logistic',
        'objective': 'rank:pairwise',
        'learning_rate': 0.045,
        'min_child_weight': 50,
        'subsample': 1.0,
        'colsample_bytree': 1.0,
        'max_depth': 7,
        'n_estimators': nest,
        'nthread': njobs,
        'seed': 0,
        #'cache_opt': 1,
        'missing': float('nan')
        #'scoring': NegQWKappaScorer
    }
    # override kwargs with any changes
    for k, v in clf_kwargs.items():
        avnet_kwargs[k] = v
        pass

    # create model instance
    from xgb_sklearn import XGBClassifier
    if estimator == 'XGBClassifier':
        clf = XGBClassifier(**avnet_kwargs)
        pass
    else:
        clf = globals()[estimator](**avnet_kwargs)
        pass

    from sklearn.metrics import make_scorer
    tco_scorer = make_scorer(AvnetScorer)

    if do_hyperopt:
        def objective(space):
            param_grid = {'objective': ['binary:logistic']}
            #param_grid = {'objective': ['binary:logitraw']}
            #param_grid = {'objective': ['rank:pairwise']}
            #param_grid = {'objective': ['rank:pairwise'], 'booster_type': ['gblinear']}
            for k, v in space.items():
                if k in ['n_estimators', 'max_depth', 'min_child_weight', 'num_pairwise']:
                    v = int(v)
                    pass
                param_grid[k] = [v]
                pass

            from sklearn.cross_validation import StratifiedKFold, LeaveOneOut
            from sklearn.grid_search import GridSearchCV

            from sklearn.cross_validation import _PartitionIterator
            class CustomLOO(_PartitionIterator):
                def __init__(self, train_keys):
                    ids = set(t[0] for t in train_keys)
                    self.n_folds = len(ids)
                    self.n = len(train_keys)

                    from numpy import zeros, array
                    test_folds = zeros(len(train_keys))
                    for i, k in enumerate(ids):
                        mask = [t[0] == k for t in train_keys]
                        test_folds[array(mask)] = i
                        pass
                    self.test_folds = test_folds
                    pass

                #def _iter_test_indices(self):
                #    return range(self.n_folds)
                def _iter_test_masks(self):
                    for i in range(self.n_folds):
                        yield self.test_folds == i

                def __len__(self):
                    return self.n_folds
                pass

            grid = GridSearchCV(estimator=clf,
                            param_grid=param_grid,
                            #cv=StratifiedKFold(train_y, n_folds=nfolds),
                            #cv=LeaveOneOut(91),
                            cv=CustomLOO(train_keys),
                            scoring=tco_scorer,
                            n_jobs=1,
                            #verbose=2,
                            refit=False)
            grid.fit(train_X, train_y)

            print('best score: {:.5f}  best params: {}'.format(grid.best_score_, grid.best_params_))
            return -grid.best_score_

        from sys import path as sys_path
        sys_path.insert(0, './hyperopt')
        from hyperopt import fmin, tpe, hp

        # cheatsheet:
        # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions
        space = {
            'n_estimators': hp.quniform("x_n_estimators", 2, 32, 1),
            'max_depth': hp.quniform("x_max_depth", 1, 24, 1),
            'min_child_weight': hp.quniform ('x_min_child', 1, 16, 1),
            #'gamma': hp.uniform ('x_gamma', 0.0, 2.0),
            'scale_pos_weight': hp.uniform ('x_scale_pos_weight', 0.2, 1.0),

            #'num_pairsample': hp.quniform ('x_num_pairsample', 1, 20, 1),
            #'learning_rate': hp.uniform ('x_learning_rate', 0.03, 0.06),

            'subsample': hp.uniform ('x_subsample', 0.8, 1.0),
            'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.3, 1.0)
            }
        print(clf)
        print(space)
        best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=1000,
            )
        print(best)
        pass

    return

    """
best score: 956593.40659
best params: {'colsample_bytree': 0.6964853661142929, 'min_child_weight': 2, 'n_estimators': 160, 'subsample': 0.9904670890953792, 'objective': 'rank:pairwise', 'max_depth': 8, 'gamma': 0.663344866861138}
{'x_gamma': 0.66334486686113803, 'x_min_child': 2.0, 'x_max_depth': 8.0, 'x_subsample': 0.99046708909537917, 'x_colsample_bytree': 0.6964853661142929, 'x_n_estimators': 160.0}

    """



    """ Model crossvalidation """
    if (False
        #or True
        ):
        param_grid = {
                    #'objective': ['binary:logitraw'],
                    'objective': ['rank:pairwise'],
                    #'booster': ['gblinear'],
                    'n_estimators': [580],

                    'max_depth': [6],
                    'min_child_weight': [45],
                    'gamma': [0.],

                    'subsample': [0.85],
                    'colsample_bytree': [0.65],

                    'learning_rate': [0.045],
                    }
        for k, v in cv_grid.items():
            param_grid[k] = v

        from sklearn.cross_validation import StratifiedKFold
        from sklearn.grid_search import GridSearchCV
        grid = GridSearchCV(estimator=clf,
                                param_grid=param_grid,
                                cv=StratifiedKFold(train_y, n_folds=nfolds),
                                scoring='roc_auc',
                                n_jobs=1,
                                verbose=2,
                                refit=False)
        grid.fit(train_X, train_y)
        print('grid scores:')
        for item in grid.grid_scores_:
            print('  {:s}'.format(item))
        print('best score: {:.5f}'.format(grid.best_score_))
        print('best params:', grid.best_params_)

    """ Print feature importances """
    if (False
        #or True
        ):
        clf.fit(train_X, train_y)
        feature_names = train_X.columns.values.tolist()
        from numpy import zeros
        feature_importances = zeros(len(feature_names))
        importances = clf.booster().get_fscore()
        for i, feat in enumerate(feature_names):
            if feat in importances:
                feature_importances[i] += importances[feat]
                pass
            pass
        import operator
        sorted_importances = sorted(zip(feature_names, feature_importances), key=operator.itemgetter(1), reverse=True)
        for k, v in sorted_importances:
            print("{}\t{}".format(v, k))
            pass
        print([k for k, v in sorted_importances if v == 0])
        pass

    """ Hyperopt """


    return
def work(estimator, nest, njobs, nfolds, cv_grid, clf_kwargs):

    from numpy.random import seed as random_seed
    random_seed(1)

    from pandas import read_csv

    all_data = read_csv("../../data/demographic_membership_training.csv")

    train_y = all_data['DEMO_X'].values
    train_X = all_data.drop(['CONSUMER_ID', 'DEMO_X'], axis=1)

    from pandas import factorize
    train_X['GENDER'][train_X['GENDER'] == 'U'] = float('nan')

    for col in FACTORIZABLE:
        from pandas import isnull

        missing = isnull(train_X[col])

        train_X[col] = factorize(train_X[col])[0]  ## NANs become -1
        train_X[col][missing] = float('nan')

        from numpy import isnan
        print("NANs after factorization", sum(train_X[col].apply(isnan)))
        pass

    train_X = OneHot(train_X, NOMINALS)

    #train_X['**PROP_PAGE_IMPRESSIONS_DWELL'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['TOTAL_DWELL']
    #train_X['**PROP_VOD_VIEWS_DWELL'] = train_X['VOD_VIEWS_DWELL'] / train_X['TOTAL_DWELL']
    #
    #    train_X['**FLAG_WARD_WKDAY_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKDAY_')]].sum(axis=1)
    #    train_X['**FLAG_WARD_WKEND_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKEND_')]].sum(axis=1)
    #    train_X['**FLAG_UNI_CLUSTER_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_UNI_CLUSTER_')]].sum(axis=1)
    #    train_X['**INTERESTS_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('INTEREST_')]].sum(axis=1)
    #    train_X['**AGE_25'] = train_X['AGE'] < 25
    #    train_X['**AGE_30'] = train_X['AGE'] < 30
    #    train_X['**AGE_35'] = train_X['AGE'] < 35
    #    train_X['**AGE_40'] = train_X['AGE'] < 40
    #    train_X['**AGE_45'] = train_X['AGE'] < 45
    #train_X['**PAGE_IMP_DWELL_PER_DAY'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['REGISTRATION_DAYS']
    #train_X['**LATE_PAGE_VIEWS_PER_DAY'] = train_X['LATE_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #train_X['**TOTAL_DWELL_PER_DAY'] = train_X['TOTAL_DWELL'] / train_X['REGISTRATION_DAYS']
    #train_X['**AFTERNOON_PAGE_VIEWS_PER_DAY'] = train_X['AFTERNOON_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #train_X['**PAGE_IMPRESSION_VISITS_PER_DAY'] = train_X['PAGE_IMPRESSION_VISITS'] / train_X['REGISTRATION_DAYS']
    #train_X['**LUNCHTIME_PAGE_VIEWS_PER_DAY'] = train_X['LUNCHTIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #train_X['**NIGHT_TIME_PAGE_VIEWS_PER_DAY'] = train_X['NIGHT_TIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #    train_X['**BREAKFAST_PAGE_VIEWS_PER_DAY'] = train_X['BREAKFAST_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #    train_X['**VIDEO_STOPS_PER_DAY'] = train_X['VIDEO_STOPS'] / train_X['REGISTRATION_DAYS']

    #    TO_DROP = [
    #        'VIEWS_AFF4', 'FLAG_WARD_WKDAY_10_16', 'FLAG_WARD_WKDAY_17_19',
    #        'FLAG_WARD_WKDAY_20_24', 'FLAG_WARD_WKEND_10_13', 'FLAG_WARD_WKEND_14_20',
    #        'FLAG_UNI_CLUSTER_15', 'FLAG_UNI_CLUSTER_23', 'FLAG_UNI_CLUSTER_28',
    #        'FLAG_UNI_CLUSTER_29', 'FLAG_UNI_CLUSTER_33', 'FLAG_WEBSITE',
    #        'FLAG_BREAKFAST_VIEWS', 'FLAG_LUNCHTIME_VIEWS', 'FLAG_AFTERNOON_VIEWS',
    #        'FLAG_CATCHUP_VIEWS', 'FLAG_ARCHIVE_VIEWS', 'FLAG_AFF3', 'FLAG_AFF4',
    #        'REGISTRATION_ROUTE_3', 'REGISTRATION_ROUTE_4', 'REGISTRATION_CONTEXT_3',
    #        'REGISTRATION_CONTEXT_6', 'REGISTRATION_CONTEXT_8', 'REGISTRATION_CONTEXT_9',
    #        'REGISTRATION_CONTEXT_10', 'REGISTRATION_CONTEXT_11', 'REGISTRATION_CONTEXT_12',
    #        'REGISTRATION_CONTEXT_13', 'REGISTRATION_CONTEXT_14', 'REGISTRATION_CONTEXT_15',
    #        'REGISTRATION_CONTEXT_16', 'REGISTRATION_CONTEXT_17', 'REGISTRATION_CONTEXT_18',
    #        'REGISTRATION_CONTEXT_19', 'REGISTRATION_CONTEXT_20', 'REGISTRATION_CONTEXT_21',
    #        'REGISTRATION_CONTEXT_22', 'REGISTRATION_CONTEXT_23', 'REGISTRATION_CONTEXT_24',
    #        'REGISTRATION_CONTEXT_25', 'REGISTRATION_CONTEXT_26', 'REGISTRATION_CONTEXT_27',
    #        'MIGRATED_USER_TYPE_5', 'TOD_CENTRE_3', 'CONTENT_CENTRE_1', 'CONTENT_CENTRE_2',
    #        'CONTENT_CENTRE_4', 'CONTENT_CENTRE_5', 'CONTENT_CENTRE_6', 'CONTENT_CENTRE_7',
    #        'CONTENT_CENTRE_8', 'CONTENT_CENTRE_9', 'CONTENT_CENTRE_12','CONTENT_CENTRE_13',
    #        'CONTENT_CENTRE_15']
    #    TO_DROP += [
    #        'SOCIAL_AUTH_TWITTER', 'FLAG_WARD_WKEND_3_9', 'FLAG_UNI_CLUSTER_7',
    #        'FLAG_UNI_CLUSTER_13', 'FLAG_UNI_CLUSTER_21', 'FLAG_UNI_CLUSTER_22',
    #        'FLAG_UNI_CLUSTER_25', 'FLAG_ANDROID', 'FLAG_LATE_PEAK_VIEWS',
    #        'FLAG_NIGHT_TIME_VIEWS', 'FLAG_AFF1', 'FLAG_AFF2', 'MIGRATED_USER_TYPE_4',
    #        'CONTENT_CENTRE_10', 'CONTENT_CENTRE_14', 'CONTENT_CENTRE_16']
    #    TO_DROP += [
    #        'FLAG_WARD_WKDAY_3_9', 'FLAG_UNI_CLUSTER_5', 'FLAG_UNI_CLUSTER_8',
    #        'FLAG_UNI_CLUSTER_9', 'FLAG_UNI_CLUSTER_17', 'FLAG_UNI_CLUSTER_26',
    #        'FLAG_MORNING_VIEWS', 'FLAG_EARLY_PEAK_VIEWS']
    #    TO_DROP += [
    #        'FLAG_WARD_WKEND_1_2', 'FLAG_WARD_WKEND_21_24', 'FLAG_UNI_CLUSTER_1',
    #        'FLAG_UNI_CLUSTER_14', 'FLAG_MAIN', 'FLAG_OTHER_VIEWS', 'CONTENT_CENTRE_11']
    #    TO_DROP += ['FLAG_UNI_CLUSTER_12', 'FLAG_UNI_CLUSTER_19', 'FLAG_UNI_CLUSTER_27']
    #    TO_DROP += ['FLAG_UNI_CLUSTER_11', 'FLAG_UNI_CLUSTER_24'] ## 814000 ?
    #    TO_DROP += ['FLAG_UNI_CLUSTER_2', 'FLAG_UNI_CLUSTER_16']
    #    TO_DROP += ['FLAG_UNI_CLUSTER_10', 'FLAG_UNI_CLUSTER_31', 'FLAG_POST_PEAK_VIEWS', 'TOD_CENTRE_2']
    #    TO_DROP += ['FLAG_UNI_CLUSTER_30']
    #    train_X = train_X.drop(TO_DROP, axis=1)

    #    train_X.fillna(-1, inplace=True)

    from sklearn.cross_validation import StratifiedKFold
    from sklearn.grid_search import GridSearchCV

    if (False
            #or True
        ):
        skf = StratifiedKFold(train_y, n_folds=nfolds)
        from numpy import asarray
        selection = asarray(['-'] * len(train_y))
        symbol = 0
        for train_index, test_index in skf:
            selection[test_index] = chr(symbol + 48)
            symbol += 1
            pass
        print(''.join(selection))
        return

    muse_kwargs = \
    {
        #'objective': 'reg:logistic',
        'objective': 'rank:pairwise',
        'learning_rate': 0.045,
        'min_child_weight': 50,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'max_depth': 7,
        'n_estimators': nest,
        'nthread': njobs,
        'seed': 0,
        'missing': float('nan')
        #'scoring': NegQWKappaScorer
    }

    # override kwargs with any changes
    for k, v in clf_kwargs.items():
        muse_kwargs[k] = v
        pass

    #clf = globals()[estimator](**muse_kwargs)
    #from xgboost import XGBClassifier
    clf = XGBClassifier(**muse_kwargs)
    #clf = MillenialsClassifier(**muse_kwargs)

    from sklearn.metrics import make_scorer
    tco_scorer = make_scorer(MinPRScorer)
    """
binary:logistic
    grid scores:
  mean: 787812.76918, std: 1297.55109, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 1}
  mean: 789084.73195, std: 1925.75110, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 3}
  mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10}
  mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
  mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50}
  mean: 788168.38281, std: 928.87371, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80}
best score: 789958.10747
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}

reg:logistic (to samo co wyżej)
grid scores:
  mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10}
  mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
  mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50}
best score: 789958.10747
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}


grid scores:
  mean: 786388.90860, std: 906.72660, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 300, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}
  mean: 789050.88848, std: 1708.63378, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}
  mean: 789454.57872, std: 2059.68811, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}
best score: 789454.57872
best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}

======================================================
rank:pairwise
grid scores:
  mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
best score: 806358.37855
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}

grid scores:
  mean: 750119.43597, std: 9120.06057, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 4, 'min_child_weight': 20}
  mean: 809673.54959, std: 4784.35577, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20}
  mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20}
  mean: 794998.50356, std: 2029.93836, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 20, 'min_child_weight': 20}
  mean: 794548.01245, std: 2062.41505, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 50, 'min_child_weight': 20}
best score: 809673.54959
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20}
>>> 'max_depth': 6, 'min_child_weight': 20

grid scores:
  mean: 802508.37926, std: 4201.47242, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 10}
  mean: 793935.52998, std: 7607.45918, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 20}
  mean: 784568.74090, std: 7161.04235, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 40}
  mean: 802325.99222, std: 1833.64884, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10}
  mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
  mean: 808437.63308, std: 3881.55687, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40}
  mean: 798618.25778, std: 2948.03146, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 10}
  mean: 802665.25722, std: 2350.85430, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 20}
  mean: 806720.10926, std: 2543.82598, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 40}
  mean: 795701.38488, std: 2962.99442, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 10}
  mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20}
  mean: 803385.26027, std: 2271.86591, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 40}
best score: 808437.63308
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40}
>>> 'max_depth': 7, 'min_child_weight': 40

grid scores:
  mean: 782028.41606, std: 9637.64116, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 50}
  mean: 769010.75894, std: 6079.16367, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 60}
  mean: 760914.24094, std: 9643.26515, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 80}
  mean: 807557.88495, std: 4219.13250, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 50}
  mean: 801663.63876, std: 7556.18492, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 60}
  mean: 784727.73532, std: 7314.95469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 80}
  mean: 811735.94787, std: 3476.37280, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50}
  mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60}
  mean: 806342.26320, std: 7227.93062, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80}
best score: 812694.98649
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60}
>>> 'max_depth': 7, 'min_child_weight': 60

    grid scores:
  mean: 811261.01220, std: 1387.81968, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 55}
  mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60}
  mean: 813522.63431, std: 5054.98775, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65}
  mean: 811147.14498, std: 1469.98812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 70}
  mean: 810716.38989, std: 3383.29928, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 55}
  mean: 810977.37920, std: 3039.52816, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 60}
  mean: 809034.76724, std: 4751.72859, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 65}
  mean: 810902.03165, std: 3741.53151, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 70}
best score: 813522.63431
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65}
>>> 'max_depth': 7, 'min_child_weight': 65
    """
    """  ONE HOT   ***
grid scores:
  mean: 808785.95756, std: 3732.33890, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 807606.64678, std: 6685.61758, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 798856.94075, std: 8380.25083, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 1.0, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812217.95285
best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'subsample': 0.8

grid scores:
  mean: 807015.44496, std: 1745.61053, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811229.10339, std: 2626.00511, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809107.13182, std: 3766.10287, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 806700.86249, std: 1673.53048, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 808803.17397, std: 2141.65596, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 810538.01687, std: 4532.84397, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 808655.96400, std: 2917.13000, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809115.42707, std: 2625.94051, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809472.09619, std: 3144.06367, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 811229.10339
best params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'colsample_bytree': 0.6, 'subsample': 0.8

grid scores:
  mean: 804112.81441, std: 8632.61400, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 807667.00596, std: 2109.00871, params: {'colsample_bytree': 0.67, 'learning_rate': 0.06, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812217.95285
best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'learning_rate': 0.045

grid scores:
  mean: 810936.40241, std: 2661.32895, params: {'colsample_bytree': 0.67, 'learning_rate': 0.04, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811906.13100, std: 3339.45916, params: {'colsample_bytree': 0.67, 'learning_rate': 0.05, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812217.95285
best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'learning_rate': 0.045

grid scores:
  mean: 811208.11854, std: 3435.62254, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809690.89826, std: 2189.70344, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 808096.24696, std: 3457.47957, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811478.50583, std: 2864.64292, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'learning_rate': 0.045, 'colsample_bytree': 0.65, 'subsample': 0.85

grid scores:
  mean: 810759.12376, std: 1528.10128, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 50, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811977.26246, std: 4991.06664, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 80, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> no change

grid scores:
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 810059.35718, std: 1824.61383, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 1.0}
  mean: 810675.99076, std: 3153.29552, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 2.0}
  mean: 809731.68599, std: 4091.21405, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 4}
  mean: 751547.94084, std: 10351.41628, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 10}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
>>> 'gamma': 0

grid scores:
  mean: 786632.68012, std: 7192.58756, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.4, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 746308.67706, std: 11236.07404, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.3, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 695506.87990, std: 7719.89820, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.2, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 802604.50597, std: 2869.94386, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 798443.86047, std: 1863.64175, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
>>> 'base_score': 0.5

    """

    param_grid = {
        #'objective': ['binary:logitraw'],
        'objective': ['rank:pairwise'],
        #'booster': ['gblinear'],
        'n_estimators': [580],
        'max_depth': [6],
        'min_child_weight': [45, 50, 55],
        'gamma': [0.],
        'subsample': [0.85],
        'colsample_bytree': [0.65],
        'learning_rate': [0.045],
    }
    for k, v in cv_grid.items():
        param_grid[k] = v

    grid = GridSearchCV(estimator=clf,
                        param_grid=param_grid,
                        cv=StratifiedKFold(train_y, n_folds=nfolds),
                        scoring=tco_scorer,
                        n_jobs=1,
                        verbose=2,
                        refit=False)

    if (False
            #or True
        ):
        grid.fit(train_X, train_y)
        print('grid scores:')
        for item in grid.grid_scores_:
            print('  {:s}'.format(item))
        print('best score: {:.5f}'.format(grid.best_score_))
        print('best params:', grid.best_params_)

    if (False
            #or True
        ):
        clf.fit(train_X, train_y)
        feature_names = train_X.columns.values.tolist()
        from numpy import zeros
        feature_importances = zeros(len(feature_names))
        importances = clf.booster().get_fscore()
        for i, feat in enumerate(feature_names):
            if feat in importances:
                feature_importances[i] += importances[feat]
                pass
            pass
        import operator
        sorted_importances = sorted(zip(feature_names, feature_importances),
                                    key=operator.itemgetter(1),
                                    reverse=True)
        for k, v in sorted_importances:
            print("{}\t{}".format(v, k))
            pass
        print([k for k, v in sorted_importances if v == 0])
        pass

    if (False or True):

        def objective(space):
            #param_grid = {'objective': ['binary:logistic']}
            #param_grid = {'objective': ['binary:logitraw']}
            param_grid = {
                'objective': ['rank:pairwise'],
                'booster_type': ['gblinear']
            }
            for k, v in space.items():
                if k in [
                        'n_estimators', 'max_depth', 'min_child_weight',
                        'num_pairwise'
                ]:
                    v = int(v)
                    pass
                param_grid[k] = [v]
                pass

            grid = GridSearchCV(estimator=clf,
                                param_grid=param_grid,
                                cv=StratifiedKFold(train_y, n_folds=nfolds),
                                scoring=tco_scorer,
                                n_jobs=1,
                                verbose=2,
                                refit=False)
            grid.fit(train_X, train_y)

            print('best score: {:.5f}  best params: {}'.format(
                grid.best_score_, grid.best_params_))
            return -grid.best_score_

        from sys import path as sys_path
        sys_path.insert(0, './hyperopt')
        from hyperopt import fmin, tpe, hp

        # cheatsheet:
        # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions
        space = {
            'n_estimators': hp.quniform("x_n_estimators", 500, 800, 10),
            'max_depth': hp.quniform("x_max_depth", 4, 8, 1),
            'min_child_weight': hp.quniform('x_min_child', 45, 240, 5),
            'gamma': hp.uniform('x_gamma', 0.0, 2.0),
            #'scale_pos_weight': hp.uniform ('x_scale_pos_weight', 0.5, 1.0),
            'num_pairsample': hp.quniform('x_num_pairsample', 1, 4, 1),
            'subsample': hp.uniform('x_subsample', 0.4, 1.0),
            'colsample_bytree': hp.uniform('x_colsample_bytree', 0.4, 1.0)
        }
        best = fmin(
            fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
        )
        print(best)
        pass

    return
Exemple #11
0
def work(out_csv_file, estimator, nest, njobs, nfolds, cv_grid, minimizer,
         nbuckets, mvector, imputer, clf_kwargs, int_fold):

    from numpy.random import seed as random_seed
    random_seed(1)

    from zipfile import ZipFile
    from pandas import read_csv, factorize
    from numpy import rint, clip, savetxt, stack

    if KAGGLE:
        train = read_csv("../input/train.csv")
        test = read_csv("../input/test.csv")
    else:
        train = read_csv(
            ZipFile("../../data/train.csv.zip", 'r').open('train.csv'))
        test = read_csv(
            ZipFile("../../data/test.csv.zip", 'r').open('test.csv'))


#    gmm17_train = read_csv('GMM_17_full_train.csv')
#    gmm17_test = read_csv('GMM_17_full_test.csv')
#    gmm6_train = read_csv('GMM_6_full_train.csv')
#    gmm6_test = read_csv('GMM_6_full_test.csv')
#
#    train['GMM17'] = gmm17_train['Response']
#    test['GMM17'] = gmm17_test['Response']
#    train['GMM6'] = gmm6_train['Response']
#    test['GMM6'] = gmm6_test['Response']

# combine train and test
    all_data = train.append(test)

    #    G_vectors = read_csv('../../data/G_vectors.csv')
    #    #all_data = all_data.join(G_vectors.drop(['G3'], axis=1))
    #    all_data = all_data.join(
    #        G_vectors[['G8', 'G11', 'G12', 'G13', 'G17', 'G18', 'G19', 'G20']])

    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    all_data[DISCRETE] = imp.fit_transform(all_data[DISCRETE])
    #    from numpy import bincount
    #    for col in all_data[DISCRETE]:
    #        top = bincount(all_data[col].astype(int)).argmax()
    #        all_data[col] -= top
    imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    all_data[CONTINUOUS] = imp.fit_transform(all_data[CONTINUOUS])
    #    all_data[BOOLEANS] = all_data[BOOLEANS] + 1e6

    #    from sklearn.preprocessing import StandardScaler
    #    from sklearn.decomposition import PCA
    #    std = StandardScaler(copy=True)
    #    all_data[CONTINUOUS] = std.fit_transform(all_data[CONTINUOUS])
    #    pca = PCA(whiten=False, copy=True)
    #    all_data[CONTINUOUS] = pca.fit_transform(all_data[CONTINUOUS])

    # create any new variables
    all_data['Product_Info_2_char'] = all_data.Product_Info_2.str[0]
    all_data['Product_Info_2_num'] = all_data.Product_Info_2.str[1]

    # factorize categorical variables
    all_data['Product_Info_2'] = factorize(
        all_data['Product_Info_2'])[0]  # + 1
    all_data['Product_Info_2_char'] = factorize(
        all_data['Product_Info_2_char'])[0]  # + 1
    all_data['Product_Info_2_num'] = factorize(
        all_data['Product_Info_2_num'])[0]  # + 1
    """
    Both: 0.65576
    BmiAge: 0.65578
    MedCount: 0.65638
    None: 0.65529
    """
    all_data['BMI_Age'] = all_data['BMI'] * all_data['Ins_Age']
    med_keyword_columns = all_data.columns[all_data.columns.str.startswith(
        'Medical_Keyword_')]
    all_data['Med_Keywords_Count'] = all_data[med_keyword_columns].sum(axis=1)
    """
    print('BOOLEANS:')
    for col in all_data[BOOLEANS]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    print('DISCRETE:')
    for col in all_data[DISCRETE]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    print('CONTINUOUS:')
    for col in all_data[CONTINUOUS]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    print('NOMINALS:')
    for col in all_data[NOMINALS]:
        print(col, all_data[col].dtype, min(all_data[col]), max(all_data[col]), float(sum(all_data[col] == 0)) / len(all_data[col]))
    return
    """

    # Use -1 for any others
    if imputer is None:
        all_data.fillna(-1, inplace=True)
    else:
        all_data['Response'].fillna(-1, inplace=True)

    # fix the dtype on the label column
    all_data['Response'] = all_data['Response'].astype(int)

    # split train and test
    train = all_data[all_data['Response'] > 0].copy()
    test = all_data[all_data['Response'] < 1].copy()

    #dropped_cols = ['Id', 'Response', 'Medical_History_10', 'Medical_History_24']#, 'Medical_History_32']
    dropped_cols = ['Id', 'Response']

    train_y = train['Response'].values
    train_X = train.drop(dropped_cols, axis=1)
    test_X = test.drop(dropped_cols, axis=1)

    if imputer is not None:
        from sklearn.preprocessing import Imputer
        imp = Imputer(missing_values='NaN', strategy=imputer, axis=0)
        train_X = imp.fit_transform(train_X)
        test_X = imp.transform(test_X)

    prudential_kwargs = \
    {
        'objective': 'reg:linear',
        'learning_rate': 0.045,
        'min_child_weight': 50,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'max_depth': 7,
        'n_estimators': nest,
        'nthread': njobs,
        'seed': 0,
        'n_buckets': nbuckets,
        'initial_params': mvector,
        'minimizer': minimizer,
        'scoring': NegQWKappaScorer
    }
    if estimator == 'PrudentialRegressorCVO2FO' or estimator == 'PrudentialRegressorCVO2':
        prudential_kwargs['int_fold'] = int_fold
        pass

    # override kwargs with any changes
    for k, v in clf_kwargs.items():
        prudential_kwargs[k] = v
    clf = globals()[estimator](**prudential_kwargs)
    print(estimator, clf.get_params())

    if nfolds > 1:
        param_grid = {
            'n_estimators': [700],
            'max_depth': [6],
            'colsample_bytree': [0.67],
            'subsample': [0.9],
            'min_child_weight': [240],
            #'initial_params': [[-0.71238755, -1.4970176, -1.73800531, -1.13361266, -0.82986203, -0.06473039, 0.69008725, 0.94815881]]
        }
        for k, v in cv_grid.items():
            param_grid[k] = v

        from sklearn.metrics import make_scorer
        MIN, MAX = (1, 8)
        qwkappa = make_scorer(Kappa,
                              weights='quadratic',
                              min_rating=MIN,
                              max_rating=MAX)

        from sklearn.cross_validation import StratifiedKFold
        from sklearn.grid_search import GridSearchCV
        grid = GridSearchCV(estimator=clf,
                            param_grid=param_grid,
                            cv=StratifiedKFold(train_y, n_folds=nfolds),
                            scoring=qwkappa,
                            n_jobs=1,
                            verbose=1,
                            refit=False)
        grid.fit(train_X, train_y)
        print('grid scores:')
        for item in grid.grid_scores_:
            print('  {:s}'.format(item))
        print('best score: {:.5f}'.format(grid.best_score_))
        print('best params:', grid.best_params_)

        pass

    else:
        clf.fit(train_X, train_y)

        final_test_preds = clf.predict(test_X)
        final_test_preds = rint(clip(final_test_preds, 1, 8))

        savetxt(out_csv_file,
                stack(zip(test['Id'].values, final_test_preds), axis=1).T,
                delimiter=',',
                fmt=['%d', '%d'],
                header='"Id","Response"',
                comments='')

        importance = clf.xgb.booster().get_fscore()
        import operator
        print(sorted(importance.items()), "\n")
        importance = sorted(importance.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
        print(importance, "\n")
        features = [k for k, _ in importance]
        print(len(features), features)

    return
import os
from os.path import isfile, join

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# from progressbar import progressbar

import tensorflow as tf
from tensorflow.core.example import example_pb2

from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle

random_seed(21) #Reproducibil

#Define the Global Variables
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string('command', 'text_to_binary',
                           'Either text_to_vocabulary or text_to_binary.'
                           'Specify FLAGS.in_directories accordingly.')
tf.app.flags.DEFINE_string('in_folder','','path to input json data file')
tf.app.flags.DEFINE_string('out_files', '','comma seperated paths to files') #specify the outfile during command line interface
tf.app.flags.DEFINE_string('split', '', 'comma separated fractions of data') #specify during terminal command call
# tf.app.flags.DEFINE_string('vocab_file','data/vocabulary','path to output the vocab of the training corpus')
# tf.app.flags.DEFINE_integer('body_len', 30000, 'Define the length of body to consider')
# tf.app.flags.DEFINE_integer('abs_len', 1500, 'Define the length of abstract to consider')
tf.app.flags.DEFINE_integer('max_words', 500000000, 'Define the max number of words to consider in vocab')
#
# UNKNOWN_TOKEN = '<UNK>'
def work(estimator,
         nest,
         njobs,
         nfolds,
         cv_grid,
         clf_kwargs):

    from numpy.random import seed as random_seed
    random_seed(1)

    from pandas import read_csv

    all_data = read_csv("../../data/demographic_membership_training.csv")

    train_y = all_data['DEMO_X'].values
    train_X = all_data.drop(['CONSUMER_ID', 'DEMO_X'], axis=1)

    from pandas import factorize
    train_X['GENDER'][train_X['GENDER'] == 'U'] = float('nan')

    for col in FACTORIZABLE:
        from pandas import isnull

        missing = isnull(train_X[col])

        train_X[col] = factorize(train_X[col])[0] ## NANs become -1
        train_X[col][missing] = float('nan')

        from numpy import isnan
        print("NANs after factorization", sum(train_X[col].apply(isnan)))
        pass

    train_X = OneHot(train_X, NOMINALS)

    #train_X['**PROP_PAGE_IMPRESSIONS_DWELL'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['TOTAL_DWELL']
    #train_X['**PROP_VOD_VIEWS_DWELL'] = train_X['VOD_VIEWS_DWELL'] / train_X['TOTAL_DWELL']
#
#    train_X['**FLAG_WARD_WKDAY_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKDAY_')]].sum(axis=1)
#    train_X['**FLAG_WARD_WKEND_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_WARD_WKEND_')]].sum(axis=1)
#    train_X['**FLAG_UNI_CLUSTER_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('FLAG_UNI_CLUSTER_')]].sum(axis=1)
#    train_X['**INTERESTS_COUNT'] = train_X[train_X.columns[train_X.columns.str.startswith('INTEREST_')]].sum(axis=1)
#    train_X['**AGE_25'] = train_X['AGE'] < 25
#    train_X['**AGE_30'] = train_X['AGE'] < 30
#    train_X['**AGE_35'] = train_X['AGE'] < 35
#    train_X['**AGE_40'] = train_X['AGE'] < 40
#    train_X['**AGE_45'] = train_X['AGE'] < 45
    #train_X['**PAGE_IMP_DWELL_PER_DAY'] = train_X['PAGE_IMPRESSIONS_DWELL'] / train_X['REGISTRATION_DAYS']
    #train_X['**LATE_PAGE_VIEWS_PER_DAY'] = train_X['LATE_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #train_X['**TOTAL_DWELL_PER_DAY'] = train_X['TOTAL_DWELL'] / train_X['REGISTRATION_DAYS']
    #train_X['**AFTERNOON_PAGE_VIEWS_PER_DAY'] = train_X['AFTERNOON_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #train_X['**PAGE_IMPRESSION_VISITS_PER_DAY'] = train_X['PAGE_IMPRESSION_VISITS'] / train_X['REGISTRATION_DAYS']
    #train_X['**LUNCHTIME_PAGE_VIEWS_PER_DAY'] = train_X['LUNCHTIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
    #train_X['**NIGHT_TIME_PAGE_VIEWS_PER_DAY'] = train_X['NIGHT_TIME_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
#    train_X['**BREAKFAST_PAGE_VIEWS_PER_DAY'] = train_X['BREAKFAST_PAGE_VIEWS'] / train_X['REGISTRATION_DAYS']
#    train_X['**VIDEO_STOPS_PER_DAY'] = train_X['VIDEO_STOPS'] / train_X['REGISTRATION_DAYS']

#    TO_DROP = [
#        'VIEWS_AFF4', 'FLAG_WARD_WKDAY_10_16', 'FLAG_WARD_WKDAY_17_19',
#        'FLAG_WARD_WKDAY_20_24', 'FLAG_WARD_WKEND_10_13', 'FLAG_WARD_WKEND_14_20',
#        'FLAG_UNI_CLUSTER_15', 'FLAG_UNI_CLUSTER_23', 'FLAG_UNI_CLUSTER_28',
#        'FLAG_UNI_CLUSTER_29', 'FLAG_UNI_CLUSTER_33', 'FLAG_WEBSITE',
#        'FLAG_BREAKFAST_VIEWS', 'FLAG_LUNCHTIME_VIEWS', 'FLAG_AFTERNOON_VIEWS',
#        'FLAG_CATCHUP_VIEWS', 'FLAG_ARCHIVE_VIEWS', 'FLAG_AFF3', 'FLAG_AFF4',
#        'REGISTRATION_ROUTE_3', 'REGISTRATION_ROUTE_4', 'REGISTRATION_CONTEXT_3',
#        'REGISTRATION_CONTEXT_6', 'REGISTRATION_CONTEXT_8', 'REGISTRATION_CONTEXT_9',
#        'REGISTRATION_CONTEXT_10', 'REGISTRATION_CONTEXT_11', 'REGISTRATION_CONTEXT_12',
#        'REGISTRATION_CONTEXT_13', 'REGISTRATION_CONTEXT_14', 'REGISTRATION_CONTEXT_15',
#        'REGISTRATION_CONTEXT_16', 'REGISTRATION_CONTEXT_17', 'REGISTRATION_CONTEXT_18',
#        'REGISTRATION_CONTEXT_19', 'REGISTRATION_CONTEXT_20', 'REGISTRATION_CONTEXT_21',
#        'REGISTRATION_CONTEXT_22', 'REGISTRATION_CONTEXT_23', 'REGISTRATION_CONTEXT_24',
#        'REGISTRATION_CONTEXT_25', 'REGISTRATION_CONTEXT_26', 'REGISTRATION_CONTEXT_27',
#        'MIGRATED_USER_TYPE_5', 'TOD_CENTRE_3', 'CONTENT_CENTRE_1', 'CONTENT_CENTRE_2',
#        'CONTENT_CENTRE_4', 'CONTENT_CENTRE_5', 'CONTENT_CENTRE_6', 'CONTENT_CENTRE_7',
#        'CONTENT_CENTRE_8', 'CONTENT_CENTRE_9', 'CONTENT_CENTRE_12','CONTENT_CENTRE_13',
#        'CONTENT_CENTRE_15']
#    TO_DROP += [
#        'SOCIAL_AUTH_TWITTER', 'FLAG_WARD_WKEND_3_9', 'FLAG_UNI_CLUSTER_7',
#        'FLAG_UNI_CLUSTER_13', 'FLAG_UNI_CLUSTER_21', 'FLAG_UNI_CLUSTER_22',
#        'FLAG_UNI_CLUSTER_25', 'FLAG_ANDROID', 'FLAG_LATE_PEAK_VIEWS',
#        'FLAG_NIGHT_TIME_VIEWS', 'FLAG_AFF1', 'FLAG_AFF2', 'MIGRATED_USER_TYPE_4',
#        'CONTENT_CENTRE_10', 'CONTENT_CENTRE_14', 'CONTENT_CENTRE_16']
#    TO_DROP += [
#        'FLAG_WARD_WKDAY_3_9', 'FLAG_UNI_CLUSTER_5', 'FLAG_UNI_CLUSTER_8',
#        'FLAG_UNI_CLUSTER_9', 'FLAG_UNI_CLUSTER_17', 'FLAG_UNI_CLUSTER_26',
#        'FLAG_MORNING_VIEWS', 'FLAG_EARLY_PEAK_VIEWS']
#    TO_DROP += [
#        'FLAG_WARD_WKEND_1_2', 'FLAG_WARD_WKEND_21_24', 'FLAG_UNI_CLUSTER_1',
#        'FLAG_UNI_CLUSTER_14', 'FLAG_MAIN', 'FLAG_OTHER_VIEWS', 'CONTENT_CENTRE_11']
#    TO_DROP += ['FLAG_UNI_CLUSTER_12', 'FLAG_UNI_CLUSTER_19', 'FLAG_UNI_CLUSTER_27']
#    TO_DROP += ['FLAG_UNI_CLUSTER_11', 'FLAG_UNI_CLUSTER_24'] ## 814000 ?
#    TO_DROP += ['FLAG_UNI_CLUSTER_2', 'FLAG_UNI_CLUSTER_16']
#    TO_DROP += ['FLAG_UNI_CLUSTER_10', 'FLAG_UNI_CLUSTER_31', 'FLAG_POST_PEAK_VIEWS', 'TOD_CENTRE_2']
#    TO_DROP += ['FLAG_UNI_CLUSTER_30']
#    train_X = train_X.drop(TO_DROP, axis=1)

#    train_X.fillna(-1, inplace=True)

    from sklearn.cross_validation import StratifiedKFold
    from sklearn.grid_search import GridSearchCV

    if (False
        #or True
        ):
        skf = StratifiedKFold(train_y, n_folds=nfolds)
        from numpy import asarray
        selection = asarray(['-'] * len(train_y))
        symbol = 0
        for train_index, test_index in skf:
            selection[test_index] =chr(symbol + 48)
            symbol += 1
            pass
        print(''.join(selection))
        return

    muse_kwargs = \
    {
        #'objective': 'reg:logistic',
        'objective': 'rank:pairwise',
        'learning_rate': 0.045,
        'min_child_weight': 50,
        'subsample': 0.8,
        'colsample_bytree': 0.7,
        'max_depth': 7,
        'n_estimators': nest,
        'nthread': njobs,
        'seed': 0,
        'missing': float('nan')
        #'scoring': NegQWKappaScorer
    }

    # override kwargs with any changes
    for k, v in clf_kwargs.items():
        muse_kwargs[k] = v
        pass

    #clf = globals()[estimator](**muse_kwargs)
    #from xgboost import XGBClassifier
    clf = XGBClassifier(**muse_kwargs)
    #clf = MillenialsClassifier(**muse_kwargs)

    from sklearn.metrics import make_scorer
    tco_scorer = make_scorer(MinPRScorer)

    """
binary:logistic
    grid scores:
  mean: 787812.76918, std: 1297.55109, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 1}
  mean: 789084.73195, std: 1925.75110, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 3}
  mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10}
  mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
  mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50}
  mean: 788168.38281, std: 928.87371, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80}
best score: 789958.10747
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}

reg:logistic (to samo co wyżej)
grid scores:
  mean: 789651.63043, std: 1855.11841, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10}
  mean: 789958.10747, std: 1305.40202, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
  mean: 788739.11423, std: 952.60469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50}
best score: 789958.10747
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}


grid scores:
  mean: 786388.90860, std: 906.72660, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 300, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}
  mean: 789050.88848, std: 1708.63378, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}
  mean: 789454.57872, std: 2059.68811, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}
best score: 789454.57872
best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 700, 'subsample': 0.9, 'objective': 'binary:logistic', 'max_depth': 7}

======================================================
rank:pairwise
grid scores:
  mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
best score: 806358.37855
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}

grid scores:
  mean: 750119.43597, std: 9120.06057, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 4, 'min_child_weight': 20}
  mean: 809673.54959, std: 4784.35577, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20}
  mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20}
  mean: 794998.50356, std: 2029.93836, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 20, 'min_child_weight': 20}
  mean: 794548.01245, std: 2062.41505, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 50, 'min_child_weight': 20}
best score: 809673.54959
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 20}
>>> 'max_depth': 6, 'min_child_weight': 20

grid scores:
  mean: 802508.37926, std: 4201.47242, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 10}
  mean: 793935.52998, std: 7607.45918, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 20}
  mean: 784568.74090, std: 7161.04235, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 40}
  mean: 802325.99222, std: 1833.64884, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 10}
  mean: 806358.37855, std: 4488.86812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 20}
  mean: 808437.63308, std: 3881.55687, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40}
  mean: 798618.25778, std: 2948.03146, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 10}
  mean: 802665.25722, std: 2350.85430, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 20}
  mean: 806720.10926, std: 2543.82598, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 40}
  mean: 795701.38488, std: 2962.99442, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 10}
  mean: 798151.02989, std: 2162.04583, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 20}
  mean: 803385.26027, std: 2271.86591, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 10, 'min_child_weight': 40}
best score: 808437.63308
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 40}
>>> 'max_depth': 7, 'min_child_weight': 40

grid scores:
  mean: 782028.41606, std: 9637.64116, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 50}
  mean: 769010.75894, std: 6079.16367, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 60}
  mean: 760914.24094, std: 9643.26515, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 5, 'min_child_weight': 80}
  mean: 807557.88495, std: 4219.13250, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 50}
  mean: 801663.63876, std: 7556.18492, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 60}
  mean: 784727.73532, std: 7314.95469, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 6, 'min_child_weight': 80}
  mean: 811735.94787, std: 3476.37280, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 50}
  mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60}
  mean: 806342.26320, std: 7227.93062, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 80}
best score: 812694.98649
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60}
>>> 'max_depth': 7, 'min_child_weight': 60

    grid scores:
  mean: 811261.01220, std: 1387.81968, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 55}
  mean: 812694.98649, std: 4262.04853, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 60}
  mean: 813522.63431, std: 5054.98775, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65}
  mean: 811147.14498, std: 1469.98812, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 70}
  mean: 810716.38989, std: 3383.29928, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 55}
  mean: 810977.37920, std: 3039.52816, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 60}
  mean: 809034.76724, std: 4751.72859, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 65}
  mean: 810902.03165, std: 3741.53151, params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 8, 'min_child_weight': 70}
best score: 813522.63431
best params: {'n_estimators': 500, 'subsample': 0.9, 'colsample_bytree': 0.67, 'max_depth': 7, 'min_child_weight': 65}
>>> 'max_depth': 7, 'min_child_weight': 65
    """

    """  ONE HOT   ***
grid scores:
  mean: 808785.95756, std: 3732.33890, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 807606.64678, std: 6685.61758, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 798856.94075, std: 8380.25083, params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 1.0, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812217.95285
best params: {'colsample_bytree': 0.67, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'subsample': 0.8

grid scores:
  mean: 807015.44496, std: 1745.61053, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811229.10339, std: 2626.00511, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809107.13182, std: 3766.10287, params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 806700.86249, std: 1673.53048, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 808803.17397, std: 2141.65596, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 810538.01687, std: 4532.84397, params: {'colsample_bytree': 0.7, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 808655.96400, std: 2917.13000, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809115.42707, std: 2625.94051, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809472.09619, std: 3144.06367, params: {'colsample_bytree': 0.8, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.9, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 811229.10339
best params: {'colsample_bytree': 0.6, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'colsample_bytree': 0.6, 'subsample': 0.8

grid scores:
  mean: 804112.81441, std: 8632.61400, params: {'colsample_bytree': 0.67, 'learning_rate': 0.03, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 807667.00596, std: 2109.00871, params: {'colsample_bytree': 0.67, 'learning_rate': 0.06, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812217.95285
best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'learning_rate': 0.045

grid scores:
  mean: 810936.40241, std: 2661.32895, params: {'colsample_bytree': 0.67, 'learning_rate': 0.04, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811906.13100, std: 3339.45916, params: {'colsample_bytree': 0.67, 'learning_rate': 0.05, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812217.95285
best params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'learning_rate': 0.045

grid scores:
  mean: 811208.11854, std: 3435.62254, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 809690.89826, std: 2189.70344, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 808096.24696, std: 3457.47957, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.75, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812217.95285, std: 2332.88180, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811478.50583, std: 2864.64292, params: {'colsample_bytree': 0.67, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> 'learning_rate': 0.045, 'colsample_bytree': 0.65, 'subsample': 0.85

grid scores:
  mean: 810759.12376, std: 1528.10128, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 50, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
  mean: 811977.26246, std: 4991.06664, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 80, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7}
>>> no change

grid scores:
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 810059.35718, std: 1824.61383, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 1.0}
  mean: 810675.99076, std: 3153.29552, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 2.0}
  mean: 809731.68599, std: 4091.21405, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 4}
  mean: 751547.94084, std: 10351.41628, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 10}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
>>> 'gamma': 0

grid scores:
  mean: 786632.68012, std: 7192.58756, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.4, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 746308.67706, std: 11236.07404, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.3, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 695506.87990, std: 7719.89820, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.2, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 812432.77112, std: 4630.29708, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 802604.50597, std: 2869.94386, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.7, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
  mean: 798443.86047, std: 1863.64175, params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.8, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
best score: 812432.77112
best params: {'colsample_bytree': 0.65, 'learning_rate': 0.045, 'min_child_weight': 65, 'n_estimators': 500, 'subsample': 0.85, 'base_score': 0.5, 'objective': 'rank:pairwise', 'max_depth': 7, 'gamma': 0.0}
>>> 'base_score': 0.5

    """

    param_grid = {
                #'objective': ['binary:logitraw'],
                'objective': ['rank:pairwise'],
                #'booster': ['gblinear'],
                'n_estimators': [580],

                'max_depth': [6],
                'min_child_weight': [45, 50, 55],
                'gamma': [0.],

                'subsample': [0.85],
                'colsample_bytree': [0.65],

                'learning_rate': [0.045],
                }
    for k, v in cv_grid.items():
        param_grid[k] = v


    grid = GridSearchCV(estimator=clf,
                            param_grid=param_grid,
                            cv=StratifiedKFold(train_y, n_folds=nfolds),
                            scoring=tco_scorer,
                            n_jobs=1,
                            verbose=2,
                            refit=False)

    if (False
        #or True
        ):
        grid.fit(train_X, train_y)
        print('grid scores:')
        for item in grid.grid_scores_:
            print('  {:s}'.format(item))
        print('best score: {:.5f}'.format(grid.best_score_))
        print('best params:', grid.best_params_)

    if (False
        #or True
        ):
        clf.fit(train_X, train_y)
        feature_names = train_X.columns.values.tolist()
        from numpy import zeros
        feature_importances = zeros(len(feature_names))
        importances = clf.booster().get_fscore()
        for i, feat in enumerate(feature_names):
            if feat in importances:
                feature_importances[i] += importances[feat]
                pass
            pass
        import operator
        sorted_importances = sorted(zip(feature_names, feature_importances), key=operator.itemgetter(1), reverse=True)
        for k, v in sorted_importances:
            print("{}\t{}".format(v, k))
            pass
        print([k for k, v in sorted_importances if v == 0])
        pass

    if (False
        or True
        ):
        def objective(space):
            #param_grid = {'objective': ['binary:logistic']}
            #param_grid = {'objective': ['binary:logitraw']}
            param_grid = {'objective': ['rank:pairwise'], 'booster_type': ['gblinear']}
            for k, v in space.items():
                if k in ['n_estimators', 'max_depth', 'min_child_weight', 'num_pairwise']:
                    v = int(v)
                    pass
                param_grid[k] = [v]
                pass

            grid = GridSearchCV(estimator=clf,
                            param_grid=param_grid,
                            cv=StratifiedKFold(train_y, n_folds=nfolds),
                            scoring=tco_scorer,
                            n_jobs=1,
                            verbose=2,
                            refit=False)
            grid.fit(train_X, train_y)

            print('best score: {:.5f}  best params: {}'.format(grid.best_score_, grid.best_params_))
            return -grid.best_score_

        from sys import path as sys_path
        sys_path.insert(0, './hyperopt')
        from hyperopt import fmin, tpe, hp

        # cheatsheet:
        # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions
        space = {
            'n_estimators': hp.quniform("x_n_estimators", 500, 800, 10),
            'max_depth': hp.quniform("x_max_depth", 4, 8, 1),
            'min_child_weight': hp.quniform ('x_min_child', 45, 240, 5),
            'gamma': hp.uniform ('x_gamma', 0.0, 2.0),
            #'scale_pos_weight': hp.uniform ('x_scale_pos_weight', 0.5, 1.0),

            'num_pairsample': hp.quniform ('x_num_pairsample', 1, 4, 1),

            'subsample': hp.uniform ('x_subsample', 0.4, 1.0),
            'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.4, 1.0)
            }
        best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            )
        print(best)
        pass


    return
import sys
from time import time
import os
from os.path import isfile, join
import itertools

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import tensorflow as tf
from tensorflow.core.example import example_pb2

from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle

random_seed(21)  #Reproducibility

#Define the Global Variables
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
    'command', 'text_to_binary', 'Either text_to_vocabulary or text_to_binary.'
    'Specify FLAGS.in_directories accordingly.')
tf.app.flags.DEFINE_string('in_folder', '', 'path to input json data file')
tf.app.flags.DEFINE_string(
    'out_files', '', 'comma seperated paths to files'
)  #specify the outfile during command line interface
tf.app.flags.DEFINE_string(
    'split', '',
    'comma separated fractions of data')  #specify during terminal command call
# tf.app.flags.DEFINE_string('vocab_file','data/vocabulary','path to output the vocab of the training corpus')
# tf.app.flags.DEFINE_integer('body_len', 30000, 'Define the length of body to consider')
Exemple #15
0
'''

import argparse
import numpy as np
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Dropout
from keras.layers import recurrent
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, TensorBoard, CSVLogger, LambdaCallback
from numpy.random import seed as random_seed
from numpy.random import randint as random_randint
import os
import pickle

from data import DataSet

random_seed(42)  # Reproducibility

# Parameters for the model and dataset
DATASET_FILENAME = 'data/dataset/news.2011.en.shuffled'
NUMBER_OF_EPOCHS = 2
RNN = recurrent.LSTM
INPUT_LAYERS = 2
OUTPUT_LAYERS = 2
AMOUNT_OF_DROPOUT = 0.3
BATCH_SIZE = 32
SAMPLES_PER_EPOCH = 65536
HIDDEN_SIZE = 700
INITIALIZATION = "he_normal"  # : Gaussian initialization scaled by fan_in (He et al., 2014)
NUMBER_OF_CHARS = 100  # 75
CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
INVERTED = True