def main(stack_setting_):

    """
     [rawdata2filterdata Step]
      1. Reading raw datasets
      2. Droping useless feat columns in training set
      3. Droping useless feat columns in test set
    """

    raw_train_path = os.path.join(Config.get_string('data.path'), 
                                  stack_setting_['0-Level']['folder'],
                                  stack_setting_['0-Level']['raw']['train'])
    raw_test_path = os.path.join(Config.get_string('data.path'), 
                                 stack_setting_['0-Level']['folder'],
                                 stack_setting_['0-Level']['raw']['test'])
    print("= Reading raw datasets ...")

    names = ("age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, TARGET").split(', ')

    raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=','
    #raw_trai['TARGET'] = (raw_trai['TARGET'].values == ' >50K').astype(np.int32)
    #raw_train = raw_train.apply(lambda x: pd.factorize(x)[0])
    train_path = os.path.join(Config.get_string('data.path'), 
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['train'])
    raw_train.to_csv(train_path, index=True, index_label='ID')


    raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=','
    #raw_test['TARGET'] = (raw_test['TARGET'].values == ' >50K').astype(np.int32)
    #raw_test = raw_test.apply(lambda x: pd.factorize(x)[0])
    test_path = os.path.join(Config.get_string('data.path'), 
                             stack_setting_['0-Level']['folder'],
                             stack_setting_['0-Level']['test'])
    raw_test.to_csv(test_path, index=True, index_label='ID')
Exemple #2
0
def write_cv_res_csv(cv_out, cv_csv_out):
    cv_out = os.path.join(Config.get_string('data.path'),'output',cv_out)
    cv_csv_out = os.path.join(Config.get_string('data.path'),'output',cv_csv_out)
    param_keys, param_vals, scores = cp.load(open(cv_out, 'rb'))
    assert len(param_vals) == len(scores), 'Error: param value list length do not match score list length!'
    assert len(param_keys) == len(param_vals[0]), 'Error: param key count and value count do not match!'
    if isinstance(param_vals[0], dict):
        param_keys = param_vals[0].keys()
        param_vals = [param.values() for param in param_vals]
    f = open(cv_csv_out, 'w')
    f.write('idx,')
    for key in param_keys:
        f.write('{0},'.format(key))
    for i in xrange(len(scores[0])):
        f.write('score_{0},'.format(i))
    f.write('score_mean,score_std\n')
    for i, params in enumerate(param_vals):
        f.write('{},'.format(i))
        for p in params:
            f.write('{0},'.format(p))
        for s in scores[i]:
            f.write('{0},'.format(s))
        f.write('{0},{1}\n'.format(scores[i].mean(), scores[i].std()))
    f.close()
    pass
    def __init__(self,
                 data_folder,
                 train_fname=None,
                 test_fname=None,
                 k_fold_=None):
        self.random_state = 325243  # do not change it for different l2 models!
        #self.random_state = 98754  # do not change it for different l2 models!
        if not train_fname:
            sys.stderr.write('Do not set train_meta_feature\n')
            sys.exit()
        if not test_fname:
            sys.stderr.write('Do not set test_meta_feature\n')
            sys.exit()
        train_fname = os.path.join(Config.get_string('data.path'), data_folder,
                                   train_fname)
        test_fname = os.path.join(Config.get_string('data.path'), data_folder,
                                  test_fname)

        # load train data
        train = pd.read_csv(train_fname)
        self.train_id = train.values
        self.train_y = train.label.values
        self.train_x = train.drop(['label'], axis=1)
        # load test data
        test = pd.read_csv(test_fname)
        self.test_id = test.values
        self.test_y = test.label.values
        self.test_x = test.drop(['label'], axis=1)
        #print self.train_x.head()
        #print self.test_x.head()

        if k_fold_ is None:
            self.k_fold_ = 5
        else:
            self.k_fold_ = k_fold_
def main(stack_setting_):

    """
     [rawdata2filterdata Step]
      1. Reading raw datasets
      2. Droping useless feat columns in training set
      3. Droping useless feat columns in test set
    """

    raw_train_path = os.path.join(Config.get_string('data.path'), 
                                  stack_setting_['0-Level']['folder'],
                                  stack_setting_['0-Level']['raw']['train'])
    raw_test_path = os.path.join(Config.get_string('data.path'), 
                                 stack_setting_['0-Level']['folder'],
                                 stack_setting_['0-Level']['raw']['test'])
    print("= Reading raw datasets ...")

    names = ("age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, TARGET").split(', ')
    raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=','
    raw_train['TARGET'] = (raw_train['TARGET'].values == ' >50K').astype(np.int32)
    raw_train = raw_train.apply(lambda x: pd.factorize(x)[0])
    train_path = os.path.join(Config.get_string('data.path'), 
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['train'])
    raw_train.to_csv(train_path, index=True, index_label='ID')


    raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=','
    raw_test['TARGET'] = (raw_test['TARGET'].values == ' >50K').astype(np.int32)
    raw_test = raw_test.apply(lambda x: pd.factorize(x)[0])
    test_path = os.path.join(Config.get_string('data.path'), 
                             stack_setting_['0-Level']['folder'],
                             stack_setting_['0-Level']['test'])
    raw_test.to_csv(test_path, index=True, index_label='ID')
Exemple #5
0
    def __init__(self,
                 wrapper_class,
                 experiment,
                 model_param_keys,
                 model_param_space,
                 cv_out=None,
                 cv_pred_out=None,
                 refit_pred_out=None,
                 dump_round=10,
                 use_lower=0,
                 n_folds=5):
        """
        Constructor of bayes search.
        Support search on a set of model parameters, and record the cv result of each param configuration.

        :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel'
        :param experiment: experiment object of ExperimentL1 or ExperimentL2
        :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC']
        :param model_param_space: list of model param space
        :param cv_out: Output pickle file name of cross validation score results.
        :param cv_pred_out: prediction of cross validation each fold.
        :param refit_pred_out: refit on full train set and predict on test set.
        :return: None
        """

        self.wrapper_class = wrapper_class
        self.experiment = experiment
        self.model_param_keys = model_param_keys
        self.model_param_space = model_param_space
        self.integer_params = set()
        self.n_folds = n_folds
        for k, v in model_param_space.iteritems():
            vstr = str(v)
            if vstr.find('quniform') >= 0 \
                    or vstr.find('qloguniform') >= 0\
                    or vstr.find('qnormal') >= 0\
                    or vstr.find('qnormal') >= 0:
                #if v == hp.quniform or v == hp.qlognormal or v == hp.qnormal:
                self.integer_params.add(k)
            pass
        self.param_vals_list = []
        self.preds_list = []
        self.scores_list = []
        self.refit_preds_list = []
        self.model_name = self.wrapper_class.__name__

        self.cv_out = os.path.join(Config.get_string('data.path'), 'output',
                                   cv_out) if cv_out else None
        self.cv_pred_out = os.path.join(Config.get_string('data.path'),
                                        'output',
                                        cv_pred_out) if cv_pred_out else None
        self.refit_pred_out = os.path.join(
            Config.get_string('data.path'), 'output',
            refit_pred_out) if refit_pred_out else None

        self.eval_round = 0
        self.dump_round = dump_round
        self.trials = Trials()
        self.use_lower = use_lower
        pass
def nn_param_avg_submission(prefix, top_k=1):
    exp = ExperimentL1()
    score_fname = os.path.join(Config.get_string('data.path'), 'output',prefix+ '-scores.pkl')
    refit_pred_fname =os.path.join(Config.get_string('data.path'), 'output',prefix+ '-refit-preds.pkl')
    preds = get_top_model_avg_preds(score_fname, refit_pred_fname, topK=top_k)
    submission_fname = os.path.join(Config.get_string('data.path'), 'submission', 'avg-{}-refit-preds{}.csv'.format(prefix, top_k))
    save_submissions(submission_fname, exp.test_id, preds)
    def __init__(self, wrapper_class, experiment, model_param_keys, model_param_vals,
                 cv_folder=None,
                 cv_out=None, cv_pred_out=None, refit_pred_out=None):
        """
        Constructor of grid search.
        Support search on a set of model parameters, and record the cv result of each param configuration.

        :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel'
        :param experiment: experiment object of ExperimentL1 at 1-Level or ExperimentL2 at 2-Level
        :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC']
        :param model_param_vals: list of model param values (iterable). eg. [['valAa', 'valAb'], [0.1, 0.2], (1, 2, 3)]
        :param cv_out: Output pickle file name of cross validation score results.
        :param cv_pred_out: prediction of cross validation each fold.
        :param refit_pred_out: refit on full train set and predict on test set.
        :return: (best parameters, best score)
        """

        self.wrapper_class = wrapper_class
        self.experiment = experiment
        self.model_param_keys = model_param_keys
        self.model_param_vals = model_param_vals
        self.str_match = re.compile(r'loss')

        if wrapper_class == SklearnModel:
            self.model_name = model_param_vals[0]
        else:
            self.model_name = 'xgb'
        self.cv_out = os.path.join(Config.get_string('data.path'), cv_folder, cv_out) if cv_out else None
        self.cv_pred_out = os.path.join(Config.get_string('data.path'), cv_folder, cv_pred_out) if cv_pred_out else None
        self.refit_pred_out = os.path.join(Config.get_string('data.path'), cv_folder, refit_pred_out) if refit_pred_out else None
def ridge_blend(stack_setting_, best_param_):

    folder = stack_setting_['2-Level']['blending']['folder']
    blend_weight_fname = stack_setting_['2-Level']['blending']['weight']
    blend_weight_fname = os.path.join(Config.get_string('data.path'), folder, blend_weight_fname)
    linear_weight = pd.read_csv(blend_weight_fname)

    folder = stack_setting_['1-Level']['meta_features']['folder']
    test_fname = stack_setting_['1-Level']['meta_features']['test']
    test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname)
    test = pd.read_csv(test_fname)

    folder = stack_setting_['2-Level']['blending']['folder']
    model_fname = stack_setting_['2-Level']['blending']['model']
    model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname)
    with gzip.open(model_fname, 'rb') as gf:
        model = cPickle.load(gf)

    y_test = test.label.values
    X_test = test.drop(['label'], axis = 1)
    del test

    y_predict = model.predict(X_test)

    #return mean_squared_error(y_test, y_predict)
    return precision_recall(y_test, y_predict)
def ridge_blend(stack_setting_, best_param_):

    folder = stack_setting_['2-Level']['blending']['folder']
    blend_weight_fname = stack_setting_['2-Level']['blending']['weight']
    blend_weight_fname = os.path.join(Config.get_string('data.path'), folder, blend_weight_fname)
    linear_weight = pd.read_csv(blend_weight_fname)

    folder = stack_setting_['1-Level']['meta_features']['folder']
    test_fname = stack_setting_['1-Level']['meta_features']['test']
    test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname)
    test = pd.read_csv(test_fname)

    folder = stack_setting_['2-Level']['blending']['folder']
    model_fname = stack_setting_['2-Level']['blending']['model']
    model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname)
    with gzip.open(model_fname, 'rb') as gf:
        model = cPickle.load(gf)

    y_test = test.label.values
    X_test = test.drop(['label'], axis = 1)
    del test

    y_predict = model.predict(X_test)

    #return mean_squared_error(y_test, y_predict)
    return precision_recall(y_test, y_predict)
    def __init__(self, 
                 data_folder,
                 train_fname=None, test_fname=None,
                 k_fold_=None):
        #self.random_state = 325243  # do not change it for different l1 models!
        self.random_state = 98754  # do not change it for different l1 models!
        if not train_fname:
            train_fname = 'filtered_train.csv'
        if not test_fname:
            test_fname = 'filtered_test.csv'
        train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname)
        test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname)

        # load train data
        with gzip.open(train_fname, 'rb') as gf:
            self.train_x, self.train_y = cPickle.load(gf)

        # load test data
        with gzip.open(test_fname, 'rb') as gf:
            self.test_x, self.test_y = cPickle.load(gf)

        if k_fold_ is None:
            self.k_fold_ = 5
        else:
            self.k_fold_ = k_fold_
    def __init__(self, 
                 data_folder,
                 train_fname=None, test_fname=None,
                 k_fold_=None):
        #self.random_state = 325243  # do not change it for different l1 models!
        self.random_state = 98754  # do not change it for different l1 models!
        if not train_fname:
            train_fname = 'filtered_train.csv'
        if not test_fname:
            test_fname = 'filtered_test.csv'
        train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname)
        test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname)
        # load train data
        train = pd.read_csv(train_fname, dtype=np.float32)
        train.sort(columns='ID', inplace=1)
        self.train_id = train.values
        self.train_y = train.TARGET.values
        self.train_x = train.drop(['ID', 'TARGET'], axis=1)


        # load test data
        test = pd.read_csv(test_fname, dtype=np.float32)
        test.sort(columns='ID', inplace=1)
        self.test_id = test.values
        self.test_y = test.TARGET.values
        #self.test_x = test.drop(['ID'], axis=1)
        self.test_x = test.drop(['ID', 'TARGET'], axis=1)
        #print self.train_x.head()
        #print self.test_x.head()

        if k_fold_ is None:
            self.k_fold_ = 5
        else:
            self.k_fold_ = k_fold_
Exemple #12
0
def get_top_cv_and_test_preds(out_fname_prefix, top_k=10, use_lower=0):
    """
    Get the top k cross-validation predictions of trainset and refit predictions of testset from experiment_l1 results.
    You can use numpy.hstack to join different model results
    :param out_fname_prefix: prefix to identify a given experiment (L1)
    :param k: top k
    :return: top k cv preds and refit preds (numpy array)
    """
    from utils.config_utils import Config
    # file names
    score_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix+'-scores.pkl')
    pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-preds.pkl')
    refit_pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-refit-preds.pkl')
    # load pickle files
    param_keys, param_vals, scores = cp.load(open(score_fname, 'rb'))
    refit_preds = cp.load(open(refit_pred_fname, 'rb'))
    preds = cp.load(open(pred_fname, 'rb'))
    # calculate top results
    scores = np.asarray(scores)
    idxs = np.arange(len(scores))
    mscores = scores.mean(axis=1)
    if use_lower:
        mscores -= scores.std()
    idxs = sorted(idxs, key=lambda x:mscores[x], reverse=1)[:top_k]
    preds = np.transpose(np.asarray(preds)[idxs])
    refit_preds = np.transpose(np.asarray(refit_preds)[idxs])
    return preds, refit_preds
Exemple #13
0
def get_cv_and_test_preds(out_fname_prefix, idxs):
    from utils.config_utils import Config
    pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-preds.pkl')
    refit_pred_fname = os.path.join(Config.get_string('data.path'), 'output', out_fname_prefix +'-refit-preds.pkl')
    refit_preds = cp.load(open(refit_pred_fname, 'rb'))
    preds = cp.load(open(pred_fname, 'rb'))
    preds = np.transpose(np.asarray(preds)[idxs])
    refit_preds = np.transpose(np.asarray(refit_preds)[idxs])
    return preds, refit_preds
def save_l2_submission(prefix='stacking-xgb'):
    import os
    exp = ExperimentL1()
    score_fname = os.path.join(Config.get_string('data.path'), 'output', prefix+'-scores.pkl')
    refit_pred_fname =os.path.join(Config.get_string('data.path'), 'output', prefix+'-refit-preds.pkl')
    topK = 1
    preds = get_top_model_avg_preds(score_fname, refit_pred_fname, topK=topK)
    submission_fname = os.path.join(Config.get_string('data.path'), 'submission',
                                    prefix+'-refit-preds{}.csv'.format(topK))
    save_submissions(submission_fname, exp.test_id, preds)
Exemple #15
0
    def write2csv_meta_feature(self, model, meta_folder, meta_train_fname,
                               meta_test_fname, meta_header, best_param_):

        kfold = cross_validation.StratifiedKFold(
            self.train_y,
            n_folds=5,
            shuffle=True,
            random_state=self.random_state)

        model.set_params(**best_param_)

        transform_train = np.zeros((self.train_x.shape[0], 2),
                                   dtype=np.float32)

        transform_test = np.zeros((self.test_x.shape[0], 2), dtype=np.float32)

        # transform train data
        for i, (train_idx, test_idx) in enumerate(kfold):
            print(' [Meta Feature] --------- fold {0} ---------- '.format(i))
            train_x = self.train_x.iloc[train_idx]
            train_y = self.train_y[train_idx]
            test_x = self.train_x.iloc[test_idx]
            test_y = self.train_y[test_idx]
            model.fit(train_x, train_y)
            #transform_train[test_idx, 0] = model.predict_proba(test_x)[:,1].astype(np.float32)w
            transform_train[test_idx,
                            0] = self.get_proba(model,
                                                test_x).astype(np.float32)
            transform_train[test_idx, 1] = test_y.astype(np.int32)

        meta_train_fname = os.path.join(Config.get_string('data.path'),
                                        meta_folder, meta_train_fname)
        np.savetxt(meta_train_fname,
                   transform_train,
                   delimiter=',',
                   header=meta_header,
                   comments='',
                   fmt='%1.10e,%d')
        del transform_train

        # transform test data
        model.fit(self.train_x, self.train_y)
        #transform_test = model.predict_proba(self.test_x)[:,1].astype(np.float32)
        transform_test[:, 0] = self.get_proba(model,
                                              self.test_x).astype(np.float32)
        transform_test[:, 1] = self.test_y.astype(np.int32)
        meta_test_fname = os.path.join(Config.get_string('data.path'),
                                       meta_folder, meta_test_fname)
        np.savetxt(meta_test_fname,
                   transform_test,
                   delimiter=',',
                   header=meta_header,
                   comments='',
                   fmt='%1.10e,%d')
        del transform_test
Exemple #16
0
def make_hold_out_backup(stack_setting_):
    """
     input
      train
     output
      train, ptrain, ptest
    """

    split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio']

    # train
    folder = stack_setting_['1-Level']['meta_features']['folder']
    train_fname = stack_setting_['1-Level']['meta_features']['train']
    train_fname = os.path.join(Config.get_string('data.path'), folder,
                               train_fname)
    train = pd.read_csv(train_fname)

    nrows = len(train.index)
    #a,b = int(nrows * split_ratio), nrows - int(nrows * split_ratio)
    a = nrows - int(nrows * split_ratio)
    train, hold_out = train[:a], train[a:]

    # train data for(meta_feature, label)
    train.to_csv(os.path.join(Config.get_string('data.path'),
                              stack_setting_['2-Level']['blending']['folder'],
                              stack_setting_['2-Level']['blending']['train']),
                 index=False)

    nrows = len(hold_out.index)
    a = int(nrows *
            0.5)  # for hold out set, we split half train and test data set.
    p_train, p_test = hold_out[:a], hold_out[a:]
    p_train.to_csv(os.path.join(
        Config.get_string('data.path'),
        stack_setting_['2-Level']['blending']['folder'],
        stack_setting_['2-Level']['blending']['ptrain']),
                   index=False)
    p_test.to_csv(os.path.join(Config.get_string('data.path'),
                               stack_setting_['2-Level']['blending']['folder'],
                               stack_setting_['2-Level']['blending']['ptest']),
                  index=False)

    print '----------- train data -----------'
    print train['label'].value_counts()
    print '----------- p_train_data -----------'
    print p_train['label'].value_counts()
    print '----------- p_test_data -----------'
    print p_test['label'].value_counts()

    return True
Exemple #17
0
def make_hold_out(stack_setting_):
    """
     input
      train
     output
      train, ptrain, ptest
    """

    split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio']

    # train
    folder = stack_setting_['1-Level']['meta_features']['folder']
    train_fname = stack_setting_['1-Level']['meta_features']['train']
    train_fname = os.path.join(Config.get_string('data.path'), folder,
                               train_fname)

    #
    meta_train_at_blend = os.path.join(
        Config.get_string('data.path'),
        stack_setting_['2-Level']['blending']['folder'],
        stack_setting_['2-Level']['blending']['train'])
    meta_ptrain_at_blend = os.path.join(
        Config.get_string('data.path'),
        stack_setting_['2-Level']['blending']['folder'],
        stack_setting_['2-Level']['blending']['ptrain'])
    meta_ptest_at_blend = os.path.join(
        Config.get_string('data.path'),
        stack_setting_['2-Level']['blending']['folder'],
        stack_setting_['2-Level']['blending']['ptest'])
    meta_test_at_blend = os.path.join(
        Config.get_string('data.path'),
        stack_setting_['2-Level']['blending']['folder'],
        'meta_test_at_blend.csv')

    # 1. split train file into train and hold out
    fs = File_Split(test_size=split_ratio)
    fs.__iter__(fname=train_fname,
                train_fname=meta_train_at_blend,
                test_fname=meta_test_at_blend)
    del fs

    # 2. split
    fs = File_Split(test_size=0.5)
    fs.__iter__(fname=meta_test_at_blend,
                train_fname=meta_ptrain_at_blend,
                test_fname=meta_ptest_at_blend)
    del fs

    return True
Exemple #18
0
def xgb_submmision(exp, param=None):
    if not param:
        param = {
            'colsample_bytree': 0.6923529515220681,
            'silent': 1,
            'model_type': XGBClassifier,
            'learning_rate': 0.014582411837608816,
            'nthread': 4,
            'min_child_weight': 6.0,
            'n_estimators': 400,
            'subsample': 0.5530324529773664,
            'seed': 9438,
            'objective': 'binary:logistic',
            'max_depth': 8.0
        }
    xgb_model = SklearnModel(param)
    final_preds = exp.fit_fullset_and_predict(xgb_model)
    submission_path = os.path.join(Config.get_string('data.path'),
                                   'submission')
    fname = os.path.join(submission_path,
                         xgb_model.to_string().split("-")[0] + '_res.csv')
    #fname = os.path.join(submission_path, 'xgb_bayes_param_res.csv')
    #print final_preds
    #print exp.test_id
    save_submissions(fname, exp.test_id, final_preds)
Exemple #19
0
def write_cv_res_csv(cv_out, cv_csv_out):
    cv_out = os.path.join(Config.get_string('data.path'), 'output',
                          cv_out) if cv_out else None

    param_keys, param_vals, scores = cp.load(open(cv_out, 'rb'))
    assert len(param_vals) == len(
        scores
    ), 'Error: param value list length do not match score list length!'
    assert len(param_keys) == len(
        param_vals[0]), 'Error: param key count and value count do not match!'
    if isinstance(param_vals[0], dict):
        param_keys = param_vals[0].keys()
        param_vals = [param.values() for param in param_vals]
    f = open(cv_csv_out, 'w')
    for key in param_keys:
        f.write('{0},'.format(key))
    for i in xrange(len(scores[0])):
        f.write('score_{0},'.format(i))
    f.write('score_mean,score_std\n')
    for i, params in enumerate(param_vals):
        for p in params:
            f.write('{0},'.format(p))
        for s in scores[i]:
            f.write('{0},'.format(s))
        f.write('{0},{1}\n'.format(scores[i].mean(), scores[i].std()))
    f.close()
def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
    def write2csv_meta_feature(self,
                               model,
                               meta_folder, meta_train_fname, meta_test_fname, meta_header,
                               best_param_):

        kfold = cross_validation.StratifiedKFold(self.train_y, 
                                                 n_folds=self.k_fold_, 
                                                 shuffle=True, 
                                                 random_state=self.random_state)

        # set model with best parameter
        model.set_params(**best_param_)

        transform_train = np.zeros((self.train_x.shape[0],2), dtype=np.float32)        
        transform_test = np.zeros((self.test_x.shape[0], 2), dtype=np.float32)

        # transform train data
        for i, (train_idx, test_idx) in enumerate(kfold):
            print (' [Meta Feature] --------- fold {0} ---------- '.format(i))
            train_x = self.train_x.iloc[train_idx]
            train_y = self.train_y[train_idx]
            test_x = self.train_x.iloc[test_idx]
            test_y = self.train_y[test_idx]
            model.fit(train_x, train_y)
            #transform_train[test_idx, 0] = model.predict_proba(test_x)[:,1].astype(np.float32)
            transform_train[test_idx, 0] = self.get_proba(model, test_x).astype(np.float32)
            transform_train[test_idx, 1] = test_y.astype(np.int32)

        meta_train_fname = os.path.join(Config.get_string('data.path'), 
                                        meta_folder, 
                                        meta_train_fname)
        np.savetxt(meta_train_fname, transform_train, delimiter=',',
                   header=meta_header, comments='', fmt='%1.10e,%d')
        del transform_train


        # transform test data
        model.fit(self.train_x, self.train_y)
        #transform_test = model.predict_proba(self.test_x)[:,1].astype(np.float32)
        transform_test[:,0] = self.get_proba(model, self.test_x).astype(np.float32) # predict label prob
        transform_test[:,1] = self.test_y.astype(np.int32) # true label
        meta_test_fname = os.path.join(Config.get_string('data.path'), 
                                       meta_folder, 
                                       meta_test_fname)
        np.savetxt(meta_test_fname, transform_test, delimiter=',',
                   header=meta_header, comments='', fmt='%1.10e,%d')
        del transform_test
def make_hold_out_backup(stack_setting_):
    """
     input
      train
     output
      train, ptrain, ptest
    """

    split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio']

    # train
    folder = stack_setting_['1-Level']['meta_features']['folder']
    train_fname = stack_setting_['1-Level']['meta_features']['train']
    train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname)
    train = pd.read_csv(train_fname)

    nrows = len(train.index)
    #a,b = int(nrows * split_ratio), nrows - int(nrows * split_ratio)
    a = nrows - int(nrows * split_ratio)
    train, hold_out = train[:a], train[a:]

    # train data for(meta_feature, label)
    train.to_csv(os.path.join(Config.get_string('data.path'),
                              stack_setting_['2-Level']['blending']['folder'],
                              stack_setting_['2-Level']['blending']['train']), index=False)


    nrows = len(hold_out.index)
    a = int(nrows * 0.5) # for hold out set, we split half train and test data set.
    p_train, p_test = hold_out[:a], hold_out[a:]
    p_train.to_csv(os.path.join(Config.get_string('data.path'),
                                stack_setting_['2-Level']['blending']['folder'],
                                stack_setting_['2-Level']['blending']['ptrain']), index=False)
    p_test.to_csv(os.path.join(Config.get_string('data.path'),
                               stack_setting_['2-Level']['blending']['folder'],
                               stack_setting_['2-Level']['blending']['ptest']), index=False)

    print '----------- train data -----------'
    print train['label'].value_counts()
    print '----------- p_train_data -----------'
    print p_train['label'].value_counts()
    print '----------- p_test_data -----------'
    print p_test['label'].value_counts()


    return True
Exemple #23
0
def main():
    if len(sys.argv) != 3:
        print 'Usage: python submit_utils.py <model-prefix> <model-idxs>'
        exit()
    from utils.config_utils import Config
    model_prefix = sys.argv[1]
    score_fname = os.path.join(Config.get_string('data.path'), 'output', model_prefix + '-scores.pkl')
    refit_pred_fname = os.path.join(Config.get_string('data.path'), 'output', model_prefix + '-refit-preds.pkl')
    model_idxs =  sys.argv[2].strip()
    idxs = [int(s) for s in model_idxs.split(',')]
    preds = get_selected_model_avg_preds(score_fname, refit_pred_fname, idxs)
    from experiment.stacking.experiment_l1 import ExperimentL1
    exp = ExperimentL1()
    submission_fname = os.path.join(Config.get_string('data.path'), 'submission',
                                    '{}-{}-submission.csv'.format(model_prefix, model_idxs))
    save_submissions(submission_fname, exp.test_id, preds)
    pass
    def __init__(self, wrapper_class, experiment, model_param_keys, model_param_space,
                 cv_out=None, cv_pred_out=None, refit_pred_out=None, dump_round=10, use_lower=0,n_folds=5):
        """
        Constructor of bayes search.
        Support search on a set of model parameters, and record the cv result of each param configuration.

        :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel'
        :param experiment: experiment object of ExperimentL1 or ExperimentL2
        :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC']
        :param model_param_space: list of model param space
        :param cv_out: Output pickle file name of cross validation score results.
        :param cv_pred_out: prediction of cross validation each fold.
        :param refit_pred_out: refit on full train set and predict on test set.
        :return: None
        """

        self.wrapper_class = wrapper_class
        self.experiment = experiment
        self.model_param_keys = model_param_keys
        self.model_param_space = model_param_space
        self.integer_params = set()
        self.n_folds = n_folds
        for k, v in model_param_space.iteritems():
            vstr = str(v)
            if vstr.find('quniform') >= 0 \
                    or vstr.find('qloguniform') >= 0\
                    or vstr.find('qnormal') >= 0\
                    or vstr.find('qnormal') >= 0:
            #if v == hp.quniform or v == hp.qlognormal or v == hp.qnormal:
                self.integer_params.add(k)
            pass
        self.param_vals_list = []
        self.preds_list = []
        self.scores_list = []
        self.refit_preds_list = []
        self.model_name = self.wrapper_class.__name__

        self.cv_out = os.path.join(Config.get_string('data.path'), 'output', cv_out) if cv_out else None
        self.cv_pred_out = os.path.join(Config.get_string('data.path'), 'output', cv_pred_out) if cv_pred_out else None
        self.refit_pred_out = os.path.join(Config.get_string('data.path'), 'output', refit_pred_out) if refit_pred_out else None

        self.eval_round = 0
        self.dump_round = dump_round
        self.trials = Trials()
        self.use_lower=use_lower
        pass
Exemple #25
0
 def load_mnist_labels(filename):
     filename = os.path.join(Config.get_string('data.path'), 'input', filename)
     if not os.path.exists(filename):
         download(filename)
     # Read the labels in Yann LeCun's binary format.
     with gzip.open(filename, 'rb') as f:
         data = np.frombuffer(f.read(), np.uint8, offset=8)
     # The labels are vectors of integers now, that's exactly what we want.
     return data
def make_hold_out(stack_setting_):
    """
     input
      train
     output
      train, ptrain, ptest
    """

    split_ratio = stack_setting_['2-Level']['blending']['hold_out_ratio']

    # train
    folder = stack_setting_['1-Level']['meta_features']['folder']
    train_fname = stack_setting_['1-Level']['meta_features']['train']
    train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname)

    # 
    meta_train_at_blend = os.path.join(Config.get_string('data.path'),
                                       stack_setting_['2-Level']['blending']['folder'],
                                       stack_setting_['2-Level']['blending']['train'])
    meta_ptrain_at_blend = os.path.join(Config.get_string('data.path'),
                                        stack_setting_['2-Level']['blending']['folder'],
                                        stack_setting_['2-Level']['blending']['ptrain'])
    meta_ptest_at_blend = os.path.join(Config.get_string('data.path'),
                                       stack_setting_['2-Level']['blending']['folder'],
                                       stack_setting_['2-Level']['blending']['ptest'])
    meta_test_at_blend = os.path.join(Config.get_string('data.path'),
                                      stack_setting_['2-Level']['blending']['folder'],
                                      'meta_test_at_blend.csv')

    # 1. split train file into train and hold out 
    fs = File_Split(test_size=split_ratio)
    fs.__iter__(fname = train_fname,
                train_fname = meta_train_at_blend,
                test_fname = meta_test_at_blend)
    del fs

    # 2. split 
    fs = File_Split(test_size=0.5)
    fs.__iter__(fname = meta_test_at_blend,
                train_fname = meta_ptrain_at_blend,
                test_fname = meta_ptest_at_blend)
    del fs

    return True
Exemple #27
0
    def __init__(self, data_folder, train_fname=None, test_fname=None):
        #self.random_state = 325243  # do not change it for different l1 models!
        self.random_state = 98754  # do not change it for different l1 models!
        if not train_fname:
            train_fname = 'filtered_train.csv'
        if not test_fname:
            test_fname = 'filtered_test.csv'
        train_fname = os.path.join(Config.get_string('data.path'), data_folder,
                                   train_fname)
        test_fname = os.path.join(Config.get_string('data.path'), data_folder,
                                  test_fname)

        # load train data
        with gzip.open(train_fname, 'rb') as gf:
            self.train_x, self.train_y = cPickle.load(gf)

        # load test data
        with gzip.open(test_fname, 'rb') as gf:
            self.test_x, self.test_y = cPickle.load(gf)
Exemple #28
0
def load_raw_data(dataset_name):
    pkl_fname = os.path.join(Config.get_string('data.path'), 'input',
                             dataset_name + '.pkl')
    if not os.path.exists(pkl_fname):
        path = os.path.join(Config.get_string('data.path'), 'input',
                            dataset_name)
        order_df = load_order_data(path)
        traffic_df = load_traffic_data(path)
        weather_df = load_weather_data(path)
        cluster_map = pd.read_csv(os.path.join(path,
                                               'cluster_map/cluster_map'),
                                  sep='\t',
                                  names=['district_hash', 'district_id'])
        poi_data, poi_cnt = load_poi_data(path)
        data = order_df, traffic_df, weather_df, cluster_map, poi_data, poi_cnt
        cp.dump(data, open(pkl_fname, 'wb'), protocol=2)
    else:
        data = cp.load(open(pkl_fname, 'rb'))
    return data
Exemple #29
0
def principal_component_analysis(x_train):
    """
    Principal Component Analysis (PCA) identifies the combination
    of attributes (principal components, or directions in the feature space)
    that account for the most variance in the data.

    Let's calculate the 2 first principal components of the training data,
    and then create a scatter plot visualizing the training data examples
    projected on the calculated components.
    """

    # Extract the variable to be predicted
    y_train = x_train["TARGET"]
    x_train = x_train.drop(labels="TARGET", axis=1)
    classes = np.sort(np.unique(y_train))
    labels = ["Satisfied customer", "Unsatisfied customer"]

    # Normalize each feature to unit norm (vector length)
    x_train_normalized = normalize(x_train, axis=0)

    # Run PCA
    pca = PCA(n_components=2)
    x_train_projected = pca.fit_transform(x_train_normalized)

    # Visualize
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(1, 1, 1)
    colors = [(0.0, 0.63, 0.69), 'black']
    markers = ["o", "D"]
    for class_ix, marker, color, label in zip(classes, markers, colors,
                                              labels):
        ax.scatter(x_train_projected[np.where(y_train == class_ix), 0],
                   x_train_projected[np.where(y_train == class_ix), 1],
                   marker=marker,
                   color=color,
                   edgecolor='whitesmoke',
                   linewidth='1',
                   alpha=0.9,
                   label=label)
        ax.legend(loc='best')
    plt.title("Scatter plot of the training data examples projected on the "
              "2 first principal components")
    plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" %
               (pca.explained_variance_ratio_[0] * 100.0))
    plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" %
               (pca.explained_variance_ratio_[1] * 100.0))
    #plt.show()

    #plt.savefig("../data/pca.pdf", format='pdf')
    #plt.savefig("../data/pca.png", format='png')
    plt.savefig(os.path.join(Config.get_string('data.path'), 'graph',
                             'pca.png'),
                format='png')
    plt.close()
def xgb_submmision(exp, param=None):
    if not param:
        param = {'colsample_bytree': 0.6923529515220681, 'silent': 1, 'model_type':XGBClassifier, 'learning_rate': 0.014582411837608816, 'nthread': 4, 'min_child_weight': 6.0, 'n_estimators': 400, 'subsample': 0.5530324529773664, 'seed': 9438, 'objective': 'binary:logistic', 'max_depth': 8.0}
    xgb_model = SklearnModel(param)
    final_preds = exp.fit_fullset_and_predict(xgb_model)
    submission_path = os.path.join(Config.get_string('data.path'), 'submission')
    fname = os.path.join(submission_path, xgb_model.to_string().split("-")[0] + '_res.csv')
    #fname = os.path.join(submission_path, 'xgb_bayes_param_res.csv')
    #print final_preds
    #print exp.test_id
    save_submissions(fname, exp.test_id, final_preds)
def dump_stacking_setting(stack_setting_):

    text = json.dumps(stack_setting_, sort_keys=True, ensure_ascii=False, indent=4)

    data_folder = stack_setting_['setting']['folder']
    fname = stack_setting_['setting']['name']
    fname = os.path.join(Config.get_string('data.path'), data_folder, fname)
    with open(fname, 'w') as fh:
        fh.write(text.encode('utf-8'))

    return True
Exemple #32
0
    def __init__(self,
                 wrapper_class,
                 experiment,
                 model_param_keys,
                 model_param_vals,
                 cv_folder=None,
                 cv_out=None,
                 cv_pred_out=None,
                 refit_pred_out=None):
        """
        Constructor of grid search.
        Support search on a set of model parameters, and record the cv result of each param configuration.

        :param wrapper_class: model wrapper type string like 'XgboostModel' or 'SklearnModel'
        :param experiment: experiment object of ExperimentL1 at 1-Level or ExperimentL2 at 2-Level
        :param model_param_keys: list of model param keys. eg. ['paramA', 'paramB', 'paramC']
        :param model_param_vals: list of model param values (iterable). eg. [['valAa', 'valAb'], [0.1, 0.2], (1, 2, 3)]
        :param cv_out: Output pickle file name of cross validation score results.
        :param cv_pred_out: prediction of cross validation each fold.
        :param refit_pred_out: refit on full train set and predict on test set.
        :return: (best parameters, best score)
        """

        self.wrapper_class = wrapper_class
        self.experiment = experiment
        self.model_param_keys = model_param_keys
        self.model_param_vals = model_param_vals
        self.str_match = re.compile(r'loss')

        if wrapper_class == SklearnModel:
            self.model_name = model_param_vals[0]
        else:
            self.model_name = 'xgb'
        self.cv_out = os.path.join(Config.get_string('data.path'), cv_folder,
                                   cv_out) if cv_out else None
        self.cv_pred_out = os.path.join(Config.get_string('data.path'),
                                        cv_folder,
                                        cv_pred_out) if cv_pred_out else None
        self.refit_pred_out = os.path.join(
            Config.get_string('data.path'), cv_folder,
            refit_pred_out) if refit_pred_out else None
def get_optimal_blend_weigth(exp_, best_param_,
                             folder, fname, model_fname):
    clf = RidgeClassifier()
    X_test, y_test = exp_.get_test_data()
    clf.set_params(**best_param_)
    clf.fit(X_test, y_test)

    # dump2csv optimal linear weight
    names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values)
    coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64)
    optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names)
    optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'),
                                              folder,
                                              fname), index=False)

    # dump2cpkle for ridge model
    model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname)
    with gzip.open(model_fname, 'wb') as gf:
        cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL)
    
    return True
def get_optimal_blend_weigth(exp_, best_param_,
                             folder, fname, model_fname):
    clf = RidgeClassifier()
    X_test, y_test = exp_.get_test_data()
    clf.set_params(**best_param_)
    clf.fit(X_test, y_test)

    # dump2csv optimal linear weight
    names = np.append(np.array(['intercept'], dtype='S100'), X_test.columns.values)
    coefs = np.append(clf.intercept_, clf.coef_).astype(np.float64)
    optimal_linear_weight = pd.DataFrame(coefs.reshape(1,len(coefs)), columns=names)
    optimal_linear_weight.to_csv(os.path.join(Config.get_string('data.path'),
                                              folder,
                                              fname), index=False)

    # dump2cpkle for ridge model
    model_fname = os.path.join(Config.get_string('data.path'), folder, model_fname)
    with gzip.open(model_fname, 'wb') as gf:
        cPickle.dump(clf, gf, cPickle.HIGHEST_PROTOCOL)
    
    return True
Exemple #35
0
def principal_component_analysis(x_train):

    """
    Principal Component Analysis (PCA) identifies the combination
    of attributes (principal components, or directions in the feature space)
    that account for the most variance in the data.

    Let's calculate the 2 first principal components of the training data,
    and then create a scatter plot visualizing the training data examples
    projected on the calculated components.
    """

    # Extract the variable to be predicted
    y_train = x_train["TARGET"]
    x_train = x_train.drop(labels="TARGET", axis=1)
    classes = np.sort(np.unique(y_train))
    labels = ["Satisfied customer", "Unsatisfied customer"]

    # Normalize each feature to unit norm (vector length)
    x_train_normalized = normalize(x_train, axis=0)

    # Run PCA
    pca = PCA(n_components=2)
    x_train_projected = pca.fit_transform(x_train_normalized)

    # Visualize
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(1, 1, 1)
    colors = [(0.0, 0.63, 0.69), 'black']
    markers = ["o", "D"]
    for class_ix, marker, color, label in zip(
            classes, markers, colors, labels):
        ax.scatter(x_train_projected[np.where(y_train == class_ix), 0],
                   x_train_projected[np.where(y_train == class_ix), 1],
                   marker=marker, color=color, edgecolor='whitesmoke',
                   linewidth='1', alpha=0.9, label=label)
        ax.legend(loc='best')
    plt.title(
        "Scatter plot of the training data examples projected on the "
        "2 first principal components")
    plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[0] * 100.0))
    plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[1] * 100.0))
    #plt.show()

    #plt.savefig("../data/pca.pdf", format='pdf')
    #plt.savefig("../data/pca.png", format='png')
    plt.savefig(os.path.join(Config.get_string('data.path'), 'graph', 'pca.png'), format='png')
    plt.close()
Exemple #36
0
 def load_mnist_images(filename):
     filename = os.path.join(Config.get_string('data.path'), 'input', filename)
     if not os.path.exists(filename):
         download(filename)
     # Read the inputs in Yann LeCun's binary format.
     with gzip.open(filename, 'rb') as f:
         data = np.frombuffer(f.read(), np.uint8, offset=16)
     # The inputs are vectors now, we reshape them to monochrome 2D images,
     # following the shape convention: (examples, channels, rows, columns)
     data = data.reshape(-1, 1, 28, 28)
     # The inputs come as bytes, we convert them to float32 in range [0,1].
     # (Actually to range [0, 255/256], for compatibility to the version
     # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
     return data / np.float32(256)
Exemple #37
0
def dump_stacking_setting(stack_setting_):

    text = json.dumps(stack_setting_,
                      sort_keys=True,
                      ensure_ascii=False,
                      indent=4)

    data_folder = stack_setting_['setting']['folder']
    fname = stack_setting_['setting']['name']
    fname = os.path.join(Config.get_string('data.path'), data_folder, fname)
    with open(fname, 'w') as fh:
        fh.write(text.encode('utf-8'))

    return True
    def __init__(self, 
                 data_folder,
                 train_fname=None, test_fname=None):
        self.random_state = 325243  # do not change it for different l2 models!
        #self.random_state = 98754  # do not change it for different l2 models!
        if not train_fname:
            sys.stderr.write('Do not set train_meta_feature\n')
            sys.exit()
        if not test_fname:
            sys.stderr.write('Do not set test_meta_feature\n')
            sys.exit()
        train_fname = os.path.join(Config.get_string('data.path'), data_folder, train_fname)
        test_fname = os.path.join(Config.get_string('data.path'), data_folder, test_fname)

        # load train data
        train = pd.read_csv(train_fname)
        self.train_id = train.values
        self.train_y = train.label.values
        self.train_x = train.drop(['label'], axis=1)
        # load test data
        test = pd.read_csv(test_fname)
        self.test_id = test.values
        self.test_y = test.label.values
        self.test_x = test.drop(['label'], axis=1)
Exemple #39
0
 def __init__(self, data_folder, train_fname=None, test_fname=None):
     #self.random_state = 325243  # do not change it for different l1 models!
     self.random_state = 98754  # do not change it for different l1 models!
     if not train_fname:
         train_fname = 'filtered_train.csv'
     if not test_fname:
         test_fname = 'filtered_test.csv'
     train_fname = os.path.join(Config.get_string('data.path'), data_folder,
                                train_fname)
     test_fname = os.path.join(Config.get_string('data.path'), data_folder,
                               test_fname)
     # load train data
     train = pd.read_csv(train_fname)
     train.sort(columns='ID', inplace=1)
     self.train_id = train.values
     self.train_y = train.TARGET.values
     self.train_x = train.drop(['ID', 'TARGET'], axis=1)
     # load test data
     test = pd.read_csv(test_fname)
     test.sort(columns='ID', inplace=1)
     self.test_id = test.values
     self.test_y = test.TARGET.values
     #self.test_x = test.drop(['ID'], axis=1)
     self.test_x = test.drop(['ID', 'TARGET'], axis=1)
Exemple #40
0
 def __init__(self, data_set, abits, wbits, network_type, seed):
     self.network_type = network_type
     self.abits = abits
     self.wbits = wbits
     self.data_set = data_set
     self.seed = seed
     self.model = Sequential()
     cfDeep = self.myCF(self)
     if self.data_set == 'mnist':
         cfg = 'config_MNIST'
     if self.data_set == 'fashion':
         cfg = 'config_FASHION'
     if self.data_set == 'cifar10':
         cfg = 'config_CIFAR-10'
     self.cf = Config(cfg, cmd_args=cfDeep.myDict)
     print("Dataset: " + str("%s_pic/" % self.data_set))
     assure_path_exists("%s_pic/" % self.data_set)
Exemple #41
0
def remove_feat_identicals(data_frame):
    # Find feature vectors having the same values in the same order and
    # remove all but one of those redundant features.
    print("")
    print("Deleting identical features...")
    n_features_originally = data_frame.shape[1]
    # Find the names of identical features by going through all the
    # combinations of features (each pair is compared only once).
    feat_names_delete = []
    for feat_1, feat_2 in itertools.combinations(
            iterable=data_frame.columns, r=2):
        if np.array_equal(data_frame[feat_1], data_frame[feat_2]):
            feat_names_delete.append(feat_2)
    feat_names_delete = np.unique(feat_names_delete)
    # Delete the identical features
    data_frame = data_frame.drop(labels=feat_names_delete, axis=1)
    n_features_deleted = len(feat_names_delete)
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame


if __name__ == "__main__":
    train_data_path = os.path.join(Config.get_string('data.path'), 'input', 'train.csv')
    x_train = pd.read_csv(filepath_or_buffer=train_data_path,
                          index_col=0, sep=',')
    x_train = remove_feat_constants(x_train)
    x_train = remove_feat_identicals(x_train)
    principal_component_analysis(x_train)
Exemple #42
0
def main(stack_setting_):

    # for train set
    fname = os.path.join(Config.get_string('data.path'),
                         stack_setting_['0-Level']['folder'],
                         stack_setting_['0-Level']['filter']['train'])
    train = pd.read_csv(fname)

    # for test dataset
    fname = os.path.join(Config.get_string('data.path'),
                         stack_setting_['0-Level']['folder'],
                         stack_setting_['0-Level']['filter']['test'])
    test = pd.read_csv(fname)

    print("= Stats Summary in train and test set ")
    train1 = extend_df(train.copy())
    test1 = extend_df(test.copy())
    train1.to_csv(os.path.join(
        Config.get_string('data.path'), stack_setting_['0-Level']['folder'],
        stack_setting_['0-Level']['raw_extend']['train']),
                  index=False)
    test1.to_csv(os.path.join(Config.get_string('data.path'),
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['raw_extend']['test']),
                 index=False)

    print("= Scailing in train and test set ")
    train1, test1 = scale(train, test)
    train1.to_csv(os.path.join(Config.get_string('data.path'),
                               stack_setting_['0-Level']['folder'],
                               stack_setting_['0-Level']['scaled']['train']),
                  index=False)
    test1.to_csv(os.path.join(Config.get_string('data.path'),
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['scaled']['test']),
                 index=False)

    train1 = extend_df(train1)
    test1 = extend_df(test1)
    train1.to_csv(os.path.join(
        Config.get_string('data.path'), stack_setting_['0-Level']['folder'],
        stack_setting_['0-Level']['scaled_extend']['train']),
                  index=False)
    test1.to_csv(os.path.join(
        Config.get_string('data.path'), stack_setting_['0-Level']['folder'],
        stack_setting_['0-Level']['scaled_extend']['test']),
                 index=False)

    train1, test1 = standard(train, test)
    train1.to_csv(os.path.join(Config.get_string('data.path'),
                               stack_setting_['0-Level']['folder'],
                               stack_setting_['0-Level']['standard']['train']),
                  index=False)
    test1.to_csv(os.path.join(Config.get_string('data.path'),
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['standard']['test']),
                 index=False)

    train1 = extend_df(train1)
    test1 = extend_df(test1)
    train1.to_csv(os.path.join(
        Config.get_string('data.path'), stack_setting_['0-Level']['folder'],
        stack_setting_['0-Level']['standard_extend']['train']),
                  index=False)
    test1.to_csv(os.path.join(
        Config.get_string('data.path'), stack_setting_['0-Level']['folder'],
        stack_setting_['0-Level']['standard_extend']['test']),
                 index=False)

    return True
Exemple #43
0
from sklearn import preprocessing
import scipy.special as special
from pandas import DataFrame, Series
from tqdm import tqdm
import time

# import sys
# sys.path.extend('../')

from utils.data_utils import preprocess
from utils.config_utils import Config

from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.io as scio

cfg = Config()

np.random.seed(cfg.seed)
random.seed(cfg.seed)
"""
Feature Extraction Tools

TF-IDF + W2V + Multi-label + Onehot + Click multiply + Time Sequence + Shuffle

"""


def tfidf(log, pivot_key, out_key, flag, max_df=0.99):
    """
    TF-IDF Features
def main():
    # for train set
    fname = os.path.join(Config.get_string('data.path'), 'input', 'filtered_train.csv')
    train = pd.read_csv(fname)

    # for test dataset
    fname = os.path.join(Config.get_string('data.path'), 'input', 'filtered_test.csv')
    test = pd.read_csv(fname)



    # for extended
    print '--- extending raw dataset ---'
    train1 = extend_df(train.copy())
    test1 = extend_df(test.copy())
    train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'raw_extend_train.csv'), index=0)
    test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'raw_extend_test.csv'), index=0)

    # for scaled
    print '--- scaling raw dataset to [0, 1] ---'
    train1, test1 = scale(train, test)
    train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_train.csv'), index=0)
    test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_test.csv'), index=0)

    # for extended scaled
    print '--- extending scaled dataset ---'
    train1 = extend_df(train1)
    test1 = extend_df(test1)
    train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_extend_train.csv'), index=0)
    test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'scaled_extend_test.csv'), index=0)

    # for normalized data
    print '--- standardizing dataset ---'
    train1, test1 = standard(train, test)
    train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_train.csv'), index=0)
    test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_test.csv'), index=0)

    # for pca
    print "--- transforming pca dataset  ----"
    train2, test2 = pca(train1, test1, components=100)
    train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca100_train.csv'), index=0)
    test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca100_test.csv'), index=0)

    train2, test2 = pca(train1, test1, components=200)
    train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca200_train.csv'), index=0)
    test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca200_test.csv'), index=0)

    # for pca extend
    print "--- standard -> standard + pca  ----"
    train2, test2 = pca_extend(train1, test1, components=10)
    train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca10_and_standard_train.csv'), index=0)
    test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca10_and_standard_test.csv'), index=0)

    train2, test2 = pca_extend(train1, test1, components=20)
    train2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca20_and_standard_train.csv'), index=0)
    test2.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'pca20_and_standard_test.csv'), index=0)

    del train2
    del test2

    # for extended normalized data
    print '--- extending standard dataset ---'
    train1 = extend_df(train1)
    test1 = extend_df(test1)
    train1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_extend_train.csv'), index=0)
    test1.to_csv(os.path.join(Config.get_string('data.path'), 'input', 'standard_extend_test.csv'), index=0)
    pass
def combine_meta_features(stack_setting_):

    #data_folder = stack_setting_['1-Level' ]['meta_features']
    #fname = stack_setting_['setting']['name']
    #fname = os.path.join(Config.get_string('data.path'), data_folder, fname)

    train_merge = []
    test_merge = []
    for model_name in stack_setting_['1-Level'].keys():
        try:
            if model_name == 'gbdt_linear':
                # train
                folder = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['folder']
                train_fname = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['train']
                cmd = "ls %s%s/%s*%s*.%s" % (Config.get_string('data.path'),
                                             folder,
                                             "_".join(".".join(train_fname.split('.')[:-1]).split("_")[:-1]), 
                                             ".".join(train_fname.split('.')[:-1]).split("_")[-1], 
                                             train_fname.split('.')[-1])
                p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
                for line in iter(p.stdout.readline, b''):
                    train = pd.read_csv(line.rstrip('\n'))
                    col_name = train.columns.values[:-1]
                    X_train = train[col_name]
                    col_name = train.columns.values[-1]
                    y_train = train[col_name]
                    train_merge.append(X_train)

                # test
                folder = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['folder']
                test_fname = stack_setting_['1-Level'][model_name]['lower']['meta_feature']['test']
                cmd = "ls %s%s/%s*%s*.%s" % (Config.get_string('data.path'),
                                             folder,
                                             "_".join(".".join(test_fname.split('.')[:-1]).split("_")[:-1]), 
                                             ".".join(test_fname.split('.')[:-1]).split("_")[-1], 
                                             test_fname.split('.')[-1])
                p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
                for line in iter(p.stdout.readline, b''):
                    test = pd.read_csv(line.rstrip('\n'))
                    col_name = test.columns.values[:-1]
                    X_test = test[col_name]
                    col_name = test.columns.values[-1]
                    y_test = test[col_name]
                    test_merge.append(X_test)
            else:
                # train
                folder = stack_setting_['1-Level'][model_name]['meta_feature']['folder']
                train_fname = stack_setting_['1-Level'][model_name]['meta_feature']['train']
                train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname)
                #data1.columns.values
                train = pd.read_csv(train_fname)
                col_name = train.columns.values[:-1]
                X_train = train[col_name]
                col_name = train.columns.values[-1]
                y_train = train[col_name]
                train_merge.append(X_train)

                # test
                folder = stack_setting_['1-Level'][model_name]['meta_feature']['folder']
                test_fname = stack_setting_['1-Level'][model_name]['meta_feature']['test']
                test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname)
                #data1.columns.values
                test = pd.read_csv(test_fname)
                col_name = test.columns.values[:-1]
                X_test = test[col_name]
                col_name = test.columns.values[-1]
                y_test = test[col_name]
                test_merge.append(X_test)

        except:
            pass

    

    train_merge.append(y_train)
    train_merge = pd.concat(train_merge, ignore_index=False, axis=1)
    #print train_merge.head(10)
    folder = stack_setting_['1-Level']['meta_features']['folder']
    train_fname = stack_setting_['1-Level']['meta_features']['train']
    train_fname = os.path.join(Config.get_string('data.path'), folder, train_fname)
    train_merge.to_csv(train_fname, index=False)

    test_merge.append(y_test)
    test_merge = pd.concat(test_merge, ignore_index=False, axis=1)
    #print test_merge.head(10)
    folder = stack_setting_['1-Level']['meta_features']['folder']
    test_fname = stack_setting_['1-Level']['meta_features']['test']
    test_fname = os.path.join(Config.get_string('data.path'), folder, test_fname)
    test_merge.to_csv(test_fname, index=False)
    return True
def main_transform(stack_setting_):

    """
     [rawdata2filterdata Step]
      1. Reading raw datasets
      2. Droping useless feat columns in training set
      3. Droping useless feat columns in test set
    """

    raw_train_path = os.path.join(Config.get_string('data.path'), 
                                  stack_setting_['0-Level']['folder'],
                                  stack_setting_['0-Level']['raw']['train'])
    raw_test_path = os.path.join(Config.get_string('data.path'), 
                                 stack_setting_['0-Level']['folder'],
                                 stack_setting_['0-Level']['raw']['test'])
    print("= Reading raw datasets ...")

    names = ("age, workclass, fnlwgt, education, education-num, material, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, country, TARGET").split(', ')

    # train
    raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=','
    train_labels = pd.DataFrame((raw_train['TARGET'].values == ' >50K').astype(np.int32), 
                                columns=['TARGET'])
    raw_train = raw_train.drop(labels=['TARGET'], axis=1)
    #print 'train summary'
    #print raw_train[['age', 'hours-per-week', 'fnlwgt']].describe()
    for key in ['fnlwgt', 'capital-gain', 'capital-loss']:
        raw_train['%s_%s' % (key, 'log')] = np.log(raw_train[key] + 1.0).astype(np.float32)
        raw_train = raw_train.drop(labels=[key], axis=1)
    raw_train['TARGET'] = train_labels; del train_labels

    # test
    raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=','
    test_labels = pd.DataFrame((raw_test['TARGET'].values == ' >50K').astype(np.int32), 
                                columns=['TARGET'])
    raw_test = raw_test.drop(labels=['TARGET'], axis=1)
    print 'test summary-1'
    print raw_test.describe()
    for key in ['fnlwgt', 'capital-gain', 'capital-loss']:
        raw_test['%s_%s' % (key, 'log')] = np.log(raw_test[key] + 1.0).astype(np.float32)
        raw_test = raw_test.drop(labels=[key], axis=1)
    print '\n'
    print 'test summary-2'
    print raw_test.describe()
    raw_test['TARGET'] = test_labels; del test_labels


    # main
    print("= Transform Categorical Variables into One-Hot-Encoding ...")

    categoricals = ['workclass',  'education',  'material', 'occupation', 'relationship', 'race', 'sex', 'country']
    combined = pd.concat([raw_train, raw_test])
    #print combined.head()
    #combined_categoricals = combined[categoricals]

    clf = onehot_encode()
    clf.fit(combined, categoricals)
    train_path = os.path.join(Config.get_string('data.path'), 
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['train'])

    # dump train
    raw_train = clf.transform(raw_train, categoricals)
    train_label = raw_train.TARGET.values
    raw_train = raw_train.drop(labels=['TARGET'], axis=1)
    raw_train['TARGET'] = pd.DataFrame(train_label); del train_label
    train_path = os.path.join(Config.get_string('data.path'), 
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['train'])
    raw_train.to_csv(train_path, index=True, index_label='ID')

    # dump test
    raw_test = clf.transform(raw_test, categoricals)
    test_label = raw_test.TARGET.values
    raw_test = raw_test.drop(labels=['TARGET'], axis=1)
    raw_test['TARGET'] = pd.DataFrame(test_label); del test_label
    test_path = os.path.join(Config.get_string('data.path'), 
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['test'])
    raw_test.to_csv(test_path, index=True, index_label='ID')
Exemple #47
0
    for feat_1, feat_2 in itertools.combinations(
            iterable=data_frame.columns, r=2):
        if np.array_equal(data_frame[feat_1], data_frame[feat_2]):
            feat_names_delete.append(feat_2)
    feat_names_delete = np.unique(feat_names_delete)
    # Delete the identical features
    data_frame = data_frame.drop(labels=feat_names_delete, axis=1)
    n_features_deleted = len(feat_names_delete)
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame, feat_names_delete


if __name__=='__main__':
    data_path = os.path.join(Config.get_string('data.path'), 'input')
    raw_train_path = os.path.join(data_path, 'train.csv')
    raw_test_path = os.path.join(data_path, 'test.csv')
    print("= Reading raw datasets ...")
    raw_train = pd.read_csv(raw_train_path, index_col=0, sep=',')
    raw_test = pd.read_csv(raw_test_path, index_col=0, sep=',')

    print("= Droping useless feat columns in training set ")
    raw_train, feat_to_delete = remove_feat_constants(raw_train)
    raw_train, temp = remove_feat_identicals(raw_train)
    feat_to_delete.extend(temp)

    print("= Droping useless feat columns in test set:")
    print feat_to_delete
    raw_test = raw_test.drop(feat_to_delete, axis=1)
Exemple #48
0
import os

from seq2seq_regression.Seq2SeqRegression import train_on_plouffe_copy
from utils.config_utils import Config, flat_dict, flat_dict_helper

if __name__ == "__main__":
    config_path = os.getcwd()
    config = Config(config_path)

    cmd_args = config.config_init_parser()
    load_params = 0

    if cmd_args.config_file is None:
        sess_args = vars(cmd_args)
        load_params = 1
    else:
        yml_args = config.config_parse_yaml()
        sess_args = flat_dict(yml_args)

    train_on_plouffe_copy(sess_args, load_params)
def gbdt_plus_liner_classifier_grid_search(stack_setting_,
                                           upper_param_keys=None, upper_param_vals=None,
                                           lower_param_keys=None, lower_param_vals=None,
                                           num_proc=None):

    """
     upper model is GBDT or Random Forest
     lower model is Linear Classifier
    """
    if stack_setting_ is None:
        sys.stderr.write('You have no setting Json file\n')
        sys.exit()

    if num_proc is None:
        num_proc = 6


    # 1. upper model
    if upper_param_keys is None:
        upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf']

    if upper_param_vals is None:
        upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]]


    # grid search for upper model : GBDT or Random Forest
    # ExperimentL1 has model free. On the other hand, data is fix
    exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'],
                       train_fname = stack_setting_['0-Level']['train'], 
                       test_fname = stack_setting_['0-Level']['test'])

    # GridSearch has a single model. model is dertermined by param
    #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals,
    #                cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'],
    #                cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], 
    #                cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], 
    #                refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out'])
    #upper_best_param, upper_best_score = gs.search_by_cv()


    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_train_fname = os.path.join(Config.get_string('data.path'), 
                                     model_folder, 
                                     model_train_fname)
    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    model_test_fname = os.path.join(Config.get_string('data.path'), 
                                    model_folder, 
                                    model_test_fname)
    upper_param_dict = dict(zip(upper_param_keys, upper_param_vals))
    if os.path.isfile(model_train_fname) is False and \
            os.path.isfile(model_test_fname) is False:
        #upper_param_dict['model_type'] == [GradientBoostingClassifier]
        del upper_param_dict['model_type']        
        clf = GradientBoostingClassifier()
        clf_cv = GridSearchCV(clf, upper_param_dict, 
                              verbose = 10, 
                              scoring = "f1",#scoring = "precision" or "recall"
                              n_jobs = num_proc, cv = 5)
        
        X_train, y_train = exp.get_train_data()
        clf_cv.fit(X_train, y_train)
        upper_best_params = clf_cv.best_params_
        print upper_best_params
        del clf_cv
        clf.set_params(**upper_best_params)
        clf.fit(X_train, y_train)
        train_loss = clf.train_score_
        test_loss = np.empty(len(clf.estimators_))
        X_test, y_test = exp.get_test_data()
        for i, pred in enumerate(clf.staged_predict(X_test)):
            test_loss[i] = clf.loss_(y_test, pred)

        graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder']
        graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name']
        graph_fname = os.path.join(Config.get_string('data.path'), 
                                   graph_folder, 
                                   graph_fname)
        gs = GridSpec(2,2)
        ax1 = plt.subplot(gs[0,1])
        ax2 = plt.subplot(gs[1,1])
        ax3 = plt.subplot(gs[:,0])

        ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
        ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
        ax1.set_xlabel('the number of weak learner:Boosting Iterations')
        ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax1.legend(loc="best")       

        # dump for the transformated feature
        clf = TreeTransform(GradientBoostingClassifier(),
                            best_params_ = upper_best_params)
        if type(X_train) == pd.core.frame.DataFrame:
            clf.fit(X_train.as_matrix().astype(np.float32), y_train)
        elif X_train == np.ndarray:
            clf.fit(X_train.astype(np.float32), y_train)

        # train result
        train_loss = clf.estimator_.train_score_
        test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32)

        if type(X_train) == pd.core.frame.DataFrame:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        elif type(X_train) == np.ndarray:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        ax2.plot(train_loss, label="train_loss")
        ax2.plot(test_loss, label="test_loss")
        ax2.set_xlabel('Boosting Iterations')
        ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax2.legend(loc="best")

        # tree ensambles
        score_threshold=0.8
        index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values))
        feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]]
        feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index]
        fis = pd.DataFrame(
            {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index],
             'score':feature_importances_score}
            )
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        # where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)
        sns.barplot(x = 'score', y = 'name',
                    data = fis,
                    ax=ax3,
                    color="blue")
        ax3.set_xlabel("Feature_Importance", fontsize=10)
        plt.tight_layout()
        plt.savefig(graph_fname)
        plt.close()

        #print clf.toarray().shape
        # >(26049, 100)
        #input_features = 26049, weak_learners = 100
        #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0]
        #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:]

        ## feature transformation : get test data from train trees
        #print transformated_train_features.shape, X_train.shape
        #print transformated_test_features.shape, X_test.shape

        transformated_train_features = clf.one_hot_encoding
        if type(X_test) == pd.core.frame.DataFrame:
            transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), 
                                                        y_test)
        elif type(X_train) == np.ndarray:
            transformated_test_features = clf.transform(X_test, y_test)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        #model_train_fname = os.path.join(Config.get_string('data.path'), 
        #                                 model_folder, 
        #                                 model_train_fname)
        with gzip.open(model_train_fname, "wb") as gf:
            cPickle.dump([transformated_train_features, y_train], 
                         gf,
                         cPickle.HIGHEST_PROTOCOL)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        #model_test_fname = os.path.join(Config.get_string('data.path'), 
        #                                model_folder, 
        #                                model_test_fname)
        with gzip.open(model_test_fname, "wb") as gf:
            cPickle.dump([transformated_test_features, y_test],
                         gf,
                         cPickle.HIGHEST_PROTOCOL)


    """
    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    if lower_param_dict['model_type'] == [LogisticRegression]:

        # grid search for lower model : Linear Classifier
        # ExperimentL1_1 has model free. On the other hand, data is fix
        model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                             train_fname = model_train_fname, 
                             test_fname = model_test_fname)
        # GridSearch has a single model. model is dertermined by param
        gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                        cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                        cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                        cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                        refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
        lower_best_param, lower_best_score = gs.search_by_cv()
        print lower_best_param
    

        # get meta_feature
        exp.write2csv_meta_feature(
            model = LogisticRegression(),
            meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
            meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'],
            meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'],
            meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
            best_param_ = lower_best_param
            )
    """

    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    clf_lower_model = None
    clf_lower_mname = None

    # grid search for lower model : Linear Classifier
    # ExperimentL1_1 has model free. On the other hand, data is fix
    if lower_param_dict['model_type'] == [LogisticRegression]:
        # Logistic Regression
        clf_lower_model = LogisticRegression()
        clf_lower_mname = 'LR'

    elif lower_param_dict['model_type'] == [SVM]:
        # SVM
        clf_lower_model = LinearSVC()
        clf_lower_mname = 'SVM'

    else:
        sys.stderr.write("You should input lower liner model\n")
        sys.exit()

    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                         train_fname = model_train_fname, 
                         test_fname = model_test_fname)
    # GridSearch has a single model. model is dertermined by param
    gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                    cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                    cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                    cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                    refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
    lower_best_param, lower_best_score = gs.search_by_cv()
    print lower_best_param

    # get meta_feature
    meta_train_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1]
        )
    meta_test_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1]
        )
    exp.write2csv_meta_feature(
        model = clf_lower_model,
        meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
        meta_train_fname = meta_train_fname_,
        meta_test_fname = meta_test_fname_,
        meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
        best_param_ = lower_best_param
        )

    ## best parameter for GBDT and anohter sklearn classifier
    #return best_param, best_score
    return upper_best_params, lower_best_param
Exemple #50
0
def combine_meta_features(stack_setting_):

    #data_folder = stack_setting_['1-Level' ]['meta_features']
    #fname = stack_setting_['setting']['name']
    #fname = os.path.join(Config.get_string('data.path'), data_folder, fname)

    train_merge = []
    test_merge = []
    for model_name in stack_setting_['1-Level'].keys():
        try:
            if model_name == 'gbdt_linear':
                # train
                folder = stack_setting_['1-Level'][model_name]['lower'][
                    'meta_feature']['folder']
                train_fname = stack_setting_['1-Level'][model_name]['lower'][
                    'meta_feature']['train']
                cmd = "ls %s%s/%s*%s*.%s" % (
                    Config.get_string('data.path'), folder, "_".join(".".join(
                        train_fname.split('.')[:-1]).split("_")[:-1]),
                    ".".join(train_fname.split('.')[:-1]).split("_")[-1],
                    train_fname.split('.')[-1])
                p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
                for line in iter(p.stdout.readline, b''):
                    train = pd.read_csv(line.rstrip('\n'))
                    col_name = train.columns.values[:-1]
                    X_train = train[col_name]
                    col_name = train.columns.values[-1]
                    y_train = train[col_name]
                    train_merge.append(X_train)

                # test
                folder = stack_setting_['1-Level'][model_name]['lower'][
                    'meta_feature']['folder']
                test_fname = stack_setting_['1-Level'][model_name]['lower'][
                    'meta_feature']['test']
                cmd = "ls %s%s/%s*%s*.%s" % (
                    Config.get_string('data.path'), folder, "_".join(".".join(
                        test_fname.split('.')[:-1]).split("_")[:-1]), ".".join(
                            test_fname.split('.')[:-1]).split("_")[-1],
                    test_fname.split('.')[-1])
                p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
                for line in iter(p.stdout.readline, b''):
                    test = pd.read_csv(line.rstrip('\n'))
                    col_name = test.columns.values[:-1]
                    X_test = test[col_name]
                    col_name = test.columns.values[-1]
                    y_test = test[col_name]
                    test_merge.append(X_test)
            else:
                # train
                folder = stack_setting_['1-Level'][model_name]['meta_feature'][
                    'folder']
                train_fname = stack_setting_['1-Level'][model_name][
                    'meta_feature']['train']
                train_fname = os.path.join(Config.get_string('data.path'),
                                           folder, train_fname)
                #data1.columns.values
                train = pd.read_csv(train_fname)
                col_name = train.columns.values[:-1]
                X_train = train[col_name]
                col_name = train.columns.values[-1]
                y_train = train[col_name]
                train_merge.append(X_train)

                # test
                folder = stack_setting_['1-Level'][model_name]['meta_feature'][
                    'folder']
                test_fname = stack_setting_['1-Level'][model_name][
                    'meta_feature']['test']
                test_fname = os.path.join(Config.get_string('data.path'),
                                          folder, test_fname)
                #data1.columns.values
                test = pd.read_csv(test_fname)
                col_name = test.columns.values[:-1]
                X_test = test[col_name]
                col_name = test.columns.values[-1]
                y_test = test[col_name]
                test_merge.append(X_test)

        except:
            pass

    train_merge.append(y_train)
    train_merge = pd.concat(train_merge, ignore_index=False, axis=1)
    #print train_merge.head(10)
    folder = stack_setting_['1-Level']['meta_features']['folder']
    train_fname = stack_setting_['1-Level']['meta_features']['train']
    train_fname = os.path.join(Config.get_string('data.path'), folder,
                               train_fname)
    train_merge.to_csv(train_fname, index=False)

    test_merge.append(y_test)
    test_merge = pd.concat(test_merge, ignore_index=False, axis=1)
    #print test_merge.head(10)
    folder = stack_setting_['1-Level']['meta_features']['folder']
    test_fname = stack_setting_['1-Level']['meta_features']['test']
    test_fname = os.path.join(Config.get_string('data.path'), folder,
                              test_fname)
    test_merge.to_csv(test_fname, index=False)
    return True
Exemple #51
0
                print(checkpoint_name)
                #print(len(name_list))
                #exit()
                
                train_from_config(lr,
                                  batch_size,
                                  num_nodes,
                                  dataset_size,
                                  teacher_forcing,
                                  checkpoint_name,
                                  log_dir_num,
                                  log_dir_path,
                                  train_option,
                                  sys.argv)
                
                log_dir_num += 1
                #print(checkpoint_name_idx)
                if checkpoint_name_idx < len(name_list)-1:
                    checkpoint_name_idx += 1
                else:
                    checkpoint_name_idx = 0


if __name__=="__main__":
    config_path = os.getcwd()
    config = Config(config_path)
    yml_args = config.config_parse_yaml()
    sess_args = flat_dict(yml_args)

    train_many_jobs(sess_args)
                    default=None,
                    help='Configuration file')
parser.add_argument('-o', '--override', action='store', nargs='*', default=[])

arguments = parser.parse_args()
override_dir = {}

for s in arguments.override:
    s_s = s.split("=")
    k = s_s[0].strip()
    v = "=".join(s_s[1:]).strip()
    override_dir[k] = v
arguments.override = override_dir

cfg = arguments.config_path
cf = Config(cfg, cmd_args=arguments.override)

# if necessary, only use the CPU for debugging
if cf.cpu:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = cf.cuda

# ## Construct the network
print('Construct the Network\n')
model = build_model(cf)

print('loading data\n')
train_data, val_data, test_data = load_dataset(cf.dataset, cf)

print('setting up the network and creating callbacks\n')
Exemple #53
0
def main_transform(stack_setting_):
    """
     [rawdata2filterdata Step]
      1. Reading raw datasets
      2. Droping useless feat columns in training set
      3. Droping useless feat columns in test set
    """

    raw_train_path = os.path.join(Config.get_string('data.path'),
                                  stack_setting_['0-Level']['folder'],
                                  stack_setting_['0-Level']['raw']['train'])
    raw_test_path = os.path.join(Config.get_string('data.path'),
                                 stack_setting_['0-Level']['folder'],
                                 stack_setting_['0-Level']['raw']['test'])
    print("= Reading raw datasets ...")

    names = (
        "age, workclass, fnlwgt, education, education-num, material, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, country, TARGET"
    ).split(', ')

    # train
    raw_train = pd.read_csv(raw_train_path, names=names,
                            skiprows=1)  #, index_col=0, sep=','
    train_labels = pd.DataFrame(
        (raw_train['TARGET'].values == ' >50K').astype(np.int32),
        columns=['TARGET'])
    raw_train = raw_train.drop(labels=['TARGET'], axis=1)
    #print 'train summary'
    #print raw_train[['age', 'hours-per-week', 'fnlwgt']].describe()
    for key in ['fnlwgt', 'capital-gain', 'capital-loss']:
        raw_train['%s_%s' % (key, 'log')] = np.log(raw_train[key] +
                                                   1.0).astype(np.float32)
        raw_train = raw_train.drop(labels=[key], axis=1)
    raw_train['TARGET'] = train_labels
    del train_labels

    # test
    raw_test = pd.read_csv(raw_test_path, names=names,
                           skiprows=1)  #, index_col=0, sep=','
    test_labels = pd.DataFrame(
        (raw_test['TARGET'].values == ' >50K').astype(np.int32),
        columns=['TARGET'])
    raw_test = raw_test.drop(labels=['TARGET'], axis=1)
    print 'test summary-1'
    print raw_test.describe()
    for key in ['fnlwgt', 'capital-gain', 'capital-loss']:
        raw_test['%s_%s' % (key, 'log')] = np.log(raw_test[key] + 1.0).astype(
            np.float32)
        raw_test = raw_test.drop(labels=[key], axis=1)
    print '\n'
    print 'test summary-2'
    print raw_test.describe()
    raw_test['TARGET'] = test_labels
    del test_labels

    # main
    print("= Transform Categorical Variables into One-Hot-Encoding ...")

    categoricals = [
        'workclass', 'education', 'material', 'occupation', 'relationship',
        'race', 'sex', 'country'
    ]
    combined = pd.concat([raw_train, raw_test])
    #print combined.head()
    #combined_categoricals = combined[categoricals]

    clf = onehot_encode()
    clf.fit(combined, categoricals)
    train_path = os.path.join(Config.get_string('data.path'),
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['train'])

    # dump train
    raw_train = clf.transform(raw_train, categoricals)
    train_label = raw_train.TARGET.values
    raw_train = raw_train.drop(labels=['TARGET'], axis=1)
    raw_train['TARGET'] = pd.DataFrame(train_label)
    del train_label
    train_path = os.path.join(Config.get_string('data.path'),
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['train'])
    raw_train.to_csv(train_path, index=True, index_label='ID')

    # dump test
    raw_test = clf.transform(raw_test, categoricals)
    test_label = raw_test.TARGET.values
    raw_test = raw_test.drop(labels=['TARGET'], axis=1)
    raw_test['TARGET'] = pd.DataFrame(test_label)
    del test_label
    test_path = os.path.join(Config.get_string('data.path'),
                             stack_setting_['0-Level']['folder'],
                             stack_setting_['0-Level']['test'])
    raw_test.to_csv(test_path, index=True, index_label='ID')
def get_ridge_plot(best_param_, experiment_, 
                   param_keys_, param_vals_,
                   png_folder,
                   png_fname,
                   score_threshold=0.8):

    parameters = dict(zip(param_keys_, param_vals_))
    del parameters['model_type']

    clf = RidgeClassifier()
    X_train, y_train = experiment_.get_train_data()
    clf.set_params(**best_param_)
    clf.fit(X_train, y_train)    
    best_alpha = best_param_['alpha']
    result = {'alphas':[],
              'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ),
              'scores':[],
              'score':None}


    for i, alpha in enumerate(parameters.get('alpha',None)):
        result['alphas'].append(alpha)
        del best_param_['alpha']
        best_param_['alpha'] = alpha
        clf.set_params(**best_param_)
        clf.fit(X_train, y_train)

        # regularization path
        tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32)
        if best_param_['fit_intercept']:
            tmp = np.append(clf.intercept_, clf.coef_)
        else:
            tmp[1:] = clf.intercept_
        result['coefs'][i,:] = tmp
        result['scores'].append(experiment_.get_proba(clf, X_train))
    del X_train, y_train

    # 2. 
    tmp_len = len(experiment_.get_data_col_name())
    index2feature = dict(zip(np.arange(1, tmp_len + 1), 
                             experiment_.get_data_col_name()))
    if best_param_['fit_intercept']:
        index2feature[0] = 'intercept'

    # 3. plot
    gs = GridSpec(2,2)
    ax1 = plt.subplot(gs[:,0])
    ax2 = plt.subplot(gs[0,1])
    ax3 = plt.subplot(gs[1,1])


    # 3.1 feature importance
    labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name())
    nrows, ncols = result['coefs'].shape
    for ncol in xrange(ncols):
        ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol])
    ax1.legend(loc='best')
    ax1.set_xscale('log')
    ax1.set_title("Regularization Path:%1.3e" % (best_alpha))
    ax1.set_xlabel("alpha", fontsize=10)

    # 3.2 PDF
    X_test, y_test = experiment_.get_test_data()
    result['score'] = clf.decision_function(X_test)
    sns.distplot(result['score'], kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF : Decision_Function")


    # 3.3 CDF
    num_bins = 100
    try:
        counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True)
    except:
        counts, bin_edges = np.histogram(result['score'], normed=True)

    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10)


    png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)
    plt.close()

    return True
Exemple #55
0
def get_xgb_feature_importance_plot(best_param_,
                                    experiment_,
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1.
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({
        'name': index2feature.keys(),
        'score': index2feature.values()
    })
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(
            score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(
        x='score',
        y='name',
        data=fis,
        #ax=ax1,
        color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)
    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph',
                             png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)  #, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
Exemple #56
0
def get_rf_feature_importance_plot(best_param_,
                                   experiment_,
                                   png_folder,
                                   png_fname,
                                   score_threshold=0.8):

    # 1.
    best_param_['oob_score'] = True

    # 2.
    train_X, train_y = experiment_.get_train_data()
    clf = RandomForestClassifier()
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)

    index2feature = dict(
        zip(np.arange(len(train_X.columns.values)), train_X.columns.values))
    feature_importances_index = [
        str(j) for j in clf.feature_importances_.argsort()[::-1]
    ]
    feature_importances_score = [
        clf.feature_importances_[int(j)] for j in feature_importances_index
    ]
    fis = pd.DataFrame({
        'name': [
            index2feature.get(int(key), 'Null')
            for key in feature_importances_index
        ],
        'score':
        feature_importances_score
    })
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(
            score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 3. plot
    gs = GridSpec(2, 2)
    ax1 = plt.subplot(gs[:, 0])
    ax2 = plt.subplot(gs[0, 1])
    ax3 = plt.subplot(gs[1, 1])

    # 3.1 feature importance
    sns.barplot(x='score', y='name', data=fis, ax=ax1, color="blue")
    #ax1.set_title("Feature_Importance", fontsize=10)
    ax1.set_ylabel("Feature", fontsize=10)
    ax1.set_xlabel("Feature_Importance", fontsize=10)

    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:, 1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = 100
    try:
        counts, bin_edges = np.histogram(confidence_score,
                                         bins=num_bins,
                                         normed=True)
    except:
        counts, bin_edges = np.histogram(confidence_score, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)

    png_fname = os.path.join(Config.get_string('data.path'), png_folder,
                             png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)
    plt.close()

    return True
override_dir = {}
#arguments.override=
#for s in arguments.override:
#    s_s = s.split("=")
#    k = s_s[0].strip()
#    v = "=".join(s_s[1:]).strip()
#    override_dir[k]=v
#arguments.override = override_dir
override_dir['lr'] = 0.01
override_dir['wbits'] = 4
override_dir['abits'] = 4
override_dir['network_type'] = 'full-qnn'

#config_oath
cfg = "config_CIFAR-10"
cf = Config(cfg, cmd_args=override_dir)

# if necessary, only use the CPU for debugging
#if cf.cpu:
#    os.environ["CUDA_VISIBLE_DEVICES"] = ""

# ## Construct the network
print('Construct the Network\n')

# In[4]:
model = build_model(cf)

print('setting up the network and creating callbacks\n')

early_stop = EarlyStopping(monitor='loss',
                           min_delta=0.001,
def get_ridge_plot(best_param_, experiment_, 
                   param_keys_, param_vals_,
                   png_folder,
                   png_fname,
                   score_threshold=0.8):

    parameters = dict(zip(param_keys_, param_vals_))
    del parameters['model_type']

    clf = RidgeClassifier()
    X_train, y_train = experiment_.get_train_data()
    clf.set_params(**best_param_)
    clf.fit(X_train, y_train)    
    best_alpha = best_param_['alpha']
    result = {'alphas':[],
              'coefs':np.zeros( (len(parameters['alpha']), len(X_train.columns.values) + 1) ),
              'scores':[],
              'score':None}


    for i, alpha in enumerate(parameters.get('alpha',None)):
        result['alphas'].append(alpha)
        del best_param_['alpha']
        best_param_['alpha'] = alpha
        clf.set_params(**best_param_)
        clf.fit(X_train, y_train)

        # regularization path
        tmp = np.array([0 for j in xrange(len(X_train.columns.values) + 1)], dtype=np.float32)
        if best_param_['fit_intercept']:
            tmp = np.append(clf.intercept_, clf.coef_)
        else:
            tmp[1:] = clf.intercept_
        result['coefs'][i,:] = tmp
        result['scores'].append(experiment_.get_proba(clf, X_train))
    del X_train, y_train

    # 2. 
    tmp_len = len(experiment_.get_data_col_name())
    index2feature = dict(zip(np.arange(1, tmp_len + 1), 
                             experiment_.get_data_col_name()))
    if best_param_['fit_intercept']:
        index2feature[0] = 'intercept'

    # 3. plot
    gs = GridSpec(2,2)
    ax1 = plt.subplot(gs[:,0])
    ax2 = plt.subplot(gs[0,1])
    ax3 = plt.subplot(gs[1,1])


    # 3.1 feature importance
    labels = np.append(np.array(['intercept'], dtype='S100'), experiment_.get_data_col_name())
    nrows, ncols = result['coefs'].shape
    for ncol in xrange(ncols):
        ax1.plot(np.array(result['alphas']), result['coefs'][:,ncol], label = labels[ncol])
    ax1.legend(loc='best')
    ax1.set_xscale('log')
    ax1.set_title("Regularization Path:%1.3e" % (best_alpha))
    ax1.set_xlabel("alpha", fontsize=10)

    # 3.2 PDF
    X_test, y_test = experiment_.get_test_data()
    result['score'] = clf.decision_function(X_test)
    sns.distplot(result['score'], kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF : Decision_Function")


    # 3.3 CDF
    num_bins = 100
    try:
        counts, bin_edges = np.histogram(result['score'], bins=num_bins, normed=True)
    except:
        counts, bin_edges = np.histogram(result['score'], normed=True)

    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Decision_Function:Confidence_Score", fontsize=10)


    png_fname = os.path.join(Config.get_string('data.path'), png_folder, png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)
    plt.close()

    return True