def work(in_h5, out_csv_file, nest, njobs):

    from h5pipes import h5open
    from pypipes import getitem, as_key, del_key
    from nppipes import (as_array, fit_transform, transform, fit, predict,
                         savetxt, stack, clip)
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import StandardScaler
    from xgboost import XGBRegressor

    nominal_cidx = [
        0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
        27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52,
        53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72,
        73, 74, 75, 76, 77
    ]

    data = (
        (in_h5, )
        | h5open
        | as_key('file')
        | as_key(
            'train_X', lambda d: (d['file'], )
            | getitem('train_X')
            | as_array
            | P.first)
        | as_key(
            'train_y', lambda d: (d['file'], )
            | getitem('train_y')
            | as_array
            | P.first)
        | as_key(
            'test_X', lambda d: (d['file'], )
            | getitem('test_X')
            | as_array
            | P.first)
        | as_key(
            'train_labels', lambda d: (d['file'], )
            | getitem('train_labels')
            | as_array
            | P.first)
        | as_key(
            'test_labels', lambda d: (d['file'], )
            | getitem('test_labels')
            | as_array
            | P.first)
        | as_key(
            'one_hot', lambda _:
            (OneHotEncoder(categorical_features=nominal_cidx, sparse=False), ))
        | as_key(
            'train_X', lambda d: (d['train_X'].copy(), )
            | fit_transform(d['one_hot'])
            | P.first)
        | as_key(
            'test_X', lambda d: (d['test_X'].copy(), )
            | transform(d['one_hot'])
            | P.first)
        | del_key('one_hot')
        | as_key('std_scaler', lambda _: (StandardScaler(), ))
        | as_key(
            'train_X', lambda d: (d['train_X'].copy(), )
            | fit_transform(d['std_scaler'])
            | P.first)
        | as_key(
            'test_X', lambda d: (d['test_X'].copy(), )
            | transform(d['std_scaler'])
            | P.first)
        | del_key('std_scaler')
        | as_key(
            'XGBReg',
            lambda d: (
                XGBRegressor(
                    seed=1,
                    n_estimators=nest,  #n_jobs=njobs,
                    #verbose=1,
                    #max_features=1.0, min_samples_leaf=1.0,
                    max_depth=50), )
            | fit((d['train_X'], ), (d['train_y'], ))
            | P.first)
        | as_key(
            'y_hat', lambda d: (d['test_X'], )
            | predict((d['XGBReg'], ))
            | clip(1, 8)
            | P.first)
        | del_key('XGBReg')
        | P.first)

    ((data['test_labels'], data['y_hat'])
     | stack(axis=1)
     | savetxt(out_csv_file,
               delimiter=',',
               fmt=['%d', '%d'],
               header='"Id","Response"',
               comments='')
     | P.first)

    return
def work(in_h5,
         out_csv_file,
         nest,
         njobs):

    from h5pipes import h5open
    from pypipes import getitem,as_key,del_key
    from nppipes import as_array,fit_transform,transform,fit,predict,savetxt,stack
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import ExtraTreesRegressor


    nominal_cidx = [0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23,
                 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45,
                 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59,
                 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77]

    data = (
        (in_h5,)
        | h5open
        | as_key('file')
        | as_key('train_X', lambda d:
            (d['file'],)
            | getitem('train_X')
            | as_array
            | P.first
            )
        | as_key('train_y', lambda d:
            (d['file'],)
            | getitem('train_y')
            | as_array
            | P.first
            )
        | as_key('test_X', lambda d:
            (d['file'],)
            | getitem('test_X')
            | as_array
            | P.first
            )
        | as_key('train_labels', lambda d:
            (d['file'],)
            | getitem('train_labels')
            | as_array
            | P.first
            )
        | as_key('test_labels', lambda d:
            (d['file'],)
            | getitem('test_labels')
            | as_array
            | P.first
            )

        | as_key('one_hot', lambda _:
            (OneHotEncoder(categorical_features=nominal_cidx, sparse=False),))
        | as_key('train_X', lambda d:
            (d['train_X'].copy(),)
            | fit_transform(d['one_hot'])
            | P.first
            )
        | as_key('test_X', lambda d:
            (d['test_X'].copy(),)
            | transform(d['one_hot'])
            | P.first
            )
        | del_key('one_hot')

        | as_key('std_scaler', lambda _: (StandardScaler(),))
        | as_key('train_X', lambda d:
            (d['train_X'].copy(),)
            | fit_transform(d['std_scaler'])
            | P.first
            )
        | as_key('test_X', lambda d:
            (d['test_X'].copy(),)
            | transform(d['std_scaler'])
            | P.first
            )
        | del_key('std_scaler')

        | as_key('RFReg', lambda d:
            (ExtraTreesRegressor(random_state=1,
                                 n_estimators=nest, n_jobs=njobs,
                                 verbose=1,
                                 max_features=1.0, min_samples_leaf=1.0,
                                 max_depth=50),)
            | fit((d['train_X'],), (d['train_y'],))
            | P.first
            )
        | as_key('y_hat', lambda d:
            (d['test_X'],)
            | predict((d['RFReg'],))
            | P.first
            )
        | del_key('RFReg')

        | P.first
    )

    (
        (data['test_labels'], data['y_hat'])
        | stack(axis=1)
        | savetxt(out_csv_file,
                  delimiter=',',
                  fmt=['%d', '%d'],
                  header='"Id","Response"', comments='')
        | P.first
    )

    return
def work():

    from h5pipes import h5open
    from pypipes import getitem, as_key
    from nppipes import as_array
    from skll import kappa

    data = (('raw-data.h5', )
            | h5open
            | as_key('file')
            | as_key(
                'train_X', lambda d: (d['file'], )
                | getitem('train_X')
                | as_array
                | P.first)
            | as_key(
                'train_y', lambda d: (d['file'], )
                | getitem('train_y')
                | as_array
                | P.first)
            | as_key(
                'test_X', lambda d: (d['file'], )
                | getitem('test_X')
                | as_array
                | P.first)
            | as_key(
                'train_labels', lambda d: (d['file'], )
                | getitem('train_labels')
                | as_array
                | P.first)
            | as_key(
                'test_labels', lambda d: (d['file'], )
                | getitem('test_labels')
                | as_array
                | P.first)
            | P.first)

    nominal_cidx = [
        0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
        27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52,
        53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72,
        73, 74, 75, 76, 77
    ]

    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(categorical_features=nominal_cidx, sparse=False)
    data['train_X'] = enc.fit_transform(data['train_X'])
    data['test_X'] = enc.transform(data['test_X'])

    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    data['train_X'] = ss.fit_transform(data['train_X'])
    data['test_X'] = ss.transform(data['test_X'])

    #    from sklearn.neighbors import KNeighborsClassifier
    #    clf = KNeighborsClassifier(weights='uniform', n_neighbors=5)

    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(random_state=1, n_estimators=10, n_jobs=1)
    rfc = RandomForestClassifier(random_state=1, n_jobs=3)

    #from sklearn.ensemble import GradientBoostingClassifier
    #clf = GradientBoostingClassifier(n_estimators=10)
    #from sklearn.ensemble import AdaBoostClassifier
    #clf = AdaBoostClassifier(rfc, n_estimators=30, random_state=1)
    #from sklearn.ensemble import ExtraTreesClassifier
    #clf = ExtraTreesClassifier(n_jobs=3, n_estimators=50, random_state=1)

    from sklearn.metrics import make_scorer
    qwkappa = make_scorer(kappa, weights='quadratic')
    #    from sklearn.cross_validation import cross_val_score
    #    scores = cross_val_score(clf, data['train_X'], data['train_y'], cv=10,
    #                            scoring=qwkappa, n_jobs=2)
    #    print("Kappa: {:.5f} (+/- {:.5f})".format(scores.mean(), scores.std()))

    from sklearn.grid_search import GridSearchCV
    grid = GridSearchCV(
        estimator=clf,
        param_grid={
            'n_estimators': [10, 20, 50],
            'criterion': ['gini', 'entropy'],
            #'max_depth': [3, 4, 5, 7, 10]
        },
        cv=10,
        scoring=qwkappa,
        n_jobs=2,
        verbose=2)
    grid.fit(data['train_X'], data['train_y'])
    print('grid scores:', grid.grid_scores_)
    print('best score:', grid.best_score_)
    print('best params:', grid.best_params_)

    pass
def work():

    from h5pipes import h5open
    from pypipes import getitem,as_key
    from nppipes import as_array
    from skll import kappa

    data = (
        ('raw-data.h5',)
        | h5open
        | as_key('file')
        | as_key('train_X', lambda d:
            (d['file'],)
            | getitem('train_X')
            | as_array
            | P.first
            )
        | as_key('train_y', lambda d:
            (d['file'],)
            | getitem('train_y')
            | as_array
            | P.first
            )
        | as_key('test_X', lambda d:
            (d['file'],)
            | getitem('test_X')
            | as_array
            | P.first
            )
        | as_key('train_labels', lambda d:
            (d['file'],)
            | getitem('train_labels')
            | as_array
            | P.first
            )
        | as_key('test_labels', lambda d:
            (d['file'],)
            | getitem('test_labels')
            | as_array
            | P.first
            )

        | P.first
    )


    nominal_cidx = [0, 1, 2, 4, 5, 6, 12, 13, 15, 17, 18, 19, 20, 21, 22, 23,
                 24, 25, 26, 27, 29, 30, 31, 32, 38, 39, 40, 41, 42, 43, 44, 45,
                 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59,
                 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77]


    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(categorical_features=nominal_cidx, sparse=False)
    data['train_X'] = enc.fit_transform(data['train_X'])
    data['test_X'] = enc.transform(data['test_X'])


    from sklearn.preprocessing import StandardScaler
    ss = StandardScaler()
    data['train_X'] = ss.fit_transform(data['train_X'])
    data['test_X'] = ss.transform(data['test_X'])


#    from sklearn.neighbors import KNeighborsClassifier
#    clf = KNeighborsClassifier(weights='uniform', n_neighbors=5)


    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(random_state=1, n_estimators=10, n_jobs=1)
    rfc = RandomForestClassifier(random_state=1, n_jobs=3)

    #from sklearn.ensemble import GradientBoostingClassifier
    #clf = GradientBoostingClassifier(n_estimators=10)
    #from sklearn.ensemble import AdaBoostClassifier
    #clf = AdaBoostClassifier(rfc, n_estimators=30, random_state=1)
    #from sklearn.ensemble import ExtraTreesClassifier
    #clf = ExtraTreesClassifier(n_jobs=3, n_estimators=50, random_state=1)

    from sklearn.metrics import make_scorer
    qwkappa = make_scorer(kappa, weights='quadratic')
#    from sklearn.cross_validation import cross_val_score
#    scores = cross_val_score(clf, data['train_X'], data['train_y'], cv=10,
#                            scoring=qwkappa, n_jobs=2)
#    print("Kappa: {:.5f} (+/- {:.5f})".format(scores.mean(), scores.std()))

    from sklearn.grid_search import GridSearchCV
    grid = GridSearchCV(estimator=clf,
                        param_grid={'n_estimators': [10, 20, 50],
                                    'criterion': ['gini', 'entropy'],
                                    #'max_depth': [3, 4, 5, 7, 10]
                                    },
                        cv=10, scoring=qwkappa, n_jobs=2,
                        verbose=2)
    grid.fit(data['train_X'], data['train_y'])
    print('grid scores:', grid.grid_scores_)
    print('best score:', grid.best_score_)
    print('best params:', grid.best_params_)

    pass
Beispiel #5
0
def work(in_train_arch,
         in_test_arch,
         in_train_csv,
         in_test_csv,
         out_h5):

    from pypipes import unzip,as_key,del_key,getitem,setitem
    from nppipes import (genfromtxt,
                         place,astype,as_columns,label_encoder,fit_transform,
                         transform,stack
                         )
    from nppipes import take as np_take
    from numpy.core.defchararray import strip
    from numpy import s_,mean,in1d,putmask
    from collections import Counter
    from h5pipes import h5new


    @P.Pipe
    def replace_missing_with(iterable, ftor):
        from numpy import isnan
        for item in iterable:
            for i in range(item.shape[1]):
                mask = isnan(item[:, i])
                value = ftor(item[~mask, i])
                item[mask, i] = value
                pass
            yield item


    missing_cidx = [11, 14, 16, 28, 33, 34, 35, 36, 37, 46, 51, 60, 68]
    unseen_nominal_cidx = [2, 12, 38, 69, 74]
    seen_nominal_cidx = [0, 1, 4, 5, 6, 13, 15, 17, 18, 19, 20, 21, 22, 23,
                 24, 25, 26, 27, 29, 30, 31, 32, 39, 40, 41, 42, 43, 44, 45,
                 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59,
                 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 75, 76, 77]
    nominal_cidx = seen_nominal_cidx + unseen_nominal_cidx


    data = (
        in_train_arch
        | unzip(in_train_csv)
        | genfromtxt(delimiter=',', dtype=str)
        | place(lambda d: d == '', 'nan')
        | as_key('train')
        | as_key('train_col_names', lambda d: strip(d['train'][0], '"'))
        | as_key('train_labels',    lambda d: d['train'][1:, 0].astype(int))
        | as_key('train_X',         lambda d: d['train'][1:, 1:-1])
        | as_key('train_y',         lambda d: d['train'][1:, -1].astype(int))
        | del_key('train')


        | as_key('test', lambda d:
                in_test_arch
                | unzip(in_test_csv)
                | genfromtxt(delimiter=',', dtype=str)
                | place(lambda d: d == '', 'nan')
                | P.first
                )
        | as_key('test_col_names', lambda d: strip(d['test'][0], '"'))
        | as_key('test_labels',    lambda d: d['test'][1:, 0].astype(int))
        | as_key('test_X',         lambda d: d['test'][1:, 1:])
        | del_key('test')

        | as_key('train_X', lambda d:
                (d['train_X'],)
                | np_take(missing_cidx, axis=1)
                | astype(float)

                | replace_missing_with(mean)

                | astype(str)
                | setitem(d['train_X'].copy(), s_[:, missing_cidx])
                | P.first
                )

        | as_key('label_encoders', lambda d:
                len(nominal_cidx)
                | label_encoder
                | P.as_tuple
                )

        | as_key('train_X', lambda d:
                (d['train_X'],)
                | np_take(nominal_cidx, axis=1)
                | as_columns
                | fit_transform(d['label_encoders'])
                | stack(axis=1)
                | setitem(d['train_X'].copy(), s_[:, nominal_cidx])
                | P.first
                )

        | as_key('test_X', lambda d:
                (d['test_X'],)
                | np_take(seen_nominal_cidx, axis=1)
                | as_columns
                | transform(d['label_encoders'][:-len(unseen_nominal_cidx)])
                | stack(axis=1)
                | setitem(d['test_X'].copy(), s_[:, seen_nominal_cidx])
                | P.first
                )

        | as_key('test_X', lambda d:
                (d['test_X'],)
                | np_take(unseen_nominal_cidx, axis=1)
                | as_key('test_unseen_nominals_features')

                | as_key('test_unseen_nominals', lambda d2:
                        zip(d2['test_unseen_nominals_features'].T,
                            d['label_encoders'][-len(unseen_nominal_cidx):])
                        | P.select(lambda t: list(set(t[0]) - set(t[1].classes_)))
                        | P.as_list
                        )

                | as_key('train_most_common_nominals', lambda d2:
                        zip(d['train_X'][:, unseen_nominal_cidx].T.astype(int),
                            d['label_encoders'][-len(unseen_nominal_cidx):])
                        | P.select(lambda t: t[1].inverse_transform(t[0]))
                        | P.select(lambda s: Counter(s).most_common(1)[0][0])
                        | P.as_list
                        )

                | as_key('test_corrected_features', lambda d2:
                        zip(d2['test_unseen_nominals_features'].copy().T,
                            d2['test_unseen_nominals'],
                            d2['train_most_common_nominals'])
                        | P.select(lambda t: putmask(t[0], in1d(t[0], t[1]), t[2]) or t[0].T)
                        | stack(axis=1)
                        | P.first
                        )

                | getitem('test_corrected_features')
                | as_columns
                | transform(d['label_encoders'][-len(unseen_nominal_cidx):])
                | stack(axis=1)
                | setitem(d['test_X'].copy(), s_[:, unseen_nominal_cidx])
                | P.first
                )

        | del_key('label_encoders')

        | as_key('test_X', lambda d:
                (d['test_X'],)
                | np_take(missing_cidx, axis=1)
                | astype(float)

                | replace_missing_with(mean)

                | astype(str)
                | setitem(d['test_X'].copy(), s_[:, missing_cidx])
                | P.first
                )

        | P.first
        )

    #print(data.keys())

    (
        (out_h5,)
        | h5new
        | as_key('train_X',         lambda _: data['train_X'].astype(float))
        | as_key('train_y',         lambda _: data['train_y'].astype(float))
        | as_key('test_X',          lambda _: data['test_X'].astype(float))
        | as_key('train_labels',    lambda _: data['train_labels'])
        | as_key('test_labels',     lambda _: data['test_labels'])
        | P.first
    )

    return