コード例 #1
0
ファイル: train_dual.py プロジェクト: fagan2888/trains
def main():
    """
    Get data from db and save it as csv
    """
    # Helpers
    bq = bqhandler.BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io)

    # Configuration
    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    # Initialise classifier
    if hasattr(options, 'classifier_file'):
        classifier = io.load_scikit_model(options.classifier_file)
    else:
        if options.classifier == 'svc':
            params = {
                'kernel': options.kernel,
                'gamma': options.gamma,
                'C': options.penalty,
                'probability': options.probability
            }
            classifier = SVC(**params)
        elif options.classifier == 'rfc':
            classifier = RandomForestClassifier(n_jobs=-1)
        else:
            raise (
                'Model not specificied or wrong. Add "classifier: bgm" to config file.'
            )

    # Initialise regression model
    if options.regression == 'rfr':
        model = RandomForestRegressor(
            n_estimators=options.n_estimators,
            n_jobs=-1,
            min_samples_leaf=options.min_samples_leaf,
            min_samples_split=options.min_samples_split,
            max_features=options.max_features,
            max_depth=options.max_depth,
            bootstrap=options.bootstrap)
        regressor = _trans.Regressor(model=model)
    else:
        raise (
            'Model not specificied or wrong. Add "classifier: bgm" to config file.'
        )

    # Initialise transformer
    transformer = _trans.Selector(classifier=classifier)

    # Initialise pipeline
    pipe = Pipeline([('selector', transformer), ('regression', regressor)])

    sum_columns = ['delay']
    if options.reason_code_table is not None:
        sum_columns = ['count']

    # Pick only selected month
    where = {}
    if options.pick_month is not None:
        where = {'EXTRACT(MONTH from time)': options.pick_month}

    logging.info('Reading data...')
    bq.set_params(loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names,
                  reason_code_table=options.reason_code_table,
                  where=where)

    data = bq.get_rows(starttime, endtime)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=sum_columns,
                                aggs=aggs)

    data.sort_values(by=['time', 'trainstation'], inplace=True)

    logging.info('Processing {} rows...'.format(len(data)))

    # Filter only timesteps with large distribution in the whole network
    if options.filter_delay_limit is not None:
        data = io.filter_delay_with_limit(data, options.filter_delay_limit)

    # Binary class
    logging.info('Adding binary class to the dataset with limit {}...'.format(
        options.delay_limit))
    data['class'] = data['delay'].map(lambda x: 1
                                      if x > options.delay_limit else -1)

    # Separate train and validation sets
    data_train, data_test = train_test_split(data, test_size=0.3)

    # Balance
    if options.balance:
        logging.info('Balancing training data...')
        count = data_train.groupby('class').size().min()
        # SVC can't handle more than 50 000 samples
        if options.classifier == 'svc': count = min(count, 50000)
        data_train = pd.concat([
            data_train.loc[data_train['class'] == -1].sample(n=count),
            data_train.loc[data_train['class'] == 1].sample(n=count)
        ])

    logging.info('Train data:')
    io.log_class_dist(data_train.loc[:, 'class'].values, labels=[-1, 1])
    logging.info('Test data:')
    io.log_class_dist(data_test.loc[:, 'class'].values, labels=[-1, 1])

    # Adding month
    if options.month:
        logging.info('Adding month to the datasets...')
        data_train['month'] = data_train.loc[:, 'time'].map(lambda x: x.month)
        data_test['month'] = data_test.loc[:, 'time'].map(lambda x: x.month)
        options.feature_params.append('month')

    data_train.set_index('time', inplace=True)

    y_train = data_train.loc[:, ['delay', 'class']].astype(np.int32).values
    y_test = data_test.loc[:, ['delay', 'class']].astype(np.int32).values
    X_train = data_train.loc[:,
                             options.feature_params].astype(np.float32).values
    X_test = data_test.loc[:, options.feature_params].astype(np.float32).values

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # io.log_class_dist(y_train[:,1], [-1,1])

    # If asked, save used train and test splits into big query
    if options.save_data:
        tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_train'
        bq.nparray_to_table([X_train, y_train],
                            [options.feature_params, ['delay', 'class']],
                            options.project, options.feature_dataset, tname)
        tname = options.model + '_' + options.feature_dataset + '_' + options.config_name + '_test'
        bq.nparray_to_table([X_test, y_test],
                            [options.feature_params, ['delay', 'class']],
                            options.project, options.feature_dataset, tname)

    if options.normalize:
        logging.info('Normalizing data...')
        if hasattr(options, 'xscaler_file'):
            scaler = io.load_scikit_model(options.xscaler_file)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        else:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

    if options.cv:
        logging.info('Doing random search for hyper parameters...')
        raise ("No param_grid set for given model ({})".format(
            options.regression))

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)
        logging.info("RandomizedSearchCV done.")
        fname = options.output_path + '/random_search_cv_results.txt'
        report_cv_results(random_search.cv_results_, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
        sys.exit()
    else:
        logging.info('Training...')
        transformer.set_y(y_train[:, 0])
        regressor.set_classifier(transformer)
        pipe.fit(X_train, y_train[:, 1])

    # Metrics
    y_pred_proba = pipe.steps[0][1].predict_proba(X_test)
    y_pred = pipe.steps[0][1].predict(X_test, type='int')

    io.save_scikit_model(pipe,
                         filename=options.save_file,
                         ext_filename=options.save_file)

    # Classification performance
    y_test_class = y_test[:, 1]

    acc = accuracy_score(y_test_class, y_pred)
    precision = precision_score(y_test_class, y_pred, average='micro')
    recall = recall_score(y_test_class, y_pred, average='micro')
    f1 = f1_score(y_test_class, y_pred, average='micro')

    logging.info('Classification accuracy: {}'.format(acc))
    logging.info('Classification precision: {}'.format(precision))
    logging.info('Classification recall: {}'.format(recall))
    logging.info('Classification F1 score: {}'.format(f1))
    io.log_class_dist(y_pred, [-1, 1])

    # Confusion matrices
    fname = '{}/confusion_matrix_validation.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test_class,
                              y_pred,
                              np.arange(2),
                              filename=fname)

    fname = '{}/confusion_matrix_validation_normalised.png'.format(
        options.output_path)
    viz.plot_confusion_matrix(y_test_class,
                              y_pred,
                              np.arange(2),
                              True,
                              filename=fname)

    # Precision-recall curve
    fname = '{}/precision-recall-curve.png'.format(options.output_path)
    viz.prec_rec_curve(y_test_class, y_pred_proba, filename=fname)

    # ROC
    fname = '{}/roc.png'.format(options.output_path)
    viz.plot_binary_roc(y_test_class, y_pred_proba, filename=fname)

    if options.normalize:
        fname = options.save_path + '/xscaler.pkl'
        io.save_scikit_model(scaler, filename=fname, ext_filename=fname)

    if options.regression == 'rfr':
        fname = options.output_path + '/rfc_feature_importance.png'
        viz.rfc_feature_importance(pipe.steps[1][1].get_feature_importances(),
                                   fname,
                                   feature_names=options.feature_params)

    # Regression performance
    #y_pred = pipe.steps[1][1].predict(X_test)
    y_test_reg = y_test[:, 0]
    pipe.set_params(selector__full=True)
    y_pred = pipe.predict(X_test, full=True)

    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
    mae = mean_absolute_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)

    logging.info('Regression RMSE: {}'.format(rmse))
    logging.info('Regression MAE: {}'.format(mae))
    logging.info('Regression R2 score: {}'.format(r2))

    error_data = {
        'acc': [acc],
        'precision': [precision],
        'recall': [recall],
        'f1': [f1],
        'rmse': [rmse],
        'mae': [mae],
        'r2': [r2]
    }
    fname = '{}/training_time_classification_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    ############################################################################
    # EVALUATE
    ############################################################################
    if options.evaluate:
        logging.info('Loading test data...')
        test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"),
                                dt.datetime.strptime('2019-01-01', "%Y-%m-%d"),
                                loc_col='trainstation',
                                project=options.project,
                                dataset=options.feature_dataset,
                                table=options.test_table,
                                parameters=all_param_names)

        test_data = io.filter_train_type(labels_df=test_data,
                                         train_types=['K', 'L'],
                                         sum_types=True,
                                         train_type_column='train_type',
                                         location_column='trainstation',
                                         time_column='time',
                                         sum_columns=['delay'],
                                         aggs=aggs)

        # Sorting is actually not necessary. It's been useful for debugging.
        test_data.sort_values(by=['time', 'trainstation'], inplace=True)
        test_data.set_index('time', inplace=True)
        logging.info('Test data contain {} rows...'.format(len(test_data)))

        logging.info(
            'Adding binary class to the test dataset with limit {}...'.format(
                options.delay_limit))
        #logging.info('Adding binary class to the dataset with limit {}...'.format(limit))
        #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1)
        test_data['class'] = test_data['delay'].map(
            lambda x: 1 if x > options.delay_limit else -1)
        io.log_class_dist(test_data.loc[:, 'class'].values, labels=[-1, 1])

        if options.month:
            logging.info('Adding month to the test dataset...')
            test_data['month'] = test_data.index.map(lambda x: x.month)

        times = [('2014-01-01', '2014-02-01'), ('2016-06-01', '2016-07-01'),
                 ('2017-02-01', '2017-03-01'), ('2011-02-01', '2017-03-01')]
        for start, end in times:
            try:
                y_pred_proba, y_pred, y = predict_timerange(
                    test_data, options.feature_params, pipe.steps[0][1],
                    scaler, start, end)
                perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io)
            except EmptyDataError:
                logging.info('No data for {} - {}'.format(start, end))
コード例 #2
0
def main():
    """
    Get data from db and save it as csv
    """
    # Helpers
    bq = bqhandler.BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io)
    predictor = Predictor(io, ModelLoader(io), options)

    ### OPTIONS ################################################################

    # Configuration
    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(options.feature_dataset,
                                                                  starttime.strftime('%Y-%m-%d'),
                                                                  endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    ### MODELS #################################################################

    # Initialise classifier
    if hasattr(options, 'classifier_file'):
        classifier = io.load_scikit_model(options.classifier_file)
    else:
        if options.classifier == 'svc':
            params = {'kernel': options.kernel, 'gamma': options.gamma, 'C': options.penalty, 'probability': options.probability}
            #classifier = SVC(**params)
            classifier = SVCClassifier(params, limit=options.class_limit)
        elif options.classifier == 'graphsvc':

            classifier  = GraphSVCClassifier()
            graph_data = pd.read_csv(options.graph_data, names=['date', 'start_hour', 'src', 'dst', 'type', 'sum_delay','sum_ahead','add_delay','add_ahead','train_count'])
            classifier.fetch_connections(graph_data)

        elif options.classifier == 'gaussiannb':
            classifier = GaussianNBClassifier()
        elif options.classifier == 'lstm':
            num_of_features = len(options.feature_params)
            if options.month: num_of_features += 1
            class_weight=None
            if hasattr(options, 'class_weight'):
                class_weight=eval(options.class_weight)
            params = {'length': options.time_steps, 'batch_size': options.batch_size, 'epochs': options.epochs, 'num_of_features': num_of_features, 'log_dir': options.log_dir, 'class_weight':class_weight}
            classifier = LSTMClassifier(**params)
        else:
            raise('Model not specificied or wrong. Add "classifier: bgm" to config file.')

    # Initialise regression model
    if options.regression == 'rfr':
        regressor = RandomForestRegressor(n_estimators=options.n_estimators,
                                      n_jobs=-1,
                                      min_samples_leaf=options.min_samples_leaf,
                                      min_samples_split=options.min_samples_split,
                                      max_features=options.max_features,
                                      max_depth=options.max_depth,
                                      bootstrap=options.bootstrap
                                      )
        #regressor = _trans.Regressor(model=model)
    else:
        raise('Model not specificied or wrong. Add "classifier: bgm" to config file.')

    # Initialise transformer
    #transformer = _trans.Selector(classifier=classifier)

    # Initialise pipeline
    #pipe = Pipeline(
    #    [('selector', transformer),
    #     ('regression', regressor)]
    #)

    ### DATA ###################################################################

    sum_columns = ['delay']
    if 'train_count' in options.meta_params:
        sum_columns.append('train_count')

    # Pick only selected month
    where = {}
    if options.pick_month is not None:
        where = {'EXTRACT(MONTH from time)': options.pick_month}

    logging.info('Reading data...')
    bq.set_params(loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names,
                  locations=options.locations,
                  only_winters=options.only_winters,
                  reason_code_table=options.reason_code_table,
                  where=where)

    data = bq.get_rows(starttime,
                       endtime)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=sum_columns,
                                aggs=aggs)

    data['delay'] = data.loc[:, 'delay'].replace(-99, np.nan)
    data.sort_values(by=['trainstation', 'time'], inplace=True)

    logging.info('Processing {} rows...'.format(len(data)))

    # Filter only timesteps with large distribution in the whole network
    if options.filter_delay_limit is not None:
        data = io.filter_delay_with_limit(data, options.filter_delay_limit)

    # Binary class
    logging.info('Adding binary class to the dataset with limit {}...'.format(options.delay_limit))
    def set_class(x):
        if x > options.delay_limit:
            return binary_labels[1]
        elif x < options.delay_limit:
            return binary_labels[0]
        return np.nan
    data['class'] = data['delay'].map(lambda x: set_class(x))

    # Separate train and validation sets
    data_train, data_test = train_test_split(data, test_size=0.3, shuffle=False)

    # Balance
    if options.balance:
        logging.info('Balancing training data...')
        count = data_train.groupby('class').size().min()
        # SVC can't handle more than 50 000 samples
        if options.classifier == 'svc': count = min(count, 50000)
        data_train = pd.concat([data_train.loc[data_train['class'] == 0].sample(n=count),
        data_train.loc[data_train['class'] == 1].sample(n=count)])

    logging.info('Train data:')
    io.log_class_dist(data_train.loc[:, 'class'].values, labels=binary_labels)
    logging.info('Test data:')
    io.log_class_dist(data_test.loc[:, 'class'].values, labels=binary_labels)

    # Adding month
    if options.month:
        logging.info('Adding month to the datasets...')
        data_train['month'] = data_train.loc[:,'time'].map(lambda x: x.month)
        data_test['month'] = data_test.loc[:,'time'].map(lambda x: x.month)
        options.feature_params.append('month')

    #data_train.set_index('time', inplace=True)
    #y_train_class = data_train.loc[:,['class']].astype(np.int32).values.ravel()
    #y_train_delay = data_train.loc[:,['delay']].astype(np.int32).values.ravel()
    y_train_class = data_train.loc[:,['class']].values.ravel()
    y_train_delay = data_train.loc[:,['delay']].values.ravel()
    #y_test_class = data_test.loc[:,['class']].astype(np.int32).values.ravel()
    #y_test_delay = data_test.loc[:,['delay']].astype(np.int32).values.ravel()
    y_test_class = data_test.loc[:,['class']].values.ravel()
    y_test_delay = data_test.loc[:,['delay']].values.ravel()

    X_train = data_train.loc[:,options.feature_params].astype(np.float32).values
    X_test = data_test.loc[:,options.feature_params].astype(np.float32).values

    if options.smote:
        logging.info('Smoting...')
        sm = SMOTE()
        X_train_class, y_class = sm.fit_resample(X_train, y_train_class)
        io.log_class_dist(y_class, labels=binary_labels)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # io.log_class_dist(y_train[:,1], [-1,1])

    # If asked, save used train and test splits into big query
    if options.save_data:
        tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_train'
        columns = [options.feature_params, ['delay'], ['class']]
        bq.nparray_to_table([X_train, y_train_class, y_train_delay],
                            columns,
                            options.project,
                            options.feature_dataset,
                            tname
                            )
        tname = options.model+'_'+options.feature_dataset+'_'+options.config_name+'_test'
        bq.nparray_to_table([X_test, y_test_class, y_test_delay],
                            columns,
                            options.project,
                            options.feature_dataset,
                            tname
                            )

    if options.normalize:
        logging.info('Normalizing data...')
        #scale=(0,1)
        if hasattr(options, 'xscaler_file'):
            xscaler = io.load_scikit_model(options.xscaler_file)
            X_train = xscaler.transform(X_train)
            X_test = xscaler.transform(X_test)
        else:
            xscaler = MinMaxScaler(feature_range=(-1,1))
            #xscaler = StandardScaler()
            X_train = xscaler.fit_transform(X_train)
            X_test = xscaler.transform(X_test)
            fname = options.save_path+'/xscaler.pkl'
            io.save_scikit_model(xscaler, fname, fname)

        if hasattr(options, 'yscaler_file'):
            yscaler = io.load_scikit_model(options.yscaler_file)
            y_train_delay = yscaler.transform(y_train_delay)
            y_test_delay = yscaler.transform(y_test_delay)
        else:
            #yscaler = MinMaxScaler(feature_range=(0,1))
            yscaler=StandardScaler()
            y_train_delay = yscaler.fit_transform(y_train_delay.reshape(-1,1)).ravel()
            y_test_delay = yscaler.transform(y_test_delay.reshape(-1,1)).ravel()
            fname = options.save_path+'/yscaler.pkl'
            io.save_scikit_model(yscaler, fname, fname)


    data_train.loc[:,options.feature_params].to_csv('data/x_train.csv', index=False)
    data_test.loc[:,options.feature_params].to_csv('data/x_test.csv', index=False)
    data_train.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_train.csv', index=False)
    data_test.loc[:,['class']].fillna(-99).astype(np.int).to_csv('data/y_test.csv', index=False)
    sys.exit()

    ### TRAIN ##################################################################

    if options.cv:
        logging.info('Doing random search for hyper parameters...')
        raise("No param_grid set for given model ({})".format(options.regression))

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)
        logging.info("RandomizedSearchCV done.")
        fname = options.output_path+'/random_search_cv_results.txt'
        report_cv_results(random_search.cv_results_, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
        sys.exit()
    else:
        logging.info('Training classifier...')

        if options.classifier == 'graphsvc':
            classifier.fit(X_train, y_train_class, stations=data_train.loc[:, 'trainstation'].values)
        else:
            history = classifier.fit(X_train, y_train_class, X_test, y_test_class)

        # Save classifier
        if options.classifier == 'lstm':
            history_fname = options.save_path+'/history.pkl'
            fname = options.save_path+'/classifier.h5'
            io.save_keras_model(fname, history_fname, classifier, history.history)
        else:
            fname = options.save_path+'/classifier.pkl'
            io.save_scikit_model(classifier, filename=fname, ext_filename=fname)

        # Drop data with no delay information
        X_train = X_train[~np.isnan(y_train_delay)]
        y_train_delay = y_train_delay[~np.isnan(y_train_delay)]
        y_train_class = y_train_class[~np.isnan(y_train_class)]

        y_pred_train_bin = classifier.predict(X_train, type='bool')

        # debug
        #y_pred_train_bin
        #indices = np.random.choice(np.arange(y_pred_train_bin.size),
        #                           replace=False,
        #                           size=int(y_pred_train_bin.size * 0.2))
        #y_pred_train_bin[indices] = True

        #print('y_pred_train_bin: {}'.format(y_pred_train_bin.shape))
        #print('y_train_delay: {}'.format(y_train_delay.shape))
        #print('y_train_class: {}'.format(y_train_class.shape))

        # Pick only severe values
        #y_train_delay_ = y_train_delay[(len(y_train_class)-len(y_pred_train_bin)):]
        #X_train_ = X_train[(len(y_train_class)-len(y_pred_train_bin)):]
        y_train_delay_ = y_train_delay[(len(y_train_delay)-len(y_pred_train_bin)):]
        X_train_ = X_train[(len(y_train_delay)-len(y_pred_train_bin)):]
        #print('y_train_delay_: {}'.format(y_train_delay_.shape))
        y_train_severe = y_train_delay_[y_pred_train_bin]
        X_train_severe = X_train_[y_pred_train_bin]

        logging.info('Training regressor...')
        regressor.fit(X_train_severe, y_train_severe)

    # Save regressor
    io.save_scikit_model(regressor, filename=options.save_file, ext_filename=options.save_file)

    # Learning history
    # fname = options.output_path+'/learning_over_time.png'
    # viz.plot_nn_perf(history.history, metrics={'Error': {'mean_squared_error': 'MSE',
    #                                                      'mean_absolute_error': 'MAE'}},
    #                                                      filename=fname)

    ### RESULTS FOR VALIDATION SET #############################################

    # Drop data with missing delay
    X_test = X_test[~np.isnan(y_test_class)]
    y_test_class = y_test_class[~np.isnan(y_test_class)]
    data_test = data_test[~np.isnan(data_test.delay)]

    # Metrics
    #y_pred_proba = classifier.predict_proba(X_test)
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.y_pred_proba

    #y_test_delay = y_test_delay[~np.isnan(y_test_delay)]

    # Classification performance
    # LSTM don't have first timesteps
    y_test_class = y_test_class[(len(X_test)-len(y_pred)):]

    acc = accuracy_score(y_test_class, y_pred)
    precision = precision_score(y_test_class, y_pred, average='micro')
    recall = recall_score(y_test_class, y_pred, average='micro')
    f1 = f1_score(y_test_class, y_pred, average='micro')

    logging.info('Classification accuracy: {}'.format(acc))
    logging.info('Classification precision: {}'.format(precision))
    logging.info('Classification recall: {}'.format(recall))
    logging.info('Classification F1 score: {}'.format(f1))
    io.log_class_dist(y_pred, binary_labels)

    # Confusion matrices
    fname = '{}/confusion_matrix_validation.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), filename=fname)

    fname = '{}/confusion_matrix_validation_normalised.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test_class, y_pred, np.arange(2), True, filename=fname)

    # Precision-recall curve
    fname = '{}/precision-recall-curve_validation.png'.format(options.output_path)
    viz.prec_rec_curve(y_test_class, y_pred_proba, filename=fname)

    # ROC
    fname = '{}/roc_validation.png'.format(options.output_path)
    viz.plot_binary_roc(y_test_class, y_pred_proba, filename=fname)

    if options.regression == 'rfr':
        fname = options.output_path+'/rfc_feature_importance.png'
        viz.rfc_feature_importance(regressor.feature_importances_, fname, feature_names=options.feature_params)

    # Regression performance
    y_pred_reg, y_test_reg = predictor.pred(data=data_test)
    #y_test_reg = y_test[(len(y_test)-len(y_pred)):,0]
    rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
    mae = mean_absolute_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)

    logging.info('Regression RMSE: {}'.format(rmse))
    logging.info('Regression MAE: {}'.format(mae))
    logging.info('Regression R2 score: {}'.format(r2))

    error_data = {'acc': [acc],
                  'precision': [precision],
                  'recall': [recall],
                  'f1': [f1],
                  'rmse': [rmse],
                  'mae': [mae],
                  'r2': [r2]}
    fname = '{}/training_time_classification_validation_errors.csv'.format(options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)






    ############################################################################
    # EVALUATE
    ############################################################################
    if options.evaluate:
        logging.info('Loading test data...')
        test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"),
                                dt.datetime.strptime('2019-01-01', "%Y-%m-%d"),
                                loc_col='trainstation',
                                project=options.project,
                                dataset=options.feature_dataset,
                                table=options.test_table,
                                reason_code_table=options.reason_code_table,
                                locations=options.locations,
                                parameters=all_param_names)

        test_data = io.filter_train_type(labels_df=test_data,
                                         train_types=['K','L'],
                                         sum_types=True,
                                         train_type_column='train_type',
                                         location_column='trainstation',
                                         time_column='time',
                                         sum_columns=['delay'],
                                         aggs=aggs)

         # Sorting is actually not necessary. It's been useful for debugging.
        #test_data.sort_values(by=['time', 'trainstation'], inplace=True)

        # Filter only timesteps with large distribution in the whole network
        if options.filter_delay_limit is not None:
            test_data = io.filter_delay_with_limit(test_data, options.filter_delay_limit)

        test_data.set_index('time', inplace=True)
        logging.info('Test data contain {} rows...'.format(len(test_data)))

        logging.info('Adding binary class to the test dataset with limit {}...'.format(options.delay_limit))
        test_data['class'] = test_data['delay'].map(lambda x: binary_labels[1] if x > options.delay_limit else binary_labels[0])
        io.log_class_dist(test_data.loc[:, 'class'].values, labels=binary_labels)

        if options.month:
            logging.info('Adding month to the test dataset...')
            test_data['month'] = test_data.index.map(lambda x: x.month)

        times = [('2014-01-01', '2014-02-01'), ('2016-06-01', '2016-07-01'), ('2017-02-01', '2017-03-01'), ('2011-02-01', '2011-03-01')]
        for start, end in times:
            try:
                y_pred_proba, y_pred, y = predict_timerange(test_data, options.feature_params, classifier, xscaler, start, end)
                perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io)
            except EmptyDataError:
                logging.info('No data for {} - {}'.format(start, end))
コード例 #3
0
def main():
    """
    Get data from db and save it as csv
    """

    bq = BQHandler()
    io = IO(gs_bucket=options.gs_bucket)
    viz = Viz(io)

    starttime, endtime = io.get_dates(options)
    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    if options.model == 'bgm':
        model = BayesianGaussianMixture(
            weight_concentration_prior_type="dirichlet_process",
            n_components=options.n_components)
    elif options.model == 'gaussiannb':
        model = GaussianNB()
    elif options.model == 'rfc':
        model = RandomForestClassifier(n_jobs=-1)
    elif options.model == 'svc':
        params = {'kernel': 'rbf', 'gamma': 0.5, 'C': 1, 'probability': True}
        model = SVC(**params)
    else:
        raise (
            'Model not specificied or wrong. Add for example "model: bgm" to config file.'
        )

    if options.pca:
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

    sum_columns = ['delay']
    if options.reason_code_table is not None:
        sum_columns = ['count']

    logging.info('Reading data...')
    data = bq.get_rows(starttime,
                       endtime,
                       loc_col='trainstation',
                       project=options.project,
                       dataset=options.feature_dataset,
                       table=options.feature_table,
                       parameters=all_param_names,
                       reason_code_table=options.reason_code_table,
                       only_winters=options.only_winters)

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=sum_columns,
                                aggs=aggs)

    # Sorting is actually not necessary. It's been useful for debugging.
    data.sort_values(by=['time', 'trainstation'], inplace=True)
    data.set_index('time', inplace=True)

    logging.info('Data contain {} rows...'.format(len(data)))

    logging.info('Adding binary class to the dataset with limit {}...'.format(
        options.delay_limit))
    #logging.info('Adding binary class to the dataset with limit {}...'.format(limit))
    #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1)
    data['class'] = data['delay'].map(lambda x: 1
                                      if x > options.delay_limit else -1)
    io.log_class_dist(data.loc[:, 'class'].values, labels=[-1, 1])

    if options.balance:
        logging.info('Balancing dataset...')
        count = data.groupby('class').size().min()
        data = pd.concat([
            data.loc[data['class'] == -1].sample(n=count),
            data.loc[data['class'] == 1].sample(n=count)
        ])
        io.log_class_dist(data.loc[:, 'class'].values, labels=[-1, 1])

    if options.month:
        logging.info('Adding month to the dataset...')
        data['month'] = data.index.map(lambda x: x.month)
        options.feature_params.append('month')

    target = data.loc[:, 'class'].astype(np.int32).values.ravel()
    features = data.loc[:, options.feature_params].astype(np.float32).values

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        test_size=0.3)

    if options.normalize:
        logging.info('Normalizing data...')
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    logging.debug('Features shape after pre-processing: {}'.format(
        X_train.shape))

    if options.cv:
        logging.info('Doing random search for hyper parameters...')
        if options.model == 'bgm':
            param_grid = {
                "n_components": [1, 2, 4, 8, 16],
                "covariance_type": ['full', 'tied', 'diag', 'spherical'],
                "init_params": ['kmeans', 'random']
            }
        elif options.model == 'rfc':
            raise ("Not implemented. Get back to work!")
        elif options.model == 'svc':

            features_compinations = [
                [
                    'lat', 'lon', 'pressure', 'max_temperature',
                    'min_temperature', 'mean_temperature', 'mean_dewpoint',
                    'mean_humidity', 'mean_winddirection', 'mean_windspeedms',
                    'max_windgust', 'max_precipitation1h', 'max_snowdepth',
                    'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'max_temperature', 'min_temperature',
                    'mean_temperature', 'mean_dewpoint', 'mean_humidity',
                    'mean_winddirection', 'mean_windspeedms', 'max_windgust',
                    'max_precipitation1h', 'max_snowdepth', 'max_n', 'min_vis',
                    'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_windgust',
                    'max_precipitation1h', 'max_snowdepth', 'max_n', 'min_vis',
                    'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_snowdepth',
                    'max_n', 'min_vis', 'min_clhb', 'max_precipitation3h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_snowdepth',
                    'max_n', 'min_vis', 'min_clhb', 'max_precipitation1h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_dewpoint',
                    'mean_winddirection', 'mean_windspeedms', 'max_snowdepth',
                    'min_vis', 'max_precipitation1h'
                ],
                [
                    'pressure', 'min_temperature', 'mean_winddirection',
                    'mean_windspeedms', 'max_snowdepth', 'max_precipitation1h'
                ]
            ]

            param_grid = {
                "C": [0.0001, 0.001, 0.01, 0.1, 1],
                "kernel": ['rbf', 'poly'],
                "degree": [2, 3],
                "gamma": [0.5],
                "coef0": [0.1],
                "probability": [True],
                "features": features_compinations
            }

            from lib.svc import SVCF
            model = SVCF(all_features=options.feature_params)
        else:
            raise ("No param_grid set for given model ({})".format(
                options.model))

        print(model.get_params().keys())

        ftwo_scorer = make_scorer(fbeta_score, beta=2)
        scoring = {
            'accuracy': 'accuracy',
            'precision': 'precision',
            'recall': 'recall',
            'f1': 'f1',
            'f2': ftwo_scorer
        }

        random_search = RandomizedSearchCV(model,
                                           param_distributions=param_grid,
                                           n_iter=int(options.n_iter_search),
                                           verbose=1,
                                           scoring=scoring,
                                           refit='recall',
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)

        logging.info("RandomizedSearchCV done.")
        scores = ['accuracy', 'precision', 'recall', 'f1', 'f2']
        fname = options.output_path + '/random_search_cv_results.txt'
        io.report_cv_results(random_search.cv_results_,
                             scores=scores,
                             filename=fname,
                             ext_filename=fname)
        model = random_search.best_estimator_

        io.save_scikit_model(model,
                             filename=options.save_file,
                             ext_filename=options.save_file)
        if options.normalize:
            fname = options.save_path + '/xscaler.pkl'
            io.save_scikit_model(scaler, filename=fname, ext_filename=fname)

    else:
        logging.info('Training...')
        model.fit(X_train, y_train)

        # Save model and xscaler (no reason to save xscaler before the model has fitted as well)
        io.save_scikit_model(model,
                             filename=options.save_file,
                             ext_filename=options.save_file)
        if options.normalize:
            fname = options.save_path + '/xscaler.pkl'
            io.save_scikit_model(scaler, filename=fname, ext_filename=fname)

    # Metrics
    y_pred_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)
    # We want [-1,1] classes as y values are
    y_pred[y_pred == 0] = -1

    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    logging.info('Accuracy: {}'.format(acc))
    logging.info('Precision: {}'.format(precision))
    logging.info('Recall: {}'.format(recall))
    logging.info('F1 score: {}'.format(f1))
    io.log_class_dist(y_pred, labels=[-1, 1])

    error_data = {
        'acc': [acc],
        'precision': [precision],
        'recall': [recall],
        'f1': [f1]
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)

    # Confusion matrices
    fname = '{}/confusion_matrix_validation.png'.format(options.output_path)
    viz.plot_confusion_matrix(y_test, y_pred, np.arange(2), filename=fname)

    fname = '{}/confusion_matrix_validation_normalised.png'.format(
        options.output_path)
    viz.plot_confusion_matrix(y_test,
                              y_pred,
                              np.arange(2),
                              True,
                              filename=fname)

    # Precision-recall curve
    fname = '{}/precision-recall-curve.png'.format(options.output_path)
    viz.prec_rec_curve(y_test, y_pred_proba, filename=fname)

    # ROC
    fname = '{}/roc.png'.format(options.output_path)
    viz.plot_binary_roc(y_test, y_pred_proba, filename=fname)

    ############################################################################
    # EVALUATE
    ############################################################################
    if options.evaluate:
        logging.info('Loading test data...')
        test_data = bq.get_rows(dt.datetime.strptime('2010-01-01', "%Y-%m-%d"),
                                dt.datetime.strptime('2019-01-01', "%Y-%m-%d"),
                                loc_col='trainstation',
                                project=options.project,
                                dataset=options.feature_dataset,
                                table=options.test_table,
                                parameters=all_param_names)

        test_data = io.filter_train_type(labels_df=test_data,
                                         train_types=['K', 'L'],
                                         sum_types=True,
                                         train_type_column='train_type',
                                         location_column='trainstation',
                                         time_column='time',
                                         sum_columns=['delay'],
                                         aggs=aggs)

        # Sorting is actually not necessary. It's been useful for debugging.
        test_data.sort_values(by=['time', 'trainstation'], inplace=True)
        test_data.set_index('time', inplace=True)
        logging.info('Test data contain {} rows...'.format(len(test_data)))

        logging.info(
            'Adding binary class to the test dataset with limit {}...'.format(
                options.delay_limit))
        #logging.info('Adding binary class to the dataset with limit {}...'.format(limit))
        #data['class'] = data['count'].map(lambda x: 1 if x > options.delay_count_limit else -1)
        test_data['class'] = test_data['delay'].map(
            lambda x: 1 if x > options.delay_limit else -1)
        io.log_class_dist(test_data.loc[:, 'class'].values, labels=[-1, 1])

        if options.month:
            logging.info('Adding month to the test dataset...')
            test_data['month'] = test_data.index.map(lambda x: x.month)

        times = [('2011-02-01', '2011-03-01'), ('2016-06-01', '2016-07-01'),
                 ('2017-02-01', '2017-03-01'), ('2011-02-01', '2017-03-01')]
        for start, end in times:
            try:
                y_pred_proba, y_pred, y = predict_timerange(
                    test_data, options.feature_params, model, scaler, start,
                    end)
                perf_metrics(y_pred_proba, y_pred, y, start, end, viz, io)
            except EmptyDataError:
                logging.info('No data for {} - {}'.format(start, end))