コード例 #1
0
def evaluate_model(dataset, pipeline_components, pipeline_parameters):
    input_data = pd.read_csv(dataset, compression='gzip', sep='\t')
    features = input_data.drop('class', axis=1).values.astype(float)
    labels = input_data['class'].values

    pipelines = [
        dict(zip(pipeline_parameters.keys(), list(parameter_combination)))
        for parameter_combination in itertools.product(
            *pipeline_parameters.values())
    ]

    with warnings.catch_warnings():
        # Squash warning messages. Turn this off when debugging!
        warnings.simplefilter('ignore')

        for pipe_parameters in pipelines:
            pipeline = []
            for component in pipeline_components:
                if component in pipe_parameters:
                    args = pipe_parameters[component]
                    pipeline.append(component(**args))
                else:
                    pipeline.append(component())

            try:
                clf = make_pipeline(*pipeline)
                cv_predictions = cross_val_predict(estimator=clf,
                                                   X=features,
                                                   y=labels,
                                                   cv=StratifiedKFold(
                                                       n_splits=10,
                                                       shuffle=True,
                                                       random_state=90483257))
                accuracy = accuracy_score(labels, cv_predictions)
                macro_f1 = f1_score(labels, cv_predictions, average='macro')
                balanced_accuracy = balanced_accuracy_score(
                    labels, cv_predictions)
            except KeyboardInterrupt:
                sys.exit(1)
            # This is a catch-all to make sure that the evaluation won't crash due to a bad parameter
            # combination or bad data. Turn this off when debugging!
            except Exception as e:
                continue

            classifier_class = pipeline_components[-1]
            param_string = ','.join([
                '{}={}'.format(parameter, value) for parameter, value in
                pipe_parameters[classifier_class].items()
            ])

            out_text = '\t'.join([
                dataset.split('/')[-1][:-7], classifier_class.__name__,
                param_string,
                str(accuracy),
                str(macro_f1),
                str(balanced_accuracy)
            ])

            print(out_text)
            sys.stdout.flush()
コード例 #2
0
            clf = make_pipeline(
                preprocessor,
                LinearSVC(C=C,
                          loss=loss,
                          penalty=penalty,
                          dual=dual,
                          fit_intercept=fit_intercept,
                          random_state=324089))
            # 10-fold CV score for the pipeline
            cv_predictions = cross_val_predict(estimator=clf,
                                               X=features,
                                               y=labels,
                                               cv=10)
            accuracy = accuracy_score(labels, cv_predictions)
            macro_f1 = f1_score(labels, cv_predictions, average='macro')
            balanced_accuracy = balanced_accuracy_score(labels, cv_predictions)
        except KeyboardInterrupt:
            sys.exit(1)
        except:
            continue

        param_string = ''
        param_string += 'preprocessor={},'.format(
            preprocessor.__class__.__name__)
        param_string += 'C={},'.format(C)
        param_string += 'loss={},'.format(loss)
        param_string += 'penalty={},'.format(penalty)
        param_string += 'dual={},'.format(dual)
        param_string += 'fit_intercept={}'.format(fit_intercept)

        out_text = '\t'.join([
コード例 #3
0
    labels = input_data['class'].values

    try:
        # Create the pipeline for the model
        clf = make_pipeline(StandardScaler(),
                            GradientBoostingClassifier(loss=loss,
                                                       learning_rate=learning_rate,
                                                       n_estimators=n_estimators,
                                                       max_depth=max_depth,
                                                       max_features=max_features,
                                                       random_state=324089))
        # 10-fold CV score for the pipeline
        cv_predictions = cross_val_predict(estimator=clf, X=features, y=labels, cv=10)
        accuracy = accuracy_score(labels, cv_predictions)
        macro_f1 = f1_score(labels, cv_predictions, average='macro')
        balanced_accuracy = balanced_accuracy_score(labels, cv_predictions)
    except KeyboardInterrupt:
        sys.exit(1)
    except:
        continue

    param_string = ''
    param_string += 'loss={},'.format(loss)
    param_string += 'learning_rate={},'.format(learning_rate)
    param_string += 'n_estimators={},'.format(n_estimators)
    param_string += 'max_depth={},'.format(max_depth)
    param_string += 'max_features={}'.format(max_features)

    out_text = '\t'.join([dataset.split('/')[-1][:-7],
                          'GradientBoostingClassifier',
                          param_string,