コード例 #1
ファイル: __main__.py プロジェクト: sb123456789sb/AlphaPy
def training_pipeline(model):
    r"""AlphaPy Training Pipeline

    model : alphapy.Model
        The model object for controlling the pipeline.

    model : alphapy.Model
        The final results are stored in the model object.

        If the number of columns of the train and test data do not match,
        then this exception is raised.


    logger.info("Training Pipeline")

    # Unpack the model specifications

    calibration = model.specs['calibration']
    directory = model.specs['directory']
    drop = model.specs['drop']
    extension = model.specs['extension']
    feature_selection = model.specs['feature_selection']
    grid_search = model.specs['grid_search']
    model_type = model.specs['model_type']
    predict_mode = model.specs['predict_mode']
    rfe = model.specs['rfe']
    sampling = model.specs['sampling']
    scorer = model.specs['scorer']
    separator = model.specs['separator']
    target = model.specs['target']

    # Get train and test data

    X_train, y_train = get_data(model, Partition.train)
    X_test, y_test = get_data(model, Partition.test)

    # Determine if there are any test labels

    if y_test.any():
        logger.info("Test Labels Found")
        model.test_labels = True
    model = save_features(model, X_train, X_test, y_train, y_test)

    # Log feature statistics

    logger.info("Original Feature Statistics")
    logger.info("Number of Training Rows    : %d", X_train.shape[0])
    logger.info("Number of Training Columns : %d", X_train.shape[1])
    if model_type == ModelType.classification:
        uv, uc = np.unique(y_train, return_counts=True)
        logger.info("Unique Training Values for %s : %s", target, uv)
        logger.info("Unique Training Counts for %s : %s", target, uc)
    logger.info("Number of Testing Rows     : %d", X_test.shape[0])
    logger.info("Number of Testing Columns  : %d", X_test.shape[1])
    if model_type == ModelType.classification and model.test_labels:
        uv, uc = np.unique(y_test, return_counts=True)
        logger.info("Unique Testing Values for %s : %s", target, uv)
        logger.info("Unique Testing Counts for %s : %s", target, uc)

    # Merge training and test data

    if X_train.shape[1] == X_test.shape[1]:
        split_point = X_train.shape[0]
        X = pd.concat([X_train, X_test])
        raise IndexError(
            "The number of training and test columns [%d, %d] must match." %
            (X_train.shape[1], X_test.shape[1]))

    # Apply treatments to the feature matrix
    all_features = apply_treatments(model, X)

    # Drop features
    all_features = drop_features(all_features, drop)

    # Save the train and test files with extracted and dropped features

    datestamp = get_datestamp()
    data_dir = SSEP.join([directory, 'input'])
    df_train = all_features.iloc[:split_point, :]
    df_train = pd.concat(
        [df_train, pd.DataFrame(y_train, columns=[target])], axis=1)
    output_file = USEP.join([model.train_file, datestamp])
    write_frame(df_train, data_dir, output_file, extension, separator)
    df_test = all_features.iloc[split_point:, :]
    if y_test.any():
        df_test = pd.concat(
            [df_test, pd.DataFrame(y_test, columns=[target])], axis=1)
    output_file = USEP.join([model.test_file, datestamp])
    write_frame(df_test, data_dir, output_file, extension, separator)

    # Create crosstabs for any categorical features

    if model_type == ModelType.classification:

    # Create initial features

    all_features = create_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Generate interactions

    all_features = create_interactions(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Remove low-variance features

    all_features = remove_lv_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Shuffle the data [if specified]
    model = shuffle_data(model)

    # Oversampling or Undersampling [if specified]

    if model_type == ModelType.classification:
        if sampling:
            model = sample_data(model)
            logger.info("Skipping Sampling")
        # Get sample weights (classification only)
        model = get_class_weights(model)

    # Perform feature selection, independent of algorithm

    if feature_selection:
        model = select_features(model)

    # Get the available classifiers and regressors

    logger.info("Getting All Estimators")
    estimators = get_estimators(model)

    # Get the available scorers

    if scorer not in scorers:
        raise KeyError("Scorer function %s not found" % scorer)

    # Model Selection

    logger.info("Selecting Models")

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # select estimator
            estimator = estimators[algo]
            scoring = estimator.scoring
            est = estimator.estimator
        except KeyError:
            logger.info("Algorithm %s not found", algo)
        # initial fit
        model = first_fit(model, algo, est)
        # recursive feature elimination
        if rfe:
            if scoring:
                model = rfecv_search(model, algo)
            elif hasattr(est, "coef_"):
                model = rfe_search(model, algo)
                logger.info("No RFE Available for %s", algo)
        # grid search
        if grid_search:
            model = hyper_grid_search(model, estimator)
        # predictions
        model = make_predictions(model, algo, calibration)

    # Create a blended estimator

    if len(model.algolist) > 1:
        model = predict_blend(model)

    # Generate metrics

    model = generate_metrics(model, Partition.train)
    model = generate_metrics(model, Partition.test)

    # Store the best estimator
    model = predict_best(model)

    # Generate plots

    generate_plots(model, Partition.train)
    if model.test_labels:
        generate_plots(model, Partition.test)

    # Save best features and predictions
    save_model(model, 'BEST', Partition.test)

    # Return the model
    return model
コード例 #2
ファイル: __main__.py プロジェクト: sb123456789sb/AlphaPy
def prediction_pipeline(model):
    r"""AlphaPy Prediction Pipeline

    model : alphapy.Model
        The model object for controlling the pipeline.

    None : None

    The saved model is loaded from disk, and predictions are made
    on the new testing data.


    logger.info("Predict Mode")

    # Unpack the model specifications

    directory = model.specs['directory']
    drop = model.specs['drop']
    extension = model.specs['extension']
    feature_selection = model.specs['feature_selection']
    model_type = model.specs['model_type']
    rfe = model.specs['rfe']
    separator = model.specs['separator']

    # Get all data. We need original train and test for interactions.

    partition = Partition.predict
    X_predict, _ = get_data(model, partition)

    # Load feature_map
    model = load_feature_map(model, directory)

    # Log feature statistics

    logger.info("Feature Statistics")
    logger.info("Number of Prediction Rows    : %d", X_predict.shape[0])
    logger.info("Number of Prediction Columns : %d", X_predict.shape[1])

    # Apply treatments to the feature matrix
    all_features = apply_treatments(model, X_predict)

    # Drop features
    all_features = drop_features(all_features, drop)

    # Create initial features
    all_features = create_features(model, all_features)

    # Generate interactions
    all_features = create_interactions(model, all_features)

    # Remove low-variance features
    all_features = remove_lv_features(model, all_features)

    # Load the univariate support vector, if any

    if feature_selection:
        logger.info("Getting Univariate Support")
            support = model.feature_map['uni_support']
            all_features = all_features[:, support]
            logger.info("New Feature Count : %d", all_features.shape[1])
            logger.info("No Univariate Support")

    # Load the RFE support vector, if any

    if rfe:
        logger.info("Getting RFE Support")
            support = model.feature_map['rfe_support']
            all_features = all_features[:, support]
            logger.info("New Feature Count : %d", all_features.shape[1])
            logger.info("No RFE Support")

    # Load predictor
    predictor = load_predictor(directory)

    # Make predictions

    logger.info("Making Predictions")
    tag = 'BEST'
    model.preds[(tag, partition)] = predictor.predict(all_features)
    if model_type == ModelType.classification:
                      partition)] = predictor.predict_proba(all_features)[:, 1]

    # Get date stamp to record file creation

    d = datetime.now()
    f = "%Y%m%d"
    timestamp = d.strftime(f)

    # Save predictions
    save_predictions(model, tag, partition)