Esempio n. 1
0
def fit(
    X: pd.DataFrame,
    y: pd.Series,
    output_dir: str,
    class_order: Optional[List[str]] = None,
    row_weights: Optional[np.ndarray] = None,
    **kwargs,
):
    """
    This hook must be implemented with your fitting code, for running DRUM in the fit mode.

    This hook MUST ALWAYS be implemented for custom tasks.
    For inference models, this hook can stick around unimplemented, and won’t be triggered.

    Parameters
    ----------
    X: pd.DataFrame - training data to perform fit on
    y: pd.Series - target data to perform fit on
    output_dir: the path to write output. This is the path provided in '--output' parameter of the
        'drum fit' command.
    class_order : A two element long list dictating the order of classes which should be used for
        modeling. Class order will always be passed to fit by DataRobot for classification tasks,
        and never otherwise. When models predict, they output a likelihood of one class, with a
        value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
        dictates that the first element in the list will be the 0 class, and the second will be the
        1 class.
    row_weights: An array of non-negative numeric values which can be used to dictate how important
        a row is. Row weights is only optionally used, and there will be no filtering for which
        custom models support this. There are two situations when values will be passed into
        row_weights, during smart downsampling and when weights are explicitly provided by the user
    kwargs: Added for forwards compatibility

    Returns
    -------
    Nothing
    """

    print("Fitting Preprocessing pipeline")
    preprocessor = dense_preprocessing_pipeline.fit(X)
    lb = LabelEncoder().fit(y)

    # write out the class labels file
    print("Serializing preprocessor and class labels")
    with open(os.path.join(output_dir, "class_labels.txt"), mode="w") as f:
        f.write("\n".join(str(label) for label in lb.classes_))
    with open(os.path.join(output_dir, "preprocessor.pkl"), mode="wb") as f:
        pickle.dump(preprocessor, f)

    print("Transforming input data")
    X = preprocessor.transform(X)
    y = lb.transform(y)

    estimator, optimizer, criterion = build_classifier(X, len(lb.classes_))
    print("Training classifier")
    train_classifier(X, y, estimator, optimizer, criterion)
    artifact_name = "artifact.pth"
    save_torch_model(estimator, output_dir, artifact_name)
Esempio n. 2
0
def fit(
    X: pd.DataFrame,
    y: pd.Series,
    output_dir: str,
    class_order: Optional[List[str]] = None,
    row_weights: Optional[np.ndarray] = None,
    **kwargs,
) -> None:
    """
    This hook must be implemented with your fitting code, for running drum in the fit mode.
    This hook MUST ALWAYS be implemented for custom training models.
    For inference models, this hook can stick around unimplemented, and won’t be triggered.
    Parameters
    ----------
    X: pd.DataFrame - training data to perform fit on
    y: pd.Series - target data to perform fit on
    output_dir: the path to write output. This is the path provided in '--output' parameter of the
        'drum fit' command.
    class_order : A two element long list dictating the order of classes which should be used for
        modeling. Class order will always be passed to fit by DataRobot for classification tasks,
        and never otherwise. When models predict, they output a likelihood of one class, with a
        value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
        dictates that the first element in the list will be the 0 class, and the second will be the
        1 class.
    row_weights: An array of non-negative numeric values which can be used to dictate how important
        a row is. Row weights is only optionally used, and there will be no filtering for which
        custom models support this. There are two situations when values will be passed into
        row_weights, during smart downsampling and when weights are explicitly provided by the user
    kwargs: Added for forwards compatibility
    Returns
    -------
    Nothing
    """
    # keep only numeric features
    X_train = subset_data(X)
    # Feel free to delete which ever one of these you aren't using
    if class_order:
        estimator, optimizer, criterion = build_classifier(
            X_train, len(class_order))
        train_classifier(X_train, y, estimator, optimizer, criterion)
        artifact_name = "torch_class.pth"
    else:
        estimator, optimizer, criterion = build_regressor(X_train)
        train_regressor(X_train, y, estimator, optimizer, criterion)
        artifact_name = "torch_reg.pth"

    # NOTE: We currently set a 10GB limit to the size of the serialized model
    save_torch_model(estimator, output_dir, artifact_name)