예제 #1
0
def get_data_metadata(X: DataFrame, y: Series) -> dict:
    X_raw = convert_to_raw(X)

    feature_metadata_orig = FeatureMetadata.from_df(X)
    feature_metadata_raw = FeatureMetadata.from_df(X_raw)

    num_rows, num_cols = X.shape
    num_null = X.isnull().sum().sum()

    try:
        problem_type = infer_problem_type(y, silent=True)
    except:
        # TODO: Remove, only here for legacy compatibility
        problem_type = infer_problem_type(y)
    if problem_type in ['binary', 'multiclass']:
        num_classes = len(y.unique())
    else:
        num_classes = None

    data_metadata = {
        'num_rows': num_rows,
        'num_cols': num_cols,
        'num_null': num_null,
        'num_classes': num_classes,
        'problem_type': problem_type,
        'feature_metadata': feature_metadata_orig,
        'feature_metadata_raw': feature_metadata_raw,
    }
    # TODO: class imbalance
    # TODO: has_text
    # TODO: has_special
    # TODO: memory size

    return data_metadata
################

train_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')  # can be local CSV file as well, returns Pandas DataFrame
test_data = task.Dataset(file_path='https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')  # another Pandas DataFrame
label_column = 'class'  # specifies which column do we want to predict
train_data = train_data.head(1000)  # subsample for faster demo

#############################################
# Training custom model outside of task.fit #
#############################################

# Separate features and labels
X_train = train_data.drop(columns=[label_column])
y_train = train_data[label_column]

problem_type = infer_problem_type(y=y_train)  # Infer problem type (or else specify directly)
naive_bayes_model = NaiveBayesModel(path='AutogluonModels/', name='CustomNaiveBayes', problem_type=problem_type)

# Construct a LabelCleaner to neatly convert labels to float/integers during model training/inference, can also use to inverse_transform back to original.
label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=y_train)
y_train_clean = label_cleaner.transform(y_train)

naive_bayes_model.fit(X_train=X_train, y_train=y_train_clean)  # Fit custom model

# To save to disk and load the model, do the following:
# load_path = naive_bayes_model.path
# naive_bayes_model.save()
# del naive_bayes_model
# naive_bayes_model = NaiveBayesModel.load(path=load_path)

# Prepare test data