def start_project_with_settings(fake_jobs_df):
    '''
    Run a project for fake_jobs_df
    :param fake_jobs_df: already enriched dataset
    :return: project
    '''
    global ts_setting
    advanced_options = dr.AdvancedOptions(
        response_cap=0.7,
        blueprint_threshold=2,
        smart_downsampled=True,
        majority_downsampling_rate=ts_setting["downsampling"])
    partition = dr.StratifiedTVH(ts_setting["holdout_pct"],
                                 ts_setting["validation_pct"],
                                 seed=0)
    pandas_dataset = dr.Dataset.create_from_in_memory_data(
        data_frame=fake_jobs_df.drop(columns=["job_id"]))
    project = pandas_dataset.create_project(
        project_name=ts_setting["project_name"])
    project.set_target(target=ts_setting["target"],
                       mode=dr.enums.AUTOPILOT_MODE.QUICK,
                       partitioning_method=partition,
                       advanced_options=advanced_options,
                       worker_count=-1)
    project.unlock_holdout()
    project.wait_for_autopilot(verbosity=dr.VERBOSITY_LEVEL.SILENT)
    return project
Beispiel #2
0
def run_cp_project(df, proj_name, target, unique_index, window_col, workers):
    proj = dr.Project.create(df, proj_name, max_wait=9999)

    group_partition = dr.GroupTVH(holdout_pct=0,
                                  validation_pct=20,
                                  partition_key_cols=[unique_index])

    mono_up = [window_col]
    flist_mono_up = proj.create_featurelist(name='mono_up', features=mono_up)
    advanced_options = dr.AdvancedOptions(
        monotonic_increasing_featurelist_id=flist_mono_up.id,
        only_include_monotonic_blueprints=True)

    proj.set_target(target=target,
                    positive_class=1,
                    partitioning_method=group_partition,
                    mode=dr.AUTOPILOT_MODE.FULL_AUTO,
                    max_wait=9999,
                    advanced_options=advanced_options)
    proj.set_worker_count(workers)
    proj.wait_for_autopilot()
    return proj
Beispiel #3
0
def run_dr_project(mlbench_project):
    """
    Given the metadata for an MLbench dataset, run a modeling project in DataRobot.
    """

    logger.info('DataRobot: Creating project...')
    dr_project = dr.Project.create(mlbench_project['train_dataset'],
                                   project_name=mlbench_project['name'])

    logger.info('DataRobot: Aim...')
    dr_project.set_target(
        target=mlbench_project['target_name'],
        metric=mlbench_project['metric'],
        partitioning_method=dr.StratifiedCV(holdout_pct=20, reps=5),
        advanced_options=dr.AdvancedOptions(accuracy_optimized_mb=True),
        worker_count=MAX_DATAROBOT_WORKERS,
    )

    logger.info('DataRobot: Waiting for autopilot...')
    dr_project.wait_for_autopilot()

    return dr_project
# see available metrics (optional)
project.get_metrics('was_delayed')['available_metrics']

# custom feature lists (optional)
featurelist = project.create_featurelist('myfeatures', list(df.columns.values))

# for other advanced options see the docs, e.g.
# https://datarobot-public-api-client.readthedocs-hosted.com/en/v2.17.0/autodoc/api_reference.html#advanced-options-api

# run autopilot with more accurate models
project.set_target(
    target='was_delayed',
    featurelist_id=featurelist.id,
    metric='AUC',
    advanced_options=dr.AdvancedOptions(accuracy_optimized_mb=True),
    mode=dr.AUTOPILOT_MODE.FULL_AUTO,
    worker_count=-1)
project.wait_for_autopilot()

# run a custom model - e.g. Fasttext word embeddings
blueprints = project.get_blueprints()
fasttext = [
    bp for bp in blueprints if any('Fasttext' in p for p in bp.processes)
]
for f in fasttext:
    job = project.train(f,
                        sample_pct=64,
                        source_project_id=project.id,
                        scoring_type=dr.enums.SCORING_TYPE.cross_validation)
    model = dr.models.modeljob.wait_for_async_model_creation(project.id, job)
Beispiel #5
0
def create_dr_project(df, project_name, ts_settings, **advanced_options):
    """
    Kickoff single DataRobot project

    df: pandas df
    project_name: name of project
    ts_settings: dictionary of parameters for time series project

    Returns:
    --------
    DataRobot project object

    """

    print(f'Building Next Project \n...\n')

    #######################
    # Get Advanced Options
    #######################
    opts = {
        'weights': None,
        'response_cap': None,
        'blueprint_threshold': None,
        'seed': None,
        'smart_downsampled': False,
        'majority_downsampling_rate': None,
        'offset': None,
        'exposure': None,
        'accuracy_optimized_mb': None,
        'scaleout_modeling_mode': None,
        'events_count': None,
        'monotonic_increasing_featurelist_id': None,
        'monotonic_decreasing_featurelist_id': None,
        'only_include_monotonic_blueprints': None,
    }

    for opt in advanced_options.items():
        opts[opt[0]] = opt[1]

    opts = dr.AdvancedOptions(
        weights=opts['weights'],
        seed=opts['seed'],
        monotonic_increasing_featurelist_id=opts[
            'monotonic_increasing_featurelist_id'],
        monotonic_decreasing_featurelist_id=opts[
            'monotonic_decreasing_featurelist_id'],
        only_include_monotonic_blueprints=opts[
            'only_include_monotonic_blueprints'],
        accuracy_optimized_mb=opts['accuracy_optimized_mb'],
        smart_downsampled=opts['smart_downsampled'],
    )

    ############################
    # Get Datetime Specification
    ############################
    settings = {
        'max_date': None,
        'known_in_advance': None,
        'num_backtests': None,
        'validation_duration': None,
        'holdout_duration': None,
        'holdout_start_date': None,
        'disable_holdout': False,
        'number_of_backtests': None,
        'backtests': None,
        'use_cross_series_features': None,
        'aggregation_type': None,
        'cross_series_group_by_columns': None,
        'calendar_id': None,
        'use_time_series': False,
        'series_id': None,
        'metric': None,
        'target': None,
        'mode': dr.AUTOPILOT_MODE.FULL_AUTO,  # MANUAL #QUICK
        'date_col': None,
        'fd_start': None,
        'fd_end': None,
        'fdw_start': None,
        'fdw_end': None,
    }

    for s in ts_settings.items():
        settings[s[0]] = s[1]

    df[settings['date_col']] = pd.to_datetime(df[settings['date_col']])

    if settings['max_date'] is None:
        settings['max_date'] = df[settings['date_col']].max()
    else:
        settings['max_date'] = pd.to_datetime(settings['max_date'])

    if ts_settings['known_in_advance']:
        settings['known_in_advance'] = [
            dr.FeatureSettings(feat_name, known_in_advance=True)
            for feat_name in settings['known_in_advance']
        ]

    # Update validation and holdout duration, start, and end date
    project_time_unit, project_time_step = get_timestep(df, settings)

    validation_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0}
    holdout_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0}

    if project_time_unit == 'minute':
        validation_durations['minute'] = settings['validation_duration']
        holdout_durations['minute'] = settings['holdout_duration']

    elif project_time_unit == 'hour':
        validation_durations['hour'] = settings['validation_duration']
        holdout_durations['hour'] = settings['holdout_duration']

    elif project_time_unit == 'day':
        validation_durations['day'] = settings['validation_duration']
        holdout_durations['day'] = settings['holdout_duration']

    elif project_time_unit == 'week':
        validation_durations['day'] = settings['validation_duration'] * 7
        holdout_durations['day'] = settings['holdout_duration'] * 7

    elif project_time_unit == 'month':
        validation_durations['day'] = settings['validation_duration'] * 31
        holdout_durations['day'] = settings['holdout_duration'] * 31

    else:
        raise ValueError(f'{project_time_unit} is not a supported timestep')

    if settings['disable_holdout']:
        settings['holdout_duration'] = None
        settings['holdout_start_date'] = None
    else:
        settings['holdout_start_date'] = settings['max_date'] - dt.timedelta(
            minutes=holdout_durations['minute'],
            hours=holdout_durations['hour'],
            days=holdout_durations['day'],
        )

        settings[
            'holdout_duration'] = dr.partitioning_methods.construct_duration_string(
                minutes=holdout_durations['minute'],
                hours=holdout_durations['hour'],
                days=holdout_durations['day'],
            )

    ###############################
    # Create Datetime Specification
    ###############################
    time_partition = dr.DatetimePartitioningSpecification(
        feature_settings=settings['known_in_advance'],
        # gap_duration = dr.partitioning_methods.construct_duration_string(years=0, months=0, days=0),
        validation_duration=dr.partitioning_methods.construct_duration_string(
            minutes=validation_durations['minute'],
            hours=validation_durations['hour'],
            days=validation_durations['day'],
        ),
        datetime_partition_column=settings['date_col'],
        use_time_series=settings['use_time_series'],
        disable_holdout=settings[
            'disable_holdout'],  # set this if disable_holdout is set to False
        holdout_start_date=settings['holdout_start_date'],
        holdout_duration=settings[
            'holdout_duration'],  # set this if disable_holdout is set to False
        multiseries_id_columns=[settings['series_id']],
        forecast_window_start=int(settings['fd_start']),
        forecast_window_end=int(settings['fd_end']),
        feature_derivation_window_start=int(settings['fdw_start']),
        feature_derivation_window_end=int(settings['fdw_end']),
        number_of_backtests=settings['num_backtests'],
        calendar_id=settings['calendar_id'],
        use_cross_series_features=settings['use_cross_series_features'],
        aggregation_type=settings['aggregation_type'],
        cross_series_group_by_columns=settings[
            'cross_series_group_by_columns'],
    )

    ################
    # Create Project
    ################
    project = dr.Project.create(project_name=project_name,
                                sourcedata=df,
                                max_wait=14400,
                                read_timeout=14400)

    print(f'Creating project {project_name} ...')

    #################
    # Start Autopilot
    #################
    project.set_target(
        target=settings['target'],
        metric=settings['metric'],
        mode=settings['mode'],
        advanced_options=opts,
        worker_count=-1,
        partitioning_method=time_partition,
        max_wait=14400,
    )

    return project