proj = dr.Project.create(sourcedata=filename,
                         project_name=project_name,
                         max_wait=3600)
print('Project ID: {}'.format(proj.id))


# ## Identify Known-In-Advance Features
# This dataset has five columns that will always be known-in-advance and available for prediction.

# In[5]:


known_in_advance = ['Marketing', 'Near_Xmas', 'Near_BlackFriday',
                    'Holiday', 'DestinationEvent']

feature_settings = [dr.FeatureSettings(feat_name,
                                       known_in_advance=True)
                    for feat_name in known_in_advance]


# ## Create a Partition Specification
# This problem has a time component to it, and it would be bad practice to train on data from the present and predict on the past. We could manually add a column to the dataset to indicate which rows should be used for training, test, and validation, but it is straightforward to allow DataRobot to do it automatically. This dataset contains sales data from multiple individual stores so we use `multiseries_id_columns` to tell DataRobot there are actually multiple time series in this file and to indicate the column that identifies the series each row belongs to.

# In[6]:


time_partition = dr.DatetimePartitioningSpecification(
    datetime_partition_column='Date',
    multiseries_id_columns=['Store'],
    use_time_series=True,
    feature_settings=feature_settings,
)
Ejemplo n.º 2
0
def create_dr_project(df, project_name, ts_settings, **advanced_options):
    """
    Kickoff single DataRobot project

    df: pandas df
    project_name: name of project
    ts_settings: dictionary of parameters for time series project

    Returns:
    --------
    DataRobot project object

    """

    print(f'Building Next Project \n...\n')

    #######################
    # Get Advanced Options
    #######################
    opts = {
        'weights': None,
        'response_cap': None,
        'blueprint_threshold': None,
        'seed': None,
        'smart_downsampled': False,
        'majority_downsampling_rate': None,
        'offset': None,
        'exposure': None,
        'accuracy_optimized_mb': None,
        'scaleout_modeling_mode': None,
        'events_count': None,
        'monotonic_increasing_featurelist_id': None,
        'monotonic_decreasing_featurelist_id': None,
        'only_include_monotonic_blueprints': None,
    }

    for opt in advanced_options.items():
        opts[opt[0]] = opt[1]

    opts = dr.AdvancedOptions(
        weights=opts['weights'],
        seed=opts['seed'],
        monotonic_increasing_featurelist_id=opts[
            'monotonic_increasing_featurelist_id'],
        monotonic_decreasing_featurelist_id=opts[
            'monotonic_decreasing_featurelist_id'],
        only_include_monotonic_blueprints=opts[
            'only_include_monotonic_blueprints'],
        accuracy_optimized_mb=opts['accuracy_optimized_mb'],
        smart_downsampled=opts['smart_downsampled'],
    )

    ############################
    # Get Datetime Specification
    ############################
    settings = {
        'max_date': None,
        'known_in_advance': None,
        'num_backtests': None,
        'validation_duration': None,
        'holdout_duration': None,
        'holdout_start_date': None,
        'disable_holdout': False,
        'number_of_backtests': None,
        'backtests': None,
        'use_cross_series_features': None,
        'aggregation_type': None,
        'cross_series_group_by_columns': None,
        'calendar_id': None,
        'use_time_series': False,
        'series_id': None,
        'metric': None,
        'target': None,
        'mode': dr.AUTOPILOT_MODE.FULL_AUTO,  # MANUAL #QUICK
        'date_col': None,
        'fd_start': None,
        'fd_end': None,
        'fdw_start': None,
        'fdw_end': None,
    }

    for s in ts_settings.items():
        settings[s[0]] = s[1]

    df[settings['date_col']] = pd.to_datetime(df[settings['date_col']])

    if settings['max_date'] is None:
        settings['max_date'] = df[settings['date_col']].max()
    else:
        settings['max_date'] = pd.to_datetime(settings['max_date'])

    if ts_settings['known_in_advance']:
        settings['known_in_advance'] = [
            dr.FeatureSettings(feat_name, known_in_advance=True)
            for feat_name in settings['known_in_advance']
        ]

    # Update validation and holdout duration, start, and end date
    project_time_unit, project_time_step = get_timestep(df, settings)

    validation_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0}
    holdout_durations = {'minute': 0, 'hour': 0, 'day': 0, 'month': 0}

    if project_time_unit == 'minute':
        validation_durations['minute'] = settings['validation_duration']
        holdout_durations['minute'] = settings['holdout_duration']

    elif project_time_unit == 'hour':
        validation_durations['hour'] = settings['validation_duration']
        holdout_durations['hour'] = settings['holdout_duration']

    elif project_time_unit == 'day':
        validation_durations['day'] = settings['validation_duration']
        holdout_durations['day'] = settings['holdout_duration']

    elif project_time_unit == 'week':
        validation_durations['day'] = settings['validation_duration'] * 7
        holdout_durations['day'] = settings['holdout_duration'] * 7

    elif project_time_unit == 'month':
        validation_durations['day'] = settings['validation_duration'] * 31
        holdout_durations['day'] = settings['holdout_duration'] * 31

    else:
        raise ValueError(f'{project_time_unit} is not a supported timestep')

    if settings['disable_holdout']:
        settings['holdout_duration'] = None
        settings['holdout_start_date'] = None
    else:
        settings['holdout_start_date'] = settings['max_date'] - dt.timedelta(
            minutes=holdout_durations['minute'],
            hours=holdout_durations['hour'],
            days=holdout_durations['day'],
        )

        settings[
            'holdout_duration'] = dr.partitioning_methods.construct_duration_string(
                minutes=holdout_durations['minute'],
                hours=holdout_durations['hour'],
                days=holdout_durations['day'],
            )

    ###############################
    # Create Datetime Specification
    ###############################
    time_partition = dr.DatetimePartitioningSpecification(
        feature_settings=settings['known_in_advance'],
        # gap_duration = dr.partitioning_methods.construct_duration_string(years=0, months=0, days=0),
        validation_duration=dr.partitioning_methods.construct_duration_string(
            minutes=validation_durations['minute'],
            hours=validation_durations['hour'],
            days=validation_durations['day'],
        ),
        datetime_partition_column=settings['date_col'],
        use_time_series=settings['use_time_series'],
        disable_holdout=settings[
            'disable_holdout'],  # set this if disable_holdout is set to False
        holdout_start_date=settings['holdout_start_date'],
        holdout_duration=settings[
            'holdout_duration'],  # set this if disable_holdout is set to False
        multiseries_id_columns=[settings['series_id']],
        forecast_window_start=int(settings['fd_start']),
        forecast_window_end=int(settings['fd_end']),
        feature_derivation_window_start=int(settings['fdw_start']),
        feature_derivation_window_end=int(settings['fdw_end']),
        number_of_backtests=settings['num_backtests'],
        calendar_id=settings['calendar_id'],
        use_cross_series_features=settings['use_cross_series_features'],
        aggregation_type=settings['aggregation_type'],
        cross_series_group_by_columns=settings[
            'cross_series_group_by_columns'],
    )

    ################
    # Create Project
    ################
    project = dr.Project.create(project_name=project_name,
                                sourcedata=df,
                                max_wait=14400,
                                read_timeout=14400)

    print(f'Creating project {project_name} ...')

    #################
    # Start Autopilot
    #################
    project.set_target(
        target=settings['target'],
        metric=settings['metric'],
        mode=settings['mode'],
        advanced_options=opts,
        worker_count=-1,
        partitioning_method=time_partition,
        max_wait=14400,
    )

    return project