Ejemplo n.º 1
0
def _get_training_data(country, prediction_date, all_features,
                       global_parameters: GlobalParameters, environment,
                       influx_exporter):
    unprepared_training_df = pd.DataFrame(columns=[
        GIVER_ID, ACTION_CODE, FEEDBACK, COUNTRY, ASSIGNED_DATE, RECEIVER_ID,
        MODEL_ID, MATCHING_DATE, LOGPROB
    ])
    if global_parameters.experimental_group > 0.0:
        unprepared_training_df = fetch_training_data(
            country, prediction_date, global_parameters.feedback_weeks,
            all_features)
        if environment != Environment.DEVELOPMENT:
            _log_event_stats(unprepared_training_df, influx_exporter)
    else:
        log.info(
            'Size of the experimental group is 0%. No events will be collected.'
        )
    if unprepared_training_df.empty:
        log.info(
            'Training data is empty. The actions will be selected uniformly at random.'
        )

    training_df, training_meta_data = prepare_training_data(
        all_features, prediction_date, unprepared_training_df)

    return training_df, training_meta_data
Ejemplo n.º 2
0
 def write_string(self, string, file_id, bucket, partition_params):
     path = project_parameters.compile_path(partition_params, file_id,
                                            'txt')
     local_path = self._create_local_path(bucket, path)
     log.info(f'Writing {file_id} to {local_path}')
     with open(local_path, 'w') as f:
         f.write(string)
Ejemplo n.º 3
0
    def _random_forest_segmentation(self,
                                    dataset) -> List[List[Tuple[str, int]]]:
        log.debug('Dataset before running tree algorithm:\n'
                  f'{dataset.head().to_string()}\n')

        max_gain = 0.0
        max_segmentation: List[List[Tuple[str, int]]] = []
        log.info('Creating segmentation...')
        for _ in range(self._parameters.segment_num_trees):

            dataset.loc[:, 'norm_feedback'] = min_max_normalize(
                dataset['feedback'])

            segmentation_builder = TreeSegmentationBuilder(
                dataset.drop('feedback', axis=1), self._parameters)
            tree = segmentation_builder.build_tree(self._has_experian)
            segments = _get_segments(tree)
            segmentation_df = self._segment_dataframe(dataset, segments)
            tree_gain_vs_no_action = self._evaluate_segmentation(
                segmentation_df)

            if tree_gain_vs_no_action > max_gain:
                max_gain = tree_gain_vs_no_action
                max_segmentation = segments

        log.debug(
            f'Expected gain of the best tree vs no action = {max_gain}\n')
        log.info(f'Segments: {max_segmentation}')

        return max_segmentation
Ejemplo n.º 4
0
def _log_config(model_id, country, prediction_date, environment,
                global_parameters, model_parameters):
    print_header('CONFIG')
    config_str = _config_str(model_id, country, prediction_date,
                             global_parameters, model_parameters, environment)
    log.info(config_str)
    print_footer()
    writers.writer.write_string(
        config_str, RUN_CONFIG_FILE_ID, AWS_PARAMETERS.s3_core_data_bucket,
        partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY))
Ejemplo n.º 5
0
    def _predict(self, predict_df):
        predict_df = self._feature_pipeline.transform(predict_df)
        predict_df = predict_df[[GIVER_ID, SEGMENT]]
        predict_df = self._model_pipeline.transform(predict_df)

        log.info(
            f'Segments in prediction data: {", ".join(predict_df.segment.unique())}'
        )

        return predict_df
Ejemplo n.º 6
0
 def fit(self, df: pd.DataFrame):
     original_df = df.copy()
     df = df.drop([GIVER_ID], axis=1)
     self._input_feature_columns = get_feature_columns(
         df, self._non_feature_columns)
     self._segmentation = self._random_forest_segmentation(df)
     if not _valid_segmentation(self._segmentation):
         log.info('Could not find a confident segmentation.')
     df = self._segment_dataframe(original_df, self._segmentation)
     log.info(f'Final segmentation: {", ".join(df.segment.unique())}')
     return df
Ejemplo n.º 7
0
 def validate(self, global_parameters: GlobalParameters):
     log.info('Validating model output...')
     self._validate_group(
         CONTROL, 0, 'not_control',
         1 - global_parameters.control - global_parameters.other_action)
     self._validate_group(CONTROL, 1, 'control', global_parameters.control)
     self._validate_group(CONTROL, 2, 'other_action',
                          global_parameters.other_action)
     explore_percentage = \
         (1 - global_parameters.control - global_parameters.other_action) * global_parameters.exploration
     self._validate_group(EXPLORATION, 0, 'not_explore',
                          1 - explore_percentage)
     self._validate_group(EXPLORATION, 1, 'explore', explore_percentage)
     log.info('Done!')
     return self._output_df
Ejemplo n.º 8
0
def get_raw_prediction_data(country, ref_date, features):
    complete_prediction_data_query = compile_prediction_data_query(
        country, ref_date, features)

    writer.write_string(
        complete_prediction_data_query, 'prediction_data_query',
        AWS_PARAMETERS.s3_core_data_bucket,
        partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY))

    log.info('Fetching data for customers which will be assigned actions...')
    unprepared_prediction_df = plds.db.run_dwh_query(
        complete_prediction_data_query)
    unprepared_prediction_df = unprepared_prediction_df.drop_duplicates(
        subset=[GIVER_ID])

    return unprepared_prediction_df[prediction_data_columns(features)]
Ejemplo n.º 9
0
def send_all_event_stats(events_per_country_date):
    log.info(
        f'Exporting event statistics to InfluxDB, {INFLUXDB_HOST}:{INFLUXDB_DATABASE}'
    )
    points = []
    for key, value in events_per_country_date.to_dict().items():
        points.append({
            'measurement': 'happy_all_events',
            'tags': {
                'country': key[0]
            },
            'fields': {
                'count': value
            },
            'time': key[1].isoformat()
        })
    plds.grokana.send_to_influxdb(points)
Ejemplo n.º 10
0
def fetch_training_data(country, prediction_date, feedback_weeks, features):
    dataset_query = compile_dataset_query(
        project_parameters.model_version, country, prediction_date,
        feedback_weeks, internal_config.EVENT_TABLE_IDENTIFIER, features)
    writer.write_string(
        dataset_query, 'dataset_query', AWS_PARAMETERS.s3_core_data_bucket,
        partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY))

    log.info('Fetching training data...')
    unprepared_training_df = plds.db.run_dwh_query(dataset_query)

    unprepared_training_df = unprepared_training_df.drop_duplicates(
        subset=[MODEL_ID, GIVER_ID, RECEIVER_ID])

    if unprepared_training_df.empty:
        log.info('No valid events found.')

    return unprepared_training_df
Ejemplo n.º 11
0
 def send_action_distribution(self, model_action_distribution):
     log.info(
         f'Exporting action distribution to InfluxDB, {INFLUXDB_HOST}:{INFLUXDB_DATABASE}'
     )
     points = []
     for key, value in model_action_distribution.to_dict().items():
         points.append({
             'measurement': 'happy_action',
             'tags': {
                 'country': self.country,
                 'model_id': self.model_id,
                 'action': key
             },
             'fields': {
                 'value': value
             },
             'time': self.elaboration_date.isoformat()
         })
     plds.grokana.send_to_influxdb(points)
Ejemplo n.º 12
0
def fetch_simulation_data(country, prediction_date, feedback_weeks,
                          model_versions, features):
    print('Fetching dataset for simulation...')
    datasets = []
    for model_version in model_versions:
        dataset_query = compile_dataset_query(
            model_version, country, prediction_date, feedback_weeks,
            internal_config.EVENT_TABLE_IDENTIFIER, features)
        writer.write_string(
            dataset_query, 'dataset_query', AWS_PARAMETERS.s3_core_data_bucket,
            partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY))

        unprepared_training_df = plds.db.run_dwh_query(dataset_query)
        unprepared_training_df = unprepared_training_df.drop_duplicates(
            subset=[MODEL_ID, GIVER_ID, RECEIVER_ID])
        datasets.append(unprepared_training_df)

        if unprepared_training_df.empty:
            log.info('No valid events found.')

    dataset = pd.concat(datasets)
    return dataset
Ejemplo n.º 13
0
def load_run_config_from_local(country, path):
    log.info(f'Loading run config from {path}')
    run_config = toml.load(path)
    return load_config(run_config, country)
Ejemplo n.º 14
0
def run_model(country, prediction_date_str, environment,
              model_parameters: ModelParameters,
              global_parameters: GlobalParameters):
    prediction_date = datetime.strptime(prediction_date_str, '%Y-%m-%d')
    model_id = f'model-{prediction_date_str}-{country}-{random_str(4)}'
    set_partition_parameters(country, model_id)
    _log_config(model_id, country, prediction_date_str, environment,
                global_parameters, model_parameters)

    influx_exporter = InfluxExporter(country, prediction_date, model_id)

    np.random.seed(string_seed(prediction_date_str))

    feature_set = FeatureSet.create_for_country(country)

    all_features = feature_set.all_features()
    training_df, training_meta_data = _get_training_data(
        country, prediction_date, all_features, global_parameters, environment,
        influx_exporter)
    log.info('Training data value counts:')
    log.info(training_df.action_code.value_counts())

    log.info(f'Training data shape = {training_df.shape}')
    log.info(f'Training data columns = {list(training_df.columns)}')

    agent = SegmentedEpsGreedyAgent(feature_set, NON_FEATURE_COLUMNS,
                                    global_parameters.actions,
                                    global_parameters.default_action,
                                    global_parameters.experimental_group,
                                    model_parameters, training_meta_data)
    agent.train(training_df)

    prediction_df = get_raw_prediction_data(country, prediction_date_str,
                                            all_features)
    _validate_prediction_data(prediction_df)
    log.info(f'Prediction data shape = {prediction_df.shape}')
    log.info(f'Prediction data columns = {list(prediction_df.columns)}')
    prediction_df, other_groups_df = set_experiment_groups(
        prediction_df, global_parameters)
    log.info(
        f'Customers remaining after control and exploration = {len(prediction_df)}'
    )

    predictions_df = agent.predict(prediction_df)

    model_output_df = _get_model_output(predictions_df, other_groups_df,
                                        country, prediction_date_str, model_id,
                                        global_parameters, influx_exporter)

    if environment == Environment.PRODUCTION:
        write_final_output(
            model_output_df,
            f'model_id={model_id}/country={country}/{model_id}.csv')
Ejemplo n.º 15
0
def load_run_config_from_s3(country):
    s3_key = project_parameters.compile_path({}, 'run_config', 'toml')
    log.info(f'Loading run config from {s3_key}')
    with S3FileSystem().open(f'{AWS_PARAMETERS.s3_config_bucket}/{s3_key}') as f:
        return load_config(toml.loads(f.read().decode()), country)
Ejemplo n.º 16
0
def print_header(heading):
    log.info(heading)
    log.info('=' * 50)
Ejemplo n.º 17
0
def print_footer():
    log.info('=' * 50)
Ejemplo n.º 18
0
    def log_stats(self):
        model_output = self._output_df
        print_header('OUTPUT')
        log.info(f'Control value distribution:\n'
                 f'{model_output.control.value_counts().sort_index()}\n')
        log.info(f'Exploration value distribution:\n'
                 f'{model_output.exploration.value_counts().sort_index()}\n')

        logprob_value_counts = model_output.logprob \
            .apply(round, ndigits=3) \
            .value_counts() \
            .nlargest(10) \
            .sort_index(ascending=False)
        log.info(f'''Logprob value distribution:\n{logprob_value_counts}\n''')

        log.info(f'Action distribution for all customers:\n'
                 f'{model_output.action_code.value_counts().sort_index()}\n')

        log.info(
            f'Action distribution for exploration group:\n'
            f'{model_output[model_output["exploration"] == 1].action_code.value_counts().sort_index()}\n'
        )

        model_action_distribution = self.model_action_distribution()
        log.info(
            f'''Action distribution for experimental group:\n{model_action_distribution.sort_index()}\n'''
        )

        print_footer()
Ejemplo n.º 19
0
 def write_dataframe(self, df, file_id, bucket, partition_params):
     res = df.to_csv(index=False)
     path = project_parameters.compile_path(partition_params, file_id,
                                            'csv')
     log.info(f'Writing {file_id} to {path}')
     _put_object_to_s3(res, bucket, path)
Ejemplo n.º 20
0
 def write_dataframe(self, df, file_id, bucket, partition_params):
     path = project_parameters.compile_path(partition_params, file_id,
                                            'csv')
     local_path = self._create_local_path(bucket, path)
     log.info(f'Writing {file_id} to {local_path}')
     df.to_csv(local_path, index=False)
Ejemplo n.º 21
0
 def write_string(self, string, file_id, bucket, partition_params):
     path = project_parameters.compile_path(partition_params, file_id,
                                            'txt')
     log.info(f'Writing {file_id} to {path}')
     _put_object_to_s3(string, bucket, path)