def _get_training_data(country, prediction_date, all_features, global_parameters: GlobalParameters, environment, influx_exporter): unprepared_training_df = pd.DataFrame(columns=[ GIVER_ID, ACTION_CODE, FEEDBACK, COUNTRY, ASSIGNED_DATE, RECEIVER_ID, MODEL_ID, MATCHING_DATE, LOGPROB ]) if global_parameters.experimental_group > 0.0: unprepared_training_df = fetch_training_data( country, prediction_date, global_parameters.feedback_weeks, all_features) if environment != Environment.DEVELOPMENT: _log_event_stats(unprepared_training_df, influx_exporter) else: log.info( 'Size of the experimental group is 0%. No events will be collected.' ) if unprepared_training_df.empty: log.info( 'Training data is empty. The actions will be selected uniformly at random.' ) training_df, training_meta_data = prepare_training_data( all_features, prediction_date, unprepared_training_df) return training_df, training_meta_data
def write_string(self, string, file_id, bucket, partition_params): path = project_parameters.compile_path(partition_params, file_id, 'txt') local_path = self._create_local_path(bucket, path) log.info(f'Writing {file_id} to {local_path}') with open(local_path, 'w') as f: f.write(string)
def _random_forest_segmentation(self, dataset) -> List[List[Tuple[str, int]]]: log.debug('Dataset before running tree algorithm:\n' f'{dataset.head().to_string()}\n') max_gain = 0.0 max_segmentation: List[List[Tuple[str, int]]] = [] log.info('Creating segmentation...') for _ in range(self._parameters.segment_num_trees): dataset.loc[:, 'norm_feedback'] = min_max_normalize( dataset['feedback']) segmentation_builder = TreeSegmentationBuilder( dataset.drop('feedback', axis=1), self._parameters) tree = segmentation_builder.build_tree(self._has_experian) segments = _get_segments(tree) segmentation_df = self._segment_dataframe(dataset, segments) tree_gain_vs_no_action = self._evaluate_segmentation( segmentation_df) if tree_gain_vs_no_action > max_gain: max_gain = tree_gain_vs_no_action max_segmentation = segments log.debug( f'Expected gain of the best tree vs no action = {max_gain}\n') log.info(f'Segments: {max_segmentation}') return max_segmentation
def _log_config(model_id, country, prediction_date, environment, global_parameters, model_parameters): print_header('CONFIG') config_str = _config_str(model_id, country, prediction_date, global_parameters, model_parameters, environment) log.info(config_str) print_footer() writers.writer.write_string( config_str, RUN_CONFIG_FILE_ID, AWS_PARAMETERS.s3_core_data_bucket, partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY))
def _predict(self, predict_df): predict_df = self._feature_pipeline.transform(predict_df) predict_df = predict_df[[GIVER_ID, SEGMENT]] predict_df = self._model_pipeline.transform(predict_df) log.info( f'Segments in prediction data: {", ".join(predict_df.segment.unique())}' ) return predict_df
def fit(self, df: pd.DataFrame): original_df = df.copy() df = df.drop([GIVER_ID], axis=1) self._input_feature_columns = get_feature_columns( df, self._non_feature_columns) self._segmentation = self._random_forest_segmentation(df) if not _valid_segmentation(self._segmentation): log.info('Could not find a confident segmentation.') df = self._segment_dataframe(original_df, self._segmentation) log.info(f'Final segmentation: {", ".join(df.segment.unique())}') return df
def validate(self, global_parameters: GlobalParameters): log.info('Validating model output...') self._validate_group( CONTROL, 0, 'not_control', 1 - global_parameters.control - global_parameters.other_action) self._validate_group(CONTROL, 1, 'control', global_parameters.control) self._validate_group(CONTROL, 2, 'other_action', global_parameters.other_action) explore_percentage = \ (1 - global_parameters.control - global_parameters.other_action) * global_parameters.exploration self._validate_group(EXPLORATION, 0, 'not_explore', 1 - explore_percentage) self._validate_group(EXPLORATION, 1, 'explore', explore_percentage) log.info('Done!') return self._output_df
def get_raw_prediction_data(country, ref_date, features): complete_prediction_data_query = compile_prediction_data_query( country, ref_date, features) writer.write_string( complete_prediction_data_query, 'prediction_data_query', AWS_PARAMETERS.s3_core_data_bucket, partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY)) log.info('Fetching data for customers which will be assigned actions...') unprepared_prediction_df = plds.db.run_dwh_query( complete_prediction_data_query) unprepared_prediction_df = unprepared_prediction_df.drop_duplicates( subset=[GIVER_ID]) return unprepared_prediction_df[prediction_data_columns(features)]
def send_all_event_stats(events_per_country_date): log.info( f'Exporting event statistics to InfluxDB, {INFLUXDB_HOST}:{INFLUXDB_DATABASE}' ) points = [] for key, value in events_per_country_date.to_dict().items(): points.append({ 'measurement': 'happy_all_events', 'tags': { 'country': key[0] }, 'fields': { 'count': value }, 'time': key[1].isoformat() }) plds.grokana.send_to_influxdb(points)
def fetch_training_data(country, prediction_date, feedback_weeks, features): dataset_query = compile_dataset_query( project_parameters.model_version, country, prediction_date, feedback_weeks, internal_config.EVENT_TABLE_IDENTIFIER, features) writer.write_string( dataset_query, 'dataset_query', AWS_PARAMETERS.s3_core_data_bucket, partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY)) log.info('Fetching training data...') unprepared_training_df = plds.db.run_dwh_query(dataset_query) unprepared_training_df = unprepared_training_df.drop_duplicates( subset=[MODEL_ID, GIVER_ID, RECEIVER_ID]) if unprepared_training_df.empty: log.info('No valid events found.') return unprepared_training_df
def send_action_distribution(self, model_action_distribution): log.info( f'Exporting action distribution to InfluxDB, {INFLUXDB_HOST}:{INFLUXDB_DATABASE}' ) points = [] for key, value in model_action_distribution.to_dict().items(): points.append({ 'measurement': 'happy_action', 'tags': { 'country': self.country, 'model_id': self.model_id, 'action': key }, 'fields': { 'value': value }, 'time': self.elaboration_date.isoformat() }) plds.grokana.send_to_influxdb(points)
def fetch_simulation_data(country, prediction_date, feedback_weeks, model_versions, features): print('Fetching dataset for simulation...') datasets = [] for model_version in model_versions: dataset_query = compile_dataset_query( model_version, country, prediction_date, feedback_weeks, internal_config.EVENT_TABLE_IDENTIFIER, features) writer.write_string( dataset_query, 'dataset_query', AWS_PARAMETERS.s3_core_data_bucket, partitions(COUNTRY_PARTITION_KEY, MODEL_ID_PARTITION_KEY)) unprepared_training_df = plds.db.run_dwh_query(dataset_query) unprepared_training_df = unprepared_training_df.drop_duplicates( subset=[MODEL_ID, GIVER_ID, RECEIVER_ID]) datasets.append(unprepared_training_df) if unprepared_training_df.empty: log.info('No valid events found.') dataset = pd.concat(datasets) return dataset
def load_run_config_from_local(country, path): log.info(f'Loading run config from {path}') run_config = toml.load(path) return load_config(run_config, country)
def run_model(country, prediction_date_str, environment, model_parameters: ModelParameters, global_parameters: GlobalParameters): prediction_date = datetime.strptime(prediction_date_str, '%Y-%m-%d') model_id = f'model-{prediction_date_str}-{country}-{random_str(4)}' set_partition_parameters(country, model_id) _log_config(model_id, country, prediction_date_str, environment, global_parameters, model_parameters) influx_exporter = InfluxExporter(country, prediction_date, model_id) np.random.seed(string_seed(prediction_date_str)) feature_set = FeatureSet.create_for_country(country) all_features = feature_set.all_features() training_df, training_meta_data = _get_training_data( country, prediction_date, all_features, global_parameters, environment, influx_exporter) log.info('Training data value counts:') log.info(training_df.action_code.value_counts()) log.info(f'Training data shape = {training_df.shape}') log.info(f'Training data columns = {list(training_df.columns)}') agent = SegmentedEpsGreedyAgent(feature_set, NON_FEATURE_COLUMNS, global_parameters.actions, global_parameters.default_action, global_parameters.experimental_group, model_parameters, training_meta_data) agent.train(training_df) prediction_df = get_raw_prediction_data(country, prediction_date_str, all_features) _validate_prediction_data(prediction_df) log.info(f'Prediction data shape = {prediction_df.shape}') log.info(f'Prediction data columns = {list(prediction_df.columns)}') prediction_df, other_groups_df = set_experiment_groups( prediction_df, global_parameters) log.info( f'Customers remaining after control and exploration = {len(prediction_df)}' ) predictions_df = agent.predict(prediction_df) model_output_df = _get_model_output(predictions_df, other_groups_df, country, prediction_date_str, model_id, global_parameters, influx_exporter) if environment == Environment.PRODUCTION: write_final_output( model_output_df, f'model_id={model_id}/country={country}/{model_id}.csv')
def load_run_config_from_s3(country): s3_key = project_parameters.compile_path({}, 'run_config', 'toml') log.info(f'Loading run config from {s3_key}') with S3FileSystem().open(f'{AWS_PARAMETERS.s3_config_bucket}/{s3_key}') as f: return load_config(toml.loads(f.read().decode()), country)
def print_header(heading): log.info(heading) log.info('=' * 50)
def print_footer(): log.info('=' * 50)
def log_stats(self): model_output = self._output_df print_header('OUTPUT') log.info(f'Control value distribution:\n' f'{model_output.control.value_counts().sort_index()}\n') log.info(f'Exploration value distribution:\n' f'{model_output.exploration.value_counts().sort_index()}\n') logprob_value_counts = model_output.logprob \ .apply(round, ndigits=3) \ .value_counts() \ .nlargest(10) \ .sort_index(ascending=False) log.info(f'''Logprob value distribution:\n{logprob_value_counts}\n''') log.info(f'Action distribution for all customers:\n' f'{model_output.action_code.value_counts().sort_index()}\n') log.info( f'Action distribution for exploration group:\n' f'{model_output[model_output["exploration"] == 1].action_code.value_counts().sort_index()}\n' ) model_action_distribution = self.model_action_distribution() log.info( f'''Action distribution for experimental group:\n{model_action_distribution.sort_index()}\n''' ) print_footer()
def write_dataframe(self, df, file_id, bucket, partition_params): res = df.to_csv(index=False) path = project_parameters.compile_path(partition_params, file_id, 'csv') log.info(f'Writing {file_id} to {path}') _put_object_to_s3(res, bucket, path)
def write_dataframe(self, df, file_id, bucket, partition_params): path = project_parameters.compile_path(partition_params, file_id, 'csv') local_path = self._create_local_path(bucket, path) log.info(f'Writing {file_id} to {local_path}') df.to_csv(local_path, index=False)
def write_string(self, string, file_id, bucket, partition_params): path = project_parameters.compile_path(partition_params, file_id, 'txt') log.info(f'Writing {file_id} to {path}') _put_object_to_s3(string, bucket, path)