def rescore_metrics(data_set_path): load_mat_df = reload_churn_data(data_set_path, 'load_mat', '6.4', is_customer_data=False) score_df = reload_churn_data(data_set_path, 'score_params', '7.5', is_customer_data=False) current_data = reload_churn_data(data_set_path, 'current', '8.2', is_customer_data=True) assert set(score_df.index.values) == set( current_data.columns.values ), "Data to re-score does not match transform params" assert set(load_mat_df.index.values) == set( current_data.columns.values ), "Data to re-score does not match lodaasasdfasdfasdf matrix" transform_skew_columns(current_data, score_df[score_df['skew_score']].index.values) transform_fattail_columns(current_data, score_df[score_df['fattail_score']].index.values) scaled_data = score_current_data(current_data, score_df, data_set_path) grouped_data = group_current_data(scaled_data, load_mat_df, data_set_path) save_segment_data(grouped_data, current_data, load_mat_df, data_set_path)
def rescore_metrics(data_set_path): current_data = reload_churn_data(data_set_path, 'current', '8.2', is_customer_data=True) load_mat_df = reload_churn_data(data_set_path, 'load_mat', '6.4', is_customer_data=False) score_df = reload_churn_data(data_set_path, 'score_params', '7.5', is_customer_data=False) stats = reload_churn_data(data_set_path, 'summarystats', '5.2', is_customer_data=False) stats.drop('is_churn', inplace=True) assert set(score_df.index.values) == set( current_data.columns.values ), "Data to re-score does not match transform params" assert set(load_mat_df.index.values) == set( current_data.columns.values ), "Data to re-score does not match load matrix" assert set(stats.index.values) == set( current_data.columns.values ), "Data to re-score does not match summary stats" clip_hi_cols(current_data, stats['99pct']) clip_lo_cols(current_data, stats['1pct']) transform_skew_columns(current_data, score_df[score_df['skew_score']].index.values) transform_fattail_columns(current_data, score_df[score_df['skew_score']].index.values) current_data = current_data[score_df.index.values] scaled_data = (current_data - score_df['mean']) / score_df['std'] scaled_data = scaled_data[load_mat_df.index.values] grouped_ndarray = np.matmul(scaled_data.to_numpy(), load_mat_df.to_numpy()) current_data_grouped = pd.DataFrame(grouped_ndarray, columns=load_mat_df.columns.values, index=current_data.index) score_save_path = data_set_path.replace('.csv', '_current_groupscore.csv') current_data_grouped.to_csv(score_save_path, header=True) print('Saving results to %s' % score_save_path)
def rescore_wcats(data_set_path, categories, groups): current_path = data_set_path.replace('.csv', '_current.csv') dummy_variables(current_path, groups, current=True) current_dummies = reload_churn_data(data_set_path, 'current_dummies_groupscore', '10.7', is_customer_data=True) align_dummies(current_dummies, data_set_path) nocat_path = data_set_path.replace('.csv', '_nocat.csv') load_mat_df = reload_churn_data(nocat_path, 'load_mat', '6.4', is_customer_data=False) score_df = reload_churn_data(nocat_path, 'score_params', '7.5', is_customer_data=False) current_nocat = reload_churn_data(data_set_path, 'current_nocat', '10.7', is_customer_data=True) assert set(score_df.index.values) == set( current_nocat.columns.values ), "Data to re-score does not match transform params" assert set(load_mat_df.index.values) == set( current_nocat.columns.values ), "Data to re-score does not match loading matrix" transform_skew_columns(current_nocat, score_df[score_df['skew_score']].index.values) transform_fattail_columns(current_nocat, score_df[score_df['fattail_score']].index.values) scaled_data = score_current_data(current_nocat, score_df, data_set_path) grouped_data = group_current_data(scaled_data, load_mat_df, data_set_path) group_dum_df = grouped_data.merge(current_dummies, left_index=True, right_index=True) group_dum_df.to_csv(data_set_path.replace('.csv', '_current_groupscore.csv'), header=True) current_df = reload_churn_data(data_set_path, 'current', '10.7', is_customer_data=True) save_segment_data_wcats(grouped_data, current_df, load_mat_df, data_set_path, categories)
def clipped_scores(data_set_path,skew_thresh=4.0): churn_data = pd.read_csv(data_set_path) churn_data.set_index(['account_id','observation_date'],inplace=True) data_scores = churn_data.copy() data_scores.drop('is_churn',axis=1,inplace=True) stat_path = data_set_path.replace('.csv', '_summarystats.csv') assert os.path.isfile(stat_path),'You must running listing 5.2 first to generate stats' stats = pd.read_csv(stat_path,index_col=0) stats.drop('is_churn',inplace=True) clip_hi_cols(data_scores, stats['99pct']) clip_lo_cols(data_scores, stats['1pct']) skewed_columns=(stats['skew']>skew_thresh) & (stats['min'] >= 0) transform_skew_columns(data_scores,skewed_columns[skewed_columns].keys()) fattail_columns=(stats['skew']>skew_thresh) & (stats['min'] < 0) transform_fattail_columns(data_scores,fattail_columns[fattail_columns].keys()) mean_vals = data_scores.mean() std_vals = data_scores.std() data_scores=(data_scores-mean_vals)/std_vals data_scores['is_churn']=churn_data['is_churn'] score_save_path=data_set_path.replace('.csv','_scores.csv') data_scores.to_csv(score_save_path,header=True) print('Saving results to %s' % score_save_path) param_df = pd.DataFrame({'skew_score': skewed_columns, 'fattail_score': fattail_columns, 'mean': mean_vals, 'std': std_vals}) param_save_path=data_set_path.replace('.csv','_score_params.csv') param_df.to_csv(param_save_path,header=True) print('Saving params to %s' % param_save_path)