def make_table_details(df, model_spec): 'return column table containing just the model spec' print 'starting make_table_details()' relevant_mask = df['model_spec'] == model_spec relevant = df[relevant_mask] sorted_df = relevant.sort_values(by=['effectivedatetime'], axis='index') details = [] column_names = [ 'trace_index', 'effectivedatetime', 'quantity', 'actual', 'prediction', 'absolute_error' ] details = [] for index, row in sorted_df.iterrows(): line = [] for column_name in column_names: if column_name == 'effectivedatetime': line.append( str(row[column_name]) ) # pandas Timestamps don't convert to %s formats correctly else: line.append(row[column_name]) details.append(line) # write a csv sorted_df = df.sort_values(by='absolute_error') reordered_df = sorted_df[column_names] with open(control.path['out_details_csv'], 'w') as f: reordered_df.to_csv(f) return columns_table( column_defs(column_names), details, )
def write_1_way(summary_name, column_name, path_name): write_the_lines( headings, columns_table( (column_def(column_name), column_def('mean_absolute_error')), [(k, v) for k, v in sort_by_value(summary[summary_name]).iteritems() ]), path[path_name], )
def write_2_ways(summary_name, column_names, path_name): write_the_lines( headings, columns_table( (column_def(column_names[0]), column_def( column_names[1]), column_def('mean_absolute_error')), [(k1, k2, v2) for k1, v1 in sort_by_key(summary[summary_name]).iteritems() for k2, v2 in sort_by_value(v1).iteritems()], ), path[path_name], )
def make_table_tradetype_modelspec(df): 'return list of lines in the column table' def make_detail_lines(df): 'return DataFrame containing detail lines' result = pd.DataFrame(columns=[ 'model_spec', 'trade_type', 'n_prints', 'mean_absolute_error', 'mae_ci05', 'mae_ci95', ]) for model_spec in set(df['model_spec']): for trade_type in set(df['trade_type']): relevant = ((df['model_spec'] == model_spec) & (df['trade_type'] == trade_type)) absolute_errors_relevant = df['absolute_error'].loc[ relevant] n_prints = len(absolute_errors_relevant) mean_absolute_error = absolute_errors_relevant.mean() ci_05, ci_95 = make_confidence_interval( absolute_errors_relevant) result.loc[len(result)] = [ model_spec, trade_type, n_prints, mean_absolute_error, ci_05, ci_95 ] return result print 'starting make_table_tradetype_modelspec()' sorted_detail_lines = make_detail_lines(df).sort_values( by=['trade_type', 'mean_absolute_error'], axis='index', ) details = [] column_names = ('trade_type', 'model_spec', 'n_prints', 'mean_absolute_error', 'mae_ci05', 'mae_ci95') for index, row in sorted_detail_lines.iterrows(): line = [row[column_name] for column_name in column_names] details.append(line) with open(control.path['out_accuracy_targetfeature_modelspec_csv'], 'w') as f: sorted_detail_lines.to_csv(f) return columns_table( column_defs(column_names), details, )
def report_on_best_model(n, best_modelspec): # return list of lines all_importances = collections.defaultdict( list) # Dict[feature_name, feature_importance list] relevant = importances[importances['model_spec'] == best_modelspec] for index, row in relevant.iterrows(): all_importances[row['feature_name']].append( row['feature_importance']) details = [] data = collections.defaultdict(list) for feature_name, importances_list in all_importances.iteritems(): mean_importance = sum(importances_list) / (1.0 * len(importances_list)) details.append(( feature_name, abs(mean_importance), mean_importance, )) data['model_spec'].append(best_modelspec) data['feature_name'].append(feature_name) data['abs_mean_importance'].append(abs(mean_importance)) data['mean_importance'].append(mean_importance) df = pd.DataFrame(data=data).sort_values(by='abs_mean_importance', ascending=False) reordered = df[[ 'model_spec', 'feature_name', 'abs_mean_importance', 'mean_importance' ]] with open(control.path['out_importance_%d_csv' % n], 'w') as f: reordered.to_csv(f) ct = columns_table( column_defs(('feature_name', 'absolute_feature_importance', 'feature_importance')), sorted(details, key=lambda x: x[1], reverse=True), # sort ascending by mean importance ) report = make_report(n, best_modelspec, [], ct) return report
def report_on_best_model(): best = make_best() importances_by, predictedfeaturenames = summarize_importances( importances) headings = [ 'Importances of the best models', 'Excluding features with zero importance', 'By model name and target feature' ' ' ] pp(headings) # product the detail lines for the best model for each (modelname, targetfeature) details = [] pdb.set_trace() for (modelname, targetfeaturename), d in best.iteritems(): for n, meanerrors_modelspecstrs in d.iteritems(): if n == 1: pdb.set_trace() meanerror, modelspecstr = meanerrors_modelspecstrs[0] print targetfeaturename, modelspecstr for predictionfeaturename in predictedfeaturenames: i = importances_by[(modelspecstr, targetfeaturename, predictionfeaturename)] for queryindex, importance in i.iteritems(): pdb.set_trace() details.append(queryindex, modelspecstr, targetfeaturename, predictionfeaturename, importance) pdb.set_trace() ct = columns_table( column_defs('query_index', 'model_spec', 'target_feature', 'prediction_feature', 'importance'), details, ) pp(ct) pdb.set_trace()