예제 #1
0
 def make_table_details(df, model_spec):
     'return column table containing just the model spec'
     print 'starting make_table_details()'
     relevant_mask = df['model_spec'] == model_spec
     relevant = df[relevant_mask]
     sorted_df = relevant.sort_values(by=['effectivedatetime'],
                                      axis='index')
     details = []
     column_names = [
         'trace_index', 'effectivedatetime', 'quantity', 'actual',
         'prediction', 'absolute_error'
     ]
     details = []
     for index, row in sorted_df.iterrows():
         line = []
         for column_name in column_names:
             if column_name == 'effectivedatetime':
                 line.append(
                     str(row[column_name])
                 )  # pandas Timestamps don't convert to %s formats correctly
             else:
                 line.append(row[column_name])
         details.append(line)
     # write a csv
     sorted_df = df.sort_values(by='absolute_error')
     reordered_df = sorted_df[column_names]
     with open(control.path['out_details_csv'], 'w') as f:
         reordered_df.to_csv(f)
     return columns_table(
         column_defs(column_names),
         details,
     )
예제 #2
0
 def write_1_way(summary_name, column_name, path_name):
     write_the_lines(
         headings,
         columns_table(
             (column_def(column_name), column_def('mean_absolute_error')),
             [(k, v)
              for k, v in sort_by_value(summary[summary_name]).iteritems()
              ]),
         path[path_name],
     )
예제 #3
0
 def write_2_ways(summary_name, column_names, path_name):
     write_the_lines(
         headings,
         columns_table(
             (column_def(column_names[0]), column_def(
                 column_names[1]), column_def('mean_absolute_error')),
             [(k1, k2, v2)
              for k1, v1 in sort_by_key(summary[summary_name]).iteritems()
              for k2, v2 in sort_by_value(v1).iteritems()],
         ),
         path[path_name],
     )
예제 #4
0
    def make_table_tradetype_modelspec(df):
        'return list of lines in the column table'

        def make_detail_lines(df):
            'return DataFrame containing detail lines'
            result = pd.DataFrame(columns=[
                'model_spec',
                'trade_type',
                'n_prints',
                'mean_absolute_error',
                'mae_ci05',
                'mae_ci95',
            ])
            for model_spec in set(df['model_spec']):
                for trade_type in set(df['trade_type']):
                    relevant = ((df['model_spec'] == model_spec) &
                                (df['trade_type'] == trade_type))
                    absolute_errors_relevant = df['absolute_error'].loc[
                        relevant]
                    n_prints = len(absolute_errors_relevant)
                    mean_absolute_error = absolute_errors_relevant.mean()
                    ci_05, ci_95 = make_confidence_interval(
                        absolute_errors_relevant)
                    result.loc[len(result)] = [
                        model_spec, trade_type, n_prints, mean_absolute_error,
                        ci_05, ci_95
                    ]
            return result

        print 'starting make_table_tradetype_modelspec()'
        sorted_detail_lines = make_detail_lines(df).sort_values(
            by=['trade_type', 'mean_absolute_error'],
            axis='index',
        )
        details = []
        column_names = ('trade_type', 'model_spec', 'n_prints',
                        'mean_absolute_error', 'mae_ci05', 'mae_ci95')
        for index, row in sorted_detail_lines.iterrows():
            line = [row[column_name] for column_name in column_names]
            details.append(line)
        with open(control.path['out_accuracy_targetfeature_modelspec_csv'],
                  'w') as f:
            sorted_detail_lines.to_csv(f)
        return columns_table(
            column_defs(column_names),
            details,
        )
예제 #5
0
    def report_on_best_model(n, best_modelspec):
        # return list of lines
        all_importances = collections.defaultdict(
            list)  # Dict[feature_name, feature_importance list]
        relevant = importances[importances['model_spec'] == best_modelspec]
        for index, row in relevant.iterrows():
            all_importances[row['feature_name']].append(
                row['feature_importance'])

        details = []
        data = collections.defaultdict(list)
        for feature_name, importances_list in all_importances.iteritems():
            mean_importance = sum(importances_list) / (1.0 *
                                                       len(importances_list))
            details.append((
                feature_name,
                abs(mean_importance),
                mean_importance,
            ))
            data['model_spec'].append(best_modelspec)
            data['feature_name'].append(feature_name)
            data['abs_mean_importance'].append(abs(mean_importance))
            data['mean_importance'].append(mean_importance)

        df = pd.DataFrame(data=data).sort_values(by='abs_mean_importance',
                                                 ascending=False)
        reordered = df[[
            'model_spec', 'feature_name', 'abs_mean_importance',
            'mean_importance'
        ]]
        with open(control.path['out_importance_%d_csv' % n], 'w') as f:
            reordered.to_csv(f)

        ct = columns_table(
            column_defs(('feature_name', 'absolute_feature_importance',
                         'feature_importance')),
            sorted(details, key=lambda x: x[1],
                   reverse=True),  # sort ascending by mean importance
        )
        report = make_report(n, best_modelspec, [], ct)
        return report
예제 #6
0
 def report_on_best_model():
     best = make_best()
     importances_by, predictedfeaturenames = summarize_importances(
         importances)
     headings = [
         'Importances of the best models',
         'Excluding features with zero importance',
         'By model name and target feature'
         ' '
     ]
     pp(headings)
     # product the detail lines for the best model for each (modelname, targetfeature)
     details = []
     pdb.set_trace()
     for (modelname, targetfeaturename), d in best.iteritems():
         for n, meanerrors_modelspecstrs in d.iteritems():
             if n == 1:
                 pdb.set_trace()
                 meanerror, modelspecstr = meanerrors_modelspecstrs[0]
                 print targetfeaturename, modelspecstr
                 for predictionfeaturename in predictedfeaturenames:
                     i = importances_by[(modelspecstr, targetfeaturename,
                                         predictionfeaturename)]
                     for queryindex, importance in i.iteritems():
                         pdb.set_trace()
                         details.append(queryindex, modelspecstr,
                                        targetfeaturename,
                                        predictionfeaturename, importance)
     pdb.set_trace()
     ct = columns_table(
         column_defs('query_index', 'model_spec', 'target_feature',
                     'prediction_feature', 'importance'),
         details,
     )
     pp(ct)
     pdb.set_trace()