Ejemplo n.º 1
0
class ReportWithColumnsTable(object):
    def __init__(self,
                 header_lines,
                 column_defs,
                 print_as_spaces,
                 verbose=True):
        self._report = Report()
        self._header(header_lines)
        self._ct = ColumnsTable(column_defs, verbose)
        self._print_as_spaces = print_as_spaces

    def _header(self, header_lines):
        for line in header_lines:
            self._report.append(line)

    def append_detail(self, **kwds):
        # replace NaN with None
        with_spaces = {
            k: (None if self._print_as_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def write(self, path):
        self._t.append_legend()
        for line in self._t.iterlines():
            self._report.append(line)
        self._report.write(path)
Ejemplo n.º 2
0
 def make_report(n_best, n_worst):
     report = Report()
     make_header(report)
     details = make_details(data, control.test_months, n_best, n_worst)
     for line in details.iterlines():
         report.append(line)
     return report
Ejemplo n.º 3
0
 def make_report(n_best, n_worst):
     report = Report()
     make_header(report)
     details, extra_info = make_details(data, control.test_months, n_best, n_worst)
     for line in details.iterlines():
         report.append(line)
     make_plt(data, extra_info, n_best, n_worst)
     return report
Ejemplo n.º 4
0
 def make_report(data, cities, sorted_by_tag):
     'return a Report'
     r = Report()
     r.append('Price Statistics by City')
     r.append('Sorted by %s' % sorted_by_tag)
     r.append('Transactions from %s to %s' %
              (data.date.min(), data.date.max()))
     r.append(' ')
     ct = make_column_table(cities, data)
     for line in ct.iterlines():
         r.append(line)
     return r
Ejemplo n.º 5
0
def make_report(summary):
    r = Report()
    format_header = '%40s %8s %8s %8s %8s %8s %8s %8s'
    format_detail = '%40s %8.0f %8.0f %8.0f %8.0f %8d %8d %8.0f'
    r.append(format_header % ('numeric feature', 'min', 'median', 'mean',
                              'max', 'distinct', 'NaN', 'std'))
    for row_name, row_value in summary.iterrows():
        r.append(
            format_detail %
            (row_name, row_value['min'], row_value['50%'], row_value['mean'],
             row_value['max'], row_value['number_distinct'],
             row_value['number_nan'], row_value['std']))
    return r
Ejemplo n.º 6
0
class ChartCDReport(object):
    def __init__(self, column_definitions, test):
        self._column_definitions = column_definitions
        self._test = test
        self._report = Report()
        cd = self._column_definitions.defs_for_columns(
            'validation_month',
            'rank',
            'median_absolute_error',
            'median_price',
            'model',
            'n_months_back',
            'max_depth',
            'n_estimators',
            'max_features',
            'learning_rate',
            'alpha',
            'l1_ratio',
            'units_X',
            'units_y',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)
        self._header()

    def append(self, line):
        self._report.append(line)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def _header(self):
        self._report.append(
            'Median Absolute Error (MAE) by month for best-performing models and their hyperparameters'
        )
        self._report.append(' ')

    def append_detail(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)
Ejemplo n.º 7
0
def make_report(summary):
    r = Report()
    format_header = '%40s %8s %8s %8s %8s %8s %8s %8s'
    format_detail = '%40s %8.0f %8.0f %8.0f %8.0f %8d %8d %8.0f'
    r.append(format_header % ('numeric feature', 'min', 'median', 'mean', 'max', 'distinct', 'NaN', 'std'))
    for row_name, row_value in summary.iterrows():
        r.append(format_detail % (
            row_name,
            row_value['min'],
            row_value['50%'],
            row_value['mean'],
            row_value['max'],
            row_value['number_distinct'],
            row_value['number_nan'],
            row_value['std']))
    return r
Ejemplo n.º 8
0
 def _make_report(self, counters):
     r = Report()
     r.append('Records retained while reducing input file')
     for path, counter in counters.iteritems():
         r.append(' ')
         r.append('path %s' % path)
         for tag, value in counter.iteritems():
             r.append('%30s: %d' % (tag, value))
     return r
Ejemplo n.º 9
0
def make_chart_a(control, data):
    'return a Report'
    def make_header(report):
        report.append('Median Absolute Errors for Most Accurate Models')
        report.append('By Month')
        report.append('By Feature Group')
        report.append(' ')

    def make_details(data, control):
        'return a ColumnsTable'
        def append_feature_group_description(ct):
            ct.append_line(' ')
            ct.append_line('Features groups;')
            ct.append_line('s    : only size features')
            ct.append_line('sw   : only size and wealth features')
            ct.append_line('swp  : only size, wealth, and property features')
            ct.append_line('swpn : all features: size, wealth, property, and neighborhood')

        ct = ColumnsTable((
            ('month', 6, '%6s', ('', 'month'), 'training month'),
            ('features', 8, '%8s', ('features', 'group'), 'group of features'),
            ('model', 5, '%5s', ('best', 'model'), 'family of best model'),
            ('mae', 6, '%6.0f', ('', 'mae'), 'mae of best model in month using features'),
            ),
            verbose=True,
            )
        for month in control.test_months:
            for features in control.feature_groups:
                mae_model = data[month][features]
                ct.append_detail(
                    month=month,
                    features=features,
                    model=mae_model.model,
                    mae=mae_model.mae,
                    )
            ct.append_detail()  # blank line separates each month
        ct.append_legend()
        append_feature_group_description(ct)

        return ct

    report = Report()
    make_header(report)
    for line in make_details(data, control).iterlines():
        report.append(line)
    return report
Ejemplo n.º 10
0
def make_chart_b(control, data):
    "return a Report"

    def make_header(report):
        report.append("Mean Probability of a Feature Being Included in a Decision Tree")
        report.append("Across the Entire Ensemble of Decisions Trees")
        report.append("For Most Accurate Model in Each Training Month")
        report.append(" ")

    def make_mean_importance_by_feature(test_months):
        "return dict[feature_name] = float, the mean importance of the feature"
        feature_names = Features().ege_names(control.arg.features)
        mean_importance = {}  # key = feature_name
        for feature_index, feature_name in enumerate(feature_names):
            # build vector of feature_importances for feature_name
            feature_importances = np.zeros(len(test_months))  # for feature_name
            for month_index, test_month in enumerate(test_months):
                month_importances = data[ReductionKey(test_month)]  # for each feature
                all_feature_importances = month_importances.importances["feature_importances"]
                feature_importances[month_index] = all_feature_importances[feature_index]
            mean_importance[feature_name] = np.mean(feature_importances)
        return mean_importance

    def make_details(data, test_months):
        "return a ColumnTable"
        columns_table = ColumnsTable(
            (
                ("mean_prob", 5, "%5.2f", ("mean", "prob"), "mean probability feature appears in a decision tree"),
                ("feature_name", 40, "%40s", (" ", "feature name"), "name of feature"),
            ),
            verbose=True,
        )
        mean_importance = make_mean_importance_by_feature(test_months)
        for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True):
            columns_table.append_detail(mean_prob=mean_importance[feature_name] * 100.0, feature_name=feature_name)
        columns_table.append_legend()
        return columns_table

    report = Report()
    make_header(report)
    details = make_details(data, control.test_months)
    for line in details.iterlines():
        report.append(line)
    return report
Ejemplo n.º 11
0
    def make_report(title, ordered_cities):
        def make_detail_line(city):
            return {
                'city': city,
                'median_price': median_prices[city],
                'median_price_index': median_prices_indices[city],
                'n_trades': n_trades[city],
                'n_trades_index': n_trades_indices[city],
            }

        c = ColumnsTable((
            ('city', 30, '%30s', ('', '', '', '', '', 'City'), 'city name'),
            ('median_price', 7, '%7.0f', ('', '', '', '', 'median', 'price'),
             'median price in city'),
            ('median_price_index', 7, '%7.2f', ('median', 'price', '/',
                                                'overall', 'median', 'price'),
             'median price as fraction of overall median price'),
            ('n_trades', 7, '%7.0f', ('', '', '', '', 'number', 'trades'),
             'number of trades across all months'),
            ('n_trades_index', 7, '%7.2f', ('number', 'trades', '/ ',
                                            'overall', 'median', 'trades'),
             'median number trades as fraction of overall median number of trades'
             ),
        ))
        for city in ordered_cities:
            c.append_detail(**make_detail_line(city))
        c.append_legend(40)

        r = Report()
        r.append(title)
        r.append(' ')
        for line in c.iterlines():
            r.append(line)
        return r
Ejemplo n.º 12
0
def make_table_stats(data, control, in_report_p):
    'return Report with statistics for years and months that obey the filter'
    r = Report()
    r.append('Prices by Month')
    r.append('')
    ct = ColumnsTable((
        ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'),
        ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'),
        ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'),
         'mean price in dollars'),
        ('median_price', 6, '%6.0f', (' ', 'median', 'price'),
         'median price in dollars'),
        ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'),
         'ratio of price in current month to prior month'),
        ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'),
         'ratio of price in current month to prior month'),
        ('number_trades', 6, '%6d', ('number', 'of', 'trades'),
         'number of trades in the month'),
    ))

    prior_mean_price = None
    prior_median_price = None
    for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009):
        for month in (1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                     10, 11, 12):
            if in_report_p(year, month):
                selected = data.month == Month(year, month)
                prices = data[selected].price
                mean_price = prices.mean()
                median_price = prices.median()
                number_trades = len(prices)
                ct.append_detail(
                    year=year,
                    month=month,
                    mean_price=mean_price,
                    median_price=median_price,
                    mean_price_ratio=None if prior_mean_price is None else
                    mean_price / prior_mean_price,
                    median_price_ratio=None if prior_median_price is None else
                    median_price / prior_median_price,
                    number_trades=number_trades,
                )
                prior_mean_price = mean_price
                prior_median_price = median_price
    ct.append_legend()
    for line in ct.iterlines():
        r.append(line)
    return r
Ejemplo n.º 13
0
class ChartBReport(object):
    def __init__(self, validation_month, k, column_definitions, test):
        self._report = Report()
        self._header(validation_month, k)
        self._column_definitions = column_definitions
        self._test = test
        cd = self._column_definitions.defs_for_columns(
            'median_absolute_error',
            'model',
            'n_months_back',
            'max_depth',
            'n_estimators',
            'max_features',
            'learning_rate',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def _header(self, validation_month, k):
        def a(line):
            self._report.append(line)

        a('MAE for %d best-performing models and their hyperparameters' % k)
        a('Validation month: %s' % validation_month)
        a(' ')

    def append_detail(self, **kwds):
        # replace NaN with None
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('**TESTING: DISCARD')
        self._report.write(path)
Ejemplo n.º 14
0
def make_chart_stats(data, control, filter_f):
    'return Report with statistics for years and months that obey the filter'
    r = Report()
    r.append('Prices by Month')
    r.append('')
    ct = ColumnsTable((
            ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'),
            ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'),
            ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'), 'mean price in dollars'),
            ('median_price', 6, '%6.0f', (' ', 'median', 'price'), 'median price in dollars'),
            ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'), 'ratio of price in current month to prior month'),
            ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'), 'ratio of price in current month to prior month'),
            ('number_trades', 6, '%6d', ('number', 'of', 'trades'), 'number of trades in the month'),
            ))

    prior_mean_price = None
    prior_median_price = None
    for year in xrange(2003, 2010):
        for month in xrange(1, 13):
            if filter_f(year, month):
                value = data[make_reduction_key(year, month)]
                mean_price = value['mean']
                median_price = value['median']
                number_trades = value['count']
                ct.append_detail(
                        year=year,
                        month=month,
                        mean_price=mean_price,
                        median_price=median_price,
                        mean_price_ratio=None if prior_mean_price is None else mean_price / prior_mean_price,
                        median_price_ratio=None if prior_median_price is None else median_price / prior_median_price,
                        number_trades=number_trades,
                        )
                prior_mean_price = mean_price
                prior_median_price = median_price
    ct.append_legend()
    for line in ct.iterlines():
        r.append(line)
    return r
Ejemplo n.º 15
0
def make_chart_a(control, data):
    'return a Report'

    def make_header(report):
        report.append('Median Absolute Errors for Most Accurate Models')
        report.append('By Month')
        report.append('By Feature Group')
        report.append(' ')

    def make_details(data, control):
        'return a ColumnsTable'

        def append_feature_group_description(ct):
            ct.append_line(' ')
            ct.append_line('Features groups;')
            ct.append_line('s    : only size features')
            ct.append_line('sw   : only size and wealth features')
            ct.append_line('swp  : only size, wealth, and property features')
            ct.append_line(
                'swpn : all features: size, wealth, property, and neighborhood'
            )

        ct = ColumnsTable(
            (
                ('month', 6, '%6s', ('', 'month'), 'training month'),
                ('features', 8, '%8s',
                 ('features', 'group'), 'group of features'),
                ('model', 5, '%5s', ('best', 'model'), 'family of best model'),
                ('mae', 6, '%6.0f',
                 ('', 'mae'), 'mae of best model in month using features'),
            ),
            verbose=True,
        )
        my_info = []
        for month in control.test_months:
            for features in control.feature_groups:
                mae_model = data[month][features]
                ct.append_detail(
                    month=month,
                    features=features,
                    model=mae_model.model,
                    mae=mae_model.mae,
                )
                my_info.append(
                    [month, features, mae_model.model, mae_model.mae])

            ct.append_detail()  # blank line separates each month
        ct.append_legend()
        append_feature_group_description(ct)

        return ct, my_info

    def make_plots(info):
        info = [info[i:i + 4] for i in xrange(0, len(info), 4)]

        def make_subplot1(validation_month, data):
            y = [data[k][3] for k in (0, 1, 2, 3)]
            plt.title(validation_month)
            plt.bar([1, 2, 3, 4],
                    y)  # the reduction is sorted by increasing mae
            plt.yticks(size='xx-small')
            plt.ylim(0, 140000)
            plt.xticks([1.2, 2.2, 3.2, 4.6], ['s', 'sw', 'swp', 'swpn'],
                       size='medium')  # no ticks on x axis
            return

        def make_subplot2(validation_month, data):
            y = [data[k][3] for k in (0, 1, 2, 3)]
            plt.title(validation_month)
            plt.bar([1, 2, 3, 4],
                    y)  # the reduction is sorted by increasing mae
            plt.yticks([])
            plt.xticks([1.4, 2.4, 3.4, 4.4], ['s', 'sw', 'swp', 'swpn'],
                       rotation=-70,
                       size='xx-small')  # no ticks on x axis
            plt.ylim(0, 140000)
            return

        def make_figures(path, data, kind):
            if kind == 'maeall':
                rows = 6
                cols = 6
                axes_number = 0
            if kind == 'mae2007':
                rows = 3
                cols = 4
                axes_number = 0

            plt.figure()  # new figure
            #            validation_months_2007 = ('200612', '200701', '200702', '200703', '200704', '200705',
            #                             '200706', '200707', '200708', '200709', '200710', '200711',
            #                             )
            row_seq = range(1, rows + 1)
            col_seq = range(1, cols + 1)
            for row in row_seq:
                for col in col_seq:
                    if kind == 'maeall':
                        tempData = data[axes_number]
                    else:
                        tempData = data[axes_number + 12]
                    validation_month = tempData[0][0]
                    axes_number += 1  # count across rows
                    plt.subplot(len(row_seq), len(col_seq), axes_number)
                    if kind == 'maeall':
                        make_subplot2(validation_month, tempData)
                    else:
                        make_subplot1(validation_month, tempData)
                    # annotate the bottom row only
                    if row == rows:
                        if col == 1:
                            plt.xlabel('features')
                            plt.ylabel('mae ($)')

            if kind == 'mae2007':
                plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
            else:
                plt.tight_layout(pad=0.1, w_pad=0.1, h_pad=0.1)

            plt.savefig(path)
            plt.close()

        make_figures(control.path_out_plt_a_mae_all, info, 'maeall')
        make_figures(control.path_out_plt_a_mae_2007, info, 'mae2007')

    report = Report()
    make_header(report)
    his_details, my_info = make_details(data, control)
    make_plots(my_info)
    for line in his_details.iterlines():
        report.append(line)
    return report
Ejemplo n.º 16
0
class ChartFReport(object):
    def __init__(self, k, ensemble_weighting, column_definitions, test):
        self._column_definitions = column_definitions
        self._test = test
        self._report = Report()
        self._header(k, ensemble_weighting)
        cd = self._column_definitions.defs_for_columns(
            'validation_month',
            'mae_index0',
            'mae_ensemble',
            'mae_best_next_month',
            'median_price',
            'fraction_median_price_next_month_index0',
            'fraction_median_price_next_month_ensemble',
            'fraction_median_price_next_month_best',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def detail_line(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def _header(self, k, ensemble_weighting):
        self._report.append(
            'Comparison of Errors of Ensemble and Best Model That Know the Future'
        )
        self._report.append(' ')
        self._report.append('Considering Best K = %d models' % k)
        self._report.append('Ensemble weighting: %s' % ensemble_weighting)
Ejemplo n.º 17
0
 def make_report(ct):
     report = Report()
     report.append('Count and Median Price of Transactions in 2007 by City')
     for line in ct.iterlines():
         report.append(line)
     return report
Ejemplo n.º 18
0
class ChartGReport():
    def __init__(self):
        self.report = Report()
        self.format_header = '%4s %7s'
        self.format_detail = '%4d %6.3f%%'
        self._header()

    def detail(self, k, marr):
        self.report.append(self.format_detail % (k, marr * 100.0))

    def _header(self):
        self.report.append('Hyperparameter K')
        self.report.append(' ')
        self.report.append(self.format_header % ('K', 'MARR'))

    def write(self, path):
        self.report.append('Legend:')
        self.report.append('K: number of models in ensemble')
        self.report.append('MARR: Median Absolute Relative Regret')
        self.report.write(path)

    def append(self, line):
        self.report.append(line)
Ejemplo n.º 19
0
class ChartEReport(object):
    def __init__(self, k, ensemble_weighting, column_definitions, test):
        self._column_definitions = column_definitions
        self._test = test
        self._report = Report()
        self._header(k, ensemble_weighting)
        cd = self._column_definitions.defs_for_columns(
            'validation_month',
            'model',
            'n_months_back',
            'n_estimators',
            'max_features',
            'max_depth',
            'learning_rate',
            'rank',
            'weight',
            'mae_validation',
            'mae_query',
            'mae_ensemble',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def detail_line(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def _header(self, k, ensemble_weighting):
        self._report.append(
            'Performance of Best Models Separately and as an Ensemble')
        self._report.append(' ')
        self._report.append('Considering Best K = %d models' % k)
        self._report.append('Ensemble weighting: %s' % ensemble_weighting)
Ejemplo n.º 20
0
class ChartHReport(object):
    def __init__(self, k, validation_month, ensemble_weighting,
                 column_definitions, test):
        self._column_definitions = column_definitions
        self._report = Report()
        self._test = test
        self._header(k, validation_month, ensemble_weighting)
        cd = self._column_definitions.defs_for_columns(
            'description',
            'mae_validation',
            'mae_query',
            'mare_validation',
            'mare_query',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def detail_line(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def preformatted_line(self, line):
        print line
        self._ct.append_line(line)

    def _header(self, k, validation_month, ensemble_weighting):
        self._report.append(
            'Performance of Best Models Separately and as an Ensemble')
        self._report.append(' ')
        self._report.append('Considering Best K = %d models' % k)
        self._report.append('For validation month %s' % validation_month)
        self._report.append('Ensemble weighting: %s' % ensemble_weighting)
Ejemplo n.º 21
0
def make_chart_b(control, data):
    'return a Report'
    def make_header(report):
        report.append('Mean Probability of a Feature Being Included in a Decision Tree')
        report.append('Across the Entire Ensemble of Decisions Trees')
        report.append('For Most Accurate Model in Each Training Month')
        report.append(' ')

    def make_mean_importance_by_feature(test_months):
        'return dict[feature_name] = float, the mean importance of the feature'
        feature_names = Features().ege_names(control.arg.features)
        mean_importance = {}  # key = feature_name
        for feature_index, feature_name in enumerate(feature_names):
            # build vector of feature_importances for feature_name
            feature_importances = np.zeros(len(test_months))  # for feature_name
            for month_index, test_month in enumerate(test_months):
                month_importances = data[ReductionKey(test_month)]  # for each feature
                all_feature_importances = month_importances.importances['feature_importances']
                if 'feature_importances' not in month_importances.importances:
                    print 'chart b sees an unexpected ensemble model'
                    print 'test_month', test_month
                    print 'month_importances', month_importances
                    print 'entering debugger'
                    pdb.set_trace()
                feature_importances[month_index] = all_feature_importances[feature_index]
            mean_importance[feature_name] = np.mean(feature_importances)
        return mean_importance

    def make_details(data, test_months):
        'return a ColumnTable'
        columns_table = ColumnsTable((
            ('mean_prob', 5, '%5.2f', ('mean', 'prob'), 'mean probability feature appears in a decision tree'),
            ('feature_name', 40, '%40s', (' ', 'feature name'), 'name of feature'),
            ),
            verbose=True)
        my_prob = []
        my_featname = []
        mean_importance = make_mean_importance_by_feature(test_months)
        for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True):
            columns_table.append_detail(
                mean_prob=mean_importance[feature_name] * 100.0,
                feature_name=feature_name,
            )
            if mean_importance[feature_name] * 100.0 >= 1:
                my_prob.append(mean_importance[feature_name] * 100.0)
                my_featname.append(feature_name)
        columns_table.append_legend()
        return columns_table, my_featname, my_prob

    def make_plt(feats, probs):
        plt.bar(range(len(feats)), probs, color='blue')
        labels = feats
        plt.xticks([x+.6 for x in range(len(feats))], labels, rotation=-70, size='small')

        plt.yticks(size='xx-small')
        plt.ylabel('Probability Feature in a Decision Tree (%)')
        plt.xlabel('Features That Occur More Than 1 Percent of Time')
        plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
        plt.savefig(control.path_out_chart_b_pdf)
        plt.close()

    report = Report()
    make_header(report)
    details, my_feats, my_probs = make_details(data, control.test_months)
    make_plt(my_feats, my_probs)
    for line in details.iterlines():
        report.append(line)
    return report
Ejemplo n.º 22
0
def makefile(control):
    '''write file valavm.makefile with these targets
    valavm-{feature_group}-{locality}-{system}
    all
    '''
    months = ('200512', '200601', '200602', '200603', '200604', '200605',
              '200606', '200607', '200608', '200609', '200610', '200611',
              '200612', '200701', '200702', '200703', '200704', '200705',
              '200706', '200707', '200708', '200709', '200710', '200711',
              '200712', '200801', '200802', '200803', '200804', '200805',
              '200806', '200807', '200808', '200809', '200810', '200811',
              '200812', '200901', '200902')

    def make_jobs(args):
        jobs = {}
        for index in range(0, len(args), 2):
            system_name = args[index]
            hardware_threads = args[index + 1]
            jobs[system_name] = int(hardware_threads)
        return jobs

    def make_system_generator(jobs):
        systems = jobs.keys()
        for system in itertools.cycle(systems):
            for index in xrange(jobs[system]):
                yield system

    def make_system_months(jobs, months):
        result = collections.defaultdict(list)
        system_generator = make_system_generator(jobs)
        for month in months:
            system = system_generator.next()
            result[system].append(month)
        return result

    def make_variable(feature_group, locality, system, month):
        lhs = 'valavm-%s-all-%s-%s' % (feature_group, locality, system)
        rhs = '../data/working/valavm/%s-all-%s/%s' % (feature_group, locality,
                                                       month)
        line = '%s += %s' % (lhs, rhs)
        return line

    def make_target(feature_group, locality, system):
        var = 'valavm-%s-all-%s-%s' % (feature_group, locality, system)
        line = '%s : $(%s)' % (var, var)
        return line

    def make_rule(feature_group, locality, month):
        target_lhs = '../data/working/valavm/%s-all-%s/%s' % (feature_group,
                                                              locality, month)
        target_rhs = 'valavm.py ' + control.path_in_samples
        target_line = '%s : %s' % (target_lhs, target_rhs)

        recipe_line = '\t~/anaconda2/bin/python valavm.py %s-all-%s-%s' % (
            feature_group, locality, month)

        return (target_line, recipe_line)

    args = control.arg.makefile.split(' ')
    jobs = make_jobs(args)
    system_months = make_system_months(jobs, months)

    report_variables = Report()
    report_variables.append('# valavm variables')
    report_targets = Report()
    report_targets.append('# valavm targets')
    report_rules = Report()
    report_rules.append('# valavm rules')
    # for now, only implement hps 'all'
    for feature_group in ('s', 'sw', 'swp', 'swpn'):
        for locality in ('city', 'global'):
            for system in jobs.keys():
                for month in system_months[system]:
                    report_variables.append(
                        make_variable(feature_group, locality, system, month))
                    report_rules.append_lines(
                        make_rule(feature_group, locality, month))
                report_targets.append(
                    make_target(feature_group, locality, system))
    report_variables.append_report(report_targets)
    report_variables.append_report(report_rules)
    report_variables.write(control.path_out_makefile)
    return