Exemple #1
0
    def make_column_table(cities, data):
        def append_detail_line(ct, city_name, prices):
            ct.append_detail(
                city=city_name,
                mean=prices.mean(),
                median=prices.median(),
                stddev=prices.std(),
                count=len(prices),
            )

        ct = ColumnsTable((
            ('city', 30, '%30s', ('city'), 'name of city'),
            ('mean', 7, '%7.0f', ('mean'), 'mean price across time periods'),
            ('median', 7, '%7.0f', ('median'),
             'median price across time periods'),
            ('stddev', 7, '%7.0f', ('stddev'),
             'standard deviation of prices across time periods'),
            ('count', 7, '%7.0f', ('count'),
             'number of transactions across time periods'),
        ))
        for city in cities:
            in_city = data.city == city
            city_data = data[in_city]
            prices = city_data.price
            append_detail_line(ct, city, prices)

        # summary line is across all the cities
        append_detail_line(ct, '* all cities *', data.price)

        ct.append_legend()

        return ct
Exemple #2
0
class ReportWithColumnsTable(object):
    def __init__(self,
                 header_lines,
                 column_defs,
                 print_as_spaces,
                 verbose=True):
        self._report = Report()
        self._header(header_lines)
        self._ct = ColumnsTable(column_defs, verbose)
        self._print_as_spaces = print_as_spaces

    def _header(self, header_lines):
        for line in header_lines:
            self._report.append(line)

    def append_detail(self, **kwds):
        # replace NaN with None
        with_spaces = {
            k: (None if self._print_as_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def write(self, path):
        self._t.append_legend()
        for line in self._t.iterlines():
            self._report.append(line)
        self._report.write(path)
Exemple #3
0
 def __init__(self,
              header_lines,
              column_defs,
              print_as_spaces,
              verbose=True):
     self._report = Report()
     self._header(header_lines)
     self._ct = ColumnsTable(column_defs, verbose)
     self._print_as_spaces = print_as_spaces
Exemple #4
0
 def make_details(data, test_months, n_best, n_worst):
     'return a ColumnTable'
     extra_info = []
     feature_names = Features().ege_names(control.arg.features)
     columns_table = ColumnsTable((
         ('test_month', 6, '%6s', ('test', 'month'), 'test month'),
         ('nth', 2, '%2d', (' ', 'n'), 'rank of feature (1 ==> more frequently included)'),
         ('probability', 4, '%4.1f', (' ', 'prob'), 'probability feature appears in a decision tree'),
         ('feature_name', 40, '%40s', (' ', 'feature name'), 'name of feature'),
         ),
         verbose=True)
     for test_month in test_months:
         value = data[ReductionKey(test_month)]
         if 'feature_importances' not in value.importances:
             # one month has an ensemble model
             # skip that month
             print 'chart a sees an unexpected ensemble model'
             print 'test_month', test_month
             print 'value', value
             print 'value.importance', value.importances
             print 'skipping the test month'
             print 'entering debugger'
             pdb.set_trace()
         importances = value.importances['feature_importances']
         assert value.importances['features_group'] == control.arg.features, value
         model = value.model
         assert type(model) == ResultKeyGbr or type(model) == ResultKeyRfr
         sorted_indices = importances.argsort()  # sorted first lowest, last highest
         for nth_best in xrange(n_best):
             if nth_best == len(feature_names):
                 break
             index = sorted_indices[len(importances) - nth_best - 1]
             columns_table.append_detail(
                 test_month=test_month,
                 nth=nth_best + 1,
                 probability=importances[index] * 100.0,
                 feature_name=feature_names[index]
                 )
             extra_info.append([test_month, nth_best+1, importances[index]*100.0, feature_names[index]])
         for nth in xrange(n_worst):
             break  # skip, for now
             if nth == len(feature_names):
                 break
             nth_worst = n_worst - nth - 1
             index = sorted_indices[nth_worst]
             columns_table.append_detail(
                 test_month=test_month,
                 nth=len(importances) - nth_worst,
                 probability=importances[index] * 100.0,
                 feature_name=feature_names[index]
                 )
         if n_best > 1 or n_worst > 1:
             # insert blank line between test_months if more than 1 row in a month
             columns_table.append_detail()
     columns_table.append_legend()
     return columns_table, extra_info
 def __init__(self, k, validation_month, ensemble_weighting,
              column_definitions, test):
     self._column_definitions = column_definitions
     self._report = Report()
     self._test = test
     self._header(k, validation_month, ensemble_weighting)
     cd = self._column_definitions.defs_for_columns(
         'description',
         'mae_validation',
         'mae_query',
         'mare_validation',
         'mare_query',
     )
     self._ct = ColumnsTable(columns=cd, verbose=True)
Exemple #6
0
 def make_details(data, test_months):
     "return a ColumnTable"
     columns_table = ColumnsTable(
         (
             ("mean_prob", 5, "%5.2f", ("mean", "prob"), "mean probability feature appears in a decision tree"),
             ("feature_name", 40, "%40s", (" ", "feature name"), "name of feature"),
         ),
         verbose=True,
     )
     mean_importance = make_mean_importance_by_feature(test_months)
     for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True):
         columns_table.append_detail(mean_prob=mean_importance[feature_name] * 100.0, feature_name=feature_name)
     columns_table.append_legend()
     return columns_table
Exemple #7
0
 def __init__(self, validation_month, k, column_definitions, test):
     self._report = Report()
     self._header(validation_month, k)
     self._column_definitions = column_definitions
     self._test = test
     cd = self._column_definitions.defs_for_columns(
         'median_absolute_error',
         'model',
         'n_months_back',
         'max_depth',
         'n_estimators',
         'max_features',
         'learning_rate',
     )
     self._ct = ColumnsTable(columns=cd, verbose=True)
Exemple #8
0
class ChartCDReport(object):
    def __init__(self, column_definitions, test):
        self._column_definitions = column_definitions
        self._test = test
        self._report = Report()
        cd = self._column_definitions.defs_for_columns(
            'validation_month',
            'rank',
            'median_absolute_error',
            'median_price',
            'model',
            'n_months_back',
            'max_depth',
            'n_estimators',
            'max_features',
            'learning_rate',
            'alpha',
            'l1_ratio',
            'units_X',
            'units_y',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)
        self._header()

    def append(self, line):
        self._report.append(line)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def _header(self):
        self._report.append(
            'Median Absolute Error (MAE) by month for best-performing models and their hyperparameters'
        )
        self._report.append(' ')

    def append_detail(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)
 def __init__(self, k, ensemble_weighting, column_definitions, test):
     self._column_definitions = column_definitions
     self._test = test
     self._report = Report()
     self._header(k, ensemble_weighting)
     cd = self._column_definitions.defs_for_columns(
         'validation_month',
         'mae_index0',
         'mae_ensemble',
         'mae_best_next_month',
         'median_price',
         'fraction_median_price_next_month_index0',
         'fraction_median_price_next_month_ensemble',
         'fraction_median_price_next_month_best',
     )
     self._ct = ColumnsTable(columns=cd, verbose=True)
Exemple #10
0
    def make_report(title, ordered_cities):
        def make_detail_line(city):
            return {
                'city': city,
                'median_price': median_prices[city],
                'median_price_index': median_prices_indices[city],
                'n_trades': n_trades[city],
                'n_trades_index': n_trades_indices[city],
            }

        c = ColumnsTable((
            ('city', 30, '%30s', ('', '', '', '', '', 'City'), 'city name'),
            ('median_price', 7, '%7.0f', ('', '', '', '', 'median', 'price'),
             'median price in city'),
            ('median_price_index', 7, '%7.2f', ('median', 'price', '/',
                                                'overall', 'median', 'price'),
             'median price as fraction of overall median price'),
            ('n_trades', 7, '%7.0f', ('', '', '', '', 'number', 'trades'),
             'number of trades across all months'),
            ('n_trades_index', 7, '%7.2f', ('number', 'trades', '/ ',
                                            'overall', 'median', 'trades'),
             'median number trades as fraction of overall median number of trades'
             ),
        ))
        for city in ordered_cities:
            c.append_detail(**make_detail_line(city))
        c.append_legend(40)

        r = Report()
        r.append(title)
        r.append(' ')
        for line in c.iterlines():
            r.append(line)
        return r
Exemple #11
0
    def make_details(data, control):
        'return a ColumnsTable'
        def append_feature_group_description(ct):
            ct.append_line(' ')
            ct.append_line('Features groups;')
            ct.append_line('s    : only size features')
            ct.append_line('sw   : only size and wealth features')
            ct.append_line('swp  : only size, wealth, and property features')
            ct.append_line('swpn : all features: size, wealth, property, and neighborhood')

        ct = ColumnsTable((
            ('month', 6, '%6s', ('', 'month'), 'training month'),
            ('features', 8, '%8s', ('features', 'group'), 'group of features'),
            ('model', 5, '%5s', ('best', 'model'), 'family of best model'),
            ('mae', 6, '%6.0f', ('', 'mae'), 'mae of best model in month using features'),
            ),
            verbose=True,
            )
        for month in control.test_months:
            for features in control.feature_groups:
                mae_model = data[month][features]
                ct.append_detail(
                    month=month,
                    features=features,
                    model=mae_model.model,
                    mae=mae_model.mae,
                    )
            ct.append_detail()  # blank line separates each month
        ct.append_legend()
        append_feature_group_description(ct)

        return ct
class ChartEReport(object):
    def __init__(self, k, ensemble_weighting, column_definitions, test):
        self._column_definitions = column_definitions
        self._test = test
        self._report = Report()
        self._header(k, ensemble_weighting)
        cd = self._column_definitions.defs_for_columns(
            'validation_month',
            'model',
            'n_months_back',
            'n_estimators',
            'max_features',
            'max_depth',
            'learning_rate',
            'rank',
            'weight',
            'mae_validation',
            'mae_query',
            'mae_ensemble',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def detail_line(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def _header(self, k, ensemble_weighting):
        self._report.append(
            'Performance of Best Models Separately and as an Ensemble')
        self._report.append(' ')
        self._report.append('Considering Best K = %d models' % k)
        self._report.append('Ensemble weighting: %s' % ensemble_weighting)
 def __init__(self, k, ensemble_weighting, column_definitions, test):
     self._column_definitions = column_definitions
     self._test = test
     self._report = Report()
     self._header(k, ensemble_weighting)
     cd = self._column_definitions.defs_for_columns(
         'validation_month',
         'model',
         'n_months_back',
         'n_estimators',
         'max_features',
         'max_depth',
         'learning_rate',
         'rank',
         'weight',
         'mae_validation',
         'mae_query',
         'mae_ensemble',
     )
     self._ct = ColumnsTable(columns=cd, verbose=True)
Exemple #14
0
class ChartBReport(object):
    def __init__(self, validation_month, k, column_definitions, test):
        self._report = Report()
        self._header(validation_month, k)
        self._column_definitions = column_definitions
        self._test = test
        cd = self._column_definitions.defs_for_columns(
            'median_absolute_error',
            'model',
            'n_months_back',
            'max_depth',
            'n_estimators',
            'max_features',
            'learning_rate',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def _header(self, validation_month, k):
        def a(line):
            self._report.append(line)

        a('MAE for %d best-performing models and their hyperparameters' % k)
        a('Validation month: %s' % validation_month)
        a(' ')

    def append_detail(self, **kwds):
        # replace NaN with None
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('**TESTING: DISCARD')
        self._report.write(path)
class ChartFReport(object):
    def __init__(self, k, ensemble_weighting, column_definitions, test):
        self._column_definitions = column_definitions
        self._test = test
        self._report = Report()
        self._header(k, ensemble_weighting)
        cd = self._column_definitions.defs_for_columns(
            'validation_month',
            'mae_index0',
            'mae_ensemble',
            'mae_best_next_month',
            'median_price',
            'fraction_median_price_next_month_index0',
            'fraction_median_price_next_month_ensemble',
            'fraction_median_price_next_month_best',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def detail_line(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def _header(self, k, ensemble_weighting):
        self._report.append(
            'Comparison of Errors of Ensemble and Best Model That Know the Future'
        )
        self._report.append(' ')
        self._report.append('Considering Best K = %d models' % k)
        self._report.append('Ensemble weighting: %s' % ensemble_weighting)
Exemple #16
0
 def make_details(data, test_months, n_best, n_worst):
     "return a ColumnTable"
     feature_names = Features().ege_names(control.arg.features)
     columns_table = ColumnsTable(
         (
             ("test_month", 6, "%6s", ("test", "month"), "test month"),
             ("nth", 2, "%2d", (" ", "n"), "rank of feature (1 ==> more frequently included)"),
             ("probability", 4, "%4.1f", (" ", "prob"), "probability feature appears in a decision tree"),
             ("feature_name", 40, "%40s", (" ", "feature name"), "name of feature"),
         ),
         verbose=True,
     )
     for test_month in test_months:
         value = data[ReductionKey(test_month)]
         importances = value.importances["feature_importances"]
         assert value.importances["features_group"] == control.arg.features, value
         model = value.model
         assert type(model) == ResultKeyGbr or type(model) == ResultKeyRfr
         sorted_indices = importances.argsort()  # sorted first lowest, last highest
         for nth_best in xrange(n_best):
             if nth_best == len(feature_names):
                 break
             index = sorted_indices[len(importances) - nth_best - 1]
             columns_table.append_detail(
                 test_month=test_month,
                 nth=nth_best + 1,
                 probability=importances[index] * 100.0,
                 feature_name=feature_names[index],
             )
         for nth in xrange(n_worst):
             break  # skip, for now
             if nth == len(feature_names):
                 break
             nth_worst = n_worst - nth - 1
             index = sorted_indices[nth_worst]
             columns_table.append_detail(
                 test_month=test_month,
                 nth=len(importances) - nth_worst,
                 probability=importances[index] * 100.0,
                 feature_name=feature_names[index],
             )
         if n_best > 1 or n_worst > 1:
             # insert blank line between test_months if more than 1 row in a month
             columns_table.append_detail()
     columns_table.append_legend()
     return columns_table
Exemple #17
0
 def __init__(self, column_definitions, test):
     self._column_definitions = column_definitions
     self._test = test
     self._report = Report()
     cd = self._column_definitions.defs_for_columns(
         'validation_month',
         'rank',
         'median_absolute_error',
         'median_price',
         'model',
         'n_months_back',
         'max_depth',
         'n_estimators',
         'max_features',
         'learning_rate',
         'alpha',
         'l1_ratio',
         'units_X',
         'units_y',
     )
     self._ct = ColumnsTable(columns=cd, verbose=True)
     self._header()
Exemple #18
0
def make_table_stats(data, control, in_report_p):
    'return Report with statistics for years and months that obey the filter'
    r = Report()
    r.append('Prices by Month')
    r.append('')
    ct = ColumnsTable((
        ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'),
        ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'),
        ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'),
         'mean price in dollars'),
        ('median_price', 6, '%6.0f', (' ', 'median', 'price'),
         'median price in dollars'),
        ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'),
         'ratio of price in current month to prior month'),
        ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'),
         'ratio of price in current month to prior month'),
        ('number_trades', 6, '%6d', ('number', 'of', 'trades'),
         'number of trades in the month'),
    ))

    prior_mean_price = None
    prior_median_price = None
    for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009):
        for month in (1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                     10, 11, 12):
            if in_report_p(year, month):
                selected = data.month == Month(year, month)
                prices = data[selected].price
                mean_price = prices.mean()
                median_price = prices.median()
                number_trades = len(prices)
                ct.append_detail(
                    year=year,
                    month=month,
                    mean_price=mean_price,
                    median_price=median_price,
                    mean_price_ratio=None if prior_mean_price is None else
                    mean_price / prior_mean_price,
                    median_price_ratio=None if prior_median_price is None else
                    median_price / prior_median_price,
                    number_trades=number_trades,
                )
                prior_mean_price = mean_price
                prior_median_price = median_price
    ct.append_legend()
    for line in ct.iterlines():
        r.append(line)
    return r
Exemple #19
0
    def make_details(data, control):
        'return a ColumnsTable'

        def append_feature_group_description(ct):
            ct.append_line(' ')
            ct.append_line('Features groups;')
            ct.append_line('s    : only size features')
            ct.append_line('sw   : only size and wealth features')
            ct.append_line('swp  : only size, wealth, and property features')
            ct.append_line(
                'swpn : all features: size, wealth, property, and neighborhood'
            )

        ct = ColumnsTable(
            (
                ('month', 6, '%6s', ('', 'month'), 'training month'),
                ('features', 8, '%8s',
                 ('features', 'group'), 'group of features'),
                ('model', 5, '%5s', ('best', 'model'), 'family of best model'),
                ('mae', 6, '%6.0f',
                 ('', 'mae'), 'mae of best model in month using features'),
            ),
            verbose=True,
        )
        my_info = []
        for month in control.test_months:
            for features in control.feature_groups:
                mae_model = data[month][features]
                ct.append_detail(
                    month=month,
                    features=features,
                    model=mae_model.model,
                    mae=mae_model.mae,
                )
                my_info.append(
                    [month, features, mae_model.model, mae_model.mae])

            ct.append_detail()  # blank line separates each month
        ct.append_legend()
        append_feature_group_description(ct)

        return ct, my_info
Exemple #20
0
 def make_column_table(df):
     ct = ColumnsTable(columns=(
         ('city', 30, '%30s', ('', 'city'), 'city in Los Angeles Country'),
         ('count', 6, '%6d', (' ', 'count'),
          'number of transactions in 2007'),
         ('median_price', 7, '%7.0f', ('median', 'price'), 'median price'),
     ), )
     for index, series in df.iterrows():
         ct.append_detail(
             city=series['city'],
             count=series['count'],
             median_price=series['median_price'],
         )
     ct.append_legend()
     return ct
class ChartHReport(object):
    def __init__(self, k, validation_month, ensemble_weighting,
                 column_definitions, test):
        self._column_definitions = column_definitions
        self._report = Report()
        self._test = test
        self._header(k, validation_month, ensemble_weighting)
        cd = self._column_definitions.defs_for_columns(
            'description',
            'mae_validation',
            'mae_query',
            'mare_validation',
            'mare_query',
        )
        self._ct = ColumnsTable(columns=cd, verbose=True)

    def write(self, path):
        self._ct.append_legend()
        for line in self._ct.iterlines():
            self._report.append(line)
        if self._test:
            self._report.append('** TESTING: DISCARD')
        self._report.write(path)

    def detail_line(self, **kwds):
        with_spaces = {
            k:
            (None if self._column_definitions.replace_by_spaces(k, v) else v)
            for k, v in kwds.iteritems()
        }
        self._ct.append_detail(**with_spaces)

    def preformatted_line(self, line):
        print line
        self._ct.append_line(line)

    def _header(self, k, validation_month, ensemble_weighting):
        self._report.append(
            'Performance of Best Models Separately and as an Ensemble')
        self._report.append(' ')
        self._report.append('Considering Best K = %d models' % k)
        self._report.append('For validation month %s' % validation_month)
        self._report.append('Ensemble weighting: %s' % ensemble_weighting)
Exemple #22
0
 def make_details(data, test_months):
     'return a ColumnTable'
     columns_table = ColumnsTable((
         ('mean_prob', 5, '%5.2f', ('mean', 'prob'), 'mean probability feature appears in a decision tree'),
         ('feature_name', 40, '%40s', (' ', 'feature name'), 'name of feature'),
         ),
         verbose=True)
     my_prob = []
     my_featname = []
     mean_importance = make_mean_importance_by_feature(test_months)
     for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True):
         columns_table.append_detail(
             mean_prob=mean_importance[feature_name] * 100.0,
             feature_name=feature_name,
         )
         if mean_importance[feature_name] * 100.0 >= 1:
             my_prob.append(mean_importance[feature_name] * 100.0)
             my_featname.append(feature_name)
     columns_table.append_legend()
     return columns_table, my_featname, my_prob
Exemple #23
0
def make_chart_stats(data, control, filter_f):
    'return Report with statistics for years and months that obey the filter'
    r = Report()
    r.append('Prices by Month')
    r.append('')
    ct = ColumnsTable((
            ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'),
            ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'),
            ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'), 'mean price in dollars'),
            ('median_price', 6, '%6.0f', (' ', 'median', 'price'), 'median price in dollars'),
            ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'), 'ratio of price in current month to prior month'),
            ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'), 'ratio of price in current month to prior month'),
            ('number_trades', 6, '%6d', ('number', 'of', 'trades'), 'number of trades in the month'),
            ))

    prior_mean_price = None
    prior_median_price = None
    for year in xrange(2003, 2010):
        for month in xrange(1, 13):
            if filter_f(year, month):
                value = data[make_reduction_key(year, month)]
                mean_price = value['mean']
                median_price = value['median']
                number_trades = value['count']
                ct.append_detail(
                        year=year,
                        month=month,
                        mean_price=mean_price,
                        median_price=median_price,
                        mean_price_ratio=None if prior_mean_price is None else mean_price / prior_mean_price,
                        median_price_ratio=None if prior_median_price is None else median_price / prior_median_price,
                        number_trades=number_trades,
                        )
                prior_mean_price = mean_price
                prior_median_price = median_price
    ct.append_legend()
    for line in ct.iterlines():
        r.append(line)
    return r