class ReportWithColumnsTable(object): def __init__(self, header_lines, column_defs, print_as_spaces, verbose=True): self._report = Report() self._header(header_lines) self._ct = ColumnsTable(column_defs, verbose) self._print_as_spaces = print_as_spaces def _header(self, header_lines): for line in header_lines: self._report.append(line) def append_detail(self, **kwds): # replace NaN with None with_spaces = { k: (None if self._print_as_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def write(self, path): self._t.append_legend() for line in self._t.iterlines(): self._report.append(line) self._report.write(path)
def make_report(n_best, n_worst): report = Report() make_header(report) details = make_details(data, control.test_months, n_best, n_worst) for line in details.iterlines(): report.append(line) return report
def make_report(n_best, n_worst): report = Report() make_header(report) details, extra_info = make_details(data, control.test_months, n_best, n_worst) for line in details.iterlines(): report.append(line) make_plt(data, extra_info, n_best, n_worst) return report
def make_report(data, cities, sorted_by_tag): 'return a Report' r = Report() r.append('Price Statistics by City') r.append('Sorted by %s' % sorted_by_tag) r.append('Transactions from %s to %s' % (data.date.min(), data.date.max())) r.append(' ') ct = make_column_table(cities, data) for line in ct.iterlines(): r.append(line) return r
def make_report(summary): r = Report() format_header = '%40s %8s %8s %8s %8s %8s %8s %8s' format_detail = '%40s %8.0f %8.0f %8.0f %8.0f %8d %8d %8.0f' r.append(format_header % ('numeric feature', 'min', 'median', 'mean', 'max', 'distinct', 'NaN', 'std')) for row_name, row_value in summary.iterrows(): r.append( format_detail % (row_name, row_value['min'], row_value['50%'], row_value['mean'], row_value['max'], row_value['number_distinct'], row_value['number_nan'], row_value['std'])) return r
class ChartCDReport(object): def __init__(self, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() cd = self._column_definitions.defs_for_columns( 'validation_month', 'rank', 'median_absolute_error', 'median_price', 'model', 'n_months_back', 'max_depth', 'n_estimators', 'max_features', 'learning_rate', 'alpha', 'l1_ratio', 'units_X', 'units_y', ) self._ct = ColumnsTable(columns=cd, verbose=True) self._header() def append(self, line): self._report.append(line) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def _header(self): self._report.append( 'Median Absolute Error (MAE) by month for best-performing models and their hyperparameters' ) self._report.append(' ') def append_detail(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces)
def make_report(summary): r = Report() format_header = '%40s %8s %8s %8s %8s %8s %8s %8s' format_detail = '%40s %8.0f %8.0f %8.0f %8.0f %8d %8d %8.0f' r.append(format_header % ('numeric feature', 'min', 'median', 'mean', 'max', 'distinct', 'NaN', 'std')) for row_name, row_value in summary.iterrows(): r.append(format_detail % ( row_name, row_value['min'], row_value['50%'], row_value['mean'], row_value['max'], row_value['number_distinct'], row_value['number_nan'], row_value['std'])) return r
def _make_report(self, counters): r = Report() r.append('Records retained while reducing input file') for path, counter in counters.iteritems(): r.append(' ') r.append('path %s' % path) for tag, value in counter.iteritems(): r.append('%30s: %d' % (tag, value)) return r
def make_chart_a(control, data): 'return a Report' def make_header(report): report.append('Median Absolute Errors for Most Accurate Models') report.append('By Month') report.append('By Feature Group') report.append(' ') def make_details(data, control): 'return a ColumnsTable' def append_feature_group_description(ct): ct.append_line(' ') ct.append_line('Features groups;') ct.append_line('s : only size features') ct.append_line('sw : only size and wealth features') ct.append_line('swp : only size, wealth, and property features') ct.append_line('swpn : all features: size, wealth, property, and neighborhood') ct = ColumnsTable(( ('month', 6, '%6s', ('', 'month'), 'training month'), ('features', 8, '%8s', ('features', 'group'), 'group of features'), ('model', 5, '%5s', ('best', 'model'), 'family of best model'), ('mae', 6, '%6.0f', ('', 'mae'), 'mae of best model in month using features'), ), verbose=True, ) for month in control.test_months: for features in control.feature_groups: mae_model = data[month][features] ct.append_detail( month=month, features=features, model=mae_model.model, mae=mae_model.mae, ) ct.append_detail() # blank line separates each month ct.append_legend() append_feature_group_description(ct) return ct report = Report() make_header(report) for line in make_details(data, control).iterlines(): report.append(line) return report
def make_chart_b(control, data): "return a Report" def make_header(report): report.append("Mean Probability of a Feature Being Included in a Decision Tree") report.append("Across the Entire Ensemble of Decisions Trees") report.append("For Most Accurate Model in Each Training Month") report.append(" ") def make_mean_importance_by_feature(test_months): "return dict[feature_name] = float, the mean importance of the feature" feature_names = Features().ege_names(control.arg.features) mean_importance = {} # key = feature_name for feature_index, feature_name in enumerate(feature_names): # build vector of feature_importances for feature_name feature_importances = np.zeros(len(test_months)) # for feature_name for month_index, test_month in enumerate(test_months): month_importances = data[ReductionKey(test_month)] # for each feature all_feature_importances = month_importances.importances["feature_importances"] feature_importances[month_index] = all_feature_importances[feature_index] mean_importance[feature_name] = np.mean(feature_importances) return mean_importance def make_details(data, test_months): "return a ColumnTable" columns_table = ColumnsTable( ( ("mean_prob", 5, "%5.2f", ("mean", "prob"), "mean probability feature appears in a decision tree"), ("feature_name", 40, "%40s", (" ", "feature name"), "name of feature"), ), verbose=True, ) mean_importance = make_mean_importance_by_feature(test_months) for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True): columns_table.append_detail(mean_prob=mean_importance[feature_name] * 100.0, feature_name=feature_name) columns_table.append_legend() return columns_table report = Report() make_header(report) details = make_details(data, control.test_months) for line in details.iterlines(): report.append(line) return report
def make_report(title, ordered_cities): def make_detail_line(city): return { 'city': city, 'median_price': median_prices[city], 'median_price_index': median_prices_indices[city], 'n_trades': n_trades[city], 'n_trades_index': n_trades_indices[city], } c = ColumnsTable(( ('city', 30, '%30s', ('', '', '', '', '', 'City'), 'city name'), ('median_price', 7, '%7.0f', ('', '', '', '', 'median', 'price'), 'median price in city'), ('median_price_index', 7, '%7.2f', ('median', 'price', '/', 'overall', 'median', 'price'), 'median price as fraction of overall median price'), ('n_trades', 7, '%7.0f', ('', '', '', '', 'number', 'trades'), 'number of trades across all months'), ('n_trades_index', 7, '%7.2f', ('number', 'trades', '/ ', 'overall', 'median', 'trades'), 'median number trades as fraction of overall median number of trades' ), )) for city in ordered_cities: c.append_detail(**make_detail_line(city)) c.append_legend(40) r = Report() r.append(title) r.append(' ') for line in c.iterlines(): r.append(line) return r
def make_table_stats(data, control, in_report_p): 'return Report with statistics for years and months that obey the filter' r = Report() r.append('Prices by Month') r.append('') ct = ColumnsTable(( ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'), ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'), ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'), 'mean price in dollars'), ('median_price', 6, '%6.0f', (' ', 'median', 'price'), 'median price in dollars'), ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('number_trades', 6, '%6d', ('number', 'of', 'trades'), 'number of trades in the month'), )) prior_mean_price = None prior_median_price = None for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009): for month in (1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12): if in_report_p(year, month): selected = data.month == Month(year, month) prices = data[selected].price mean_price = prices.mean() median_price = prices.median() number_trades = len(prices) ct.append_detail( year=year, month=month, mean_price=mean_price, median_price=median_price, mean_price_ratio=None if prior_mean_price is None else mean_price / prior_mean_price, median_price_ratio=None if prior_median_price is None else median_price / prior_median_price, number_trades=number_trades, ) prior_mean_price = mean_price prior_median_price = median_price ct.append_legend() for line in ct.iterlines(): r.append(line) return r
class ChartBReport(object): def __init__(self, validation_month, k, column_definitions, test): self._report = Report() self._header(validation_month, k) self._column_definitions = column_definitions self._test = test cd = self._column_definitions.defs_for_columns( 'median_absolute_error', 'model', 'n_months_back', 'max_depth', 'n_estimators', 'max_features', 'learning_rate', ) self._ct = ColumnsTable(columns=cd, verbose=True) def _header(self, validation_month, k): def a(line): self._report.append(line) a('MAE for %d best-performing models and their hyperparameters' % k) a('Validation month: %s' % validation_month) a(' ') def append_detail(self, **kwds): # replace NaN with None with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('**TESTING: DISCARD') self._report.write(path)
def make_chart_stats(data, control, filter_f): 'return Report with statistics for years and months that obey the filter' r = Report() r.append('Prices by Month') r.append('') ct = ColumnsTable(( ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'), ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'), ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'), 'mean price in dollars'), ('median_price', 6, '%6.0f', (' ', 'median', 'price'), 'median price in dollars'), ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('number_trades', 6, '%6d', ('number', 'of', 'trades'), 'number of trades in the month'), )) prior_mean_price = None prior_median_price = None for year in xrange(2003, 2010): for month in xrange(1, 13): if filter_f(year, month): value = data[make_reduction_key(year, month)] mean_price = value['mean'] median_price = value['median'] number_trades = value['count'] ct.append_detail( year=year, month=month, mean_price=mean_price, median_price=median_price, mean_price_ratio=None if prior_mean_price is None else mean_price / prior_mean_price, median_price_ratio=None if prior_median_price is None else median_price / prior_median_price, number_trades=number_trades, ) prior_mean_price = mean_price prior_median_price = median_price ct.append_legend() for line in ct.iterlines(): r.append(line) return r
def make_chart_a(control, data): 'return a Report' def make_header(report): report.append('Median Absolute Errors for Most Accurate Models') report.append('By Month') report.append('By Feature Group') report.append(' ') def make_details(data, control): 'return a ColumnsTable' def append_feature_group_description(ct): ct.append_line(' ') ct.append_line('Features groups;') ct.append_line('s : only size features') ct.append_line('sw : only size and wealth features') ct.append_line('swp : only size, wealth, and property features') ct.append_line( 'swpn : all features: size, wealth, property, and neighborhood' ) ct = ColumnsTable( ( ('month', 6, '%6s', ('', 'month'), 'training month'), ('features', 8, '%8s', ('features', 'group'), 'group of features'), ('model', 5, '%5s', ('best', 'model'), 'family of best model'), ('mae', 6, '%6.0f', ('', 'mae'), 'mae of best model in month using features'), ), verbose=True, ) my_info = [] for month in control.test_months: for features in control.feature_groups: mae_model = data[month][features] ct.append_detail( month=month, features=features, model=mae_model.model, mae=mae_model.mae, ) my_info.append( [month, features, mae_model.model, mae_model.mae]) ct.append_detail() # blank line separates each month ct.append_legend() append_feature_group_description(ct) return ct, my_info def make_plots(info): info = [info[i:i + 4] for i in xrange(0, len(info), 4)] def make_subplot1(validation_month, data): y = [data[k][3] for k in (0, 1, 2, 3)] plt.title(validation_month) plt.bar([1, 2, 3, 4], y) # the reduction is sorted by increasing mae plt.yticks(size='xx-small') plt.ylim(0, 140000) plt.xticks([1.2, 2.2, 3.2, 4.6], ['s', 'sw', 'swp', 'swpn'], size='medium') # no ticks on x axis return def make_subplot2(validation_month, data): y = [data[k][3] for k in (0, 1, 2, 3)] plt.title(validation_month) plt.bar([1, 2, 3, 4], y) # the reduction is sorted by increasing mae plt.yticks([]) plt.xticks([1.4, 2.4, 3.4, 4.4], ['s', 'sw', 'swp', 'swpn'], rotation=-70, size='xx-small') # no ticks on x axis plt.ylim(0, 140000) return def make_figures(path, data, kind): if kind == 'maeall': rows = 6 cols = 6 axes_number = 0 if kind == 'mae2007': rows = 3 cols = 4 axes_number = 0 plt.figure() # new figure # validation_months_2007 = ('200612', '200701', '200702', '200703', '200704', '200705', # '200706', '200707', '200708', '200709', '200710', '200711', # ) row_seq = range(1, rows + 1) col_seq = range(1, cols + 1) for row in row_seq: for col in col_seq: if kind == 'maeall': tempData = data[axes_number] else: tempData = data[axes_number + 12] validation_month = tempData[0][0] axes_number += 1 # count across rows plt.subplot(len(row_seq), len(col_seq), axes_number) if kind == 'maeall': make_subplot2(validation_month, tempData) else: make_subplot1(validation_month, tempData) # annotate the bottom row only if row == rows: if col == 1: plt.xlabel('features') plt.ylabel('mae ($)') if kind == 'mae2007': plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) else: plt.tight_layout(pad=0.1, w_pad=0.1, h_pad=0.1) plt.savefig(path) plt.close() make_figures(control.path_out_plt_a_mae_all, info, 'maeall') make_figures(control.path_out_plt_a_mae_2007, info, 'mae2007') report = Report() make_header(report) his_details, my_info = make_details(data, control) make_plots(my_info) for line in his_details.iterlines(): report.append(line) return report
class ChartFReport(object): def __init__(self, k, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() self._header(k, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'validation_month', 'mae_index0', 'mae_ensemble', 'mae_best_next_month', 'median_price', 'fraction_median_price_next_month_index0', 'fraction_median_price_next_month_ensemble', 'fraction_median_price_next_month_best', ) self._ct = ColumnsTable(columns=cd, verbose=True) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def detail_line(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def _header(self, k, ensemble_weighting): self._report.append( 'Comparison of Errors of Ensemble and Best Model That Know the Future' ) self._report.append(' ') self._report.append('Considering Best K = %d models' % k) self._report.append('Ensemble weighting: %s' % ensemble_weighting)
def make_report(ct): report = Report() report.append('Count and Median Price of Transactions in 2007 by City') for line in ct.iterlines(): report.append(line) return report
class ChartGReport(): def __init__(self): self.report = Report() self.format_header = '%4s %7s' self.format_detail = '%4d %6.3f%%' self._header() def detail(self, k, marr): self.report.append(self.format_detail % (k, marr * 100.0)) def _header(self): self.report.append('Hyperparameter K') self.report.append(' ') self.report.append(self.format_header % ('K', 'MARR')) def write(self, path): self.report.append('Legend:') self.report.append('K: number of models in ensemble') self.report.append('MARR: Median Absolute Relative Regret') self.report.write(path) def append(self, line): self.report.append(line)
class ChartEReport(object): def __init__(self, k, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._test = test self._report = Report() self._header(k, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'validation_month', 'model', 'n_months_back', 'n_estimators', 'max_features', 'max_depth', 'learning_rate', 'rank', 'weight', 'mae_validation', 'mae_query', 'mae_ensemble', ) self._ct = ColumnsTable(columns=cd, verbose=True) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def detail_line(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def _header(self, k, ensemble_weighting): self._report.append( 'Performance of Best Models Separately and as an Ensemble') self._report.append(' ') self._report.append('Considering Best K = %d models' % k) self._report.append('Ensemble weighting: %s' % ensemble_weighting)
class ChartHReport(object): def __init__(self, k, validation_month, ensemble_weighting, column_definitions, test): self._column_definitions = column_definitions self._report = Report() self._test = test self._header(k, validation_month, ensemble_weighting) cd = self._column_definitions.defs_for_columns( 'description', 'mae_validation', 'mae_query', 'mare_validation', 'mare_query', ) self._ct = ColumnsTable(columns=cd, verbose=True) def write(self, path): self._ct.append_legend() for line in self._ct.iterlines(): self._report.append(line) if self._test: self._report.append('** TESTING: DISCARD') self._report.write(path) def detail_line(self, **kwds): with_spaces = { k: (None if self._column_definitions.replace_by_spaces(k, v) else v) for k, v in kwds.iteritems() } self._ct.append_detail(**with_spaces) def preformatted_line(self, line): print line self._ct.append_line(line) def _header(self, k, validation_month, ensemble_weighting): self._report.append( 'Performance of Best Models Separately and as an Ensemble') self._report.append(' ') self._report.append('Considering Best K = %d models' % k) self._report.append('For validation month %s' % validation_month) self._report.append('Ensemble weighting: %s' % ensemble_weighting)
def make_chart_b(control, data): 'return a Report' def make_header(report): report.append('Mean Probability of a Feature Being Included in a Decision Tree') report.append('Across the Entire Ensemble of Decisions Trees') report.append('For Most Accurate Model in Each Training Month') report.append(' ') def make_mean_importance_by_feature(test_months): 'return dict[feature_name] = float, the mean importance of the feature' feature_names = Features().ege_names(control.arg.features) mean_importance = {} # key = feature_name for feature_index, feature_name in enumerate(feature_names): # build vector of feature_importances for feature_name feature_importances = np.zeros(len(test_months)) # for feature_name for month_index, test_month in enumerate(test_months): month_importances = data[ReductionKey(test_month)] # for each feature all_feature_importances = month_importances.importances['feature_importances'] if 'feature_importances' not in month_importances.importances: print 'chart b sees an unexpected ensemble model' print 'test_month', test_month print 'month_importances', month_importances print 'entering debugger' pdb.set_trace() feature_importances[month_index] = all_feature_importances[feature_index] mean_importance[feature_name] = np.mean(feature_importances) return mean_importance def make_details(data, test_months): 'return a ColumnTable' columns_table = ColumnsTable(( ('mean_prob', 5, '%5.2f', ('mean', 'prob'), 'mean probability feature appears in a decision tree'), ('feature_name', 40, '%40s', (' ', 'feature name'), 'name of feature'), ), verbose=True) my_prob = [] my_featname = [] mean_importance = make_mean_importance_by_feature(test_months) for feature_name in sorted(mean_importance, key=mean_importance.get, reverse=True): columns_table.append_detail( mean_prob=mean_importance[feature_name] * 100.0, feature_name=feature_name, ) if mean_importance[feature_name] * 100.0 >= 1: my_prob.append(mean_importance[feature_name] * 100.0) my_featname.append(feature_name) columns_table.append_legend() return columns_table, my_featname, my_prob def make_plt(feats, probs): plt.bar(range(len(feats)), probs, color='blue') labels = feats plt.xticks([x+.6 for x in range(len(feats))], labels, rotation=-70, size='small') plt.yticks(size='xx-small') plt.ylabel('Probability Feature in a Decision Tree (%)') plt.xlabel('Features That Occur More Than 1 Percent of Time') plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) plt.savefig(control.path_out_chart_b_pdf) plt.close() report = Report() make_header(report) details, my_feats, my_probs = make_details(data, control.test_months) make_plt(my_feats, my_probs) for line in details.iterlines(): report.append(line) return report
def makefile(control): '''write file valavm.makefile with these targets valavm-{feature_group}-{locality}-{system} all ''' months = ('200512', '200601', '200602', '200603', '200604', '200605', '200606', '200607', '200608', '200609', '200610', '200611', '200612', '200701', '200702', '200703', '200704', '200705', '200706', '200707', '200708', '200709', '200710', '200711', '200712', '200801', '200802', '200803', '200804', '200805', '200806', '200807', '200808', '200809', '200810', '200811', '200812', '200901', '200902') def make_jobs(args): jobs = {} for index in range(0, len(args), 2): system_name = args[index] hardware_threads = args[index + 1] jobs[system_name] = int(hardware_threads) return jobs def make_system_generator(jobs): systems = jobs.keys() for system in itertools.cycle(systems): for index in xrange(jobs[system]): yield system def make_system_months(jobs, months): result = collections.defaultdict(list) system_generator = make_system_generator(jobs) for month in months: system = system_generator.next() result[system].append(month) return result def make_variable(feature_group, locality, system, month): lhs = 'valavm-%s-all-%s-%s' % (feature_group, locality, system) rhs = '../data/working/valavm/%s-all-%s/%s' % (feature_group, locality, month) line = '%s += %s' % (lhs, rhs) return line def make_target(feature_group, locality, system): var = 'valavm-%s-all-%s-%s' % (feature_group, locality, system) line = '%s : $(%s)' % (var, var) return line def make_rule(feature_group, locality, month): target_lhs = '../data/working/valavm/%s-all-%s/%s' % (feature_group, locality, month) target_rhs = 'valavm.py ' + control.path_in_samples target_line = '%s : %s' % (target_lhs, target_rhs) recipe_line = '\t~/anaconda2/bin/python valavm.py %s-all-%s-%s' % ( feature_group, locality, month) return (target_line, recipe_line) args = control.arg.makefile.split(' ') jobs = make_jobs(args) system_months = make_system_months(jobs, months) report_variables = Report() report_variables.append('# valavm variables') report_targets = Report() report_targets.append('# valavm targets') report_rules = Report() report_rules.append('# valavm rules') # for now, only implement hps 'all' for feature_group in ('s', 'sw', 'swp', 'swpn'): for locality in ('city', 'global'): for system in jobs.keys(): for month in system_months[system]: report_variables.append( make_variable(feature_group, locality, system, month)) report_rules.append_lines( make_rule(feature_group, locality, month)) report_targets.append( make_target(feature_group, locality, system)) report_variables.append_report(report_targets) report_variables.append_report(report_rules) report_variables.write(control.path_out_makefile) return