def test_can_return_week(self): m = Month(2012, 10) w = Week() w.set_day(0, TimeRange(Time(7), Time(17))) m.add_week(40, w) self.assertEquals( m.get_week(40).get_day(0), TimeRange(Time(7), Time(17)))
def test_can_enter_times_into_first_incomplete_week(self): m = Month(2012, 11) w = Week() w.set_day(0, TimeRange(Time(7), Time(17))) w.set_day(3, TimeRange(Time(8), Time(18))) m.add_week(44, w) self.assertEquals(m.get_day(1), TimeRange(Time(8), Time(18)))
def test_can_give_total_time_for_month(self): m = Month(2012, 10) w = Week() w.set_day(4, TimeRange(Time(7), Time(17, 10))) m.add_week(40, w) m.add_week(42, w) self.assertEquals(m.get_total_time(), Hours(20, 20))
def make_subplot(validation_month, reduction, relevant_median_prices): 'mutate the default axes' # draw one line for each model family for model in ('en', 'gb', 'rf'): y = [ v.mae for k, v in reduction[validation_month].iteritems() if k.model == model ] plt.plot(y, label=model) # the reduction is sorted by increasing mae plt.yticks(size='xx-small') if Month(validation_month) not in relevant_median_prices: print validation_month print relevant_median_prices print 'should not happen' pdb.set_trace() plt.title( 'yr mnth %s med price %6.0f' % (validation_month, relevant_median_prices[Month(validation_month)]), loc='right', fontdict={ 'fontsize': 'xx-small', 'style': 'italic', }, ) plt.xticks([]) # no ticks on x axis return
def test_can_return_half_week(self): m = Month(2012, 11) w = Week() w.set_day(2, TimeRange(Time(7), Time(17))) w.set_day(3, TimeRange(Time(8), Time(18))) m.add_week(44, w) self.assertEquals(m.get_week(44).get_day(2), None)
def test_can_enter_times_from_last_week_in_month(self): m = Month(2012, 10) w = Week() w.set_day(2, TimeRange(Time(7), Time(17))) w.set_day(3, TimeRange(Time(7), Time(17))) m.add_week(44, w) self.assertEquals(m.get_day(31), TimeRange(Time(7), Time(17))) self.assertNotEquals(m.get_day(32), TimeRange(Time(7), Time(17)))
def test_returns_correctly_annotated_week_for_start_of_month(self): m = Month(2012, 11) w = Week() for i in xrange(7): w.set_day(i, TimeRange(Time(7), Time(17))) m.add_week(44, w) aw = m.get_week(44) self.assertEquals(aw.get_earliest_day_in_month(), 1) self.assertEquals(aw.get_first_valid_day(), 3) self.assertEquals(aw.get_last_valid_day(), 6)
def generate_montly_report(self, month_number, hours): m = Month(self.year, month_number) for w in m.get_week_numbers(): if month_number == 12 and w == 1: schedule = self.next_year.get_schedule_for_week(w, hours) else: schedule = self.get_schedule_for_week(w, hours) m.add_week(w, schedule) return m
def split_train_validate(n_months_back): '''return (train, validate) where - test contains only transactions in the validation_month - train contains only transactions in the n_months_back preceeding the validation_month ''' validation_month = Month(control.arg.validation_month) ss = SampleSelector(samples) samples_validate = ss.in_month(validation_month) samples_train = ss.between_months( validation_month.decrement(n_months_back), validation_month.decrement(1), ) return samples_train, samples_validate
def split_train_validate(n_months_back, samples, validation_month): '''return (train, validate) where - test contains only transactions in the validation_month - train contains only transactions in the n_months_back preceeding the validation_month ''' the_validation_month = Month(validation_month) ss = SampleSelector(samples) samples_validate = ss.in_month(the_validation_month) samples_train = ss.between_months( the_validation_month.decrement(n_months_back), the_validation_month.decrement(1), ) return samples_train, samples_validate
def by_month(dates, xs): 'return dict[Month] = [x]' result = collections.defaultdict(list) for date, x in itertools.izip(dates, xs): month = Month(date) result[month].append(x) return result
def make_data(control): 'return DataFrame' def to_datetime_date(x): year = int(x / 10000.0) x -= year * 10000.0 month = int(x / 100.0) x -= month * 100 day = int(x) return datetime.date(year, month, day) transactions = pd.read_csv( control.path_in_samples, nrows=10 if control.test else None, ) dates = [to_datetime_date(x) for x in transactions[t.sale_date]] months = [Month(date.year, date.month) for date in dates] result = pd.DataFrame({ 'price': transactions[t.price], 'city': transactions[t.city], 'date': dates, 'month': months, }) return result
def fit_and_predict(training_samples, query_samples, hps, control): 'return (predictions, attributes, n_training_samples)' def X_y(df): return Features().extract_and_transform(df, hps['units_X'], hps['units_y']) relevant_training_samples = select_in_time_period_and_in_city( training_samples, Month(control.arg.prediction_month).decrement(1), hps['n_months_back'], control.arg.neighborhood, ) if len(relevant_training_samples) == 0: message = 'no relevant samples hps:%s neighborhood: %s prediction_month %s' % ( HPs.to_str(hps), control.arg.neighborhood, control.arg.prediction_month, ) raise FittingError(message) X_train, y_train = X_y(relevant_training_samples) X_query, actuals = X_y(query_samples) fitter = ( fit_en if control.arg.model == 'en' else fit_gb if control.arg.model == 'gb' else fit_rf ) fitted = fitter(X_train, y_train, hps, control.random_seed) attributes = ( {'coef_': fitted.coef_, 'intercept_': fitted.intercept_} if control.arg.model == 'en' else {'feature_importances_': fitted.feature_importances_} ) predictions = fitted.predict(X_query) return predictions, attributes, len(relevant_training_samples)
def select_in_time_period(df, last_month_str, n_months_back): 'return subset of DataFrame df that are in the time period' first_date_float = float(Month(last_month_str).decrement(n_months_back).as_int() * 100 + 1) next_month = Month(last_month_str).increment() last_date = datetime.date(next_month.year, next_month.month, 1) - datetime.timedelta(1) last_date_float = last_date.year * 10000.0 + last_date.month * 100.0 + last_date.day sale_date_column = layout_transactions.sale_date sale_dates_float = df[sale_date_column] # type is float assert isinstance(sale_dates_float.iloc[0], float) mask1 = sale_dates_float >= first_date_float mask2 = sale_dates_float <= last_date_float mask_in_range = mask1 & mask2 df_in_range = df.loc[mask_in_range] return df_in_range
def process_dirname(dirpath, dirname, reduction, reduction_2007, reduction_200701, no_data, test): 'mutate result and no_data to include info in the transactions and predictions files in dirname' verbose = False if verbose: print dirname training_data, neighborhood, model, month_str = dirname.split('-') if model == 'gb': print 'for now, skipping gb', dirname return month = Month(month_str) in_2007 = month.year == 2007 in_200701 = month.year == 2007 and month.month == 1 fitted = Fitted(training_data, neighborhood, model) transaction_ids_raw = read_transaction_ids(dirpath, dirname) transaction_ids_list = [] for transaction_id_raw in transaction_ids_raw: canonical = TransactionId.canonical(transaction_id_raw) transaction_ids_list.append(canonical) transaction_ids = tuple(transaction_ids_list) path = os.path.join(dirpath, dirname, 'predictions-attributes.pickle') n_records_processed = 0 with open(path, 'r') as f: unpickler = pickle.Unpickler(f) dirname_reduction = {} try: while True: obj = unpickler.load() if len(obj) == 3: hps_str, predictions, fitted_attributes = obj # convert from log domain to natural units units_y = HPs.from_str(hps_str)['units_y'] predictions_restated = np.exp(predictions) if units_y == 'log' else predictions if verbose: print hps_str dirname_reduction[hps_str] = predictions_restated else: print 'error:', obj n_records_processed += 1 if test and n_records_processed >= 10: if verbose: print 'test: stop after', n_records_processed break except EOFError as e: if verbose: print 'EOFError (%s) for %s after %d records processed' % (e, dirname, n_records_processed) if n_records_processed == 0: no_data.add(dirname) except ValueError as e: print '%s' % e no_data.add(dirname) reduction_key = (fitted, transaction_ids) reduction[reduction_key] = dirname_reduction if in_2007: reduction_2007[reduction_key] = dirname_reduction if in_200701: reduction_200701[reduction_key] = dirname_reduction print dirname, n_records_processed, len(reduction), len(reduction_2007), len(no_data)
def make_test_train(test_time_period, train_n_months_back, trade_month_column_name, samples): 'return dataframes for testing and training; see valavm.do_val.fit_and_run' assert isinstance(test_time_period, Month), test_time_period trade_month = samples[trade_month_column_name] assert trade_month.dtype == np.dtype('int64') test_month = test_time_period.as_int() test_mask = trade_month == test_month test_df = samples[test_mask] first_train_month = Month(test_time_period).decrement(train_n_months_back) train_mask = (first_train_month.as_int() <= trade_month) & (trade_month < test_month) train_df = samples[train_mask] assert len(test_df) > 0 assert len(train_df) > 0 return test_df, train_df
def test_1(self): def vp(x): if False: pprint(x) yyyymm = 'trade_month' x = 'x' samples = pd.DataFrame([ { yyyymm: 200702, x: 0 }, { yyyymm: 200701, x: 1 }, { yyyymm: 200612, x: 2 }, { yyyymm: 200611, x: 3 }, { yyyymm: 200610, x: 4 }, ]) vp(samples) test, train = make_test_train(Month(200702), 1, yyyymm, samples) vp(test) vp(train) assert len(test) == 1 assert len(train) == 1 test, train = make_test_train(Month(200701), 2, yyyymm, samples) vp(test) vp(train) assert len(test) == 1 assert len(train) == 2
def make_table_stats(data, control, in_report_p): 'return Report with statistics for years and months that obey the filter' r = Report() r.append('Prices by Month') r.append('') ct = ColumnsTable(( ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'), ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'), ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'), 'mean price in dollars'), ('median_price', 6, '%6.0f', (' ', 'median', 'price'), 'median price in dollars'), ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'), 'ratio of price in current month to prior month'), ('number_trades', 6, '%6d', ('number', 'of', 'trades'), 'number of trades in the month'), )) prior_mean_price = None prior_median_price = None for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009): for month in (1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12): if in_report_p(year, month): selected = data.month == Month(year, month) prices = data[selected].price mean_price = prices.mean() median_price = prices.median() number_trades = len(prices) ct.append_detail( year=year, month=month, mean_price=mean_price, median_price=median_price, mean_price_ratio=None if prior_mean_price is None else mean_price / prior_mean_price, median_price_ratio=None if prior_median_price is None else median_price / prior_median_price, number_trades=number_trades, ) prior_mean_price = mean_price prior_median_price = median_price ct.append_legend() for line in ct.iterlines(): r.append(line) return r
def make_control(argv): 'return a Bunch' print argv parser = argparse.ArgumentParser() parser.add_argument('invocation') parser.add_argument('samples', choices=['all', 'train']) parser.add_argument('model', choices=['en', 'gb', 'rf']) parser.add_argument('transaction_month') parser.add_argument('neighborhood') parser.add_argument('--test', action='store_true') parser.add_argument('--trace', action='store_true') arg = parser.parse_args(argv) arg.me = arg.invocation.split('.')[0] if arg.trace: pdb.set_trace() # convert arg.neighborhood into arg.all and arg.city arg.city = (None if arg.neighborhood == 'all' else arg.neighborhood.replace('_', ' ')) random_seed = 123 random.seed(random_seed) prior_month = Month(arg.transaction_month).decrement().as_str() in_dir = '%s-%s-%s-%s' % (arg.samples, arg.model, prior_month, arg.neighborhood) out_dir = '%s-%s-%s-%s' % (arg.samples, arg.model, arg.transaction_month, arg.neighborhood) dir_working = Path().dir_working() output_dir = (os.path.join(dir_working, arg.me + '-test', out_dir, '') if arg.test else os.path.join(dir_working, arg.me, out_dir, '')) dirutility.assure_exists(output_dir) return Bunch( arg=arg, path_in_fitted=os.path.join(dir_working, 'fit', in_dir, ''), path_in_samples=os.path.join(dir_working, 'samples2', arg.samples + '.csv'), path_out_file=os.path.join(output_dir, 'predictions.pickle'), path_out_log=os.path.join(output_dir, '0log.txt'), random_seed=random_seed, timer=Timer(), )
def make_control(argv): 'return a Bunch' print argv parser = argparse.ArgumentParser() parser.add_argument('invocation') parser.add_argument('data', choices=['all', 'train']) parser.add_argument('model', choices=['en', 'gb', 'rf']) parser.add_argument('last_month') parser.add_argument('neighborhood') parser.add_argument('--test', action='store_true') parser.add_argument('--trace', action='store_true') parser.add_argument('--dry', action='store_true') # don't write output arg = parser.parse_args(argv) arg.me = arg.invocation.split('.')[0] if arg.trace: pdb.set_trace() arg.last = Month(arg.last_month) # convert to Month and validate value # convert arg.neighborhood into arg.all and arg.city arg.city = ( None if arg.neighborhood == 'all' else arg.neighborhood.replace('_', ' ') ) random_seed = 123 random.seed(random_seed) dir_working = Path().dir_working() fit_dir = ( os.path.join(dir_working, arg.me + '-test') if arg.test else os.path.join(dir_working, arg.me) ) last_dir = '%s-%s-%s-%s' % (arg.data, arg.model, arg.last_month, arg.neighborhood) path_out_dir = os.path.join(fit_dir, last_dir, '') dirutility.assure_exists(path_out_dir) return Bunch( arg=arg, path_in_dir=os.path.join(dir_working, 'samples2', ''), path_out_dir=path_out_dir, path_out_log=os.path.join(path_out_dir, '0log.txt'), random_seed=random_seed, timer=Timer(), )
def __init__(self, name): self.name = name self.months = [] monthNames = [ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ] weekdays = [ 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ] # An algorithm to determine if it is a leap year, adapted from the # logic shown in this wikipedia page: # Accessed September, 2016 # https://en.wikipedia.org/wiki/Leap_year#Algorithm if (name % 4 == 0 and name % 100 != 0) or name % 400 == 0: monthLengths = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] else: monthLengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # An algorithm to determine the day of the week of the first day # of the year, adapted from the algortihm on the "disparate variation" # of Gauss's algorithm, shown here: # Accessed September, 2016 # https://en.wikipedia.org/wiki/Leap_year#Algorithm y = (name % 100) - 1 c = name // 100 weekOffset = int((2.4 + name + y / 4 + c / 4 - 2 * c) % 7) dayCount = 0 for i in range(12): days = [] for j in range(monthLengths[i]): days.append( Day(weekdays[(dayCount + weekOffset) % 7], j + 1, None)) dayCount += 1 self.months.append(Month(monthNames[i], days, self)) for i in range(12): if i > 0: self.months[i].setPrev(self.months[i - 1]) if i < 11: self.months[i].setNext(self.months[i + 1])
def make_prices_volumes(data): 'return tuples of dict[(year,month)] = number' def make_months(year): if year == 2009: return (1, 2, 3) else: return (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) median_prices = {} volumes = {} for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009): for month in make_months(year): in_year_month = data.month == Month(year, month) data_for_month = data[in_year_month] price = data_for_month.price.median() volume = len(data_for_month.price) median_prices[(year, month)] = price volumes[(year, month)] = volume return median_prices, volumes
def do_testbest(control, samples, best): 'determine accuracy of the HPs sets found to be best by valavm (as reported by chart-06)' result = {} for test_period, value in best.iteritems(): forecast_month = Month(test_period).increment() series = value[0] print test_period, forecast_month print series avm = make_avm(forecast_month, control.random_seed, series) test_df, train_df = make_test_train( forecast_month, series.n_months_back, layout_transactions.yyyymm, samples, ) avm.fit( train_df ) # the AVM object knows how to extract the train and test samples predictions = avm.predict(test_df) actuals = samples[layout_transactions.price] result[forecast_month] = valavm.ResultValue(actuals, predictions) return result
def test_can_add_week_52_of_previous_year_to_january(self): m = Month(2012, 1) w = Week() w.set_day(6, TimeRange(Time(7), Time(10))) m.add_week(52, w) self.assertEquals(m.get_day(1), TimeRange(Time(7), Time(10)))
def make_chart_cd(reduction, median_prices, control, detail_line_indices, report_id): r = ChartCDReport(control.column_definitions, control.test) my_validation_months = [] my_price = [] my_mae = [] for validation_month in control.validation_months_long: median_price = median_prices[Month(validation_month)] if validation_month not in reduction: control.exceptions.append('reduction is missing month %s' % validation_month) continue month_result_keys = reduction[validation_month].keys() my_validation_months.append(validation_month) my_price.append(median_price) for detail_line_index in detail_line_indices: if detail_line_index >= len(month_result_keys): continue # this can happend when using samples try: k = month_result_keys[detail_line_index] except: pdb.set_trace() k = month_result_keys[detail_line_index] v = reduction[validation_month][k] r.append_detail( validation_month=validation_month, rank=detail_line_index + 1, median_absolute_error=v.mae, median_price=median_price, model=k.model, n_months_back=k.n_months_back, max_depth=k.max_depth, n_estimators=k.n_estimators, max_features=k.max_features, learning_rate=k.learning_rate, alpha=k.alpha, l1_ratio=k.l1_ratio, units_X=k.units_X[:3], units_y=k.units_y[:3], ) my_mae.append(reduction[validation_month][month_result_keys[0]].mae) fig = plt.figure() fig1 = fig.add_subplot(211) fig1.bar(range(len(my_validation_months)), my_mae, color='blue') labels = my_validation_months plt.xticks([x + .6 for x in range(len(my_validation_months))], labels, rotation=-70, size='xx-small') plt.yticks(size='xx-small') plt.xlabel('Year-Month') plt.ylabel('Median Absolute Error ($)') plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) fig2 = fig.add_subplot(212) fig2.bar( range(len(my_validation_months)), [int(m) / int(p) for m, p in zip(my_mae, my_price)], color='blue', ) plt.xticks( [x + .6 for x in range(len(my_validation_months))], labels, rotation=-70, size='xx-small', ) plt.yticks(size='xx-small') plt.xlabel('Year-Month') plt.ylabel('Absolute Relative Error') plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0) plt.savefig(control.path_out_c_pdf) plt.close() r.write(control.path_out_cd % report_id) return
def test_returns_annotated_weeks(self): m = Month(2012, 10) w = Week() w.set_day(0, TimeRange(Time(7), Time(17))) m.add_week(40, w) self.assertEquals(type(m.get_week(40)), AnnotatedWeek)
def make_figure2(validation_month): '''make and write figure for the validation month Part 1: for the validation month one bar for each of the first 50 best models the height of the bar is the MAE in ($) Part 2: produce a 2-up chart, where the top chart is as in part 1 and the bottom chart has as y axis the absolute relative error ''' print 'creating figure b', validation_month # plt.suptitle('Loss by Test Period, Tree Max Depth, N Trees') # overlays the subplots bar_color = {'gb': 'white', 'rf': 'black', 'en': 'red'} models, maes = make_models_maes(validation_month) assert len(models) == len(maes) assert len(models) > 0 # the reduction is sorted by increasing mae # Jonathan fig = plt.figure() fig1 = fig.add_subplot(211) plt.title( 'Validation Month: %s' % (validation_month), loc='right', fontdict={ 'fontsize': 'large', 'style': 'italic' }, ) for i, model in enumerate(models): fig1.bar(i, maes[i], color=bar_color[model]) plt.yticks(size='xx-small') plt.xticks([]) plt.xlabel('Models in order of increasing MAE') plt.ylabel('MAE ($)') white_patch = mpatches.Patch( facecolor='white', edgecolor='black', lw=1, label="Gradient Boosting", ) black_patch = mpatches.Patch( facecolor='black', edgecolor='black', lw=1, label="Random Forest", ) plt.legend(handles=[white_patch, black_patch], loc=2) plt.ylim(0, 180000) fig2 = fig.add_subplot(212) for i, model in enumerate(models): fig2.bar(i, maes[i] / median_price[Month(validation_month)], color=bar_color[model]) plt.yticks(size='xx-small') plt.xticks([]) plt.xlabel('Models in order of increasing MAE') plt.ylabel('Absolute Relative Error') plt.ylim(0, .3) white_patch = mpatches.Patch( facecolor='white', edgecolor='black', lw=1, label="Gradient Boosting", ) black_patch = mpatches.Patch( facecolor='black', edgecolor='black', lw=1, label="Random Forest", ) plt.legend(handles=[white_patch, black_patch], loc=2) plt.savefig(control.path_out_b_pdf % int(validation_month)) plt.close()
def test_can_give_week_numbers_for_month(self): m = Month(2012, 10) self.assertEquals(m.get_week_numbers(), [40, 41, 42, 43, 44])
def median_price_for_year_month(year, month): in_year_month = months == Month(year, month) # assert sum(in_year_month) > 0, (year, month) return prices[in_year_month].median( ) # return NaN if no transactions in the (year,month)
def test_can_return_week(self): m = Month(2012, 10) w = Week() w.set_day(0, TimeRange(Time(7), Time(17))) m.add_week(40, w) self.assertEquals(m.get_week(40).get_day(0), TimeRange(Time(7), Time(17)))
def test_returns_correct_week_range_for_january(self): m = Month(2012, 1) self.assertEquals(m.get_week_numbers(), [52, 1, 2, 3, 4, 5])
def make_chart_efh(k, reduction, actuals, median_price, control): '''Write charts e and f, return median-absolute-relative_regret object''' def interesting(): return k == 5 def trace_if_interesting(): if interesting(): print 'k', k pdb.set_trace() return True else: return False ensemble_weighting = 'exp(-MAE/100000)' mae = {} debug = False my_validation_months = [] my_ensemble_mae = [] my_best_mae = [] my_price = [] for validation_month in control.validation_months: e = ChartEReport(k, ensemble_weighting, control.column_definitions, control.test) h = ChartHReport(k, ensemble_weighting, control.column_definitions, control.test) if debug: print validation_month pdb.set_trace() query_month = Month(validation_month).increment(1).as_str() if query_month not in reduction: control.exceptions.append('%s not in reduction (charts ef)' % query_month) print control.exception continue cum_weighted_predictions = None cum_weights = 0 mae_validation = None check_key_order(reduction[validation_month]) # write lines for the k best individual models # accumulate info needed to build the ensemble model index0_mae = None for index, query_month_key in enumerate(reduction[query_month].keys()): # print only k rows if index >= k: break print index, query_month_key validation_month_value = reduction[validation_month][ query_month_key] print query_month query_month_value = reduction[query_month][query_month_key] if mae_validation is not None and False: # turn off this test for now trace_unless( mae_validation <= validation_month_value.mae, 'should be non-decreasing', mae_previous=mae_validation, mae_next=validation_month_value.mae, ) mae_validation = validation_month_value.mae mae_query = query_month_value.mae if index == 0: index0_mae = mae_query eta = 1.0 weight = math.exp(-eta * (mae_validation / 100000.0)) e.detail_line( validation_month=validation_month, model=query_month_key.model, n_months_back=query_month_key.n_months_back, n_estimators=query_month_key.n_estimators, max_features=query_month_key.max_features, max_depth=query_month_key.max_depth, learning_rate=query_month_key.learning_rate, rank=index + 1, mae_validation=mae_validation, weight=weight, mae_query=mae_query, ) h.detail_line( validation_month=validation_month, model_description=short_model_description(query_month_key), mae_validation=mae_validation, mae_query=mae_query, ) # need the mae of the ensemble # need the actuals and predictions? or is this already computed predictions_next = query_month_value.predictions if cum_weighted_predictions is None: cum_weighted_predictions = weight * predictions_next else: cum_weighted_predictions += weight * predictions_next cum_weights += weight # write line comparing the best individual model in the next month # to the ensemble model trace_if_interesting() ensemble_predictions = cum_weighted_predictions / cum_weights ensemble_rmse, ensemble_mae, ensemble_ci95_low, ensemble_ci95_high = errors.errors( actuals[query_month], ensemble_predictions, ) best_key = reduction[query_month].keys()[0] best_value = reduction[query_month][best_key] e.detail_line( validation_month=validation_month, mae_ensemble=ensemble_mae, model=best_key.model, n_months_back=best_key.n_months_back, n_estimators=best_key.n_estimators, max_features=best_key.max_features, max_depth=best_key.max_depth, learning_rate=best_key.learning_rate, ) h.detail_line( validation_month=validation_month, model_description='ensemble', mae_query=ensemble_mae, ) my_validation_months.append(validation_month) my_ensemble_mae.append(ensemble_mae) my_best_mae.append(best_value.mae) e.write(control.path_out_e_txt % (k, validation_month)) mae[validation_month] = Bunch( index0=index0_mae, ensemble=ensemble_mae, best_next_month=best_value.mae, ) my_ensemble_mae = [] my_best_mae = [] my_price = [] for month in my_validation_months: my_ensemble_mae.append(mae[month].ensemble) my_best_mae.append(mae[month].best_next_month) my_price.append(median_price[Month(month)]) width = 0.35 fig = plt.figure() fig1 = fig.add_subplot(211) fig1.bar( [x + width for x in range(len(my_validation_months))], my_best_mae, width, color='white', ) fig1.bar( range(len(my_validation_months)), my_ensemble_mae, width, color='black', ) plt.ylim(0, 180000) labels = my_validation_months plt.xticks( [x + .4 for x in range(len(my_validation_months))], labels, rotation=-70, size='xx-small', ) plt.ylabel('MAE ($)') plt.xlabel('Year-Month') white_patch = mpatches.Patch( facecolor='white', edgecolor='black', hatch='', lw=1, label="MAE of Best Model in Validation Month", ) black_patch = mpatches.Patch( facecolor='black', edgecolor='black', hatch='', lw=1, label="MAE of Ensemble of " + str(k) + " Best Models in Validation Month", ) plt.legend(handles=[white_patch, black_patch], loc=2) fig2 = fig.add_subplot(212) fig2.bar( [x + width for x in range(len(my_validation_months))], [int(m) / int(p) for m, p in zip(my_best_mae, my_price)], width, color='white', ) fig2.bar( range(len(my_validation_months)), [int(m) / int(p) for m, p in zip(my_ensemble_mae, my_price)], width, color='black', ) plt.ylim(0, .5) labels = my_validation_months plt.xticks( [x + .4 for x in range(len(my_validation_months))], labels, rotation=-70, size='xx-small', ) plt.ylabel('Absolute Relative Error') plt.xlabel('Year-Month') white_patch = mpatches.Patch( facecolor='white', edgecolor='black', hatch='', lw=1, label="ARE of Best Model in Validation Month", ) black_patch = mpatches.Patch( facecolor='black', edgecolor='black', hatch='', lw=1, label="ARE of Ensemble of " + str(k) + " Best Models in Validation Month", ) plt.legend(handles=[white_patch, black_patch], loc=2) plt.tight_layout(pad=0.8, w_pad=0.8, h_pad=1.0) plt.savefig(control.path_out_e_pdf % k) plt.close() f = ChartFReport(k, ensemble_weighting, control.column_definitions, control.test) regrets = [] relative_errors = [] for validation_month in control.validation_months: query_month = Month(validation_month).increment(1).as_str() print query_month print "need to define best_next_month --> best_query_month" pdb.set_trace() query_month_value = reduction[query_month][query_month_key] regret = mae[validation_month].ensemble - mae[ validation_month].best_next_month regrets.append(regret) relative_error = regret / median_price[Month(validation_month)] relative_errors.append(relative_error) median_price_next = median_price[Month(query_month)] f.detail_line( validation_month=validation_month, mae_index0=mae[validation_month].index0, mae_ensemble=mae[validation_month].ensemble, mae_best_next_month=mae[validation_month].best_next_month, median_price=median_price[Month(validation_month)], fraction_median_price_next_month_index0=mae[validation_month]. index0 / median_price_next, fraction_median_price_next_month_ensemble=mae[validation_month]. ensemble / median_price_next, fraction_median_price_next_month_best=mae[validation_month]. best_next_month / median_price_next, ) median_absolute_regret = np.median(np.abs(regrets)) median_absolute_relative_regret = np.median(np.abs(relative_errors)) f.write(control.path_out_f % k) return median_absolute_regret, median_absolute_relative_regret
def test_can_add_week_1_of_next_year_to_december(self): m = Month(2012, 12) w = Week() w.set_day(0, TimeRange(Time(7), Time(10))) m.add_week(1, w) self.assertEquals(m.get_day(31), TimeRange(Time(7), Time(10)))
def test_returns_correct_week_range_for_december(self): m = Month(2012, 12) self.assertEquals(m.get_week_numbers(), [48, 49, 50, 51, 52, 1])
# iterate over every transaction and parse it into a proper Transaction object for idx, trans in month.iterrows(): if 0 < ((parsed_transacts / number_of_rows) * 100) % 10 < 1: print("currently at", ((parsed_transacts / number_of_rows) * 100), "percent") current_trans = Transaction(trans['Buchungstag'], trans['Buchungstext'], trans['Betrag'], trans['Währung']) if "ANLEGEN" in current_trans.descr: current_trans.isSavings = True months_savings += 0 - current_trans.amount # move to analyzer if "PayPal" in current_trans.descr or "Auszahlung" in current_trans.descr: current_trans.toBeReviewed = True if current_trans.amount < 0 and not "ANLEGEN" in current_trans.descr: # move to analyzer totalSpendings += current_trans.amount # move to analyzer db_transactions.insert_one(current_trans.__dict__) parsed_transacts += 1 months_transactions.append(current_trans) # move to analyzer # add all the transactions to the current month of the current year current_month = Month(key.year, key.month, months_savings, totalSpendings, months_transactions) # move to analyzer db_months.insert_one(current_month.__dict__) # move to analyzer
def make_control(argv): # return a Bunch print argv parser = argparse.ArgumentParser() parser.add_argument('invocation') parser.add_argument('--data', help='reduce input and create data file in WORKING', action='store_true') parser.add_argument('--test', help='set internal test flag', action='store_true') arg = parser.parse_args(argv) # arg = Bunch.from_namespace(parser.parse_args(argv)) base_name = arg.invocation.split('.')[0] arg.me = base_name random_seed = 123 random.seed(random_seed) dir_working = Path().dir_working() # assure output directory exists def create_dir(path1, path2): result_path = os.path.join(path1, path2) dirutility.assure_exists(result_path) return result_path dir_chart01 = (create_dir(dir_working, arg.me + '-test') if arg.test else create_dir(dir_working, arg.me)) dir_date_price = create_dir(dir_chart01, 'date_price') dir_median_price = create_dir(dir_chart01, 'median_price') dir_prices_volume = create_dir(dir_chart01, 'prices_volume') all_months = [ Month(year, month) for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009) for month in ((1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)) ] return Bunch( all_months=all_months, arg=arg, base_name=base_name, debug=False, path_in_interesting_cities=os.path.join(dir_working, 'interesting_cities.txt'), path_in_samples=os.path.join(dir_working, 'samples2', 'train.csv'), path_out_dir_date_price=dir_date_price, path_out_dir_median_price=dir_median_price, path_out_dir_prices_volume=dir_prices_volume, path_out_log=os.path.join(dir_chart01, '0log.txt'), path_out_price_statistics_city_name=os.path.join( dir_chart01, 'price-statistics-city-name.txt'), path_out_price_statistics_count=os.path.join( dir_chart01, 'price-statistics-count.txt'), path_out_price_statistics_median_price=os.path.join( dir_chart01, 'price-statistics-median-price.txt'), path_out_price_volume=os.path.join(dir_chart01, 'price-volume.pdf'), path_out_stats_all=os.path.join(dir_chart01, 'price-stats-all.txt'), path_out_stats_count_by_city_in_2007=os.path.join( dir_chart01, 'count-by-city-in-2007.txt'), path_out_stats_2006_2008=os.path.join(dir_chart01, 'price-stats-2006-2008.txt'), path_reduction=os.path.join(dir_chart01, '0data.pickle'), random_seed=random_seed, test=arg.test, )
def test_can_enter_times_from_second_week(self): m = Month(2012, 10) w = Week() w.set_day(0, TimeRange(Time(7), Time(17))) m.add_week(41, w) self.assertEquals(m.get_day(8), TimeRange(Time(7), Time(17)))