コード例 #1
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_return_week(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(17)))
     m.add_week(40, w)
     self.assertEquals(
         m.get_week(40).get_day(0), TimeRange(Time(7), Time(17)))
コード例 #2
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_enter_times_into_first_incomplete_week(self):
     m = Month(2012, 11)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(17)))
     w.set_day(3, TimeRange(Time(8), Time(18)))
     m.add_week(44, w)
     self.assertEquals(m.get_day(1), TimeRange(Time(8), Time(18)))
コード例 #3
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_give_total_time_for_month(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(4, TimeRange(Time(7), Time(17, 10)))
     m.add_week(40, w)
     m.add_week(42, w)
     self.assertEquals(m.get_total_time(), Hours(20, 20))
コード例 #4
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_enter_times_into_first_incomplete_week(self):
     m = Month(2012, 11)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(17)))
     w.set_day(3, TimeRange(Time(8), Time(18)))
     m.add_week(44, w)
     self.assertEquals(m.get_day(1), TimeRange(Time(8), Time(18)))
コード例 #5
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_give_total_time_for_month(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(4, TimeRange(Time(7), Time(17, 10)))
     m.add_week(40, w)
     m.add_week(42, w)
     self.assertEquals(m.get_total_time(), Hours(20, 20))
コード例 #6
0
 def make_subplot(validation_month, reduction, relevant_median_prices):
     'mutate the default axes'
     # draw one line for each model family
     for model in ('en', 'gb', 'rf'):
         y = [
             v.mae for k, v in reduction[validation_month].iteritems()
             if k.model == model
         ]
         plt.plot(y,
                  label=model)  # the reduction is sorted by increasing mae
         plt.yticks(size='xx-small')
         if Month(validation_month) not in relevant_median_prices:
             print validation_month
             print relevant_median_prices
             print 'should not happen'
             pdb.set_trace()
         plt.title(
             'yr mnth %s med price %6.0f' %
             (validation_month,
              relevant_median_prices[Month(validation_month)]),
             loc='right',
             fontdict={
                 'fontsize': 'xx-small',
                 'style': 'italic',
             },
         )
         plt.xticks([])  # no ticks on x axis
     return
コード例 #7
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
    def test_can_return_half_week(self):
        m = Month(2012, 11)
        w = Week()
        w.set_day(2, TimeRange(Time(7), Time(17)))
        w.set_day(3, TimeRange(Time(8), Time(18)))
        m.add_week(44, w)

        self.assertEquals(m.get_week(44).get_day(2), None)
コード例 #8
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
    def test_can_return_half_week(self):
        m = Month(2012, 11)
        w = Week()
        w.set_day(2, TimeRange(Time(7), Time(17)))
        w.set_day(3, TimeRange(Time(8), Time(18)))
        m.add_week(44, w)

        self.assertEquals(m.get_week(44).get_day(2), None)
コード例 #9
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_enter_times_from_last_week_in_month(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(2, TimeRange(Time(7), Time(17)))
     w.set_day(3, TimeRange(Time(7), Time(17)))
     m.add_week(44, w)
     self.assertEquals(m.get_day(31), TimeRange(Time(7), Time(17)))
     self.assertNotEquals(m.get_day(32), TimeRange(Time(7), Time(17)))
コード例 #10
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_returns_correctly_annotated_week_for_start_of_month(self):
     m = Month(2012, 11)
     w = Week()
     for i in xrange(7):
         w.set_day(i, TimeRange(Time(7), Time(17)))
     m.add_week(44, w)
     aw = m.get_week(44)
     self.assertEquals(aw.get_earliest_day_in_month(), 1)
     self.assertEquals(aw.get_first_valid_day(), 3)
     self.assertEquals(aw.get_last_valid_day(), 6)
コード例 #11
0
ファイル: Year.py プロジェクト: remar/bfiller
    def generate_montly_report(self, month_number, hours):
        m = Month(self.year, month_number)
        for w in m.get_week_numbers():
            if month_number == 12 and w == 1:
                schedule = self.next_year.get_schedule_for_week(w, hours)
            else:
                schedule = self.get_schedule_for_week(w, hours)
            m.add_week(w, schedule)

        return m
コード例 #12
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_returns_correctly_annotated_week_for_start_of_month(self):
     m = Month(2012, 11)
     w = Week()
     for i in xrange(7):
         w.set_day(i, TimeRange(Time(7), Time(17)))
     m.add_week(44, w)
     aw = m.get_week(44)
     self.assertEquals(aw.get_earliest_day_in_month(), 1)
     self.assertEquals(aw.get_first_valid_day(), 3)
     self.assertEquals(aw.get_last_valid_day(), 6)
コード例 #13
0
    def generate_montly_report(self, month_number, hours):
        m = Month(self.year, month_number)
        for w in m.get_week_numbers():
            if month_number == 12 and w == 1:
                schedule = self.next_year.get_schedule_for_week(w, hours)
            else:
                schedule = self.get_schedule_for_week(w, hours)
            m.add_week(w, schedule)

        return m
コード例 #14
0
ファイル: valavm.py プロジェクト: rlowrance/re-avm
 def split_train_validate(n_months_back):
     '''return (train, validate)
     where
     - test contains only transactions in the validation_month
     - train contains only transactions in the n_months_back preceeding the
       validation_month
     '''
     validation_month = Month(control.arg.validation_month)
     ss = SampleSelector(samples)
     samples_validate = ss.in_month(validation_month)
     samples_train = ss.between_months(
         validation_month.decrement(n_months_back),
         validation_month.decrement(1),
         )
     return samples_train, samples_validate
コード例 #15
0
ファイル: valavm.py プロジェクト: seyi/re-avm
def split_train_validate(n_months_back, samples, validation_month):
    '''return (train, validate)
    where
    - test contains only transactions in the validation_month
    - train contains only transactions in the n_months_back preceeding the
      validation_month
    '''
    the_validation_month = Month(validation_month)
    ss = SampleSelector(samples)
    samples_validate = ss.in_month(the_validation_month)
    samples_train = ss.between_months(
        the_validation_month.decrement(n_months_back),
        the_validation_month.decrement(1),
    )
    return samples_train, samples_validate
コード例 #16
0
def by_month(dates, xs):
    'return dict[Month] = [x]'
    result = collections.defaultdict(list)
    for date, x in itertools.izip(dates, xs):
        month = Month(date)
        result[month].append(x)
    return result
コード例 #17
0
def make_data(control):
    'return DataFrame'

    def to_datetime_date(x):
        year = int(x / 10000.0)
        x -= year * 10000.0
        month = int(x / 100.0)
        x -= month * 100
        day = int(x)
        return datetime.date(year, month, day)

    transactions = pd.read_csv(
        control.path_in_samples,
        nrows=10 if control.test else None,
    )

    dates = [to_datetime_date(x) for x in transactions[t.sale_date]]
    months = [Month(date.year, date.month) for date in dates]

    result = pd.DataFrame({
        'price': transactions[t.price],
        'city': transactions[t.city],
        'date': dates,
        'month': months,
    })
    return result
コード例 #18
0
def fit_and_predict(training_samples, query_samples, hps, control):
    'return (predictions, attributes, n_training_samples)'
    def X_y(df):
        return Features().extract_and_transform(df, hps['units_X'], hps['units_y'])

    relevant_training_samples = select_in_time_period_and_in_city(
        training_samples,
        Month(control.arg.prediction_month).decrement(1),
        hps['n_months_back'],
        control.arg.neighborhood,
    )
    if len(relevant_training_samples) == 0:
        message = 'no relevant samples hps:%s neighborhood: %s prediction_month %s' % (
            HPs.to_str(hps),
            control.arg.neighborhood,
            control.arg.prediction_month,
        )
        raise FittingError(message)

    X_train, y_train = X_y(relevant_training_samples)
    X_query, actuals = X_y(query_samples)

    fitter = (
        fit_en if control.arg.model == 'en' else
        fit_gb if control.arg.model == 'gb' else
        fit_rf
    )
    fitted = fitter(X_train, y_train, hps, control.random_seed)
    attributes = (
        {'coef_': fitted.coef_, 'intercept_': fitted.intercept_} if control.arg.model == 'en' else
        {'feature_importances_': fitted.feature_importances_}
    )
    predictions = fitted.predict(X_query)
    return predictions, attributes, len(relevant_training_samples)
コード例 #19
0
def select_in_time_period(df, last_month_str, n_months_back):
    'return subset of DataFrame df that are in the time period'
    first_date_float = float(Month(last_month_str).decrement(n_months_back).as_int() * 100 + 1)
    next_month = Month(last_month_str).increment()
    last_date = datetime.date(next_month.year, next_month.month, 1) - datetime.timedelta(1)
    last_date_float = last_date.year * 10000.0 + last_date.month * 100.0 + last_date.day

    sale_date_column = layout_transactions.sale_date
    sale_dates_float = df[sale_date_column]  # type is float
    assert isinstance(sale_dates_float.iloc[0], float)

    mask1 = sale_dates_float >= first_date_float
    mask2 = sale_dates_float <= last_date_float
    mask_in_range = mask1 & mask2
    df_in_range = df.loc[mask_in_range]
    return df_in_range
コード例 #20
0
ファイル: fit-predict-reduce2.py プロジェクト: seyi/re-avm
def process_dirname(dirpath, dirname, reduction, reduction_2007, reduction_200701, no_data, test):
    'mutate result and no_data to include info in the transactions and predictions files in dirname'
    verbose = False
    if verbose:
        print dirname
    training_data, neighborhood, model, month_str = dirname.split('-')
    if model == 'gb':
        print 'for now, skipping gb', dirname
        return
    month = Month(month_str)
    in_2007 = month.year == 2007
    in_200701 = month.year == 2007 and month.month == 1
    fitted = Fitted(training_data, neighborhood, model)
    transaction_ids_raw = read_transaction_ids(dirpath, dirname)
    transaction_ids_list = []
    for transaction_id_raw in transaction_ids_raw:
        canonical = TransactionId.canonical(transaction_id_raw)
        transaction_ids_list.append(canonical)
    transaction_ids = tuple(transaction_ids_list)
    path = os.path.join(dirpath, dirname, 'predictions-attributes.pickle')
    n_records_processed = 0
    with open(path, 'r') as f:
        unpickler = pickle.Unpickler(f)
        dirname_reduction = {}
        try:
            while True:
                obj = unpickler.load()
                if len(obj) == 3:
                    hps_str, predictions, fitted_attributes = obj
                    # convert from log domain to natural units
                    units_y = HPs.from_str(hps_str)['units_y']
                    predictions_restated = np.exp(predictions) if units_y == 'log' else predictions
                    if verbose:
                        print hps_str
                    dirname_reduction[hps_str] = predictions_restated
                else:
                    print 'error:', obj
                n_records_processed += 1
                if test and n_records_processed >= 10:
                    if verbose:
                        print 'test: stop after', n_records_processed
                    break
        except EOFError as e:
            if verbose:
                print 'EOFError (%s) for %s after %d records processed' % (e, dirname, n_records_processed)
            if n_records_processed == 0:
                no_data.add(dirname)
        except ValueError as e:
            print '%s' % e
            no_data.add(dirname)
    reduction_key = (fitted, transaction_ids)
    reduction[reduction_key] = dirname_reduction
    if in_2007:
        reduction_2007[reduction_key] = dirname_reduction
    if in_200701:
        reduction_200701[reduction_key] = dirname_reduction
    print dirname, n_records_processed, len(reduction), len(reduction_2007), len(no_data)
コード例 #21
0
ファイル: make_test_train.py プロジェクト: rlowrance/re-avm
def make_test_train(test_time_period, train_n_months_back, trade_month_column_name, samples):
    'return dataframes for testing and training; see valavm.do_val.fit_and_run'
    assert isinstance(test_time_period, Month), test_time_period
    trade_month = samples[trade_month_column_name]
    assert trade_month.dtype == np.dtype('int64')
    test_month = test_time_period.as_int()

    test_mask = trade_month == test_month
    test_df = samples[test_mask]

    first_train_month = Month(test_time_period).decrement(train_n_months_back)
    train_mask = (first_train_month.as_int() <= trade_month) & (trade_month < test_month)
    train_df = samples[train_mask]

    assert len(test_df) > 0
    assert len(train_df) > 0

    return test_df, train_df
コード例 #22
0
def make_test_train(test_time_period, train_n_months_back,
                    trade_month_column_name, samples):
    'return dataframes for testing and training; see valavm.do_val.fit_and_run'
    assert isinstance(test_time_period, Month), test_time_period
    trade_month = samples[trade_month_column_name]
    assert trade_month.dtype == np.dtype('int64')
    test_month = test_time_period.as_int()

    test_mask = trade_month == test_month
    test_df = samples[test_mask]

    first_train_month = Month(test_time_period).decrement(train_n_months_back)
    train_mask = (first_train_month.as_int() <= trade_month) & (trade_month <
                                                                test_month)
    train_df = samples[train_mask]

    assert len(test_df) > 0
    assert len(train_df) > 0

    return test_df, train_df
コード例 #23
0
    def test_1(self):
        def vp(x):
            if False:
                pprint(x)

        yyyymm = 'trade_month'
        x = 'x'
        samples = pd.DataFrame([
            {
                yyyymm: 200702,
                x: 0
            },
            {
                yyyymm: 200701,
                x: 1
            },
            {
                yyyymm: 200612,
                x: 2
            },
            {
                yyyymm: 200611,
                x: 3
            },
            {
                yyyymm: 200610,
                x: 4
            },
        ])
        vp(samples)
        test, train = make_test_train(Month(200702), 1, yyyymm, samples)
        vp(test)
        vp(train)
        assert len(test) == 1
        assert len(train) == 1
        test, train = make_test_train(Month(200701), 2, yyyymm, samples)
        vp(test)
        vp(train)
        assert len(test) == 1
        assert len(train) == 2
コード例 #24
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_enter_times_from_last_week_in_month(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(2, TimeRange(Time(7), Time(17)))
     w.set_day(3, TimeRange(Time(7), Time(17)))
     m.add_week(44, w)
     self.assertEquals(m.get_day(31), TimeRange(Time(7), Time(17)))
     self.assertNotEquals(m.get_day(32), TimeRange(Time(7), Time(17)))
コード例 #25
0
def make_table_stats(data, control, in_report_p):
    'return Report with statistics for years and months that obey the filter'
    r = Report()
    r.append('Prices by Month')
    r.append('')
    ct = ColumnsTable((
        ('year', 4, '%4d', (' ', ' ', 'year'), 'year of transaction'),
        ('month', 5, '%5d', (' ', ' ', 'month'), 'month of transaction'),
        ('mean_price', 6, '%6.0f', (' ', ' mean', 'price'),
         'mean price in dollars'),
        ('median_price', 6, '%6.0f', (' ', 'median', 'price'),
         'median price in dollars'),
        ('mean_price_ratio', 6, '%6.3f', (' mean', ' price', ' ratio'),
         'ratio of price in current month to prior month'),
        ('median_price_ratio', 6, '%6.3f', ('median', ' price', ' ratio'),
         'ratio of price in current month to prior month'),
        ('number_trades', 6, '%6d', ('number', 'of', 'trades'),
         'number of trades in the month'),
    ))

    prior_mean_price = None
    prior_median_price = None
    for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009):
        for month in (1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                     10, 11, 12):
            if in_report_p(year, month):
                selected = data.month == Month(year, month)
                prices = data[selected].price
                mean_price = prices.mean()
                median_price = prices.median()
                number_trades = len(prices)
                ct.append_detail(
                    year=year,
                    month=month,
                    mean_price=mean_price,
                    median_price=median_price,
                    mean_price_ratio=None if prior_mean_price is None else
                    mean_price / prior_mean_price,
                    median_price_ratio=None if prior_median_price is None else
                    median_price / prior_median_price,
                    number_trades=number_trades,
                )
                prior_mean_price = mean_price
                prior_median_price = median_price
    ct.append_legend()
    for line in ct.iterlines():
        r.append(line)
    return r
コード例 #26
0
ファイル: predict.py プロジェクト: seyi/re-avm
def make_control(argv):
    'return a Bunch'

    print argv
    parser = argparse.ArgumentParser()
    parser.add_argument('invocation')
    parser.add_argument('samples', choices=['all', 'train'])
    parser.add_argument('model', choices=['en', 'gb', 'rf'])
    parser.add_argument('transaction_month')
    parser.add_argument('neighborhood')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--trace', action='store_true')
    arg = parser.parse_args(argv)

    arg.me = arg.invocation.split('.')[0]

    if arg.trace:
        pdb.set_trace()

    # convert arg.neighborhood into arg.all and arg.city
    arg.city = (None if arg.neighborhood == 'all' else
                arg.neighborhood.replace('_', ' '))

    random_seed = 123
    random.seed(random_seed)

    prior_month = Month(arg.transaction_month).decrement().as_str()
    in_dir = '%s-%s-%s-%s' % (arg.samples, arg.model, prior_month,
                              arg.neighborhood)
    out_dir = '%s-%s-%s-%s' % (arg.samples, arg.model, arg.transaction_month,
                               arg.neighborhood)

    dir_working = Path().dir_working()
    output_dir = (os.path.join(dir_working, arg.me + '-test', out_dir, '') if
                  arg.test else os.path.join(dir_working, arg.me, out_dir, ''))
    dirutility.assure_exists(output_dir)

    return Bunch(
        arg=arg,
        path_in_fitted=os.path.join(dir_working, 'fit', in_dir, ''),
        path_in_samples=os.path.join(dir_working, 'samples2',
                                     arg.samples + '.csv'),
        path_out_file=os.path.join(output_dir, 'predictions.pickle'),
        path_out_log=os.path.join(output_dir, '0log.txt'),
        random_seed=random_seed,
        timer=Timer(),
    )
コード例 #27
0
ファイル: fit.py プロジェクト: seyi/re-avm
def make_control(argv):
    'return a Bunch'

    print argv
    parser = argparse.ArgumentParser()
    parser.add_argument('invocation')
    parser.add_argument('data', choices=['all', 'train'])
    parser.add_argument('model', choices=['en', 'gb', 'rf'])
    parser.add_argument('last_month')
    parser.add_argument('neighborhood')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--trace', action='store_true')
    parser.add_argument('--dry', action='store_true')     # don't write output
    arg = parser.parse_args(argv)
    arg.me = arg.invocation.split('.')[0]

    if arg.trace:
        pdb.set_trace()

    arg.last = Month(arg.last_month)  # convert to Month and validate value

    # convert arg.neighborhood into arg.all and arg.city
    arg.city = (
        None if arg.neighborhood == 'all' else
        arg.neighborhood.replace('_', ' ')
    )

    random_seed = 123
    random.seed(random_seed)

    dir_working = Path().dir_working()
    fit_dir = (
        os.path.join(dir_working, arg.me + '-test') if arg.test else
        os.path.join(dir_working, arg.me)
    )
    last_dir = '%s-%s-%s-%s' % (arg.data, arg.model, arg.last_month, arg.neighborhood)
    path_out_dir = os.path.join(fit_dir, last_dir, '')
    dirutility.assure_exists(path_out_dir)

    return Bunch(
        arg=arg,
        path_in_dir=os.path.join(dir_working, 'samples2', ''),
        path_out_dir=path_out_dir,
        path_out_log=os.path.join(path_out_dir, '0log.txt'),
        random_seed=random_seed,
        timer=Timer(),
    )
コード例 #28
0
    def __init__(self, name):
        self.name = name
        self.months = []
        monthNames = [
            'january', 'february', 'march', 'april', 'may', 'june', 'july',
            'august', 'september', 'october', 'november', 'december'
        ]
        weekdays = [
            'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
            'Saturday'
        ]

        # An algorithm to determine if it is a leap year, adapted from the
        # logic shown in this wikipedia page:
        # Accessed September, 2016
        # https://en.wikipedia.org/wiki/Leap_year#Algorithm
        if (name % 4 == 0 and name % 100 != 0) or name % 400 == 0:
            monthLengths = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
        else:
            monthLengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

        # An algorithm to determine the day of the week of the first day
        # of the year, adapted from the algortihm on the "disparate variation"
        # of Gauss's algorithm, shown here:
        # Accessed September, 2016
        # https://en.wikipedia.org/wiki/Leap_year#Algorithm
        y = (name % 100) - 1
        c = name // 100
        weekOffset = int((2.4 + name + y / 4 + c / 4 - 2 * c) % 7)

        dayCount = 0
        for i in range(12):
            days = []
            for j in range(monthLengths[i]):
                days.append(
                    Day(weekdays[(dayCount + weekOffset) % 7], j + 1, None))
                dayCount += 1
            self.months.append(Month(monthNames[i], days, self))

        for i in range(12):
            if i > 0:
                self.months[i].setPrev(self.months[i - 1])
            if i < 11:
                self.months[i].setNext(self.months[i + 1])
コード例 #29
0
    def make_prices_volumes(data):
        'return tuples of dict[(year,month)] = number'

        def make_months(year):
            if year == 2009:
                return (1, 2, 3)
            else:
                return (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)

        median_prices = {}
        volumes = {}
        for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009):
            for month in make_months(year):
                in_year_month = data.month == Month(year, month)
                data_for_month = data[in_year_month]
                price = data_for_month.price.median()
                volume = len(data_for_month.price)
                median_prices[(year, month)] = price
                volumes[(year, month)] = volume
        return median_prices, volumes
コード例 #30
0
ファイル: testbest.py プロジェクト: josephjlee/re-avm
def do_testbest(control, samples, best):
    'determine accuracy of the HPs sets found to be best by valavm (as reported by chart-06)'
    result = {}
    for test_period, value in best.iteritems():
        forecast_month = Month(test_period).increment()
        series = value[0]
        print test_period, forecast_month
        print series
        avm = make_avm(forecast_month, control.random_seed, series)
        test_df, train_df = make_test_train(
            forecast_month,
            series.n_months_back,
            layout_transactions.yyyymm,
            samples,
        )
        avm.fit(
            train_df
        )  # the AVM object knows how to extract the train and test samples
        predictions = avm.predict(test_df)
        actuals = samples[layout_transactions.price]
        result[forecast_month] = valavm.ResultValue(actuals, predictions)
    return result
コード例 #31
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_add_week_52_of_previous_year_to_january(self):
     m = Month(2012, 1)
     w = Week()
     w.set_day(6, TimeRange(Time(7), Time(10)))
     m.add_week(52, w)
     self.assertEquals(m.get_day(1), TimeRange(Time(7), Time(10)))
コード例 #32
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_add_week_52_of_previous_year_to_january(self):
     m = Month(2012, 1)
     w = Week()
     w.set_day(6, TimeRange(Time(7), Time(10)))
     m.add_week(52, w)
     self.assertEquals(m.get_day(1), TimeRange(Time(7), Time(10)))
コード例 #33
0
ファイル: chart06_make_chart_cd.py プロジェクト: seyi/re-avm
def make_chart_cd(reduction, median_prices, control, detail_line_indices,
                  report_id):
    r = ChartCDReport(control.column_definitions, control.test)
    my_validation_months = []
    my_price = []
    my_mae = []
    for validation_month in control.validation_months_long:
        median_price = median_prices[Month(validation_month)]

        if validation_month not in reduction:
            control.exceptions.append('reduction is missing month %s' %
                                      validation_month)
            continue
        month_result_keys = reduction[validation_month].keys()
        my_validation_months.append(validation_month)
        my_price.append(median_price)
        for detail_line_index in detail_line_indices:
            if detail_line_index >= len(month_result_keys):
                continue  # this can happend when using samples
            try:
                k = month_result_keys[detail_line_index]
            except:
                pdb.set_trace()
            k = month_result_keys[detail_line_index]
            v = reduction[validation_month][k]
            r.append_detail(
                validation_month=validation_month,
                rank=detail_line_index + 1,
                median_absolute_error=v.mae,
                median_price=median_price,
                model=k.model,
                n_months_back=k.n_months_back,
                max_depth=k.max_depth,
                n_estimators=k.n_estimators,
                max_features=k.max_features,
                learning_rate=k.learning_rate,
                alpha=k.alpha,
                l1_ratio=k.l1_ratio,
                units_X=k.units_X[:3],
                units_y=k.units_y[:3],
            )
        my_mae.append(reduction[validation_month][month_result_keys[0]].mae)

    fig = plt.figure()
    fig1 = fig.add_subplot(211)
    fig1.bar(range(len(my_validation_months)), my_mae, color='blue')
    labels = my_validation_months
    plt.xticks([x + .6 for x in range(len(my_validation_months))],
               labels,
               rotation=-70,
               size='xx-small')

    plt.yticks(size='xx-small')
    plt.xlabel('Year-Month')
    plt.ylabel('Median Absolute Error ($)')
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
    fig2 = fig.add_subplot(212)
    fig2.bar(
        range(len(my_validation_months)),
        [int(m) / int(p) for m, p in zip(my_mae, my_price)],
        color='blue',
    )
    plt.xticks(
        [x + .6 for x in range(len(my_validation_months))],
        labels,
        rotation=-70,
        size='xx-small',
    )

    plt.yticks(size='xx-small')
    plt.xlabel('Year-Month')
    plt.ylabel('Absolute Relative Error')

    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

    plt.savefig(control.path_out_c_pdf)
    plt.close()

    r.write(control.path_out_cd % report_id)
    return
コード例 #34
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_returns_annotated_weeks(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(17)))
     m.add_week(40, w)
     self.assertEquals(type(m.get_week(40)), AnnotatedWeek)
コード例 #35
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_returns_annotated_weeks(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(17)))
     m.add_week(40, w)
     self.assertEquals(type(m.get_week(40)), AnnotatedWeek)
コード例 #36
0
    def make_figure2(validation_month):
        '''make and write figure for the validation month
        Part 1:
        for the validation month
        one bar for each of the first 50 best models
        the height of the bar is the MAE in ($)
        Part 2:
        produce a 2-up chart, where the top chart is as in part 1
        and the bottom chart has as y axis the absolute relative error
        '''

        print 'creating figure b', validation_month

        # plt.suptitle('Loss by Test Period, Tree Max Depth, N Trees')  # overlays the subplots
        bar_color = {'gb': 'white', 'rf': 'black', 'en': 'red'}
        models, maes = make_models_maes(validation_month)
        assert len(models) == len(maes)
        assert len(models) > 0
        # the reduction is sorted by increasing mae
        # Jonathan
        fig = plt.figure()
        fig1 = fig.add_subplot(211)

        plt.title(
            'Validation Month: %s' % (validation_month),
            loc='right',
            fontdict={
                'fontsize': 'large',
                'style': 'italic'
            },
        )
        for i, model in enumerate(models):
            fig1.bar(i, maes[i], color=bar_color[model])
        plt.yticks(size='xx-small')
        plt.xticks([])
        plt.xlabel('Models in order of increasing MAE')
        plt.ylabel('MAE ($)')

        white_patch = mpatches.Patch(
            facecolor='white',
            edgecolor='black',
            lw=1,
            label="Gradient Boosting",
        )
        black_patch = mpatches.Patch(
            facecolor='black',
            edgecolor='black',
            lw=1,
            label="Random Forest",
        )

        plt.legend(handles=[white_patch, black_patch], loc=2)
        plt.ylim(0, 180000)

        fig2 = fig.add_subplot(212)
        for i, model in enumerate(models):
            fig2.bar(i,
                     maes[i] / median_price[Month(validation_month)],
                     color=bar_color[model])

        plt.yticks(size='xx-small')
        plt.xticks([])
        plt.xlabel('Models in order of increasing MAE')
        plt.ylabel('Absolute Relative Error')
        plt.ylim(0, .3)

        white_patch = mpatches.Patch(
            facecolor='white',
            edgecolor='black',
            lw=1,
            label="Gradient Boosting",
        )
        black_patch = mpatches.Patch(
            facecolor='black',
            edgecolor='black',
            lw=1,
            label="Random Forest",
        )

        plt.legend(handles=[white_patch, black_patch], loc=2)
        plt.savefig(control.path_out_b_pdf % int(validation_month))
        plt.close()
コード例 #37
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_give_week_numbers_for_month(self):
     m = Month(2012, 10)
     self.assertEquals(m.get_week_numbers(), [40, 41, 42, 43, 44])
コード例 #38
0
 def median_price_for_year_month(year, month):
     in_year_month = months == Month(year, month)
     #  assert sum(in_year_month) > 0, (year, month)
     return prices[in_year_month].median(
     )  # return NaN if no transactions in the (year,month)
コード例 #39
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_return_week(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(17)))
     m.add_week(40, w)
     self.assertEquals(m.get_week(40).get_day(0), TimeRange(Time(7), Time(17)))
コード例 #40
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_returns_correct_week_range_for_january(self):
     m = Month(2012, 1)
     self.assertEquals(m.get_week_numbers(), [52, 1, 2, 3, 4, 5])
コード例 #41
0
def make_chart_efh(k, reduction, actuals, median_price, control):
    '''Write charts e and f, return median-absolute-relative_regret object'''
    def interesting():
        return k == 5

    def trace_if_interesting():
        if interesting():
            print 'k', k
            pdb.set_trace()
            return True
        else:
            return False

    ensemble_weighting = 'exp(-MAE/100000)'
    mae = {}
    debug = False
    my_validation_months = []
    my_ensemble_mae = []
    my_best_mae = []
    my_price = []
    for validation_month in control.validation_months:
        e = ChartEReport(k, ensemble_weighting, control.column_definitions,
                         control.test)
        h = ChartHReport(k, ensemble_weighting, control.column_definitions,
                         control.test)
        if debug:
            print validation_month
            pdb.set_trace()
        query_month = Month(validation_month).increment(1).as_str()
        if query_month not in reduction:
            control.exceptions.append('%s not in reduction (charts ef)' %
                                      query_month)
            print control.exception
            continue
        cum_weighted_predictions = None
        cum_weights = 0
        mae_validation = None
        check_key_order(reduction[validation_month])
        # write lines for the k best individual models
        # accumulate info needed to build the ensemble model
        index0_mae = None
        for index, query_month_key in enumerate(reduction[query_month].keys()):
            # print only k rows
            if index >= k:
                break
            print index, query_month_key
            validation_month_value = reduction[validation_month][
                query_month_key]
            print query_month
            query_month_value = reduction[query_month][query_month_key]
            if mae_validation is not None and False:  # turn off this test for now
                trace_unless(
                    mae_validation <= validation_month_value.mae,
                    'should be non-decreasing',
                    mae_previous=mae_validation,
                    mae_next=validation_month_value.mae,
                )
            mae_validation = validation_month_value.mae

            mae_query = query_month_value.mae
            if index == 0:
                index0_mae = mae_query
            eta = 1.0
            weight = math.exp(-eta * (mae_validation / 100000.0))
            e.detail_line(
                validation_month=validation_month,
                model=query_month_key.model,
                n_months_back=query_month_key.n_months_back,
                n_estimators=query_month_key.n_estimators,
                max_features=query_month_key.max_features,
                max_depth=query_month_key.max_depth,
                learning_rate=query_month_key.learning_rate,
                rank=index + 1,
                mae_validation=mae_validation,
                weight=weight,
                mae_query=mae_query,
            )

            h.detail_line(
                validation_month=validation_month,
                model_description=short_model_description(query_month_key),
                mae_validation=mae_validation,
                mae_query=mae_query,
            )
            # need the mae of the ensemble
            # need the actuals and predictions? or is this already computed
            predictions_next = query_month_value.predictions
            if cum_weighted_predictions is None:
                cum_weighted_predictions = weight * predictions_next
            else:
                cum_weighted_predictions += weight * predictions_next
            cum_weights += weight
        # write line comparing the best individual model in the next month
        # to the ensemble model
        trace_if_interesting()
        ensemble_predictions = cum_weighted_predictions / cum_weights
        ensemble_rmse, ensemble_mae, ensemble_ci95_low, ensemble_ci95_high = errors.errors(
            actuals[query_month],
            ensemble_predictions,
        )
        best_key = reduction[query_month].keys()[0]
        best_value = reduction[query_month][best_key]
        e.detail_line(
            validation_month=validation_month,
            mae_ensemble=ensemble_mae,
            model=best_key.model,
            n_months_back=best_key.n_months_back,
            n_estimators=best_key.n_estimators,
            max_features=best_key.max_features,
            max_depth=best_key.max_depth,
            learning_rate=best_key.learning_rate,
        )
        h.detail_line(
            validation_month=validation_month,
            model_description='ensemble',
            mae_query=ensemble_mae,
        )
        my_validation_months.append(validation_month)
        my_ensemble_mae.append(ensemble_mae)
        my_best_mae.append(best_value.mae)

        e.write(control.path_out_e_txt % (k, validation_month))
        mae[validation_month] = Bunch(
            index0=index0_mae,
            ensemble=ensemble_mae,
            best_next_month=best_value.mae,
        )

    my_ensemble_mae = []
    my_best_mae = []
    my_price = []
    for month in my_validation_months:
        my_ensemble_mae.append(mae[month].ensemble)
        my_best_mae.append(mae[month].best_next_month)
        my_price.append(median_price[Month(month)])

    width = 0.35

    fig = plt.figure()
    fig1 = fig.add_subplot(211)
    fig1.bar(
        [x + width for x in range(len(my_validation_months))],
        my_best_mae,
        width,
        color='white',
    )
    fig1.bar(
        range(len(my_validation_months)),
        my_ensemble_mae,
        width,
        color='black',
    )

    plt.ylim(0, 180000)

    labels = my_validation_months
    plt.xticks(
        [x + .4 for x in range(len(my_validation_months))],
        labels,
        rotation=-70,
        size='xx-small',
    )

    plt.ylabel('MAE ($)')
    plt.xlabel('Year-Month')

    white_patch = mpatches.Patch(
        facecolor='white',
        edgecolor='black',
        hatch='',
        lw=1,
        label="MAE of Best Model in Validation Month",
    )
    black_patch = mpatches.Patch(
        facecolor='black',
        edgecolor='black',
        hatch='',
        lw=1,
        label="MAE of Ensemble of " + str(k) +
        " Best Models in Validation Month",
    )
    plt.legend(handles=[white_patch, black_patch], loc=2)

    fig2 = fig.add_subplot(212)

    fig2.bar(
        [x + width for x in range(len(my_validation_months))],
        [int(m) / int(p) for m, p in zip(my_best_mae, my_price)],
        width,
        color='white',
    )
    fig2.bar(
        range(len(my_validation_months)),
        [int(m) / int(p) for m, p in zip(my_ensemble_mae, my_price)],
        width,
        color='black',
    )
    plt.ylim(0, .5)
    labels = my_validation_months
    plt.xticks(
        [x + .4 for x in range(len(my_validation_months))],
        labels,
        rotation=-70,
        size='xx-small',
    )

    plt.ylabel('Absolute Relative Error')
    plt.xlabel('Year-Month')

    white_patch = mpatches.Patch(
        facecolor='white',
        edgecolor='black',
        hatch='',
        lw=1,
        label="ARE of Best Model in Validation Month",
    )
    black_patch = mpatches.Patch(
        facecolor='black',
        edgecolor='black',
        hatch='',
        lw=1,
        label="ARE of Ensemble of " + str(k) +
        " Best Models in Validation Month",
    )
    plt.legend(handles=[white_patch, black_patch], loc=2)

    plt.tight_layout(pad=0.8, w_pad=0.8, h_pad=1.0)
    plt.savefig(control.path_out_e_pdf % k)

    plt.close()

    f = ChartFReport(k, ensemble_weighting, control.column_definitions,
                     control.test)
    regrets = []
    relative_errors = []
    for validation_month in control.validation_months:
        query_month = Month(validation_month).increment(1).as_str()
        print query_month
        print "need to define best_next_month  --> best_query_month"
        pdb.set_trace()
        query_month_value = reduction[query_month][query_month_key]
        regret = mae[validation_month].ensemble - mae[
            validation_month].best_next_month
        regrets.append(regret)
        relative_error = regret / median_price[Month(validation_month)]
        relative_errors.append(relative_error)
        median_price_next = median_price[Month(query_month)]
        f.detail_line(
            validation_month=validation_month,
            mae_index0=mae[validation_month].index0,
            mae_ensemble=mae[validation_month].ensemble,
            mae_best_next_month=mae[validation_month].best_next_month,
            median_price=median_price[Month(validation_month)],
            fraction_median_price_next_month_index0=mae[validation_month].
            index0 / median_price_next,
            fraction_median_price_next_month_ensemble=mae[validation_month].
            ensemble / median_price_next,
            fraction_median_price_next_month_best=mae[validation_month].
            best_next_month / median_price_next,
        )
    median_absolute_regret = np.median(np.abs(regrets))
    median_absolute_relative_regret = np.median(np.abs(relative_errors))
    f.write(control.path_out_f % k)
    return median_absolute_regret, median_absolute_relative_regret
コード例 #42
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_add_week_1_of_next_year_to_december(self):
     m = Month(2012, 12)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(10)))
     m.add_week(1, w)
     self.assertEquals(m.get_day(31), TimeRange(Time(7), Time(10)))
コード例 #43
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_returns_correct_week_range_for_december(self):
     m = Month(2012, 12)
     self.assertEquals(m.get_week_numbers(), [48, 49, 50, 51, 52, 1])
コード例 #44
0
ファイル: parser.py プロジェクト: luccalb/getthebag
        # iterate over every transaction and parse it into a proper Transaction object
        for idx, trans in month.iterrows():
            if 0 < ((parsed_transacts / number_of_rows) * 100) % 10 < 1:
                print("currently at",
                      ((parsed_transacts / number_of_rows) * 100), "percent")

            current_trans = Transaction(trans['Buchungstag'],
                                        trans['Buchungstext'], trans['Betrag'],
                                        trans['Währung'])

            if "ANLEGEN" in current_trans.descr:
                current_trans.isSavings = True
                months_savings += 0 - current_trans.amount  # move to analyzer

            if "PayPal" in current_trans.descr or "Auszahlung" in current_trans.descr:
                current_trans.toBeReviewed = True

            if current_trans.amount < 0 and not "ANLEGEN" in current_trans.descr:  # move to analyzer
                totalSpendings += current_trans.amount  # move to analyzer

            db_transactions.insert_one(current_trans.__dict__)
            parsed_transacts += 1

            months_transactions.append(current_trans)  # move to analyzer

        # add all the transactions to the current month of the current year
        current_month = Month(key.year, key.month, months_savings,
                              totalSpendings,
                              months_transactions)  # move to analyzer
        db_months.insert_one(current_month.__dict__)  # move to analyzer
コード例 #45
0
def make_control(argv):
    # return a Bunch

    print argv
    parser = argparse.ArgumentParser()
    parser.add_argument('invocation')
    parser.add_argument('--data',
                        help='reduce input and create data file in WORKING',
                        action='store_true')
    parser.add_argument('--test',
                        help='set internal test flag',
                        action='store_true')
    arg = parser.parse_args(argv)
    # arg = Bunch.from_namespace(parser.parse_args(argv))
    base_name = arg.invocation.split('.')[0]
    arg.me = base_name

    random_seed = 123
    random.seed(random_seed)

    dir_working = Path().dir_working()

    # assure output directory exists
    def create_dir(path1, path2):
        result_path = os.path.join(path1, path2)
        dirutility.assure_exists(result_path)
        return result_path

    dir_chart01 = (create_dir(dir_working, arg.me + '-test')
                   if arg.test else create_dir(dir_working, arg.me))
    dir_date_price = create_dir(dir_chart01, 'date_price')
    dir_median_price = create_dir(dir_chart01, 'median_price')
    dir_prices_volume = create_dir(dir_chart01, 'prices_volume')

    all_months = [
        Month(year, month)
        for year in (2003, 2004, 2005, 2006, 2007, 2008, 2009)
        for month in ((1, 2, 3) if year == 2009 else (1, 2, 3, 4, 5, 6, 7, 8,
                                                      9, 10, 11, 12))
    ]

    return Bunch(
        all_months=all_months,
        arg=arg,
        base_name=base_name,
        debug=False,
        path_in_interesting_cities=os.path.join(dir_working,
                                                'interesting_cities.txt'),
        path_in_samples=os.path.join(dir_working, 'samples2', 'train.csv'),
        path_out_dir_date_price=dir_date_price,
        path_out_dir_median_price=dir_median_price,
        path_out_dir_prices_volume=dir_prices_volume,
        path_out_log=os.path.join(dir_chart01, '0log.txt'),
        path_out_price_statistics_city_name=os.path.join(
            dir_chart01, 'price-statistics-city-name.txt'),
        path_out_price_statistics_count=os.path.join(
            dir_chart01, 'price-statistics-count.txt'),
        path_out_price_statistics_median_price=os.path.join(
            dir_chart01, 'price-statistics-median-price.txt'),
        path_out_price_volume=os.path.join(dir_chart01, 'price-volume.pdf'),
        path_out_stats_all=os.path.join(dir_chart01, 'price-stats-all.txt'),
        path_out_stats_count_by_city_in_2007=os.path.join(
            dir_chart01, 'count-by-city-in-2007.txt'),
        path_out_stats_2006_2008=os.path.join(dir_chart01,
                                              'price-stats-2006-2008.txt'),
        path_reduction=os.path.join(dir_chart01, '0data.pickle'),
        random_seed=random_seed,
        test=arg.test,
    )
コード例 #46
0
ファイル: MonthTest.py プロジェクト: remar/bfiller
 def test_can_enter_times_from_second_week(self):
     m = Month(2012, 10)
     w = Week()
     w.set_day(0, TimeRange(Time(7), Time(17)))
     m.add_week(41, w)
     self.assertEquals(m.get_day(8), TimeRange(Time(7), Time(17)))