def predict(self, business_forecast, training_data=pd.DataFrame()): sum_bins = np.zeros(6) for i in business_forecast: sched = ActualSchedule(training_data[\ training_data.user_tz == i['user_tz']]) stuff = (sched.bins() + self.smoother) / (sched.bins() + self.smoother).sum() sum_bins += stuff * i['frequency'] * i['schedule_type'] return sum_bins
def predict(self, business_forecast, training_data=pd.DataFrame()): sum_bins = np.zeros(6) smoother = np.array([0.01, 0.01, 0.01, 0.01, 0.01, 0.01]) for i in business_forecast: sched = ActualSchedule(self.training_data[\ self.training_data.schedule_type == i['schedule_type']]) stuff = (sched.bins() + smoother) / (sched.bins() + smoother).sum() sum_bins += stuff * i['frequency'] * i['schedule_type'] return sum_bins
def predict(self, business_forecast, training_data=pd.DataFrame()): # try company - show chi-square test # try timezone - show chi-square test # timezone generally is not statistically significant (2x and 3x). # Significant for 4x and 5x but # try company and timezone # too difficult. Already pretty accurate actual_schedule = ActualSchedule(self.filtered_training_data(business_forecast)) bins = actual_schedule.bins() b = bins / bins.sum() * self.num_business_forecast_lessons(business_forecast) return b
def errors(self): indices = [] # index of a row, formatted YYYY-M (e.g. 2016-8, 2016-9) predictions = [] errors = [] # errors. Lower is better test_data_size = [] actuals = [] for i in range(0, self.num_months() - self.training_data_span_months): m = self._model() training_data = self.training_data(i, self.training_data_span_months) m.fit(training_data) td = self.test_data(i + self.training_data_span_months) print "Generating prediction..." prediction = m.predict(\ BusinessForecast(\ td).convert(), training_data=training_data) actual = ActualSchedule(\ td).bins() print "prediction " print prediction print "\n" print "actual" print actual print "\n" error = mean_absolute_error(actual, prediction) errors.append(error) index = self.year_month_index(i + self.training_data_span_months) indices.append(index) print "time: {}, error: {}".format(index, error) predictions.append(prediction) actuals.append(actual) test_data_size.append(actual.sum()) return pd.DataFrame({ 'errors': errors, 'test_data_size': test_data_size, 'predictions': predictions, 'actuals': actuals }).set_index([indices])
def errors(self): indices = [] # index of a row, formatted YYYY-M (e.g. 2016-8, 2016-9) predictions = [] errors = [] # errors. Lower is better test_data_size = [] actuals = [] for i in range(0,self.num_months()-self.training_data_span_months): m = self._model() training_data = self.training_data(i, self.training_data_span_months) m.fit(training_data) td = self.test_data(i+self.training_data_span_months) print "Generating prediction..." prediction = m.predict(\ BusinessForecast(\ td).convert(), training_data=training_data) actual = ActualSchedule(\ td).bins() print "prediction " print prediction print "\n" print "actual" print actual print "\n" error = mean_absolute_error(actual, prediction) errors.append(error) index = self.year_month_index(i+self.training_data_span_months) indices.append(index) print "time: {}, error: {}".format(index, error) predictions.append(prediction) actuals.append(actual) test_data_size.append(actual.sum()) return pd.DataFrame({ 'errors': errors, 'test_data_size': test_data_size, 'predictions': predictions, 'actuals': actuals }).set_index([indices])
def converts_it_to_an_actual_schedule(self): user_tz = ["Brasilia", "Brasilia", "Pacific (US & Canada)", "Eastern (US & Canada)", "Brasilia" ] l1_time = range(0, 5) l1_day = range(0, 5) l2_time = range(1, 6) l2_day = range(0, 5) l3_time = range(2, 7) l3_day = range(0, 5) l4_time = range(3, 8) l4_day = range(0, 5) schedule_type = [4,4,4,4,4] args = { 'user_tz': user_tz, 'l1_time': l1_time, 'l1_day': l1_day, 'l2_time': l2_time, 'l2_day': l2_day, 'l3_time': l3_time, 'l3_day': l3_day, 'l4_time': l4_time, 'l4_day': l4_day, 'schedule_type': schedule_type } unique_user_summaries = pd.DataFrame(args) schedule = ActualSchedule(unique_user_summaries) bins = schedule.bins() expect(bins[0]).to.equal(10) expect(bins[1]).to.equal(10) expect(bins[2]).to.equal(0) expect(bins[3]).to.equal(0) expect(bins[4]).to.equal(0) expect(bins[5]).to.equal(0)
def converts_it_to_an_actual_schedule(self): user_tz = [ "Brasilia", "Brasilia", "Pacific (US & Canada)", "Eastern (US & Canada)", "Brasilia" ] l1_time = range(0, 5) l1_day = range(0, 5) l2_time = range(1, 6) l2_day = range(0, 5) l3_time = range(2, 7) l3_day = range(0, 5) l4_time = range(3, 8) l4_day = range(0, 5) schedule_type = [4, 4, 4, 4, 4] args = { 'user_tz': user_tz, 'l1_time': l1_time, 'l1_day': l1_day, 'l2_time': l2_time, 'l2_day': l2_day, 'l3_time': l3_time, 'l3_day': l3_day, 'l4_time': l4_time, 'l4_day': l4_day, 'schedule_type': schedule_type } unique_user_summaries = pd.DataFrame(args) schedule = ActualSchedule(unique_user_summaries) bins = schedule.bins() expect(bins[0]).to.equal(10) expect(bins[1]).to.equal(10) expect(bins[2]).to.equal(0) expect(bins[3]).to.equal(0) expect(bins[4]).to.equal(0) expect(bins[5]).to.equal(0)