class Trainer: def __init__(self, pool_size, start_training, country, region=None, covid_data=None, statistic='Cases'): # The object containing all case, fatality, and movement data self.covid_data = covid_data if covid_data is None: self.covid_data = DataGenerator() # The country and possibly specific region to generate a model for self.country = country self.region = region # The number of models for each generation self.pool_size = pool_size # What metric to create a model for, either 'Cases' or 'Fatalities' self.statistic = statistic # The date when training takes data from through 2 weeks later self.start_training = start_training # The object which handles all the models in a generation self.pool = Pool(pool_size) # The actual data which the predictions will be compared to self.test_case = [] self.generate_test_case() def generate_test_case(self): # Gathers the case data for a week after the training data ends day_one = parse_day(self.start_training) if self.region is None: self.test_case = [ self.country.daily[day_one + i][self.statistic] for i in range(7) ] else: self.test_case = [ self.region.daily[day_one + i][self.statistic] for i in range(7) ] def train(self, generations): self.pool.seed_pool() # evaluates and creates a new generation for i in range(generations): self.threaded_evaluate() if i < generations - 1: self.pool.next_generation() # Returns the top 10 models self.pool.sort() return self.pool.pool[:10] def threaded_evaluate(self): # a thread_count of 4 seems to be the most effective # higher values slow the program thread_count = 4 with multiprocessing.Pool(thread_count) as worker_pool: result = worker_pool.map(self.thread, self.pool.pool) # waits for all the processes to finish and closes them worker_pool.close() worker_pool.join() # replaces the pool with the new models self.pool.pool = result def thread(self, model): prediction = self.predict(model, self.start_training) if self.region is None: model.score = self.rmsle( prediction, self.country.cumulative[parse_day( self.start_training)][self.statistic]) else: model.score = self.rmsle( prediction, self.region.cumulative[parse_day( self.start_training)][self.statistic]) return model def rmsle(self, predicted, baseline): # Root mean square log error test les = [] predicted_cumulative = baseline actual_cumulative = baseline for predict, actual in zip(predicted, self.test_case): if predict < 0: predict = 0 if actual < 0: actual = 0 predicted_cumulative += predict actual_cumulative += actual les.append((math.log(predicted_cumulative + 1.0) - math.log(actual_cumulative + 1.0))**2) return math.sqrt(sum(les) / len(les)) def evaluate(self, start_date): # generates a week of predicted values for each model for model in self.pool.pool: model_predictions = self.predict(model, start_date) cumulative_stat = self.region.cumulative[parse_day( self.start_training)][self.statistic] model.score = self.rmsle(model_predictions, cumulative_stat) def predict(self, model, start_date): int_date = parse_day(start_date) if self.region is None: zone = self.country else: zone = self.region previous_cases = zone.daily[int_date][self.statistic] lag = model.mobility_lag week_prediction = [] if self.statistic == 'Cases': for day in range(7): mobility_stats = [] for category in zone.categories.values(): movement_stat = [] for date in range(int_date + day - lag - 4, int_date + day - lag + 1): try: movement_stat.append(category[date]['value']) except: continue mobility_stats.append( sum(movement_stat) / len(movement_stat)) prediction = model.predict(previous_cases, mobility_stats, zone.infection_rate(), zone.population) week_prediction.append(prediction) previous_cases = prediction else: for day in range(7): week_ago_cases = 0 for i in range(int_date - 8 + day, int_date - 5 + day): week_ago_cases += zone.daily[i]['Cases'] prediction = int(model.fatality_ratio * week_ago_cases / 3) week_prediction.append(prediction) return week_prediction