def run_predict(year=2016, max_days=10, dirname="../Data/training", list_days=None): """ year : year to be evaluated max_days: number of past days allowed to predict a given day (set to 10 on the platform) dirname: path to the dataset list_days: list of days to be evaluated (if None the full year is evaluated) """ overall_start = time.time() # <== Mark starting time data = load_data(year=year, dirname=dirname) # load all data files sites = data["sites"] # get sites info day_results = dict({}) if list_days is None: if calendar.isleap(year): # check if year is leap list_days = range(366) else: list_days = range(365) for day in list_days: chimeres_day, geops_day, meteo_day, concentrations_day = get_data_day( day, data, max_days=max_days, year=year ) # you will get an extraction of the year datasets, limited to the past max_days for each day day_results[day] = predict(day, sites, chimeres_day, geops_day, meteo_day, concentrations_day) # do the prediction overall_time_spent = time.time() - overall_start # end computation time pickle.dump(day_results, open('submission/results.pk', 'wb')) # save results pickle.dump(overall_time_spent, open('submission/time.pk', 'wb')) # save computation time
def analyze_covar(hours): all_data = load_data(dirname="../Data/training", year=2016) common_sites = get_common_sites(all_data) site = common_sites.pop() data_pol = dict() days = np.unique(np.random.randint(low=2, high=360, size=50)) print(days) for day in days: data_day = get_data_day(day, all_data, max_days=1, year=2016) # print(site) for pol in ["PM10", "PM25", "O3", "NO2"]: # print(pol) concentrations_pol_site = data_day[3][pol][data_day[3] [pol].idPolair == site] previous_data = np.array(concentrations_pol_site.Valeur) if np.isnan(previous_data).any(): # print('True') if site != '33374': chimeres_site = data_day[0][pol].loc[ data_day[0][pol].idPolair == float(site)] else: chimeres_site = data_day[0][pol].loc[ data_day[0][pol].idPolair == 15114.] inds = np.where(np.isnan(previous_data)) previous_data[inds] = chimeres_site['val'].iloc[inds] if day == days[0]: data_pol[pol] = np.array([previous_data[0:hours]]) else: data_pol[pol] = np.append(data_pol[pol], [previous_data[0:hours]], axis=0) for y in range(hours): for x in range(hours): ax = plt.subplot(hours, hours, y * hours + x + 1) for pol in ["PM10", "PM25", "O3", "NO2"]: yaxis = data_pol[pol][:, y] xaxis = data_pol[pol][:, x] format = 'C' + str(convert_pol(pol)) + '.' ax.plot(xaxis, yaxis, format, label=pol) ax.legend() plt.show()
def run_predict(year=2016, max_days=10, dirname="../Data/training", list_days=None): """ year : year to be evaluated max_days: number of past days allowed to predict a given day (set to 10 on the platform) dirname: path to the dataset list_days: list of days to be evaluated (if None the full year is evaluated) """ overall_start = time.time() # <== Mark starting time data = load_data(year=year, dirname=dirname) # load all data files sites = data["sites"] #get sites info # get common sites for all pollutants common_sites = get_common_sites(data) day_results = dict({}) day_scores = [] if list_days is None: if calendar.isleap(year): # check if year is leap list_days = range(366) else: list_days = range(365) for day in list_days: print(day) chimeres_day, geops_day, meteo_day, concentrations_day = get_data_day( day, data, max_days=max_days, year=year ) # you will get an extraction of the year datasets, limited to the past max_days for each day results_oldformat, results, results_covar = predict( day, sites, common_sites, chimeres_day, geops_day, meteo_day, concentrations_day) # do the prediction ground_truth = get_ground_truth_day(day, data, common_sites, year=year) score = np.mean( [ mahalanobis(ground_truth[i][24:], results[i][24:], results_covar[i][24:, 24:])**2 + np.log(np.linalg.det(results_covar[i][24:, 24:])) for i in common_sites ] ) # mean over sites, remove the first 6 hours (24 = 6*4 pollutants) from score because they are known. day_scores.append(score) day_results[day] = results_oldformat tot_score = np.mean(day_scores) # mean over days print(tot_score) overall_time_spent = time.time() - overall_start # end computation time pickle.dump(day_results, open('submission/results.pk', 'wb')) #save results pickle.dump(tot_score, open('submission/score.pk', 'wb')) #save score pickle.dump(overall_time_spent, open('submission/time.pk', 'wb')) #save computation time
def generate(year): all_data = load_data(dirname="../Data/training", year=year) day = 3 print(all_data.keys()) print(all_data['chimeres'].keys()) print(all_data['chimeres']['NO2'].keys()) # print(all_data['chimeres']['NO2']['date']) training_data = None # training_labels = None pbar = ProgressBar() sites = all_data['sites'] idPolairs = get_common_sites(all_data) for day in pbar(range(day, 365)): data_day = get_data_day(day, all_data, max_days=3, year=year) chimeres_day = data_day[0] geops_day = data_day[1] meteo_day = data_day[2] concentrations_day = data_day[3] print("day: ", day) # print(data_day['NO2']) for pol in ["PM10", "PM25", "O3", "NO2"]: # print('pollutant : ', pol) for idPolair in idPolairs: data = convert_data_day(day, pol, idPolair, sites, chimeres_day, geops_day, meteo_day, concentrations_day) data = np.expand_dims(data, axis=0) # print(data.shape) if training_data is not None: training_data = np.append(training_data, data, axis=0) # training_labels = np.append(training_labels, labels, axis=0) else: training_data = data # training_labels = labels print(training_data.shape) np.savez('./tmp/training_data_' + str(year), a=training_data)
import logging logging.basicConfig(level=logging.INFO) ###################################" ## Use functions separately if you want to test your predict function # get data from files # CHANGE DIRNAME TO WHERE YOU STORE YOUR DATA all_data = load_data(dirname="../Data/training") # returns a dictionnary {"sites":sites,"chimeres":chimeres,"geops":geops,"meteo":meteo,"concentrations":concentrations} # see utils.load_data function for more details # from all_data, extract only allowed data for day=3 (! january 4 as python starts with 0) data_day = get_data_day(3, all_data) ## apply predict function for day=3, using prediction.predict(3, sites=all_data['sites'], chimeres_day=data_day[0], geops_day=data_day[1], meteo_day=data_day[2], concentrations_day=data_day[3]) ##############################################""" ## OR run the run_predict function that will call your daily predict function, here on the first 50 days of year prediction.run_predict(list_days=range(50), dirname="../Data/training")