Exemple #1
0
def run_predict(year=2016,
                max_days=10,
                dirname="../Data/training",
                list_days=None):
    """
    year : year to be evaluated
    max_days: number of past days allowed to predict a given day (set to 10 on the platform)
    dirname: path to the dataset
    list_days: list of days to be evaluated (if None the full year is evaluated)
    """

    overall_start = time.time()  # <== Mark starting time
    data = load_data(year=year, dirname=dirname)  # load all data files
    sites = data["sites"]  # get sites info
    day_results = dict({})
    if list_days is None:
        if calendar.isleap(year):  # check if year is leap
            list_days = range(366)
        else:
            list_days = range(365)
    for day in list_days:
        chimeres_day, geops_day, meteo_day, concentrations_day = get_data_day(
            day, data, max_days=max_days, year=year
        )  # you will get an extraction of the year datasets, limited to the past max_days for each day
        day_results[day] = predict(day, sites, chimeres_day, geops_day,
                                   meteo_day,
                                   concentrations_day)  # do the prediction

    overall_time_spent = time.time() - overall_start  # end computation time
    pickle.dump(day_results, open('submission/results.pk',
                                  'wb'))  # save results
    pickle.dump(overall_time_spent, open('submission/time.pk',
                                         'wb'))  # save computation time
Exemple #2
0
def analyze_covar(hours):

    all_data = load_data(dirname="../Data/training", year=2016)

    common_sites = get_common_sites(all_data)
    site = common_sites.pop()

    data_pol = dict()

    days = np.unique(np.random.randint(low=2, high=360, size=50))
    print(days)

    for day in days:

        data_day = get_data_day(day, all_data, max_days=1, year=2016)
        # print(site)

        for pol in ["PM10", "PM25", "O3", "NO2"]:
            # print(pol)

            concentrations_pol_site = data_day[3][pol][data_day[3]
                                                       [pol].idPolair == site]
            previous_data = np.array(concentrations_pol_site.Valeur)

            if np.isnan(previous_data).any():
                # print('True')
                if site != '33374':
                    chimeres_site = data_day[0][pol].loc[
                        data_day[0][pol].idPolair == float(site)]
                else:
                    chimeres_site = data_day[0][pol].loc[
                        data_day[0][pol].idPolair == 15114.]

                inds = np.where(np.isnan(previous_data))

                previous_data[inds] = chimeres_site['val'].iloc[inds]

            if day == days[0]:
                data_pol[pol] = np.array([previous_data[0:hours]])
            else:

                data_pol[pol] = np.append(data_pol[pol],
                                          [previous_data[0:hours]],
                                          axis=0)

    for y in range(hours):
        for x in range(hours):
            ax = plt.subplot(hours, hours, y * hours + x + 1)
            for pol in ["PM10", "PM25", "O3", "NO2"]:

                yaxis = data_pol[pol][:, y]
                xaxis = data_pol[pol][:, x]

                format = 'C' + str(convert_pol(pol)) + '.'
                ax.plot(xaxis, yaxis, format, label=pol)
    ax.legend()
    plt.show()
def run_predict(year=2016,
                max_days=10,
                dirname="../Data/training",
                list_days=None):
    """
    year : year to be evaluated
    max_days: number of past days allowed to predict a given day (set to 10 on the platform)
    dirname: path to the dataset
    list_days: list of days to be evaluated (if None the full year is evaluated)
    """

    overall_start = time.time()  # <== Mark starting time
    data = load_data(year=year, dirname=dirname)  # load all data files
    sites = data["sites"]  #get sites info

    # get common sites for all pollutants
    common_sites = get_common_sites(data)
    day_results = dict({})
    day_scores = []
    if list_days is None:
        if calendar.isleap(year):  # check if year is leap
            list_days = range(366)
        else:
            list_days = range(365)
    for day in list_days:
        print(day)
        chimeres_day, geops_day, meteo_day, concentrations_day = get_data_day(
            day, data, max_days=max_days, year=year
        )  # you will get an extraction of the year datasets, limited to the past max_days for each day
        results_oldformat, results, results_covar = predict(
            day, sites, common_sites, chimeres_day, geops_day, meteo_day,
            concentrations_day)  # do the prediction
        ground_truth = get_ground_truth_day(day, data, common_sites, year=year)
        score = np.mean(
            [
                mahalanobis(ground_truth[i][24:], results[i][24:],
                            results_covar[i][24:, 24:])**2 +
                np.log(np.linalg.det(results_covar[i][24:, 24:]))
                for i in common_sites
            ]
        )  # mean over sites, remove the first 6 hours (24 = 6*4 pollutants) from score because they are known.
        day_scores.append(score)
        day_results[day] = results_oldformat
    tot_score = np.mean(day_scores)  # mean over days
    print(tot_score)
    overall_time_spent = time.time() - overall_start  # end computation time
    pickle.dump(day_results, open('submission/results.pk',
                                  'wb'))  #save results
    pickle.dump(tot_score, open('submission/score.pk', 'wb'))  #save score
    pickle.dump(overall_time_spent, open('submission/time.pk',
                                         'wb'))  #save computation time
Exemple #4
0
def generate(year):

    all_data = load_data(dirname="../Data/training", year=year)

    day = 3

    print(all_data.keys())
    print(all_data['chimeres'].keys())
    print(all_data['chimeres']['NO2'].keys())
    # print(all_data['chimeres']['NO2']['date'])
    training_data = None
    # training_labels = None
    pbar = ProgressBar()
    sites = all_data['sites']
    idPolairs = get_common_sites(all_data)

    for day in pbar(range(day, 365)):
        data_day = get_data_day(day, all_data, max_days=3, year=year)
        chimeres_day = data_day[0]
        geops_day = data_day[1]
        meteo_day = data_day[2]
        concentrations_day = data_day[3]

        print("day: ", day)
        # print(data_day['NO2'])
        for pol in ["PM10", "PM25", "O3", "NO2"]:
            # print('pollutant : ', pol)
            for idPolair in idPolairs:

                data = convert_data_day(day, pol, idPolair, sites,
                                        chimeres_day, geops_day, meteo_day,
                                        concentrations_day)
                data = np.expand_dims(data, axis=0)
                # print(data.shape)
                if training_data is not None:
                    training_data = np.append(training_data, data, axis=0)
                    # training_labels = np.append(training_labels, labels, axis=0)
                else:
                    training_data = data
                    # training_labels = labels
        print(training_data.shape)
    np.savez('./tmp/training_data_' + str(year), a=training_data)
import logging
logging.basicConfig(level=logging.INFO)

###################################"
## Use functions separately if you want to test your predict function

# get data from files
# CHANGE DIRNAME TO WHERE YOU STORE YOUR DATA
all_data = load_data(dirname="../Data/training")

# returns a dictionnary {"sites":sites,"chimeres":chimeres,"geops":geops,"meteo":meteo,"concentrations":concentrations}
# see utils.load_data function for more details

# from all_data, extract only  allowed data for day=3 (! january 4 as python starts with 0)
data_day = get_data_day(3, all_data)

## apply predict function for day=3, using
prediction.predict(3,
                   sites=all_data['sites'],
                   chimeres_day=data_day[0],
                   geops_day=data_day[1],
                   meteo_day=data_day[2],
                   concentrations_day=data_day[3])

##############################################"""

## OR run the run_predict function that will call your daily predict function, here on the first 50 days of year

prediction.run_predict(list_days=range(50), dirname="../Data/training")