def main():
    start = datetime.now()

    # get the data
    train_data = helpers.load_data(numpy_path, 'train_set.npy')
    valid_data = helpers.load_data(numpy_path, 'valid_set.npy')
    test_data = helpers.load_data(numpy_path, 'test_set.npy')

    # filter the data
    test_data_labels = np.array([item[0] for item in test_data[:, 2]])
    test_data_countries = np.array([item[0] for item in test_data[:, 0]])
    test_data_month = test_data[:, 5]

    # convert the data
    train_dataset, train_shape = convert_dataset(train_data,
                                                 batchsize=batchsize,
                                                 shuffle=1000,
                                                 shape=True)
    valid_dataset = convert_dataset(valid_data, batchsize=1000, shuffle=100)
    test_dataset = convert_dataset(test_data, batchsize=1000)

    # build the model
    model = build_model(train_shape[1], train_shape[2])

    # Print Model
    # modelprovider.printModel(model, dir=os.path.join(
    #     logdir, expname), name=expname+".png")

    # compiling the model
    lossfn = loss.crps_cost_function
    opt = Adam(lr=learning_rate, amsgrad=True)
    model.compile(loss=lossfn, optimizer=opt)

    # checkdir path
    checkpoint_dir = os.path.join(logdir, expname, 'checkpoints/')

    # begin with training 10 times
    print('[INFO] Starting training')
    predictions = []
    for i in range(1, 11):
        print('Round number: ' + str(i))
        model = build_model(train_shape[1], train_shape[2])

        # compile new model with new inital weights
        model.compile(loss=lossfn, optimizer=opt)

        # checkpoint callbacks
        # all checkpoints
        cp_callback_versuch = tf.keras.callbacks.ModelCheckpoint(
            os.path.join(checkpoint_dir, 'round-' + str(i) + '/') +
            "checkpoint_{epoch}",
            monitor='val_loss',
            save_weights_only=True,
            mode='min',
            verbose=0)
        # best checkpoint
        cp_callback = tf.keras.callbacks.ModelCheckpoint(
            os.path.join(checkpoint_dir, 'round-' + str(i) + '/checkpoint'),
            monitor='val_loss',
            save_weights_only=True,
            mode='min',
            save_best_only=True,
            verbose=0)

        # train the model
        if train_model:
            model.fit(
                train_dataset,
                epochs=epochs,
                initial_epoch=initial_epochs,
                batch_size=batchsize,
                verbose=1,
                validation_data=valid_dataset,
                validation_batch_size=1000,
                callbacks=[cp_callback, cp_callback_versuch],
            )

        # load the best checkpoint of round i
        model.load_weights(
            os.path.join(checkpoint_dir,
                         'round-' + str(i) + '/checkpoint')).expect_partial()

        predictions.append(
            model.predict(test_dataset, batch_size=1000, verbose=0))

    # convert to numpy array
    predictions = np.array(predictions)
    # Make sure std is positive
    predictions[:, :, 1] = np.abs(predictions[:, :, 1])
    # calculate mean between the 10 results
    mean_predictions = np.mean(predictions, 0)
    # calculate the score for each record in test set
    test_crps = crps.norm_data(test_data_labels, mean_predictions)

    # print the results with filters
    helpers.printIntCountries(test_data_labels, test_data_countries,
                              mean_predictions)
    helpers.printHist(helpers.datasetPIT(mean_predictions, test_data_labels))

    np.save(os.path.join(logdir, expname, 'prediction'), predictions)
    print(datetime.now() - start)
Beispiel #2
0
def read():
    start = datetime.datetime.now()
    print('[INFO] starting reading...')
    # read all data and concat the train set for mean and std
    df = pd.read_csv(file,  index_col=0)

    test_set = df[(df['init date'] > '2014-01-01')
                  & (df['init date'] < '2017-12-31')].to_numpy()

    test_data   = helpers.load_data(numpy_path, 'test_set.npy')
    test_data_labels = test_data[:, 2]
    test_data_labels = np.array([item[0] for item in test_data_labels])
    test_data_countries = test_data[:, 0]
    test_data_countries = np.array([item[0] for item in test_data_countries])
    test_data_month   = test_data[:, 5]

    scores = []
    ranks  = []
    length = []
    print('[INFO] starting calculating')
    for case in test_set:
        date = datetime.datetime.strptime(case[0],'%Y-%m-%d')
        delta = datetime.timedelta(days=16)
        ensemble = []
        for year in range(1998,2013):
            start = (datetime.datetime.strptime(str(year)+'-'+date.strftime( '%m-%d' ),'%Y-%m-%d') - delta ).strftime( '%Y-%m-%d' )
            end   = (datetime.datetime.strptime(str(year)+'-'+date.strftime( '%m-%d' ),'%Y-%m-%d') + delta ).strftime( '%Y-%m-%d' )
            train_set = df[(df['init date'] > start)
                       & (df['init date'] < end)&(df['country']==case[1])].to_numpy()
            for i in train_set[:,2]:
                ensemble.append(i)
        score = ps.crps_ensemble(case[2], ensemble)
        rank  = verificationRank(case[2], ensemble)
        length.append(len(ensemble))
        ranks.append(rank)
        scores.append(score)

    print(max(ranks))
    print(list(dict.fromkeys(length)))
    scores = np.array(scores)
    helpers.printHist(ranks, r=(0,276))
    print(('all', round(scores.mean() , 2 ) ))
    result = str(scores.mean())+'&'
    for i in [8,16,2,5,20]:
        filter = test_data_countries == i
        filter_data = scores[filter]
        if len(filter_data) > 0:
            item = str(round(np.array(filter_data).mean(), 2))
        else:
            item = str(0)
        result +=  item+'&'
    print(result)

    for i in range(1, 13):
        filter = test_data_month == i
        filter_data = scores[filter]
        if len(filter_data) > 0:
            item = (i, round(np.array(filter_data).mean(), 2))
        else:
            item = (i, 0, 0)
        print(item)