Esempio n. 1
0
    data_test = np.array(data_test)
Y_test = np.array(Y_test, dtype=np.float)

# --
# X_train = data_train
# X_test = data_test
X_train = dict()
X_test = dict()
# --

print('Constructing time series data set ..')
# for rnn
X_train['rnn'] = construct_time_steps(data_train[:-1], time_steps)
X_test['rnn'] = construct_time_steps(data_test[:-1], time_steps)

X_train['concatenate'] = concatenate_time_steps(data_train[:-1], time_steps)
Y_train = Y_train[time_steps:]

X_test['concatenate'] = concatenate_time_steps(data_test[:-1], time_steps)
Y_test = Y_test[time_steps:]

# --

Y_real = np.copy(Y_test)

Y_train = higher(Y_train, interval_hours)
Y_test = higher(Y_test, interval_hours)

# Y_train = highest(Y_train, degree=degree)
# Y_test = highest(Y_test, degree=degree)
def ensemble_model(target_kind, local, city, target_site, training_year,
                   testing_year, training_duration, testing_duration,
                   interval_hours, data, is_training):
    print('is_training(%s) = %s' % (target_site, is_training))

    site_list = pollution_site_map[local][
        city]  # ['中山', '古亭', '士林', '松山', '萬華']

    # change format from   2014-2015   to   ['2014', '2015']
    training_year = [
        training_year[:training_year.index('-')],
        training_year[training_year.index('-') + 1:]
    ]
    testing_year = [
        testing_year[:testing_year.index('-')],
        testing_year[testing_year.index('-') + 1:]
    ]

    training_duration = [
        training_duration[:training_duration.index('-')],
        training_duration[training_duration.index('-') + 1:]
    ]
    testing_duration = [
        testing_duration[:testing_duration.index('-')],
        testing_duration[testing_duration.index('-') + 1:]
    ]
    interval_hours = int(interval_hours)
    # is_training = False

    # clear redundancy work
    if training_year[0] == training_year[1]:
        training_year.pop(1)
    if testing_year[0] == testing_year[1]:
        testing_year.pop(1)
    else:
        input(
            'The range of testing year should not more than one year or crossing the bound of years.'
        )

    # checking years
    rangeofYear = int(training_year[-1]) - int(training_year[0])
    for i in range(rangeofYear):
        if not (str(i + int(training_year[0])) in training_year):
            training_year.insert(i, str(i + int(training_year[0])))

    # Training Parameters
    # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now.
    if target_kind == 'PM2.5':
        pollution_kind = [
            'PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC'
        ]
    # target_kind = 'PM2.5'
    data_update = False
    # batch_size = 24 * 7
    seed = 0

    # Network Parameters
    input_size = (len(site_list) * len(pollution_kind) +
                  len(site_list)) if 'WIND_DIREC' in pollution_kind else (
                      len(site_list) * len(pollution_kind))
    time_steps = 12
    # hidden_size = 20
    output_size = 1

    testing_month = testing_duration[0][:testing_duration[0].index('/')]
    folder = root_path + "model/%s/%s/%sh/%s/" % (local, city, interval_hours,
                                                  target_kind)
    training_begining = training_duration[0][:training_duration[0].index('/')]
    training_deadline = training_duration[-1][:training_duration[-1].index('/'
                                                                           )]
    print('site: %s' % target_site)
    print('Training for %s/%s to %s/%s' %
          (training_year[0], training_duration[0], training_year[-1],
           training_duration[-1]))
    print('Testing for %s/%s to %s/%s' %
          (testing_year[0], testing_duration[0], testing_year[-1],
           testing_duration[-1]))

    # for interval
    def ave(X, Y, interval_hours):
        reserve_hours = interval_hours - 1
        deadline = 0
        for i in range(len(Y)):
            # check the reserve data is enough or not
            if (len(Y) - i - 1) < reserve_hours:
                deadline = i
                break  # not enough
            for j in range(reserve_hours):
                Y[i] += Y[i + j + 1]
            Y[i] /= interval_hours
        if deadline:
            X = X[:deadline]
            Y = Y[:deadline]
        return X, Y

    # for interval
    def higher(X, Y, interval_hours):
        reserve_hours = 1  # choose the first n number of biggest
        if interval_hours > reserve_hours:
            deadline = 0
            for i in range(len(Y)):
                # check the reserve data is enough or not
                if (len(Y) - i) < interval_hours:
                    deadline = i
                    break  # not enough
                higher_list = []
                for j in range(interval_hours):
                    if len(higher_list) < reserve_hours:
                        higher_list.append(Y[i + j])
                    elif Y[i + j] > higher_list[0]:
                        higher_list[0] = Y[i + j]
                    higher_list = sorted(higher_list)
                Y[i] = np.array(higher_list).sum() / reserve_hours
            if deadline:
                X = X[:deadline]
                Y = Y[:deadline]
        return X, Y

    if is_training:
        # reading data
        print('Reading data .. ')
        start_time = time.time()
        print('preparing training set ..')
        X_train = read_data_sets(sites=site_list + [target_site],
                                 date_range=np.atleast_1d(training_year),
                                 beginning=training_duration[0],
                                 finish=training_duration[-1],
                                 feature_selection=pollution_kind,
                                 update=data_update)
        X_train = missing_check(X_train)
        Y_train = np.array(X_train)[:, -len(pollution_kind):]
        Y_train = Y_train[:, pollution_kind.index(target_kind)]
        X_train = np.array(X_train)[:, :-len(pollution_kind)]

        print('preparing testing set ..')
        X_test = read_data_sets(sites=site_list + [target_site],
                                date_range=np.atleast_1d(testing_year),
                                beginning=testing_duration[0],
                                finish=testing_duration[-1],
                                feature_selection=pollution_kind,
                                update=data_update)
        Y_test = np.array(X_test)[:, -len(pollution_kind):]
        Y_test = Y_test[:, pollution_kind.index(target_kind)]
        X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)])

        final_time = time.time()
        print('Reading data .. ok, ', end='')
        time_spent_printer(start_time, final_time)

        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')

        if (len(X_train) < time_steps) or (len(X_test) < time_steps):
            input('time_steps(%d) too long.' % time_steps)

        # normalize
        print('Normalize ..')
        mean_X_train = np.mean(X_train, axis=0)
        std_X_train = np.std(X_train, axis=0)
        if 0 in std_X_train:
            input("Denominator can't be 0.")
        X_train = np.array([(x_train - mean_X_train) / std_X_train
                            for x_train in X_train])
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        mean_y_train = np.mean(Y_train)
        std_y_train = np.std(Y_train)
        if not std_y_train:
            input("Denominator can't be 0.")
        Y_train = [(y - mean_y_train) / std_y_train for y in Y_train]
        print('mean_y_train: %f  std_y_train: %f' %
              (mean_y_train, std_y_train))

        fw = open(folder + "%s_parameter.pickle" % target_site, 'wb')
        cPickle.dump(
            str(mean_X_train) + ',' + str(std_X_train) + ',' +
            str(mean_y_train) + ',' + str(std_y_train), fw)
        fw.close()

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_train = X_train.tolist()
            X_test = X_test.tolist()
            for i in range(len(X_train)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_train[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_train[i].insert(specific_index, coordin[1])
                    X_train[i].insert(specific_index, coordin[0])
                    if i < len(X_test):
                        coordin = data_coordinate_angle(
                            (X_test[i].pop(specific_index + j)) *
                            std_X_train[specific_index] +
                            mean_X_train[specific_index])
                        X_test[i].insert(specific_index, coordin[1])
                        X_test[i].insert(specific_index, coordin[0])
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        Y_test = np.array(Y_test, dtype=np.float)

        # --

        print('Constructing time series data set ..')
        # for rnn
        X_rnn_train = construct_time_steps(X_train[:-1], time_steps)
        X_rnn_test = construct_time_steps(X_test[:-1], time_steps)

        X_train = concatenate_time_steps(X_train[:-1], time_steps)
        Y_train = Y_train[time_steps:]

        X_test = concatenate_time_steps(X_test[:-1], time_steps)
        Y_test = Y_test[time_steps:]

        [X_train, Y_train] = higher(X_train, Y_train, interval_hours)
        [X_test, Y_test] = higher(X_test, Y_test, interval_hours)
        X_rnn_train = X_rnn_train[:len(X_train)]
        X_rnn_test = X_rnn_test[:len(X_test)]

        # delete data which have missing values
        i = 0
        while i < len(Y_test):
            if not (
                    Y_test[i] > -10000
            ):  # check missing or not, if Y_test[i] is missing, then this command will return True
                Y_test = np.delete(Y_test, i, 0)
                X_test = np.delete(X_test, i, 0)
                X_rnn_test = np.delete(X_rnn_test, i, 0)
                i = -1
            i += 1
        Y_test = np.array(Y_test, dtype=np.float)

        # --

        X_rnn_train = np.array(X_rnn_train)
        X_rnn_test = np.array(X_rnn_test)
        X_train = np.array(X_train)
        Y_train = np.array(Y_train)
        X_test = np.array(X_test)

        np.random.seed(seed)
        np.random.shuffle(X_train)
        np.random.seed(seed)
        np.random.shuffle(Y_train)

        np.random.seed(seed)
        np.random.shuffle(X_rnn_train)

    else:  # is_training = false
        # mean and std
        fr = open(folder + "%s_parameter.pickle" % target_site, 'rb')
        [mean_X_train, std_X_train, mean_y_train,
         std_y_train] = (cPickle.load(fr)).split(',')
        mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in mean_X_train:
            mean_X_train.pop(mean_X_train.index(''))
        mean_X_train = np.array(mean_X_train, dtype=np.float)
        std_X_train = std_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in std_X_train:
            std_X_train.pop(std_X_train.index(''))
        std_X_train = np.array(std_X_train, dtype=np.float)
        mean_y_train = float(mean_y_train)
        std_y_train = float(std_y_train)
        fr.close()

        # reading data
        print('preparing testing set ..')
        X_test = data
        X_test = missing_check(np.array(X_test))

        # normalize
        print('Normalize ..')
        if 0 in std_X_train:
            input("Denominator can't be 0.")
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_test = X_test.tolist()
            for i in range(len(X_test)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_test[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_test[i].insert(specific_index, coordin[1])
                    X_test[i].insert(specific_index, coordin[0])
            X_test = np.array(X_test)

        # --

        print('Constructing time series data set ..')
        X_rnn_test = construct_time_steps(X_test, time_steps)
        X_test = concatenate_time_steps(X_test, time_steps)

        # --

        X_rnn_test = np.array(X_rnn_test)
        X_test = np.array(X_test)

    # -- xgboost --
    print('- xgboost -')

    filename = ("xgboost_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    print(filename)

    if is_training:
        xgb_model = xgb.XGBRegressor().fit(X_train, Y_train)

        fw = open(folder + filename, 'wb')
        cPickle.dump(xgb_model, fw)
        fw.close()
    else:
        fr = open(folder + filename, 'rb')
        xgb_model = cPickle.load(fr)
        fr.close()

    xgb_pred = xgb_model.predict(X_test)

    # print('rmse(xgboost): %.5f' % (np.mean((Y_test - (mean_y_train + std_y_train * xgb_pred))**2, 0)**0.5))

    # -- rnn --
    print('- rnn -')

    filename = ("sa_DropoutLSTM_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    print(filename)

    # Network Parameters
    time_steps = 12
    hidden_size = 20

    print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size")
    print("Using default args:")
    param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128"]
    args = [float(a) for a in param[1:]]
    print(args)
    p_W, p_U, p_dense, p_emb, weight_decay, batch_size = args
    batch_size = int(batch_size)

    # --

    print('Build rnn model...')
    start_time = time.time()
    rnn_model = Sequential()

    # layer 1
    rnn_model.add(
        BatchNormalization(epsilon=0.001,
                           mode=0,
                           axis=-1,
                           momentum=0.99,
                           weights=None,
                           beta_init='zero',
                           gamma_init='one',
                           gamma_regularizer=None,
                           beta_regularizer=None,
                           input_shape=(time_steps, input_size)))
    rnn_model.add(
        LSTM(hidden_size,
             W_regularizer=l2(weight_decay),
             U_regularizer=l2(weight_decay),
             b_regularizer=l2(weight_decay),
             dropout_W=p_W,
             dropout_U=p_U))  # return_sequences=True
    rnn_model.add(Dropout(p_dense))

    # output layer
    rnn_model.add(
        BatchNormalization(epsilon=0.001,
                           mode=0,
                           axis=-1,
                           momentum=0.99,
                           weights=None,
                           beta_init='zero',
                           gamma_init='one',
                           gamma_regularizer=None,
                           beta_regularizer=None))
    rnn_model.add(
        Dense(output_size,
              W_regularizer=l2(weight_decay),
              b_regularizer=l2(weight_decay)))

    # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
    optimiser = 'adam'
    rnn_model.compile(loss='mean_squared_error', optimizer=optimiser)

    final_time = time.time()
    time_spent_printer(start_time, final_time)

    if is_training:
        print("Train...")
        start_time = time.time()
        rnn_model.fit(X_rnn_train, Y_train, batch_size=batch_size, epochs=50)

        # Potentially save weights
        rnn_model.save_weights(folder + filename, overwrite=True)

        final_time = time.time()
        time_spent_printer(start_time, final_time)

    else:
        print('loading model ..')
        # print('loading model from %s' % (folder + filename + ".hdf5"))
        rnn_model.load_weights(folder + filename)

    rnn_pred = rnn_model.predict(X_rnn_test, batch_size=500, verbose=1)
    final_time = time.time()
    time_spent_printer(start_time, final_time)
    # print('rmse(rnn): %.5f' % (np.mean((np.atleast_2d(Y_test).T - (mean_y_train + std_y_train * rnn_pred))**2, 0)**0.5))

    # --  ensemble --

    print('stacking ..')
    if is_training:
        xgb_output = xgb_model.predict(X_train).reshape(len(X_train), 1)
        # rf_output = rf_model.predict(X_train).reshape(len(X_train), 1)
        rnn_output = rnn_model.predict(X_rnn_train, batch_size=500, verbose=1)
        # ensemble_X_train = np.hstack((X_train, xgb_output, rf_output, rnn_output))
        ensemble_X_train = np.hstack((X_train, xgb_output, rnn_output))

        Y_alert_train = [y * std_y_train + mean_y_train for y in Y_train]
        for element in range(len(Y_train)):
            if Y_alert_train[element] > high_alert:
                Y_alert_train[element] = 1  # [1, 0] = [high, low]
            else:
                Y_alert_train[element] = 0

    xgb_pred = xgb_pred.reshape(len(X_test), 1)
    # rf_pred = rf_pred.reshape(len(X_test), 1)
    rnn_pred = rnn_pred.reshape(len(X_test), 1)
    # ensemble_X_test = np.hstack((X_test, xgb_pred, rf_pred, rnn_pred))
    ensemble_X_test = np.hstack((X_test, xgb_pred, rnn_pred))

    # Y_alert_test = np.zeros(len(Y_test))
    # for element in range(len(Y_test)):
    #     if Y_test[element] > high_alert:
    #         Y_alert_test[element] = 1  # [1, 0] = [high, low]

    print('- ensemble -')
    filename = ("ensemble_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    filename2 = ("classification_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                 (target_site, training_year[0], training_begining,
                  training_year[-1], training_deadline, interval_hours))

    if is_training:
        ensemble_model = xgb.XGBRegressor().fit(ensemble_X_train, Y_train)
        classification_model = xgb.XGBClassifier().fit(ensemble_X_train,
                                                       Y_alert_train)

        fw = open(folder + filename, 'wb')
        cPickle.dump(ensemble_model, fw)
        fw.close()

        fw2 = open(folder + filename2, 'wb')
        cPickle.dump(classification_model, fw2)
        fw2.close()
    else:
        fr = open(folder + filename, 'rb')
        ensemble_model = cPickle.load(fr)
        fr.close()

        fr2 = open(folder + filename2, 'rb')
        classification_model = cPickle.load(fr2)
        fr2.close()

    pred = ensemble_model.predict(ensemble_X_test)
    alert_pred = classification_model.predict(ensemble_X_test)

    # --

    predictions = mean_y_train + std_y_train * pred
    # print('mse: %.5f' % mean_squared_error(Y_test, predictions))

    if is_training:
        print('rmse: %.5f' % (np.mean((Y_test - predictions)**2, 0)**0.5))

        def target_level(target, kind='PM2.5'):
            # target should be a 1d-list
            if kind == 'PM2.5':
                if (target >= 0) and (target < 11.5):  # 0-11
                    return 1
                elif (target >= 11.5) and (target < 23.5):  # 12-23
                    return 2
                elif (target >= 23.5) and (target < 35.5):  # 24-35
                    return 3
                elif (target >= 35.5) and (target < 41.5):  # 36-41
                    return 4
                elif (target >= 41.5) and (target < 47.5):  # 42-47
                    return 5
                elif (target >= 47.5) and (target < 53.5):  # 48-53
                    return 6
                elif (target >= 53.5) and (target < 58.5):  # 54-58
                    return 7
                elif (target >= 58.5) and (target < 64.5):  # 59-64
                    return 8
                elif (target >= 64.5) and (target < 70.5):  # 65-70
                    return 9
                elif target >= 70.5:  # others(71+)
                    return 10
                else:
                    print('error value: %d' % target)
                    return 1

        pred_label = np.zeros(len(predictions))
        real_target = np.zeros(len(Y_test))

        pred_label_true = 0.
        pred_label_false = 0.

        four_label_true = 0.0
        four_label_false = 0.0

        # calculate the accuracy of ten level
        for i in range(len(predictions)):
            pred_label[i] = target_level(predictions[i])
            real_target[i] = target_level(Y_test[i])

            if real_target[i] == pred_label[i]:
                pred_label_true += 1
            else:
                pred_label_false += 1

            # four label
            if (real_target[i] >= 1
                    and real_target[i] <= 3) and (pred_label[i] >= 1
                                                  and pred_label[i] <= 3):
                four_label_true += 1
            elif (real_target[i] >= 4
                  and real_target[i] <= 6) and (pred_label[i] >= 4
                                                and pred_label[i] <= 6):
                four_label_true += 1
            elif (real_target[i] >= 7
                  and real_target[i] <= 9) and (pred_label[i] >= 7
                                                and pred_label[i] <= 9):
                four_label_true += 1
            elif (real_target[i] >= 10) and (pred_label[i] >= 10):
                four_label_true += 1
            else:
                four_label_false += 1

        # print('standard_prob_accuracy: %.5f' % (standard_prob_true / (standard_prob_true + standard_prob_false)))
        print('Ten level accuracy: %.5f' %
              (pred_label_true / (pred_label_true + pred_label_false)))
        print('Four level accuracy: %.5f' %
              (four_label_true / (four_label_true + four_label_false)))
        print('--')

        # --

        ha = 0.0  # observation high, predict high
        hb = 0.0  # observation low, predict high
        hc = 0.0  # observation high, predict low
        hd = 0.0  # observation low, predict low
        la = 0.0  # observation very high, predict very high
        lb = 0.0
        lc = 0.0
        ld = 0.0
        alert_a = 0.0
        alert_b = 0.0
        alert_c = 0.0
        alert_d = 0.0
        integration_a = 0.0
        integration_b = 0.0
        integration_c = 0.0
        integration_d = 0.0

        for each_value in range(len(Y_test)):
            if Y_test[each_value] >= high_alert:  # observation high
                # regression
                if predictions[
                        each_value] >= high_alert:  # forecast high(with tolerance)
                    ha += 1
                else:
                    hc += 1

                # classification
                if alert_pred[each_value]:  # [1, 0] = [high, low]
                    alert_a += 1
                else:
                    alert_c += 1

                # integration
                if alert_pred[each_value] or (predictions[each_value] >=
                                              high_alert):
                    integration_a += 1
                else:
                    integration_c += 1

            else:  # observation low
                # regression
                if predictions[each_value] >= high_alert:
                    hb += 1
                else:
                    hd += 1

                # classification
                if alert_pred[each_value]:
                    alert_b += 1
                else:
                    alert_d += 1

                # integration
                if alert_pred[each_value] or (predictions[each_value] >=
                                              high_alert):
                    integration_b += 1
                else:
                    integration_d += 1

            # --------------------------------------------------------

            if Y_test[each_value] >= low_alert:  # observation higher
                if predictions[each_value] >= low_alert:
                    la += 1
                else:
                    lc += 1
            else:  # observation very low
                if predictions[each_value] >= low_alert:
                    lb += 1
                else:
                    ld += 1

        # print('Two level accuracy: %f' % (two_label_true / (two_label_true + two_label_false)))
        print('high label: (%d, %d, %d, %d)' % (ha, hb, hc, hd))
        print('low label: (%d, %d, %d, %d)' % (la, lb, lc, ld))
        print('alert: (%d, %d, %d, %d)' % (alert_a, alert_b, alert_c, alert_d))

    return predictions
Esempio n. 3
0
        X_test = np.array(X_test)
    Y_test = np.array(Y_test, dtype=np.float)

    # ---------------------------------------------- Data Frame --------------------------------------------------------

    print('Constructing time series data set ..', end='')
    # for cnn
    X_cnn_train = construct_time_steps(X_cnn_train[:-1], cnn_time_steps)
    X_cnn_test = construct_time_steps(X_cnn_test[:-1], cnn_time_steps)

    # for rnn
    # X_rnn_train = construct_time_steps(X_train[:-1], time_steps)
    # X_rnn_test = construct_time_steps(X_test[:-1], time_steps)

    # for others
    X_train = concatenate_time_steps(X_train[:-1], time_steps)
    X_test = concatenate_time_steps(X_test[:-1], time_steps)

    # --

    if cnn_time_steps > time_steps:
        X_train = X_train[cnn_time_steps - time_steps:]
        # X_rnn_train = X_rnn_train[cnn_time_steps-time_steps:]
        Y_train = Y_train[cnn_time_steps:]

        X_test = X_test[cnn_time_steps - time_steps:]
        # X_rnn_test = X_rnn_test[cnn_time_steps-time_steps:]
        Y_test = Y_test[cnn_time_steps:]
    else:
        X_cnn_train = X_cnn_train[time_steps - cnn_time_steps:]
        Y_train = Y_train[time_steps:]