Exemple #1
0
def data_collection(local, city, feature_selection, msg):
    sites = pollution_site_map[local][city]

    feature_matrix = []
    for publishTime in sorted(msg.keys()):
        feature_vector = []
        for site in sites:
            usite = site.decode('utf8')
            for feature_elem in feature_selection:
                if usite in msg[publishTime]:
                    if feature_elem == 'WIND_SPEED':
                        if 'WindSpeed' in msg[publishTime][usite]:
                            feature_vector.append(msg[publishTime][usite]['WindSpeed'])
                        else:
                            feature_vector.append('NaN')
                    elif feature_elem == 'WIND_DIREC':
                        if 'WindDirec' in msg[publishTime][usite]:
                            feature_vector.append(msg[publishTime][usite]['WindDirec'])
                        else:
                            feature_vector.append('NaN')
                        # ----------------------------
                    elif feature_elem == 'AMB_TEMP':
                        if local == '北部':
                            feature_vector.append(18)
                        elif local == '中部':
                            feature_vector.append(20)
                        elif local == '高屏':
                            feature_vector.append(22)
                    elif feature_elem == 'RH':
                        feature_vector.append(50)
                        # ----------------------------
                    elif feature_elem == 'PM2.5':
                        if 'PM2_5' in msg[publishTime][usite]:
                            feature_vector.append(msg[publishTime][usite]['PM2_5'])
                        else:
                            feature_vector.append('NaN')
                    else:
                        feature_vector.append(msg[publishTime][usite][feature_elem])
                else:
                    feature_vector.append('NaN')
        feature_matrix.append(feature_vector)
    feature_matrix = missing_check(feature_matrix)
    return feature_matrix
Exemple #2
0
# Set start time of data loading.
print('Loading data .. ')
start_time = time.time()
initial_time = time.time()

# Load training data, where: size(X_train) = (data_size, map_l, map_w, map_h), not sequentialized yet.
print('Preparing training dataset ..')
X_train = read_data_map(path=data_path,
                        site=target_site,
                        feature_selection=pollution_kind,
                        date_range=np.atleast_1d(training_year),
                        beginning=training_duration[0],
                        finish=training_duration[-1],
                        update=data_update)
X_train = missing_check(X_train)
Y_train = np.array(X_train)[:, center_i, center_j,
                            [6 + pollution_kind.index(i) for i in target_kind]]

# Load testing data, where: size(X_test) = (data_size, map_l, map_w, map_h), not sequentialized yet.
print('Preparing testing dataset ..')
X_test = read_data_map(path=data_path,
                       site=target_site,
                       feature_selection=pollution_kind,
                       date_range=np.atleast_1d(testing_year),
                       beginning=testing_duration[0],
                       finish=testing_duration[-1],
                       update=data_update)
X_test = missing_check(X_test)
Y_test = np.array(X_test)[:, center_i, center_j,
                          [6 + pollution_kind.index(i) for i in target_kind]]
Exemple #3
0
print('Training for %s/%s to %s/%s' % (training_year[0], training_duration[0], training_year[-1], training_duration[-1]))
print('Testing for %s/%s to %s/%s' % (testing_year[0], testing_duration[0], testing_year[-1], testing_duration[-1]))

if is_training:
    print('Training ..')
else:
    print('Testing ..')

# reading data
print('Reading data .. ')
start_time = time.time()
print('preparing training set ..')
X_train = read_data_sets(sites=site_list+[target_site], date_range=np.atleast_1d(training_year),
                         beginning=training_duration[0], finish=training_duration[-1],
                         feature_selection=pollution_kind, update=data_update)
X_train = missing_check(X_train)
Y_train = np.array(X_train)[:, -len(pollution_kind):]
Y_train = Y_train[:, pollution_kind.index(target_kind)]
X_train = np.array(X_train)[:, :-len(pollution_kind)]

print('preparing testing set ..')
X_test = read_data_sets(sites=site_list + [target_site], date_range=np.atleast_1d(testing_year),
                        beginning=testing_duration[0], finish=testing_duration[-1],
                        feature_selection=pollution_kind, update=data_update)
Y_test = np.array(X_test)[:, -len(pollution_kind):]
Y_test = Y_test[:, pollution_kind.index(target_kind)]
X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)])

final_time = time.time()
print('Reading data .. ok, ', end='')
time_spent_printer(start_time, final_time)
Exemple #4
0
    def load_data(self,
                  data_file,
                  site_list,
                  target_site,
                  target_kind,
                  training_year,
                  training_duration,
                  pollution_kind,
                  SEQ_LENGTH_1,
                  SEQ_LENGTH_2,
                  data_update=False):
        print('Reading data .. ')
        X = read_data_sets(sites=site_list + [target_site],
                           date_range=np.atleast_1d(training_year),
                           beginning=training_duration[0],
                           finish=training_duration[-1],
                           feature_selection=pollution_kind,
                           update=data_update)
        X = missing_check(X)
        Y = np.array(X)[:, -len(pollution_kind):]
        Y = Y[:, pollution_kind.index(target_kind)]
        SeqY = []
        for y in range(len(Y)):
            if (y + (SEQ_LENGTH_2 - 1)) < len(Y):
                Seqy = []
                for time_step in range(SEQ_LENGTH_2):
                    Seqy.append(Y[y + time_step])
                SeqY.append(Seqy)
                del Seqy
            else:
                break
        X = np.array(X)[:, :-len(pollution_kind)]

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X = X.tolist()
            for i in range(len(X)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(X[i].pop(specific_index +
                                                             j))
                    X[i].insert(specific_index + j, coordin[1])
                    X[i].insert(specific_index + j, coordin[0])
            X = np.array(X)

        X = construct_time_steps(X[:-1], SEQ_LENGTH_1)

        if SEQ_LENGTH_1 < SEQ_LENGTH_2:
            self.X = X[0:len(SeqY)]
        elif SEQ_LENGTH_1 > SEQ_LENGTH_2:
            SeqY = SeqY[:len(X)]

        with open(data_file, 'w') as f:
            for line in SeqY:
                for elem_no in range(len(line)):
                    f.write(str(line[elem_no]))
                    if elem_no < (len(line) - 1):
                        f.write(' ')
                f.write('\n')
def ensemble_model(target_kind, local, city, target_site, training_year,
                   testing_year, training_duration, testing_duration,
                   interval_hours, data, is_training):
    print('is_training(%s) = %s' % (target_site, is_training))

    site_list = pollution_site_map[local][
        city]  # ['中山', '古亭', '士林', '松山', '萬華']

    # change format from   2014-2015   to   ['2014', '2015']
    training_year = [
        training_year[:training_year.index('-')],
        training_year[training_year.index('-') + 1:]
    ]
    testing_year = [
        testing_year[:testing_year.index('-')],
        testing_year[testing_year.index('-') + 1:]
    ]

    training_duration = [
        training_duration[:training_duration.index('-')],
        training_duration[training_duration.index('-') + 1:]
    ]
    testing_duration = [
        testing_duration[:testing_duration.index('-')],
        testing_duration[testing_duration.index('-') + 1:]
    ]
    interval_hours = int(interval_hours)
    # is_training = False

    # clear redundancy work
    if training_year[0] == training_year[1]:
        training_year.pop(1)
    if testing_year[0] == testing_year[1]:
        testing_year.pop(1)
    else:
        input(
            'The range of testing year should not more than one year or crossing the bound of years.'
        )

    # checking years
    rangeofYear = int(training_year[-1]) - int(training_year[0])
    for i in range(rangeofYear):
        if not (str(i + int(training_year[0])) in training_year):
            training_year.insert(i, str(i + int(training_year[0])))

    # Training Parameters
    # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now.
    if target_kind == 'PM2.5':
        pollution_kind = [
            'PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC'
        ]
    # target_kind = 'PM2.5'
    data_update = False
    # batch_size = 24 * 7
    seed = 0

    # Network Parameters
    input_size = (len(site_list) * len(pollution_kind) +
                  len(site_list)) if 'WIND_DIREC' in pollution_kind else (
                      len(site_list) * len(pollution_kind))
    time_steps = 12
    # hidden_size = 20
    output_size = 1

    testing_month = testing_duration[0][:testing_duration[0].index('/')]
    folder = root_path + "model/%s/%s/%sh/%s/" % (local, city, interval_hours,
                                                  target_kind)
    training_begining = training_duration[0][:training_duration[0].index('/')]
    training_deadline = training_duration[-1][:training_duration[-1].index('/'
                                                                           )]
    print('site: %s' % target_site)
    print('Training for %s/%s to %s/%s' %
          (training_year[0], training_duration[0], training_year[-1],
           training_duration[-1]))
    print('Testing for %s/%s to %s/%s' %
          (testing_year[0], testing_duration[0], testing_year[-1],
           testing_duration[-1]))

    # for interval
    def ave(X, Y, interval_hours):
        reserve_hours = interval_hours - 1
        deadline = 0
        for i in range(len(Y)):
            # check the reserve data is enough or not
            if (len(Y) - i - 1) < reserve_hours:
                deadline = i
                break  # not enough
            for j in range(reserve_hours):
                Y[i] += Y[i + j + 1]
            Y[i] /= interval_hours
        if deadline:
            X = X[:deadline]
            Y = Y[:deadline]
        return X, Y

    # for interval
    def higher(X, Y, interval_hours):
        reserve_hours = 1  # choose the first n number of biggest
        if interval_hours > reserve_hours:
            deadline = 0
            for i in range(len(Y)):
                # check the reserve data is enough or not
                if (len(Y) - i) < interval_hours:
                    deadline = i
                    break  # not enough
                higher_list = []
                for j in range(interval_hours):
                    if len(higher_list) < reserve_hours:
                        higher_list.append(Y[i + j])
                    elif Y[i + j] > higher_list[0]:
                        higher_list[0] = Y[i + j]
                    higher_list = sorted(higher_list)
                Y[i] = np.array(higher_list).sum() / reserve_hours
            if deadline:
                X = X[:deadline]
                Y = Y[:deadline]
        return X, Y

    if is_training:
        # reading data
        print('Reading data .. ')
        start_time = time.time()
        print('preparing training set ..')
        X_train = read_data_sets(sites=site_list + [target_site],
                                 date_range=np.atleast_1d(training_year),
                                 beginning=training_duration[0],
                                 finish=training_duration[-1],
                                 feature_selection=pollution_kind,
                                 update=data_update)
        X_train = missing_check(X_train)
        Y_train = np.array(X_train)[:, -len(pollution_kind):]
        Y_train = Y_train[:, pollution_kind.index(target_kind)]
        X_train = np.array(X_train)[:, :-len(pollution_kind)]

        print('preparing testing set ..')
        X_test = read_data_sets(sites=site_list + [target_site],
                                date_range=np.atleast_1d(testing_year),
                                beginning=testing_duration[0],
                                finish=testing_duration[-1],
                                feature_selection=pollution_kind,
                                update=data_update)
        Y_test = np.array(X_test)[:, -len(pollution_kind):]
        Y_test = Y_test[:, pollution_kind.index(target_kind)]
        X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)])

        final_time = time.time()
        print('Reading data .. ok, ', end='')
        time_spent_printer(start_time, final_time)

        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')

        if (len(X_train) < time_steps) or (len(X_test) < time_steps):
            input('time_steps(%d) too long.' % time_steps)

        # normalize
        print('Normalize ..')
        mean_X_train = np.mean(X_train, axis=0)
        std_X_train = np.std(X_train, axis=0)
        if 0 in std_X_train:
            input("Denominator can't be 0.")
        X_train = np.array([(x_train - mean_X_train) / std_X_train
                            for x_train in X_train])
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        mean_y_train = np.mean(Y_train)
        std_y_train = np.std(Y_train)
        if not std_y_train:
            input("Denominator can't be 0.")
        Y_train = [(y - mean_y_train) / std_y_train for y in Y_train]
        print('mean_y_train: %f  std_y_train: %f' %
              (mean_y_train, std_y_train))

        fw = open(folder + "%s_parameter.pickle" % target_site, 'wb')
        cPickle.dump(
            str(mean_X_train) + ',' + str(std_X_train) + ',' +
            str(mean_y_train) + ',' + str(std_y_train), fw)
        fw.close()

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_train = X_train.tolist()
            X_test = X_test.tolist()
            for i in range(len(X_train)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_train[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_train[i].insert(specific_index, coordin[1])
                    X_train[i].insert(specific_index, coordin[0])
                    if i < len(X_test):
                        coordin = data_coordinate_angle(
                            (X_test[i].pop(specific_index + j)) *
                            std_X_train[specific_index] +
                            mean_X_train[specific_index])
                        X_test[i].insert(specific_index, coordin[1])
                        X_test[i].insert(specific_index, coordin[0])
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        Y_test = np.array(Y_test, dtype=np.float)

        # --

        print('Constructing time series data set ..')
        # for rnn
        X_rnn_train = construct_time_steps(X_train[:-1], time_steps)
        X_rnn_test = construct_time_steps(X_test[:-1], time_steps)

        X_train = concatenate_time_steps(X_train[:-1], time_steps)
        Y_train = Y_train[time_steps:]

        X_test = concatenate_time_steps(X_test[:-1], time_steps)
        Y_test = Y_test[time_steps:]

        [X_train, Y_train] = higher(X_train, Y_train, interval_hours)
        [X_test, Y_test] = higher(X_test, Y_test, interval_hours)
        X_rnn_train = X_rnn_train[:len(X_train)]
        X_rnn_test = X_rnn_test[:len(X_test)]

        # delete data which have missing values
        i = 0
        while i < len(Y_test):
            if not (
                    Y_test[i] > -10000
            ):  # check missing or not, if Y_test[i] is missing, then this command will return True
                Y_test = np.delete(Y_test, i, 0)
                X_test = np.delete(X_test, i, 0)
                X_rnn_test = np.delete(X_rnn_test, i, 0)
                i = -1
            i += 1
        Y_test = np.array(Y_test, dtype=np.float)

        # --

        X_rnn_train = np.array(X_rnn_train)
        X_rnn_test = np.array(X_rnn_test)
        X_train = np.array(X_train)
        Y_train = np.array(Y_train)
        X_test = np.array(X_test)

        np.random.seed(seed)
        np.random.shuffle(X_train)
        np.random.seed(seed)
        np.random.shuffle(Y_train)

        np.random.seed(seed)
        np.random.shuffle(X_rnn_train)

    else:  # is_training = false
        # mean and std
        fr = open(folder + "%s_parameter.pickle" % target_site, 'rb')
        [mean_X_train, std_X_train, mean_y_train,
         std_y_train] = (cPickle.load(fr)).split(',')
        mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in mean_X_train:
            mean_X_train.pop(mean_X_train.index(''))
        mean_X_train = np.array(mean_X_train, dtype=np.float)
        std_X_train = std_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in std_X_train:
            std_X_train.pop(std_X_train.index(''))
        std_X_train = np.array(std_X_train, dtype=np.float)
        mean_y_train = float(mean_y_train)
        std_y_train = float(std_y_train)
        fr.close()

        # reading data
        print('preparing testing set ..')
        X_test = data
        X_test = missing_check(np.array(X_test))

        # normalize
        print('Normalize ..')
        if 0 in std_X_train:
            input("Denominator can't be 0.")
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_test = X_test.tolist()
            for i in range(len(X_test)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_test[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_test[i].insert(specific_index, coordin[1])
                    X_test[i].insert(specific_index, coordin[0])
            X_test = np.array(X_test)

        # --

        print('Constructing time series data set ..')
        X_rnn_test = construct_time_steps(X_test, time_steps)
        X_test = concatenate_time_steps(X_test, time_steps)

        # --

        X_rnn_test = np.array(X_rnn_test)
        X_test = np.array(X_test)

    # -- xgboost --
    print('- xgboost -')

    filename = ("xgboost_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    print(filename)

    if is_training:
        xgb_model = xgb.XGBRegressor().fit(X_train, Y_train)

        fw = open(folder + filename, 'wb')
        cPickle.dump(xgb_model, fw)
        fw.close()
    else:
        fr = open(folder + filename, 'rb')
        xgb_model = cPickle.load(fr)
        fr.close()

    xgb_pred = xgb_model.predict(X_test)

    # print('rmse(xgboost): %.5f' % (np.mean((Y_test - (mean_y_train + std_y_train * xgb_pred))**2, 0)**0.5))

    # -- rnn --
    print('- rnn -')

    filename = ("sa_DropoutLSTM_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    print(filename)

    # Network Parameters
    time_steps = 12
    hidden_size = 20

    print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size")
    print("Using default args:")
    param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128"]
    args = [float(a) for a in param[1:]]
    print(args)
    p_W, p_U, p_dense, p_emb, weight_decay, batch_size = args
    batch_size = int(batch_size)

    # --

    print('Build rnn model...')
    start_time = time.time()
    rnn_model = Sequential()

    # layer 1
    rnn_model.add(
        BatchNormalization(epsilon=0.001,
                           mode=0,
                           axis=-1,
                           momentum=0.99,
                           weights=None,
                           beta_init='zero',
                           gamma_init='one',
                           gamma_regularizer=None,
                           beta_regularizer=None,
                           input_shape=(time_steps, input_size)))
    rnn_model.add(
        LSTM(hidden_size,
             W_regularizer=l2(weight_decay),
             U_regularizer=l2(weight_decay),
             b_regularizer=l2(weight_decay),
             dropout_W=p_W,
             dropout_U=p_U))  # return_sequences=True
    rnn_model.add(Dropout(p_dense))

    # output layer
    rnn_model.add(
        BatchNormalization(epsilon=0.001,
                           mode=0,
                           axis=-1,
                           momentum=0.99,
                           weights=None,
                           beta_init='zero',
                           gamma_init='one',
                           gamma_regularizer=None,
                           beta_regularizer=None))
    rnn_model.add(
        Dense(output_size,
              W_regularizer=l2(weight_decay),
              b_regularizer=l2(weight_decay)))

    # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
    optimiser = 'adam'
    rnn_model.compile(loss='mean_squared_error', optimizer=optimiser)

    final_time = time.time()
    time_spent_printer(start_time, final_time)

    if is_training:
        print("Train...")
        start_time = time.time()
        rnn_model.fit(X_rnn_train, Y_train, batch_size=batch_size, epochs=50)

        # Potentially save weights
        rnn_model.save_weights(folder + filename, overwrite=True)

        final_time = time.time()
        time_spent_printer(start_time, final_time)

    else:
        print('loading model ..')
        # print('loading model from %s' % (folder + filename + ".hdf5"))
        rnn_model.load_weights(folder + filename)

    rnn_pred = rnn_model.predict(X_rnn_test, batch_size=500, verbose=1)
    final_time = time.time()
    time_spent_printer(start_time, final_time)
    # print('rmse(rnn): %.5f' % (np.mean((np.atleast_2d(Y_test).T - (mean_y_train + std_y_train * rnn_pred))**2, 0)**0.5))

    # --  ensemble --

    print('stacking ..')
    if is_training:
        xgb_output = xgb_model.predict(X_train).reshape(len(X_train), 1)
        # rf_output = rf_model.predict(X_train).reshape(len(X_train), 1)
        rnn_output = rnn_model.predict(X_rnn_train, batch_size=500, verbose=1)
        # ensemble_X_train = np.hstack((X_train, xgb_output, rf_output, rnn_output))
        ensemble_X_train = np.hstack((X_train, xgb_output, rnn_output))

        Y_alert_train = [y * std_y_train + mean_y_train for y in Y_train]
        for element in range(len(Y_train)):
            if Y_alert_train[element] > high_alert:
                Y_alert_train[element] = 1  # [1, 0] = [high, low]
            else:
                Y_alert_train[element] = 0

    xgb_pred = xgb_pred.reshape(len(X_test), 1)
    # rf_pred = rf_pred.reshape(len(X_test), 1)
    rnn_pred = rnn_pred.reshape(len(X_test), 1)
    # ensemble_X_test = np.hstack((X_test, xgb_pred, rf_pred, rnn_pred))
    ensemble_X_test = np.hstack((X_test, xgb_pred, rnn_pred))

    # Y_alert_test = np.zeros(len(Y_test))
    # for element in range(len(Y_test)):
    #     if Y_test[element] > high_alert:
    #         Y_alert_test[element] = 1  # [1, 0] = [high, low]

    print('- ensemble -')
    filename = ("ensemble_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                (target_site, training_year[0], training_begining,
                 training_year[-1], training_deadline, interval_hours))
    filename2 = ("classification_%s_training_%s_m%s_to_%s_m%s_interval_%s" %
                 (target_site, training_year[0], training_begining,
                  training_year[-1], training_deadline, interval_hours))

    if is_training:
        ensemble_model = xgb.XGBRegressor().fit(ensemble_X_train, Y_train)
        classification_model = xgb.XGBClassifier().fit(ensemble_X_train,
                                                       Y_alert_train)

        fw = open(folder + filename, 'wb')
        cPickle.dump(ensemble_model, fw)
        fw.close()

        fw2 = open(folder + filename2, 'wb')
        cPickle.dump(classification_model, fw2)
        fw2.close()
    else:
        fr = open(folder + filename, 'rb')
        ensemble_model = cPickle.load(fr)
        fr.close()

        fr2 = open(folder + filename2, 'rb')
        classification_model = cPickle.load(fr2)
        fr2.close()

    pred = ensemble_model.predict(ensemble_X_test)
    alert_pred = classification_model.predict(ensemble_X_test)

    # --

    predictions = mean_y_train + std_y_train * pred
    # print('mse: %.5f' % mean_squared_error(Y_test, predictions))

    if is_training:
        print('rmse: %.5f' % (np.mean((Y_test - predictions)**2, 0)**0.5))

        def target_level(target, kind='PM2.5'):
            # target should be a 1d-list
            if kind == 'PM2.5':
                if (target >= 0) and (target < 11.5):  # 0-11
                    return 1
                elif (target >= 11.5) and (target < 23.5):  # 12-23
                    return 2
                elif (target >= 23.5) and (target < 35.5):  # 24-35
                    return 3
                elif (target >= 35.5) and (target < 41.5):  # 36-41
                    return 4
                elif (target >= 41.5) and (target < 47.5):  # 42-47
                    return 5
                elif (target >= 47.5) and (target < 53.5):  # 48-53
                    return 6
                elif (target >= 53.5) and (target < 58.5):  # 54-58
                    return 7
                elif (target >= 58.5) and (target < 64.5):  # 59-64
                    return 8
                elif (target >= 64.5) and (target < 70.5):  # 65-70
                    return 9
                elif target >= 70.5:  # others(71+)
                    return 10
                else:
                    print('error value: %d' % target)
                    return 1

        pred_label = np.zeros(len(predictions))
        real_target = np.zeros(len(Y_test))

        pred_label_true = 0.
        pred_label_false = 0.

        four_label_true = 0.0
        four_label_false = 0.0

        # calculate the accuracy of ten level
        for i in range(len(predictions)):
            pred_label[i] = target_level(predictions[i])
            real_target[i] = target_level(Y_test[i])

            if real_target[i] == pred_label[i]:
                pred_label_true += 1
            else:
                pred_label_false += 1

            # four label
            if (real_target[i] >= 1
                    and real_target[i] <= 3) and (pred_label[i] >= 1
                                                  and pred_label[i] <= 3):
                four_label_true += 1
            elif (real_target[i] >= 4
                  and real_target[i] <= 6) and (pred_label[i] >= 4
                                                and pred_label[i] <= 6):
                four_label_true += 1
            elif (real_target[i] >= 7
                  and real_target[i] <= 9) and (pred_label[i] >= 7
                                                and pred_label[i] <= 9):
                four_label_true += 1
            elif (real_target[i] >= 10) and (pred_label[i] >= 10):
                four_label_true += 1
            else:
                four_label_false += 1

        # print('standard_prob_accuracy: %.5f' % (standard_prob_true / (standard_prob_true + standard_prob_false)))
        print('Ten level accuracy: %.5f' %
              (pred_label_true / (pred_label_true + pred_label_false)))
        print('Four level accuracy: %.5f' %
              (four_label_true / (four_label_true + four_label_false)))
        print('--')

        # --

        ha = 0.0  # observation high, predict high
        hb = 0.0  # observation low, predict high
        hc = 0.0  # observation high, predict low
        hd = 0.0  # observation low, predict low
        la = 0.0  # observation very high, predict very high
        lb = 0.0
        lc = 0.0
        ld = 0.0
        alert_a = 0.0
        alert_b = 0.0
        alert_c = 0.0
        alert_d = 0.0
        integration_a = 0.0
        integration_b = 0.0
        integration_c = 0.0
        integration_d = 0.0

        for each_value in range(len(Y_test)):
            if Y_test[each_value] >= high_alert:  # observation high
                # regression
                if predictions[
                        each_value] >= high_alert:  # forecast high(with tolerance)
                    ha += 1
                else:
                    hc += 1

                # classification
                if alert_pred[each_value]:  # [1, 0] = [high, low]
                    alert_a += 1
                else:
                    alert_c += 1

                # integration
                if alert_pred[each_value] or (predictions[each_value] >=
                                              high_alert):
                    integration_a += 1
                else:
                    integration_c += 1

            else:  # observation low
                # regression
                if predictions[each_value] >= high_alert:
                    hb += 1
                else:
                    hd += 1

                # classification
                if alert_pred[each_value]:
                    alert_b += 1
                else:
                    alert_d += 1

                # integration
                if alert_pred[each_value] or (predictions[each_value] >=
                                              high_alert):
                    integration_b += 1
                else:
                    integration_d += 1

            # --------------------------------------------------------

            if Y_test[each_value] >= low_alert:  # observation higher
                if predictions[each_value] >= low_alert:
                    la += 1
                else:
                    lc += 1
            else:  # observation very low
                if predictions[each_value] >= low_alert:
                    lb += 1
                else:
                    ld += 1

        # print('Two level accuracy: %f' % (two_label_true / (two_label_true + two_label_false)))
        print('high label: (%d, %d, %d, %d)' % (ha, hb, hc, hd))
        print('low label: (%d, %d, %d, %d)' % (la, lb, lc, ld))
        print('alert: (%d, %d, %d, %d)' % (alert_a, alert_b, alert_c, alert_d))

    return predictions
Exemple #6
0

# if True:  # is_training:
# reading data
print('Reading data .. ')
start_time = time.time()
initial_time = time.time()
print('preparing training set ..')

raw_data_train = read_data_sets(sites=site_list + [target_site],
                                date_range=np.atleast_1d(training_year),
                                beginning=training_duration[0],
                                finish=training_duration[-1],
                                feature_selection=pollution_kind,
                                update=data_update)
raw_data_train = missing_check(raw_data_train)
Y_train = np.array(raw_data_train)[:, -len(pollution_kind):]
Y_train = Y_train[:, pollution_kind.index(target_kind)]
raw_data_train = np.array(raw_data_train)[:, :-len(pollution_kind)]

print('preparing testing set ..')

raw_data_test = read_data_sets(sites=site_list + [target_site],
                               date_range=np.atleast_1d(testing_year),
                               beginning=testing_duration[0],
                               finish=testing_duration[-1],
                               feature_selection=pollution_kind,
                               update=data_update)
Y_test = np.array(raw_data_test)[:, -len(pollution_kind):]
Y_test = Y_test[:, pollution_kind.index(target_kind)]
raw_data_test = missing_check(
def rnn(pollution_kind, local, city, target_site, training_year, testing_year,
        training_duration, testing_duration, interval_hours, data,
        is_training):
    print('is_training(%s) = %s' % (target_site, is_training))
    # format of training_year and testing_year should be (start year)-(end year), like 2014-2015
    # format of training_duration and testing_duration should be (start date)-(end date), like 1/1-12/31

    # local = os.sys.argv[1]
    # city = os.sys.argv[2]
    site_list = pollution_site_map[local][city]

    # change format from   2014-2015   to   ['2014', '2015']
    training_year = [
        training_year[:training_year.index('-')],
        training_year[training_year.index('-') + 1:]
    ]
    testing_year = [
        testing_year[:testing_year.index('-')],
        testing_year[testing_year.index('-') + 1:]
    ]

    training_duration = [
        training_duration[:training_duration.index('-')],
        training_duration[training_duration.index('-') + 1:]
    ]
    testing_duration = [
        testing_duration[:testing_duration.index('-')],
        testing_duration[testing_duration.index('-') + 1:]
    ]
    interval_hours = int(
        interval_hours
    )  # predict the label of average data of many hours later, default is 1
    # is_training = os.sys.argv[9]   # True False

    # clear redundancy work
    if training_year[0] == training_year[1]:
        training_year.pop(1)
    if testing_year[0] == testing_year[1]:
        testing_year.pop(1)

    # Training Parameters
    # WIND_DIREC is a specific feature, that need to be processed, and it can only be element of input vector now.
    # pollution_kind = ['PM2.5', 'O3', 'AMB_TEMP', 'RH', 'WIND_SPEED', 'WIND_DIREC']
    target_kind = 'PM2.5'
    data_update = False
    # batch_size = 24 * 7
    seed = 0

    # Network Parameters
    input_size = (len(site_list) * len(pollution_kind) +
                  len(site_list)) if 'WIND_DIREC' in pollution_kind else (
                      len(site_list) * len(pollution_kind))
    time_steps = 12
    hidden_size = 20
    output_size = 1

    # print("Expected args: p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen")
    # print("Using default args:")
    param = ["", "0.5", "0.5", "0.5", "0.5", "1e-6", "128", "200"]
    # args = [float(a) for a in sys.argv[1:]]
    args = [float(a) for a in param[1:]]
    # print(args)
    p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen = args
    batch_size = int(batch_size)
    maxlen = int(maxlen)
    testing_month = testing_duration[0][:testing_duration[0].index('/')]
    folder = root_path + "model/%s/%s/" % (local, city)
    filename = (
        "sa_DropoutLSTM_pW_%.2f_pU_%.2f_pDense_%.2f_pEmb_%.2f_reg_%f_batch_size_%d_cutoff_%d_epochs_%s_%sm_%sh"
        % (p_W, p_U, p_dense, p_emb, weight_decay, batch_size, maxlen,
           target_site, testing_month, interval_hours))
    print(filename)

    if is_training:
        # reading data
        print('Reading data for %s .. ' % target_site)
        start_time = time.time()
        print('preparing training set for %s ..' % target_site)
        X_train = read_data_sets(sites=site_list + [target_site],
                                 date_range=np.atleast_1d(training_year),
                                 beginning=training_duration[0],
                                 finish=training_duration[-1],
                                 feature_selection=pollution_kind,
                                 update=data_update)
        X_train = missing_check(X_train)
        Y_train = np.array(X_train)[:, -len(pollution_kind):]
        Y_train = Y_train[:, pollution_kind.index(target_kind)]
        X_train = np.array(X_train)[:, :-len(pollution_kind)]

        print('preparing testing set for %s..' % target_site)
        X_test = read_data_sets(sites=site_list + [target_site],
                                date_range=np.atleast_1d(testing_year),
                                beginning=testing_duration[0],
                                finish=testing_duration[-1],
                                feature_selection=pollution_kind,
                                update=data_update)
        Y_test = np.array(X_test)[:, -len(pollution_kind):]
        Y_test = Y_test[:, pollution_kind.index(target_kind)]
        X_test = missing_check(np.array(X_test)[:, :-len(pollution_kind)])

        final_time = time.time()
        print('Reading data for %s.. ok, ' % target_site, end='')
        time_spent_printer(start_time, final_time)

        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')

        if (len(X_train) < time_steps) or (len(X_test) < time_steps):
            input('time_steps(%d) too long.' % time_steps)

        # normalize
        print('Normalize for %s ..' % target_site)
        mean_X_train = np.mean(X_train, axis=0)
        std_X_train = np.std(X_train, axis=0)
        if 0 in std_X_train:
            input("Denominator can't be 0.(%s)" % target_site)
        X_train = np.array([(x_train - mean_X_train) / std_X_train
                            for x_train in X_train])
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        mean_y_train = np.mean(Y_train)
        std_y_train = np.std(Y_train)
        if not std_y_train:
            input("Denominator can't be 0.(%s)" % target_site)
        Y_train = [(y - mean_y_train) / std_y_train for y in Y_train]
        print('mean_y_train: %f  std_y_train: %f (%s)' %
              (mean_y_train, std_y_train, target_site))

        fw = open(folder + filename + ".pickle", 'wb')
        cPickle.dump(
            str(mean_X_train) + ',' + str(std_X_train) + ',' +
            str(mean_y_train) + ',' + str(std_y_train), fw)
        fw.close()

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_train = X_train.tolist()
            X_test = X_test.tolist()
            for i in range(len(X_train)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_train[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_train[i].insert(specific_index, coordin[1])
                    X_train[i].insert(specific_index, coordin[0])
                    if i < len(X_test):
                        coordin = data_coordinate_angle(
                            (X_test[i].pop(specific_index + j)) *
                            std_X_train[specific_index] +
                            mean_X_train[specific_index])
                        X_test[i].insert(specific_index, coordin[1])
                        X_test[i].insert(specific_index, coordin[0])
            X_train = np.array(X_train)
            X_test = np.array(X_test)
        Y_test = np.array(Y_test, dtype=np.float)

        # --
        print('Constructing time series data set for %s ..' % target_site)
        X_train = construct_time_steps(X_train[:-1], time_steps)
        Y_train = Y_train[time_steps:]
        reserve_hours = interval_hours - 1
        deadline = 0
        for i in range(len(Y_train)):
            # check the reserve data is enough or not
            if (len(Y_train) - i - 1) < reserve_hours:
                deadline = i
                break  # not enough
            for j in range(reserve_hours):
                Y_train[i] += Y_train[i + j + 1]
            Y_train[i] /= interval_hours
        if deadline:
            X_train = X_train[:deadline]
            Y_train = Y_train[:deadline]

        X_test = construct_time_steps(X_test[:-1], time_steps)
        Y_test = Y_test[time_steps:]
        deadline = 0
        for i in range(len(Y_test)):
            # check the reserve data is enough or not
            if (len(Y_test) - i - 1) < reserve_hours:
                deadline = i
                break  # not enough
            for j in range(reserve_hours):
                Y_test[i] += Y_test[i + j + 1]
            Y_test[i] /= interval_hours
        if deadline:
            X_test = X_test[:deadline]
            Y_test = Y_test[:deadline]

        # delete data which have missing values
        i = 0
        while i < len(Y_test):
            if not (
                    Y_test[i] > -10000
            ):  # check missing or not, if Y_test[i] is missing, then this command will return True
                Y_test = np.delete(Y_test, i, 0)
                X_test = np.delete(X_test, i, 0)
                i = -1
            i += 1
        Y_test = np.array(Y_test, dtype=np.float)
        # --
        X_train = np.array(X_train)
        Y_train = np.array(Y_train)
        X_test = np.array(X_test)

        np.random.seed(seed)
        np.random.shuffle(X_train)
        np.random.seed(seed)
        np.random.shuffle(Y_train)

    # ------------------------------------
    else:
        fr = open(folder + filename + ".pickle", 'rb')
        [mean_X_train, std_X_train, mean_y_train,
         std_y_train] = (cPickle.load(fr)).split(',')
        mean_X_train = mean_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in mean_X_train:
            mean_X_train.pop(mean_X_train.index(''))
        mean_X_train = np.array(mean_X_train, dtype=np.float)
        std_X_train = std_X_train.replace('[', '').replace(']', '').replace(
            '\n', '').split(' ')
        while '' in std_X_train:
            std_X_train.pop(std_X_train.index(''))
        std_X_train = np.array(std_X_train, dtype=np.float)
        mean_y_train = float(mean_y_train)
        std_y_train = float(std_y_train)
        fr.close()

        # input data
        X_test = data

        # normalize
        print('Normalize for %s ..' % target_site)
        X_test = np.array([(x_test - mean_X_train) / std_X_train
                           for x_test in X_test])

        # feature process
        if 'WIND_DIREC' in pollution_kind:
            index_of_kind = pollution_kind.index('WIND_DIREC')
            length_of_kind_list = len(pollution_kind)
            len_of_sites_list = len(site_list)
            X_test = X_test.tolist()
            for i in range(len(X_test)):
                for j in range(len_of_sites_list):
                    specific_index = index_of_kind + j * length_of_kind_list
                    coordin = data_coordinate_angle(
                        (X_test[i].pop(specific_index + j)) *
                        std_X_train[specific_index] +
                        mean_X_train[specific_index])
                    X_test[i].insert(specific_index, coordin[1])
                    X_test[i].insert(specific_index, coordin[0])
            X_test = np.array([X_test])

    print('Build model for %s ..' % target_site)
    start_time = time.time()
    model = Sequential()
    model.add(
        DropoutLSTM(input_size,
                    hidden_size,
                    truncate_gradient=maxlen,
                    W_regularizer=l2(weight_decay),
                    U_regularizer=l2(weight_decay),
                    b_regularizer=l2(weight_decay),
                    p_W=p_W,
                    p_U=p_U))
    model.add(Dropout(p_dense))
    model.add(
        Dense(hidden_size,
              output_size,
              W_regularizer=l2(weight_decay),
              b_regularizer=l2(weight_decay)))

    # optimiser = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=False)
    optimiser = 'adam'
    model.compile(loss='mean_squared_error', optimizer=optimiser)
    final_time = time.time()
    time_spent_printer(start_time, final_time)

    # --

    if is_training:
        print("Train for %s .." % target_site)
        start_time = time.time()
        checkpointer = ModelCheckpoint(filepath=folder + filename + ".hdf5",
                                       verbose=1,
                                       append_epoch_name=False,
                                       save_every_X_epochs=50)
        modeltest_1 = ModelTest(X_train[:100],
                                mean_y_train +
                                std_y_train * np.atleast_2d(Y_train[:100]).T,
                                test_every_X_epochs=1,
                                verbose=0,
                                loss='euclidean',
                                mean_y_train=mean_y_train,
                                std_y_train=std_y_train,
                                tau=0.1)
        modeltest_2 = ModelTest(X_test,
                                np.atleast_2d(Y_test).T,
                                test_every_X_epochs=1,
                                verbose=0,
                                loss='euclidean',
                                mean_y_train=mean_y_train,
                                std_y_train=std_y_train,
                                tau=0.1)
        model.fit(X_train,
                  Y_train,
                  batch_size=batch_size,
                  nb_epoch=251,
                  callbacks=[checkpointer, modeltest_1, modeltest_2])
        # score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
        # print('Test score:', score)
        # print('Test accuracy:', acc)

        # model.save_weights(folder+filename+"_250.hdf5", overwrite=True)
        final_time = time.time()
        time_spent_printer(start_time, final_time)

        # --

        print("Test for %s .." % target_site)
        standard_prob = model.predict(X_train, batch_size=500, verbose=1)
        print(
            np.mean(((mean_y_train + std_y_train * np.atleast_2d(Y_train).T) -
                     (mean_y_train + std_y_train * standard_prob))**2, 0)**0.5)

        # --

        standard_prob = model.predict(X_test, batch_size=500, verbose=1)
        T = 50
        prob = np.array([
            model.predict_stochastic(X_test, batch_size=500, verbose=0)
            for _ in xrange(T)
        ])
        prob_mean = np.mean(prob, 0)
        print(
            np.mean((np.atleast_2d(Y_test).T -
                     (mean_y_train + std_y_train * standard_prob))**2, 0)**0.5)
        print(
            np.mean((np.atleast_2d(Y_test).T -
                     (mean_y_train + std_y_train * prob_mean))**2, 0)**0.5)

        standard_prob_pred = np.zeros(len(standard_prob))
        prob_mean_pred = np.zeros(len(prob_mean))
        real_target = np.zeros(len(Y_test))

        standard_prob_true = 0.
        standard_prob_false = 0.
        prob_mean_true = 0.
        prob_mean_false = 0.

        # calculate the accuracy of ten level
        for i in range(len(prob_mean)):
            standard_prob_pred[i] = target_level(mean_y_train +
                                                 std_y_train * prob_mean[i])
            prob_mean_pred[i] = target_level(mean_y_train +
                                             std_y_train * prob_mean[i])
            real_target[i] = target_level(Y_test[i])

            if real_target[i] == standard_prob_pred[i]:
                standard_prob_true += 1
            else:
                standard_prob_false += 1

            if real_target[i] == prob_mean_pred[i]:
                prob_mean_true += 1
            else:
                prob_mean_false += 1

        print('standard_prob_accuracy(%s): %.5f' %
              (target_site, standard_prob_true /
               ((standard_prob_true + standard_prob_false))))
        print('prob_mean_accuracy(%s): %.5f' %
              (target_site,
               (prob_mean_true / (prob_mean_true + prob_mean_false))))

        print('--')

        ha = 0.0  # observation high, predict high
        hb = 0.0  # observation low, predict high
        hc = 0.0  # observation high, predict low
        hd = 0.0  # observation low, predict low
        vha = 0.0  # observation very high, predict very high
        vhb = 0.0
        vhc = 0.0
        vhd = 0.0
        two_label_true = 0.0
        two_label_false = 0.0
        # statistic of status of prediction by forecast & observation
        for each_label in np.arange(len(real_target)):
            if real_target[each_label] >= 7:  # observation high
                if prob_mean_pred[each_label] >= 7:
                    ha += 1
                    two_label_true += 1
                else:
                    hc += 1
                    two_label_false += 1
            else:  # observation low
                if prob_mean_pred[each_label] >= 7:
                    hb += 1
                    two_label_false += 1
                else:
                    hd += 1
                    two_label_true += 1

            if real_target[each_label] >= 10:  # observation very high
                if prob_mean_pred[each_label] >= 10:
                    vha += 1
                else:
                    vhc += 1
            else:  # observation low
                if prob_mean_pred[each_label] >= 10:
                    vhb += 1
                else:
                    vhd += 1

        print('Two level accuracy of %s : %f' %
              (target_site,
               (two_label_true / (two_label_true + two_label_false))))
        print('high label of %s: (%d, %d, %d, %d)' %
              (target_site, ha, hb, hc, hd))
        print('very high label of %s: (%d, %d, %d, %d)' %
              (target_site, vha, vhb, vhc, vhd))

        # plot the real trend and trend of prediction
        prediction = mean_y_train + std_y_train * prob_mean
        plt.plot(np.arange(len(prediction)),
                 Y_test[:len(prediction)],
                 c='gray')
        plt.plot(np.arange(len(prediction)), prediction, color='pink')

        plt.xticks(np.arange(0, len(prediction), 24))
        plt.yticks(np.arange(0, max(Y_test), 10))
        plt.grid(True)
        plt.rc('axes', labelsize=4)

    else:
        print('loading model for %s ..' % target_site)
        model.load_weights(folder + filename + ".hdf5")

        standard_prob = model.predict(X_test, batch_size=1, verbose=1)
        T = 50
        prob = np.array([
            model.predict_stochastic(X_test, batch_size=1, verbose=0)
            for _ in xrange(T)
        ])
        prob_mean = np.mean(prob, 0)

    return mean_y_train + std_y_train * prob_mean