Ejemplo n.º 1
0
def test(path_test, input_size, hidden_size, batch_size, save_dir, model_name,
         maxlen):
    db = read_data(path_test)

    X = create_sequences(db[:-maxlen], win_size=maxlen, step=maxlen)
    X = np.reshape(X, (X.shape[0], X.shape[1], input_size))

    # build the model: 1 layer LSTM
    print('Build model...')
    model = Sequential()
    model.add(
        LSTM(hidden_size,
             return_sequences=False,
             input_shape=(maxlen, input_size)))
    model.add(Dense(maxlen))

    model.load_weights(save_dir + model_name)
    model.compile(loss='mse', optimizer='adam')

    prediction = model.predict(X, batch_size, verbose=1)
    prediction = prediction.flatten()
    # prediction_container = np.array(prediction).flatten()
    Y = db[maxlen:]
    plt.plot(prediction, label='prediction')
    plt.plot(Y, label='true')
    plt.legend()
    plt.show()
def prepare_datasets(input_data_files, max_len):
    """
    Reads the input data, and prepares the train, and test files
    :param input_data_files: tuple containing (<path_to_data_file>, <path_to_labels_file>)
    :param max_len: maximum length of the sentence (number of words) in input data
    :return:
    """

    sentences, sentence_labels = utils.read_data(input_data_files[0],
                                                 input_data_files[1])
    vocab = get_vocabulary(sentences)
    word_to_idx = dict()
    word_to_idx["__PAD__"] = 0

    data = np.zeros(shape=(len(sentences), max_len), dtype=long)
    labels = np.zeros(shape=(len(sentences)), dtype=long)

    for idx, w in enumerate(vocab):
        word_to_idx[w] = idx + 1

    for i in range(len(sentences)):
        labels[i] = sentence_labels[i]
        offset = max_len - len(sentences[i])
        for j in range(len(sentences[i])):
            data[i][offset + j] = word_to_idx[sentences[i][j]]

    np.save(constants.SENTIMENT_DATA_PATH, data)
    np.save(constants.SENTIMENT_LABELS_PATH, labels)

    with open(constants.WORD_TO_IDX_PATH, "wb") as w_idx_fp:
        pickle.dump(word_to_idx, w_idx_fp)

    print "saved the train data, labels, and test data, labels"

    pass
Ejemplo n.º 3
0
def train_normal_model(path_train, input_size, hidden_size, batch_size,
                       early_stopping_patience, val_percentage, save_dir,
                       model_name, maxlen):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    db = read_data(path_train)
    train_x = db[:-maxlen]
    train_y = db[maxlen:]

    X = create_sequences(train_x, maxlen, maxlen)
    y = create_sequences(train_y, maxlen, maxlen)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    y = np.reshape(y, (y.shape[0], y.shape[1], 1))
    #
    # preparing the callbacks
    check_pointer = callbacks.ModelCheckpoint(filepath=save_dir + model_name,
                                              verbose=1,
                                              save_best_only=True)
    early_stop = callbacks.EarlyStopping(patience=early_stopping_patience,
                                         verbose=1)

    # build the model: 1 layer LSTM
    print('Build model...')
    model = Sequential()
    # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
    # note: in a situation where your input sequences have a variable length,
    # use input_shape=(None, nb_feature).
    model.add(LSTM(hidden_size, input_shape=(maxlen, input_size)))
    # For the decoder's input, we repeat the encoded input for each time step
    model.add(RepeatVector(maxlen))
    # The decoder RNN could be multiple layers stacked or a single layer
    model.add(LSTM(hidden_size, return_sequences=True))

    # For each of step of the output sequence, decide which character should be chosen
    model.add(TimeDistributed(Dense(1)))

    model.compile(loss='mae', optimizer='adam')
    model.summary()

    model.fit(X,
              y,
              batch_size=batch_size,
              nb_epoch=50,
              validation_split=val_percentage,
              callbacks=[check_pointer, early_stop])

    return model
Ejemplo n.º 4
0
def creat_url():
    """
    生产url
    :return:
    """
    for city_data in read_data('other/city_code.txt'):
        page_index = 0
        city_data = eval(city_data)
        while page_index < 55:
            page_index += 1
            url = 'https://appv3.qichacha.net/app/v1/base/getNewCompanys?province=' + str(
                city_data['provinceCode']) + '&cityCode=' + str(
                    city_data['Value']
                ) + '&pageIndex=' + str(page_index) + '&timestamp=' + str(
                    tim) + '&sign=' + sign + '&platform=other&app_channel=qq'
            yield url, city_data['Desc'], city_data['provinceName']
Ejemplo n.º 5
0
def train_normal_model(path_train, input_size, hidden_size, batch_size,
                       early_stopping_patience, val_percentage, save_dir,
                       model_name, maxlen):

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    db = read_data(path_train)
    train_x = db[:-140]
    train_y = db[140:]

    X = create_sequences(train_x, 140, 140)
    y = create_sequences(train_y, 140, 140)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))

    # preparing the callbacks
    check_pointer = callbacks.ModelCheckpoint(filepath=save_dir + model_name,
                                              verbose=1,
                                              save_best_only=True)
    early_stop = callbacks.EarlyStopping(patience=early_stopping_patience,
                                         verbose=1)

    # build the model: 1 layer LSTM
    print('Build model...')
    model = Sequential()
    model.add(
        LSTM(hidden_size,
             return_sequences=False,
             input_shape=(maxlen, input_size)))
    model.add(Dense(140))

    model.compile(loss='mse', optimizer='adam')
    model.summary()

    model.fit(X,
              y,
              batch_size=batch_size,
              nb_epoch=100,
              validation_split=val_percentage,
              callbacks=[check_pointer, early_stop])

    return model
Ejemplo n.º 6
0
def test(path_test, input_size, hidden_size, batch_size, save_dir, model_name,
         maxlen):
    db = read_data(path_test)
    X = create_sequences(db, maxlen, maxlen)
    y = create_sequences(db, maxlen, maxlen)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    y = np.reshape(y, (y.shape[0], y.shape[1], 1))

    # build the model: 1 layer LSTM
    print('Build model...')
    model = Sequential()
    # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE
    # note: in a situation where your input sequences have a variable length,
    # use input_shape=(None, nb_feature).
    model.add(LSTM(hidden_size, input_shape=(maxlen, input_size)))
    # For the decoder's input, we repeat the encoded input for each time step
    model.add(RepeatVector(maxlen))
    # The decoder RNN could be multiple layers stacked or a single layer
    model.add(LSTM(hidden_size, return_sequences=True))

    # For each of step of the output sequence, decide which character should be chosen
    model.add(TimeDistributed(Dense(1)))

    model.load_weights(save_dir + model_name)

    model.compile(loss='mae', optimizer='adam')
    model.summary()

    prediction = model.predict(
        X,
        batch_size,
        verbose=1,
    )
    prediction = prediction.flatten()
    # prediction_container = np.array(prediction).flatten()
    plt.plot(prediction.flatten()[:4000], label='prediction')
    plt.plot(y.flatten()[maxlen:4000 + maxlen], label='true')
    plt.legend()
    plt.show()

    store_prediction_and_ground_truth(model)
Ejemplo n.º 7
0
    desc = args.description

    model_name = 'LR-Cate'
    model_file = model_name + '-Sample' + '-' + version + '.model' if USE_SAMPLE else model_name + '-' + version + '.model'
    model_metainfo_file = model_name + '-Sample' + '-' + version + '.json' if USE_SAMPLE else model_name + '-' + version + '.json'
    sub_file = 'Sub-' + model_name + '-Sample' + '-' + version + '.txt' if USE_SAMPLE else 'Sub-' + model_name + '-' + version + '.txt'

    if os.path.exists(model_file):
        print('There already has a model with the same version.')
        sys.exit(-1)

    feature_store_path = '../sample/features' if USE_SAMPLE else '../data/features'

    CATE_TRAIN_FILE = 'ensemble_cate_feature_train'
    CATE_TRAIN_FILE = CATE_TRAIN_FILE + '_sample' + '.' + fmt if USE_SAMPLE else CATE_TRAIN_FILE + '.' + fmt
    ensemble_train = read_data(
        os.path.join(feature_store_path, CATE_TRAIN_FILE), fmt)

    CATE_TEST_FILE = 'ensemble_cate_feature_test'
    CATE_TEST_FILE = CATE_TEST_FILE + '_sample' + '.' + fmt if USE_SAMPLE else CATE_TEST_FILE + '.' + fmt
    ensemble_test = read_data(os.path.join(feature_store_path, CATE_TEST_FILE),
                              fmt)

    print(ensemble_train.info())
    print(ensemble_test.info())

    all_features = list(ensemble_train.columns.values)
    print("all original features")

    print(all_features)
    y = ensemble_train[y_label].values
Ejemplo n.º 8
0
96/96 [==============================] - 15s - loss: 0.5074 - acc: 0.8854 - val_loss: 0.5017 - val_acc: 0.9048
Epoch 4/20
96/96 [==============================] - 16s - loss: 0.4007 - acc: 0.8854 - val_loss: 0.3971 - val_acc: 0.9048
Epoch 5/20
96/96 [==============================] - 15s - loss: 0.3400 - acc: 0.8958 - val_loss: 0.3234 - val_acc: 0.9286
Epoch 6/20
96/96 [==============================] - 15s - loss: 0.2773 - acc: 0.9167 - val_loss: 0.3014 - val_acc: 0.9286
Epoch 7/20
96/96 [==============================] - 15s - loss: 0.2409 - acc: 0.9167 - val_loss: 0.2914 - val_acc: 0.9286
Epoch 8/20
96/96 [==============================] - 15s - loss: 0.2181 - acc: 0.9375 - val_loss: 0.2629 - val_acc: 0.9286
'''

method = 'hks'
x_data, y_data = read_data(descriptor_dir='shrec11-kp',
                           method=method,
                           descriptor_rows=KP_DESCRIPTOR_ROWS,
                           descriptor_cols=KP_DESCRIPTOR_COLS)

(train_x, val_x, train_y, val_y) = split_data(x_data,
                                              y_data,
                                              split_percentage=0.7)

train_x = train_x.reshape((-1, KP_UNITS))
val_x = val_x.reshape((-1, KP_UNITS))

mlp_model = MLP(input_units=KP_UNITS,
                output_units=OUTPUT_UNITS,
                hidden_layers=(10000, ),
                activations=('relu', 'softmax'))

scores = mlp_model.train(train_x,
Ejemplo n.º 9
0
                    '--format',
                    help='store pandas feature format, csv, pkl')

args = parser.parse_args()

if __name__ == '__main__':

    USE_SAMPLE = args.sample
    fmt = args.format if args.format else 'csv'
    feature_store_path = '../sample/features' if USE_SAMPLE else '../data/features'
    if not os.path.exists(feature_store_path):
        os.mkdir(feature_store_path)

    FACE_FEATURE_FILE = 'face_feature'
    FACE_FEATURE_FILE = FACE_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else FACE_FEATURE_FILE + '.' + fmt
    face_data = read_data(os.path.join(feature_store_path, FACE_FEATURE_FILE),
                          fmt)

    TEXT_FEATURE_FILE = 'text_feature'
    TEXT_FEATURE_FILE = TEXT_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else TEXT_FEATURE_FILE + '.' + fmt
    text_data = read_data(os.path.join(feature_store_path, TEXT_FEATURE_FILE),
                          fmt)

    TRAIN_USER_INTERACT = '../sample/train_interaction.txt' if USE_SAMPLE else '../data/train_interaction.txt'
    TEST_INTERACT = '../sample/test_interaction.txt' if USE_SAMPLE else '../data/test_interaction.txt'

    user_item_train = pd.read_csv(TRAIN_USER_INTERACT,
                                  sep='\t',
                                  header=None,
                                  names=[
                                      'user_id', 'photo_id', 'click', 'like',
                                      'follow', 'time', 'playing_time',
Ejemplo n.º 10
0
                                  sep='\t',
                                  header=None,
                                  names=[
                                      'user_id', 'photo_id', 'click', 'like',
                                      'follow', 'time', 'playing_time',
                                      'duration_time'
                                  ])
    user_item_test = pd.read_csv(
        TEST_INTERACT,
        sep='\t',
        header=None,
        names=['user_id', 'photo_id', 'time', 'duration_time'])

    PHOTO_FEATURE_FILE = 'photo_feature'
    PHOTO_FEATURE_FILE = PHOTO_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else PHOTO_FEATURE_FILE + '.' + fmt
    photo_data = read_data(
        os.path.join(feature_store_path, PHOTO_FEATURE_FILE), fmt)

    USER_FEATURE_FILE = 'user_feature'
    USER_FEATURE_FILE = USER_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else USER_FEATURE_FILE + '.' + fmt
    users = read_data(os.path.join(feature_store_path, USER_FEATURE_FILE), fmt)

    user_item_train = pd.merge(user_item_train,
                               users,
                               how='inner',
                               on=['user_id'])

    user_item_train = pd.merge(user_item_train,
                               photo_data,
                               how='left',
                               on=['photo_id'])