Beispiel #1
0
def data_processing(dataset_dir):
    """
		Data Processing
	"""
    train, validation, OO, NO, ON, NN, \
    pre_treatment, drug_fingerprint_info \
    = utils.data_processing(dataset_dir,
          parameters.train_data_path,
          parameters.validation_data_path,
          parameters.OO_data_path,
          parameters.NO_data_path,
          parameters.ON_data_path,
          parameters.NN_data_path,
          parameters.pre_treatment_data_path,
          parameters.drug_fingerprint_data_path
          )
    """
		Printing Data Statistics
	"""
    utils.print_statistics(len(train), len(validation), len(OO), len(NO),
                           len(ON), len(NN), len(pre_treatment),
                           len(drug_fingerprint_info))

    return train, validation, OO, NO, ON, NN, \
    pre_treatment, drug_fingerprint_info
Beispiel #2
0
        len(weight_new.index),
        1,
    ], 1)[:, 0],
                     index=weight_new.index)
    unit[weight_new[weight_new < 0].index] *= -1.0
    stop_info = (df[stock].ix[i - 1] -
                 max_new[stock]) / max_new[stock] * unit[stock]
    if len(stop_info[stop_info < -re]) != 0:
        weight_new[stop_info[stop_info < -re].index] = 0
        weight_new = weight_new / weight_new.sum()
        flag = 1
    return weight_new, flag


if __name__ == "__main__":
    df = ut.data_processing(file_address, asset_data_files)
    print(df.head())
    unit = np.full([
        len(df.index),
        1,
    ], 1)[:, 0]
    df['rebalancing'] = pd.Series()
    df['stoploss'] = pd.Series()
    df['nav'] = pd.Series(unit, index=df.index)
    weight_new = []
    max_new = []
    reb_index = 0
    for i in range(return_period, len(df.index)):
        if i < data_need:
            continue
        # record max price
        print(f'Loading checkpoint {args.checkpoint}')
        state_dict = torch.load(args.checkpoint, map_location=device)
        model.load_state_dict(state_dict)

    # load dataset
    data_df = pd.read_csv(args.data_path)
    data_folder = os.path.dirname(args.data_path)

    dataset = AudioDataset(data_folder, data_df)

    print('Setup loaders')
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    loader = DataLoader(dataset=dataset,
                        batch_size=hparams['batch_size'],
                        shuffle=False,
                        collate_fn=lambda x: data_processing(
                            x, text_transform, audio_transforms),
                        **kwargs)

    blank_id = len(text_transform)
    preds = []

    print('Making prediction')
    data_len = len(loader)
    for i, batch in enumerate(loader):
        print(f'{i}/{data_len}')
        spectrograms, labels, input_lengths, label_lengths = batch
        spectrograms, labels = spectrograms.to(device), labels.to(device)

        output = model(spectrograms)  # (batch, time, n_class)
        output = F.log_softmax(output, dim=2)
        output = output.transpose(0, 1)  # (time, batch, n_class)
Beispiel #4
0
                     config['stride'], config['dropout']).to(device)

    # load dataset
    data_df = pd.read_csv(os.path.join(config['data_path'], 'train.csv'))
    train_df, val_df = train_test_split(data_df,
                                        test_size=config['val_fraction'])

    train_dataset = AudioDataset(config['data_path'], train_df)
    val_dataset = AudioDataset(config['data_path'], val_df)

    print('Setup loaders')
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=config['batch_size'],
                              shuffle=True,
                              collate_fn=lambda x: data_processing(
                                  x, text_transform, train_audio_transforms),
                              **kwargs)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=config['batch_size'],
                            shuffle=False,
                            collate_fn=lambda x: data_processing(
                                x, text_transform, valid_audio_transforms),
                            **kwargs)

    optimizer = optim.AdamW(model.parameters(), config['learning_rate'])
    blank_id = len(text_transform)
    criterion = nn.CTCLoss(blank=blank_id).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer,
                                              max_lr=config['learning_rate'],
                                              steps_per_epoch=int(
                                                  len(train_loader)),
Beispiel #5
0
def model_train():
    input_shape = 60

    input_data = load_data()
    data_processing()
    with open(CONSTANTS[1], 'rb') as f:
        word_dictionary = pickle.load(f)
    with open(CONSTANTS[2], 'rb') as f:
        inverse_word_dictionary = pickle.load(f)
    with open(CONSTANTS[3], 'rb') as f:
        label_dictionary = pickle.load(f)
    with open(CONSTANTS[4], 'rb') as f:
        output_dictionary = pickle.load(f)
    vocab_size = len(word_dictionary.keys())
    label_size = len(label_dictionary.keys())

    # 处理输入数据
    aggregate_function = lambda input: [
        (word, pos, label)
        for word, pos, label in zip(input['word'].values.tolist(), input[
            'pos'].values.tolist(), input['tag'].values.tolist())
    ]

    grouped_input_data = input_data.groupby('sent_no').apply(
        aggregate_function)
    sentences = [sentence for sentence in grouped_input_data]

    x = [[word_dictionary[word[0]] for word in sent] for sent in sentences]
    x = pad_sequences(maxlen=input_shape, sequences=x, padding='post', value=0)
    y = [[label_dictionary[word[2]] for word in sent] for sent in sentences]
    y = pad_sequences(maxlen=input_shape, sequences=y, padding='post', value=0)
    y = [
        np_utils.to_categorical(label, num_classes=label_size + 1)
        for label in y
    ]

    train_end = int(len(x) * 0.9)
    train_x, train_y = x[0:train_end], np.array(y[0:train_end])
    test_x, test_y = x[train_end:], np.array(y[train_end:])

    #输入参数
    activation = 'selu'
    out_act = 'softmax'
    n_units = 100
    batch_size = 32
    epochs = 10
    output_dim = 20

    # 模型训练
    lstm_model = create_Bi_LSTM(vocab_size, label_size, input_shape,
                                output_dim, n_units, out_act, activation)
    lstm_model.fit(train_x,
                   train_y,
                   epochs=epochs,
                   batch_size=batch_size,
                   verbose=1)

    # 模型保存
    model_save_path = CONSTANTS[0]
    lstm_model.save(model_save_path)

    # 测试
    N = test_x.shape[0]
    avg_accuracy = 0
    for start, end in zip(range(0, N, 1), range(1, N + 1, 1)):
        sentence = [
            inverse_word_dictionary[i] for i in test_x[start] if i != 0
        ]
        y_predict = lstm_model.predict(test_x[start:end])
        input_sequences, output_sequences = [], []
        for i in range(0, len(y_predict[0])):
            output_sequences.append(np.argmax(y_predict[0][i]))
            input_sequences.append(np.argmax(test_y[start][i]))

        eval = lstm_model.evaluate(test_x[start:end], test_y[start:end])
        print('Test Accuracy: loss = %0.6f accuracy = %0.2f%%' %
              (eval[0], eval[1] * 100))
        avg_accuracy += eval[1]
        output_sequences = ' '.join([
            output_dictionary[key] for key in output_sequences if key != 0
        ]).split()
        input_sequences = ' '.join([
            output_dictionary[key] for key in input_sequences if key != 0
        ]).split()
        output_input_comparison = pd.DataFrame(
            [sentence, output_sequences, input_sequences]).T
        print(output_input_comparison.dropna())
        print('\n\n')

    avg_accuracy /= N
    print("模型准确率:%.2f%%." % (avg_accuracy * 100))