Example #1
0
def test(config):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    policy = Policy()
    if torch.cuda.is_available():
        policy.load_state_dict(torch.load(config.model_path))
    else:
        policy.load_state_dict(
            torch.load(config.model_path, map_location='cpu'))
    policy.to(device)

    d = {'bid_price': [0], 'ask_price': [0], 'action': [0]}
    df = pd.DataFrame(data=d)
    rewards_over_time = []

    with torch.no_grad():
        accumulative_reward_test = 0
        print_reward = []
        for j in range(config.num_of_test):
            current_reward = 0
            ask = np.zeros((1, 1))
            bid = np.zeros((1, 1))
            previous_action = torch.tensor([0.0]).to(device)
            while ask.shape[0] <= config.timespan and bid.shape[0] <= 3600:
                target_bid, target_ask, feature_span = draw_eval_episode(
                    config.week_num, config.lag, config.currency,
                    config.min_history, j, config.offset)
                # target_bid, target_ask, feature_span = draw_eval_episode(config.lag, config.currency,
                #                                                          config.min_history, j, config.offset)
                bid, ask, feature_span = target_bid * 1e3, target_ask * 1e3, feature_span
            for t in range(
                    config.timespan):  # Don't infinite loop while learning
                state = feature_span[t]
                save_action = policy(state.float(),
                                     0.1 * previous_action).to(device)

                if t == config.timespan - 1:
                    save_action = 0
                action = save_action - previous_action

                price = 0
                if action > 0:
                    price = ask[t]
                elif action < 0:
                    price = bid[t]
                reward = torch.sum(torch.tensor(-1.).float() * action *
                                   price).to(device)
                accumulative_reward_test += reward
                current_reward += reward

                d = {
                    'bid_price': [bid[t]],
                    'ask_price': [ask[t]],
                    'action': [action.item()]
                }
                temp_df = pd.DataFrame(data=d)
                df = df.append(temp_df)
                previous_action = save_action
            print_reward.append(current_reward)
            print("episode_reward", current_reward)
        print("Testing on {} datapoint and return is {}".format(
            config.num_of_test, accumulative_reward_test))
        rewards_over_time.append(accumulative_reward_test)

    # Save the csv file
    currency_pair = config.model_path[-27:-21]

    saved_path = 'deep/result/' + currency_pair + '_week_' + str(
        config.week_num) + '.csv'
    # saved_path = currency_pair + '_train.csv'
    print('Saving the csv file ...')
    df.to_csv(saved_path, index=False)
def train_eval(config):
    #change the optimizer
    #add in dropout
    #use new data from Iris
    #start = time.time()
    # starting time
    optimizer = optim.SGD(policy.parameters(), lr=config.init_lr)
    # eps = np.finfo(np.float32).eps.item()
    rewards_over_time = []

    NUM_OF_EVAL_DATA = config.num_of_eval

    _time = time.strftime("%Y%m%d-%H%M%S")
    PATH = './deep/best_model_' + config.currency + '_week' + str(
        config.week_num) + '_' + _time + '_dropout.pth'
    log_path = './deep/log_' + config.currency + '_week' + str(
        config.week_num) + '_' + _time + '_dropout.txt'

    best_accumulative_return = -1000
    #load_model_and_overhead = time.time() - start
    #print ('load model and overhead {}'.format(load_model_and_overhead))
    for epoch in range(config.num_of_epoch):
        for i_episode in range(config.num_of_episode):
            #start_episode = time.time()
            ask = torch.zeros((1, 1)).to(device)
            bid = torch.zeros((1, 1)).to(device)
            previous_action = torch.tensor([0.0]).to(device)
            while ask.size()[0] <= config.timespan and bid.size(
            )[0] <= config.timespan:
                target_bid, target_ask, feature_span = draw_train_episode(
                    config.week_num, config.lag, config.currency,
                    config.min_history)
                bid, ask, features = target_bid * 1e3, target_ask * 1e3, feature_span
            #finish_draw = time.time()
            #print('Time to draw features is {}'.format(finish_draw - start_episode))
            for t in range(
                    config.timespan):  # Don't infinite loop while learning
                state = feature_span[t]
                save_action = policy(state.float(),
                                     0.1 * previous_action).to(device)

                if t == config.timespan - 1:
                    save_action = 0

                action = save_action - previous_action

                price = 0
                #print('The action is', action, type(action))
                if action > 0:
                    price = ask[t]
                elif action < 0:
                    price = bid[t]
                reward = torch.sum(torch.tensor(-1.).float() * action *
                                   price).to(device)

                policy.rewards += reward

                previous_action = save_action
            #after_an_hour = time.time()
            #print ('after an hour of training is {}'.format(after_an_hour - finish_draw))
            optimizer.zero_grad()
            loss = -policy.rewards / config.timespan
            loss.backward(retain_graph=True)
            optimizer.step()
            #print('Time used to backprop {}'.format(time.time() - after_an_hour))
            if i_episode % 10 == 0:
                to_log = 'Epoch: {} Episode:{} The loss of training is {}'.format(
                    epoch, i_episode, loss.item())
                logging(to_log, log_path)
            policy.rewards = 0

        # eval after running 1000 episodes
        policy.eval()

        with torch.no_grad():
            accumulative_reward_test = 0
            for j in range(NUM_OF_EVAL_DATA):
                current_reward = 0
                ask = np.zeros((1, 1))
                bid = np.zeros((1, 1))
                previous_action = torch.tensor([0.0]).to(device)
                while ask.shape[0] <= config.timespan and bid.shape[
                        0] <= config.timespan:
                    target_bid, target_ask, feature_span = draw_eval_episode(
                        config.week_num, config.lag, config.currency,
                        config.min_history, j, config.offset)
                    bid, ask, features = target_bid * 1e3, target_ask * 1e3, feature_span
                for t in range(
                        config.timespan):  # Don't infinite loop while learning
                    state = feature_span[t]
                    save_action = policy(state.float(), 0.1 * previous_action)

                    if t == config.timespan - 1:
                        save_action = 0
                    action = save_action - previous_action

                    price = 0
                    if action > 0:
                        price = ask[t]
                    elif action < 0:
                        price = bid[t]
                    reward = torch.sum(
                        torch.tensor(-1.).float() * action * price).to(device)
                    accumulative_reward_test += reward
                    current_reward += reward
                    previous_action = save_action
            to_log = "Evaluating on {} datapoint and return is {}".format(
                NUM_OF_EVAL_DATA, accumulative_reward_test)
            logging(to_log, log_path)

            rewards_over_time.append(accumulative_reward_test)

            if (accumulative_reward_test * 1.0 / NUM_OF_EVAL_DATA >
                    best_accumulative_return):
                torch.save(policy.state_dict(), PATH)
                best_accumulative_return = accumulative_reward_test * 1.0 / NUM_OF_EVAL_DATA
        logging("=======================================================",
                log_path)

        policy.train()

    with open(config.reward_file, 'w') as filehandle:
        for listitem in rewards_over_time:
            filehandle.write('%s\n' % listitem)