コード例 #1
0
    def step(self):
        if len(self.clauses) == 0:
            return torch.tensor(0, dtype=torch.float32)

        binding_dict, new_state = self.interp.state_query(
            self.state, self.clauses[-1])
        self.state = new_state

        # update the success and possible fields
        self.check_possible(binding_dict)

        if not self.possible:
            self.obj_poss_left.append(0)
        elif not "var_0" in binding_dict.keys():
            self.obj_poss_left.append(self.obj_nums)
            self.update_data(binding_dict)
        else:
            self.obj_poss_left.append(len(binding_dict["var_0"]))
            self.update_data(binding_dict)

        self.check_success(binding_dict)

        # TODO: the reward function need to be updated
        reward = get_reward(self)

        # print(reward)
        return torch.tensor(reward, dtype=torch.float32)
コード例 #2
0
    def step(self):
        if len(self.clauses) == 0:
            return torch.tensor(0, dtype=torch.float32)

        # binding_dict = query(self.graph.scene, self.clauses, self.config)
        next_clause = self.clauses[-1]
        useful = self.interp.useful_check(self.interp_state, next_clause)
        logging.info(f"useful: {useful}")

        binding_dict, new_state = self.interp.state_query(
            self.interp_state, next_clause)
        self.interp_state = new_state

        # update the success and possible fields
        self.check_possible(binding_dict)

        if not self.possible:
            self.obj_poss_left.append(0)
        elif not "var_0" in binding_dict.keys():
            self.obj_poss_left.append(self.obj_nums)
        else:
            self.obj_poss_left.append(len(binding_dict["var_0"]))
        self.check_success(binding_dict)

        done = self.success or (not self.possible)

        if not useful:
            reward = torch.tensor(-1, dtype=torch.float32)
        else:
            reward = torch.tensor(get_reward(self), dtype=torch.float32)

        return reward
コード例 #3
0
    def step(self, action_idx):

        is_uncertain = self.is_uncertain
        next_clause = self.actions[action_idx]

        if self.ref_flag:
            for element in next_clause:
                if type(element) == int:
                    if element not in self.ref:
                        self.ref.append(element)

            self.unreachable = self.unreachable_dict[str(sorted(self.ref))]

        # if 'red' in next_clause or 'blue' in next_clause:
        #     print('here')

        self.idx_selected.append(action_idx)
        self.clauses.append(next_clause)

        useful = self.interp.useful_check(self.state, next_clause)
        logging.info(f"useful: {useful}")
        binding_dict, new_state = self.interp.state_query(
            self.state, next_clause)
        self.state = new_state

        # update the success and possible fields
        self.check_possible(binding_dict)

        if not self.possible:
            self.obj_poss_left.append(0)
        elif not "var_0" in binding_dict.keys():
            self.obj_poss_left.append(self.obj_nums)
            self.update_data(binding_dict)
        else:
            self.obj_poss_left.append(len(binding_dict["var_0"]))
            self.update_data(binding_dict)

        # update whether done or not
        self.check_success(binding_dict)
        done = self.success or (not self.possible)

        if not useful:
            reward = torch.tensor(-1, dtype=torch.float32)
        else:
            reward = torch.tensor(get_reward(self), dtype=torch.float32)

        self.update_data(binding_dict)
        info = {}

        logging.info(f"selected: {self.idx_selected}")
        logging.info(f"done: {done}")
        logging.info(f"success: {self.success}")

        if self.ref_flag:
            state = self.get_state(self.unreachable)
        else:
            state = self.get_state()

        return state, reward, done, info
コード例 #4
0
ファイル: mdp.py プロジェクト: MonaHe123/coding-exercise
def compute_q(MDP, V, s, a):
    S, A, R, P, gamma = MDP
    #s采取动作a之后可能的状态
    q_sa = 0.0
    for s_prime in S:
        q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime)
    q_sa = get_reward(R, s, a) + gamma * q_sa
    return q_sa
コード例 #5
0
    def _expansion_simulation(self, leaf_id, win_index):
        leaf_board = self.tree[leaf_id]['board']
        current_player = self.tree[leaf_id]['player']

        if win_index == 0:
            # expansion
            actions = utils.valid_actions(leaf_board)

            for action in actions:
                action_index = action[1]
                child_id = leaf_id + (action_index, )
                child_board = utils.get_board(child_id, self.board_size)
                next_turn = utils.get_turn(child_id)

                self.tree[child_id] = {
                    'board': child_board,
                    'player': next_turn,
                    'parent': leaf_id,
                    'child': [],
                    'n': 0.,
                    'w': 0.,
                    'q': 0.
                }

                self.tree[leaf_id]['child'].append(action_index)

            if self.tree[leaf_id]['parent']:
                # simulation
                board_sim = leaf_board.copy()
                turn_sim = current_player

                while True:
                    actions_sim = utils.valid_actions(board_sim)
                    action_sim = actions_sim[np.random.choice(
                        len(actions_sim))]
                    coord_sim = action_sim[0]

                    if turn_sim == 0:
                        board_sim[coord_sim] = 1
                    else:
                        board_sim[coord_sim] = -1

                    win_idx_sim = utils.check_win(board_sim, self.win_mark)

                    if win_idx_sim == 0:
                        turn_sim = abs(turn_sim - 1)

                    else:
                        reward = utils.get_reward(win_idx_sim, leaf_id)
                        return reward
            else:
                # root node don't simulation
                reward = 0.
                return reward
        else:
            # terminal node don't expansion
            reward = 1.
            return reward
    def get_her_transitions(self, stnd_replay):
        '''
		Params : 
		@ stnd_replay : base transition that actually occured

		'''
        new_transitions = []
        for i in range(len(stnd_replay)):
            try:
                samples = random.sample(stnd_replay[i + 1:], self.k)
            except:
                '''
					return everything remaining if sample
					population is less than k

				'''
                samples = stnd_replay[i + 1:]

            for sample in samples:
                new_goal = np.asarray(sample[3][:3])

                for transition in stnd_replay[i:]:
                    '''
					If the current transition being looked at 
					goes to the new goal state, then break out 
					of the loop after setting reward to 0.0
					Else create a new transition with new goal
					and add it to the trajectory
					
					'''
                    new_state = transition[0][:-3]
                    new_next_state = transition[3][:-3]

                    if (np.all(new_next_state[:3] == new_goal)):
                        new_reward = 0.0
                        break
                    else:
                        new_reward = get_reward(self.env, new_next_state[:3],
                                                new_goal)

                    ## normalize values before concatenating
                    new_goal = normalizer(new_goal)
                    new_state = normalizer(new_state, 5.0)
                    new_next_state = normalizer(new_next_state, 5.0)
                    action = normalizer(transition[1], 5.0)

                    new_state = np.concatenate((new_state, new_goal), axis=0)
                    new_next_state = np.concatenate((new_next_state, new_goal),
                                                    axis=0)

                    new_transition = [
                        new_state, action, new_reward, new_next_state
                    ]

                    new_transitions.append(new_transition)

        return (new_transitions)
コード例 #7
0
def compute_q(MDP, V, s, a):
    '''根据给定的MDP, 价值函数V, 计算(状态行为对)s, a的价值qsa
            公式 2.16
    '''
    S, A, R, P, gamma = MDP
    q_sa = 0
    for s_prime in S:
        q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime)

    q_sa = get_reward(R, s, a) + gamma * q_sa
    return q_sa
コード例 #8
0
def compute_q(MDP, V, s, a):
    '''
    According to the given MDP, the value function V, calculate the value qsa of the state behavior to s, a
    formula   $$q_{\pi}(s,a) = R^a_b + \gamma \sum_{s' \in S}P^a_{ss'} \nu \pi(s')  $$    #markdown
    '''
    S, A, R, P, gamma = MDP
    q_sa = 0
    for s_prime in S:
        q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime)
        q_sa = get_reward(R, s, a) + gamma * q_sa
    return q_sa
コード例 #9
0
ファイル: test.py プロジェクト: 15779235038/algorithm
def compute_q(MDP, V, s, a):
    '''根据给定的MDP,价值函数V,计算状态行为对s,a的价值qsa
    '''
    S, A, R, P, gamma = MDP
    q_sa = 0

    print('对当前的行为' + str(a) + '计算它的行为价值计算,对行为,计算其对应所有状态的价值')
    for s_prime in S:
        print('状态' + str(s) + '经过a行为,' + str(a) + '获取转移概率' +
              str(get_prob(P, s, a, s_prime)) + '以及分数' +
              str(get_value(V, s_prime)))
        q_sa += get_prob(P, s, a, s_prime) * get_value(V, s_prime)
        print('状态' + str(s) + '经过a行为,' + str(a) + str(q_sa))
    q_sa = get_reward(R, s, a) + gamma * q_sa

    print('算出这个行为价值为' + str(q_sa))
    return q_sa
コード例 #10
0
ファイル: main.py プロジェクト: TheButlah/ProjectMARS
def main():
    np.random.seed(seed)

    # Initialize the model architecture
    model = QMap((dx, dy, n_features),
                 n_actions,
                 seed=seed,
                 load_model=save_path)

    game = pm.Game(dx=dx,
                   dy=dy,
                   number_of_turns=episode_length,
                   default_capacity=25,
                   servable_distance=3.0,
                   initial_cost=50.0,
                   operating_cost=25.0,
                   profit_margin=5.0,
                   unserviced_penalty=1.0)

    display = pm.GameDisplay(game, box_size, pop_scale)

    # The initial game state

    total_return = 0
    prev_serviced = [0]  # List for easy mutable argument
    for step in range(episode_length):
        s = get_state(game)
        assert (s.shape[1] == dx and s.shape[2] == dy
                and s.shape[3] == n_features)

        display.update()

        # predicted action-value and action taken
        q, a = [np.squeeze(item) for item in model.predict_q(s)]

        take_action(game, a)  # actually take the selected action

        r = get_reward(game, prev_serviced)  # See what the reward is
        total_return += r

        print("Q-Value: %f, Action selected: %s, Reward: %d" % (q, str(a), r))
コード例 #11
0
def main(lamb, R):
    #read in file
    tmp = []
    with open("../session.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    data = np.asarray(tmp)
    tmp = []

    with open("../user_feature.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    user = np.asarray(tmp)
    tmp = []

    with open("../app_feature.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    app = np.asarray(tmp)

    #initialization
    pool_size = app.shape[0]
    #user_row_n = user.shape[1]

    #feature length
    app_row_n = app.shape[1]
    d = app_row_n - 1

    session_n = int(max(data[:, 0]))
    train_ratio = 0.5
    gamma = 1
    # lamb = 0.1
    theta = np.zeros(d)
    beta = 1
    delta = 0.9
    V = lamb * np.eye(d)
    X = np.zeros((1, d), dtype=np.float)
    Y = np.zeros(1)
    x_feature = np.zeros((pool_size, d), dtype=np.float)
    UCB = np.zeros(pool_size, dtype=np.float)
    session = np.zeros(session_n, dtype=np.float)
    K = 5
    click_n = 0

    idex = np.random.permutation(session_n)
    tr_idx = idex[:int(round(session_n * train_ratio))]
    ts_idx = idex[int(round(session_n * train_ratio)):]

    # open file for storing log info
    cur_time = strftime("%Y%m%d_", localtime())
    logFileName = '../LogFile/main_newtr_nouser' + cur_time + '.txt'
    logFile = open(logFileName, 'a+')
    print >> logFile, '\n\n'
    print >> logFile, '=' * 50
    print >> logFile, 'experiment parameters: lambda=%f, R=%f' % (lamb, R)
    print >> logFile, '\n\n'

    #app = app[:,1:]
    expl = np.zeros(pool_size)
    #train
    for i in range(tr_idx.shape[0]):
        record = np.zeros(1)
        #user_feature = np.zeros(1)
        try:
            record = data[np.where(data[:, 0] == tr_idx[i])]
            #user_feature = user[np.where(user[:, 0] == record[0, 0])][0,1:]
        except IndexError:
            continue
        else:
            # app_dict = {}
            # val = []
            # for r in range(record.shape[0]):
            # if record[r,1] not in app_dict:
            # app_dict[record[r,1]] = record[r,2]
            # else if record[r,1] == 11 or record[r,1] == 10:
            # app_dict[record[r,1]] = record[r,2]
            # for a in app_dict:
            # app_feature = app[np.where(app[:,0] == a)][0,1:]
            # x_feature[record[r,1]] = np.outer(user_feature, app_feature).reshape(1, d)
            # x_feature[a] = np.divide(x_feature[a], np.linalg.norm(x_feature[a]))
            # if app_dict[a]==61:
            # val.append(0)
            # else:
            # val.append(1)
            # val.np.asarray(val)
            # x_t = x_feature
            # w =np.array(val.reshape(val.shape[1],1))
            # [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R)

            val = []
            idx = []
            for a in range(pool_size):
                #get all apps from the session data of one user
                if app[a, 0] in record[:, 1]:
                    idx.append(a)
                    x_feature[a] = app[a, 1:]
                    x_feature[a] = np.divide(x_feature[a],
                                             np.linalg.norm(x_feature[a]))
                    #get the user's feedback to one app
                    feedback = record[np.where(record[:, 1] == app[a, 0])][:,
                                                                           2]
                    if 10 in feedback or 11 in feedback:
                        val.append(1)
                    else:
                        val.append(0)
            val = np.asarray(val)
            print val
            idx = np.asarray(idx)
            x_t = x_feature[idx, :]
            w = np.array(val.reshape(val.shape[1], 1))
            # print w
            [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb,
                                                       delta, R)
    #test
    cnt = 0
    result = [0]
    while cnt < 4000:
        i = random.randint(0, ts_idx.shape[0])
        #for i in range(ts_idx.shape[0]):
        record = np.zeros(1)
        #u = np.zeros(1)
        try:
            record = data[np.where(data[:, 0] == ts_idx[i])]
            #u = user[np.where(user[:, 0] == record[0, 0])][0,1:]
        except IndexError:
            continue
        else:
            for a in range(pool_size):
                x_feature[a] = app[a, 1:]
                x_feature[a] = np.divide(x_feature[a],
                                         np.linalg.norm(x_feature[a]))
                UCB[a] = utils.getUCB(theta, x_feature[a], beta, V)
                # print "this is app "+str(a)+" UCB is " + str(UCB[a])
            action = UCB.argsort()[-K:][::-1]
            for ii in range(K):
                if expl[action[ii]] == 0:
                    expl[action[ii]] = 1
            # print expl
            reward = match_app.match(record, action)
            idx = []
            val = []
            if reward is not None:
                for j in reward:
                    if reward[j] == 1 or reward[j] == 0:
                        cnt = cnt + 1
                        idx.append(j)
                        val.append(reward[j])
                idx = np.asarray(idx)
                val = np.asarray(val)
                x_t = x_feature[idx, :]
                w = np.array(val.reshape(x_t.shape[0], 1))
                # print w
                [V, X, Y, theta,
                 beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R)
                click_n = click_n + utils.get_reward(w)
                result.append((float(click_n) / cnt))
            else:
                continue

    print >> logFile, "the reward is: %s" % result
    print >> logFile, "cnt is: %s" % cnt
    expl_n = sum(expl)
    expl_n_rate = float(expl_n) / pool_size
    print >> logFile, "expl_n is %s" % str(expl_n)
    print >> logFile, "expl_rate is %s" % str(expl_n_rate)

    logFile.close()
コード例 #12
0
def main(lamb, R):
    #read in file
    tmp = []
    with open("../session.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    data = np.asarray(tmp)
    tmp = []

    with open("../user_feature.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    user = np.asarray(tmp)
    tmp = []

    with open("../app_feature.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    app = np.asarray(tmp)

    #initialization
    pool_size = app.shape[0]
    user_row_n = user.shape[1]
    app_row_n = app.shape[1]
    d = int((user_row_n - 1) * (app_row_n - 1))
    session_n = int(max(data[:, 0]))
    train_ratio = 0.7
    gamma = 1
    # lamb = 0.1
    theta = np.zeros(d)
    beta = 1
    delta = 0.9
    V = lamb * np.eye(d)
    X = np.zeros((1, d), dtype=np.float)
    Y = np.zeros(1)
    x_feature = np.zeros((pool_size, d), dtype=np.float)
    UCB = np.zeros(pool_size, dtype=np.float)
    session = np.zeros(session_n, dtype=np.float)
    K = 5
    click_n = 0

    idex = np.random.permutation(session_n)
    tr_idx = idex[:int(round(session_n * train_ratio))]
    ts_idx = idex[int(round(session_n * train_ratio)):]

    # open file for storing log info
    cur_time = strftime("%Y%m%d_", localtime())
    logFileName = '../LogFile/main_' + cur_time + '.txt'
    logFile = open(logFileName, 'a+')
    print >> logFile, '\n\n'
    print >> logFile, '=' * 50
    print >> logFile, 'experiment parameters: lambda=%f, R=%f' % (lamb, R)
    print >> logFile, '\n\n'

    #train
    app = app[:, 1:]
    expl = np.zeros(pool_size)
    for i in range(tr_idx.shape[0]):
        record = np.zeros(1)
        u = np.zeros(1)
        try:
            record = data[np.where(data[:, 0] == tr_idx[i])]
            u = user[np.where(user[:, 0] == record[0, 0])][0, 1:]
        except IndexError:
            continue
        else:
            for a in range(pool_size):
                x_feature[a] = np.outer(u, app[a, :]).reshape(1, d)
                x_feature[a] = np.divide(x_feature[a],
                                         np.linalg.norm(x_feature[a]))
                UCB[a] = utils.getUCB(theta, x_feature[a], beta, V)
                # print "this is app "+str(a)+" UCB is " + str(UCB[a])
            action = UCB.argsort()[-K:][::-1]
            for ii in range(K):
                if expl[action[ii]] == 0:
                    expl[action[ii]] = 1
            #print expl
            reward = match_app.match(record, action)
            idx = []
            val = []
            if reward is not None:
                for j in reward:
                    if reward[j] == 1 or reward[j] == 0:
                        idx.append(j)
                        val.append(reward[j])

                idx = np.asarray(idx)
                val = np.asarray(val)
                x_t = x_feature[idx, :]
                w = np.array(val.reshape(x_t.shape[0], 1))
                # print w
                [V, X, Y, theta,
                 beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R)

            else:
                continue
    #test
    cnt = 0
    result = [0]
    for i in range(ts_idx.shape[0]):
        record = np.zeros(1)
        u = np.zeros(1)
        try:
            record = data[np.where(data[:, 0] == ts_idx[i])]
            u = user[np.where(user[:, 0] == record[0, 0])][0, 1:]
        except IndexError:
            continue
        else:
            for a in range(pool_size):
                x_feature[a] = np.outer(u, app[a, :]).reshape(1, d)
                x_feature[a] = np.divide(x_feature[a],
                                         np.linalg.norm(x_feature[a]))
                UCB[a] = utils.getUCB(theta, x_feature[a], beta, V)
                # print "this is app "+str(a)+" UCB is " + str(UCB[a])
            action = UCB.argsort()[-K:][::-1]
            for ii in range(K):
                if expl[action[ii]] == 0:
                    expl[action[ii]] = 1
            # print expl
            reward = match_app.match(record, action)
            idx = []
            val = []
            if reward is not None:
                for j in reward:
                    if reward[j] == 1 or reward[j] == 0:
                        cnt = cnt + 1
                        idx.append(j)
                        val.append(reward[j])
                idx = np.asarray(idx)
                val = np.asarray(val)
                x_t = x_feature[idx, :]
                w = np.array(val.reshape(x_t.shape[0], 1))
                # print w
                [V, X, Y, theta,
                 beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R)
                click_n = click_n + utils.get_reward(w)
                result.append((float(click_n) / cnt))
            else:
                continue

    print >> logFile, "the reward is: %s" % result
    print >> logFile, "cnt is: %s" % cnt
    expl_n = sum(expl)
    expl_n_rate = float(expl_n) / pool_size
    print >> logFile, "expl_n is %s" % str(expl_n)
    print >> logFile, "expl_rate is %s" % str(expl_n_rate)

    logFile.close()
コード例 #13
0
ファイル: main.py プロジェクト: AppRec/c3ucb
        continue
    else:
        for a in range(pool_size):
            x_feature[a] = np.outer(u, app[a, :]).reshape(1, d)
            x_feature[a] = np.divide(x_feature[a],
                                     np.linalg.norm(x_feature[a]))
            UCB[a] = utils.getUCB(theta, x_feature[a], beta, V)
        action = UCB.argsort()[-K:][::-1]
        reward = match_app.match(record, action)
        idx = []
        val = []
        if reward is not None:
            for j in reward:
                if reward[j] == 1 or reward[j] == 0:
                    cnt = cnt + 1
                    idx.append(j)
                    val.append(reward[j])
            idx = np.asarray(idx)
            val = np.asarray(val)
            x_t = x_feature[idx, :]
            w = np.array(val.reshape(x_t.shape[0], 1))
            [V, X, Y, theta, beta] = utils.update_stat(V, x_t, X, Y, w, lamb,
                                                       delta)
            click_n = click_n + utils.get_reward(w)
            result.append((float(click_n) / cnt))
        else:
            continue

print "the reward is: %s" % result
print "cnt is: %s" % cnt
コード例 #14
0
def train(train_batch, train_label, hidden_size, test_length=5):
    '''
    ##############################
    # modified from the original code by Nurakhmetov (2019)
    references:
    Nurakhmetov, D. (2019). Reinforcement Learning Applied to Adaptive Classification Testing. 
    In Theoretical and Practical Advances in Computer-based Educational Measurement (pp. 325-336). Springer, Cham.    
    ###############################
    '''
    batch_size = 10

    tests = []
    tests_train = []
    policy = Policy(n_tests=18, n_scores=1401, hidden_size=hidden_size)
    optimizer = optim.Adam(policy.parameters())
    criterion = nn.CrossEntropyLoss(reduce=False)

    (score, test) = tuple(None, None)
    tests, scores = [], []
    rewards = []
    hidden = Variable(torch.zeros(batch_size, test), volatile=True)

    for t in range(test_length):
        logits, value, hidden, _ = policy(test, score, hidden, batch_size)
        probs = nn.functional.softmax(logits)  #sample next item
        next_test = torch.multinomial(probs, 1)

        test = next_test.data.squeeze(1)
        score, test = utils.test_score(train_batch, test)

        masks = []
        for prev_test in tests:
            mask = prev_test.squeeze(1).eq(test).unsqueeze(1)
            masks.append(mask)
        if len(masks) > 0:
            masks = torch.cat(masks, 1)
            masks = masks.sum(1).gt(0)
            masks = -1 * masks.float()
            rewards.append(masks.unsqueeze(1))

        tests.append(test.unsqueeze(1))
        scores.append(score.unsqueeze(1))

        score = Variable(score.unsqueeze(1), volatile=True)
        test = Variable(test.unsqueeze(1), volatile=True)

        tests_train.append(tests)

    saved_log_probs = []
    saved_values = []

    hidden = Variable(torch.zeros(batch_size, tests))
    logits, value, hidden, _ = policy(None, None, hidden, batch_size)
    log_probs = nn.functional.log_softmax(logits)

    for test, score in zip(test, scores):

        log_prob = log_probs.gather(1, Variable(test))
        saved_log_probs.append(log_prob)
        saved_values.append(value)

        logits, value, hidden, clf_logits = policy(Variable(test),
                                                   Variable(score), hidden,
                                                   batch_size)
        log_probs = nn.functional.log_softmax(logits)

    loss = nn.functional.cross_entropy(clf_logits, Variable(train_label))

    clf_rewards = []
    for clf_logit, targ in zip(clf_logits.data, train_label):
        reward = -criterion(Variable(clf_logit.unsqueeze(0)),
                            Variable(torch.LongTensor([targ]))).data
        clf_rewards.append(reward.unsqueeze(0))
    clf_rewards = torch.cat(clf_rewards, 0).unsqueeze(-1)

    rewards.append(clf_rewards)
    returns = utils.get_reward(rewards)

    saved_log_probs = torch.cat(saved_log_probs, 1)
    saved_values = torch.cat(saved_values, 1)

    advantages = Variable(returns) - saved_values

    critic_loss = advantages.pow(2).mean()
    actor_loss = -(saved_log_probs * Variable(advantages.data)).mean()

    optimizer.zero_grad()
    (critic_loss + actor_loss + loss).backward()
    optimizer.step()

    return tests_train
コード例 #15
0
ファイル: main_BT_nouser.py プロジェクト: AppRec/c3ucb
def main(lamb, R):
    tmp = []
    with open("../user_feature.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    user = np.asarray(tmp)

    tmp = []
    with open("../app_feature.txt", "r") as f:
        for line in f:
            line = line.strip().rstrip(',').split(',')
            line = list(map(float, line))
            tmp.append(line)
        f.close()
    app = np.asarray(tmp)

    #initialization
    B = 3
    reward_acc = 0
    cnt_acc = 0
    cur_time = strftime("%Y%m%d_", localtime())
    logFileName = '../LogFile/main_BT_nouser_' + cur_time + '.txt'
    logFile = open(logFileName, 'a+')
    print >> logFile, '\n\n'
    print >> logFile, '=' * 3
    print >> logFile, 'experiment parameters: lambda=%f, R=%f' % (lamb, R)
    print >> logFile, '\n\n'
    for t in range(0, B):
        print "Round " + str(t + 1) + " starts!"
        data = readSess(t)
        sids = readDict(t)
        start = time.clock()
        pool_size = app.shape[0]
        user_row_n = user.shape[1]
        app_row_n = app.shape[1]
        d = app_row_n - 1
        session_n = sids.shape[0]
        train_ratio = 0.7
        gamma = 1
        #lamb = 0.5
        theta = np.zeros(d)
        beta = 1
        delta = 0.9
        V = lamb * np.eye(d)
        X = np.zeros((1, d), dtype=np.float)
        Y = np.zeros(1)
        x_feature = np.zeros((pool_size, d), dtype=np.float)
        UCB = np.zeros(pool_size, dtype=np.float)
        K = 5
        click_n = 0

        sids = np.random.permutation(sids)
        tr_idx = sids[:int(round(session_n * train_ratio))]
        ts_idx = sids[int(round(session_n * train_ratio)):]

        #train
        app = app[:, 1:]
        expl = np.zeros(pool_size)
        for i in range(tr_idx.shape[0]):
            record = np.zeros(1)
            #u = np.zeros(1)
            try:
                record = data[np.where(data[:, 0] == float(tr_idx[i]))]
                #u = user[np.where(user[:, 0] == record[0, 0])][0,1:]
            except IndexError:
                continue
            else:
                for a in range(pool_size):
                    x_feature[a] = app[a, :]
                    x_feature[a] = np.divide(x_feature[a],
                                             np.linalg.norm(x_feature[a]))
                    UCB[a] = utils.getUCB(theta, x_feature[a], beta, V)
                    #print "this is app "+str(a)+" UCB is " + str(UCB[a])
                action = UCB.argsort()[-K:][::-1]
                for ii in range(K):
                    if expl[action[ii]] == 0:
                        expl[action[ii]] = 1
                # print expl
                reward = match_app.match(record, action)
                idx = []
                val = []
                if reward is not None:
                    for j in reward:
                        if reward[j] == 1 or reward[j] == 0:
                            idx.append(j)
                            val.append(reward[j])
                    idx = np.asarray(idx)
                    val = np.asarray(val)
                    x_t = x_feature[idx, :]
                    w = np.array(val.reshape(x_t.shape[0], 1))
                    print w
                    [V, X, Y, theta,
                     beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R)

                else:
                    continue

        #test
        cnt = 0
        result = [0]
        for i in range(ts_idx.shape[0]):
            record = np.zeros(1)
            #u = np.zeros(1)
            try:
                record = data[np.where(data[:, 0] == float(ts_idx[i]))]
                #u = user[np.where(user[:, 0] == record[0, 0])][0,1:]
            except IndexError:
                continue
            else:
                for a in range(pool_size):
                    x_feature[a] = app[a, :]
                    x_feature[a] = np.divide(x_feature[a],
                                             np.linalg.norm(x_feature[a]))
                    UCB[a] = utils.getUCB(theta, x_feature[a], beta, V)
                    #print "this is app "+str(a)+" UCB is " + str(UCB[a])
                action = UCB.argsort()[-K:][::-1]
                for ii in range(K):
                    if expl[action[ii]] == 0:
                        expl[action[ii]] = 1
                print expl
                reward = match_app.match(record, action)
                idx = []
                val = []
                if reward is not None:
                    for j in reward:
                        if reward[j] == 1 or reward[j] == 0:
                            cnt = cnt + 1
                            idx.append(j)
                            val.append(reward[j])
                    idx = np.asarray(idx)
                    val = np.asarray(val)
                    x_t = x_feature[idx, :]
                    w = np.array(val.reshape(x_t.shape[0], 1))
                    print w
                    [V, X, Y, theta,
                     beta] = utils.update_stat(V, x_t, X, Y, w, lamb, delta, R)
                    click_n = click_n + utils.get_reward(w)
                    result.append((float(click_n) / cnt))
                else:
                    continue

        print >> logFile, "the reward is: %s" % result[-1]
        print >> logFile, "cnt is: %s" % cnt
        expl_n = sum(expl)
        expl_n_rate = float(expl_n / pool_size)
        print >> logFile, "expl_n is %s" % str(expl_n)
        print >> logFile, "expl_rate is %s" % str(expl_n_rate)
        reward_acc += result[-1]
        cnt_acc += cnt

    reward_avg = reward_acc / B
    cnt_avg = cnt_acc / B
    print >> logFile, "the average cnt is: %s" % cnt_avg
    print >> logFile, "the average reward is: %s" % reward_avg

    logFile.close()
コード例 #16
0
    state_counts = defaultdict(lambda: 0)

    # The initial game state
    s = get_state(game, state_counts)
    assert(s.shape[1] == dx and s.shape[2] == dy and s.shape[3] == n_features)

    loss_avg = 0
    total_return = 0
    prev_serviced = [0]  # List for easy mutable argument
    for step in range(episode_length):

      q, a = compute_action_value(model, s)  # action-value and action taken

      take_action(game, a)  # actually take the selected action
      s_prime = get_state(game, state_counts)  # See what the new state is
      r = get_reward(game, prev_serviced)  # See what the reward is
      total_return += r

      # Retroactively predict the value of the previous state using the reward
      # The terminal state should have a value of 0, so we add a check for that
      if step < episode_length - 1:
        q_prime, _ = model.predict_q(s_prime)
        q_prime *= gamma
        q_prime += r
      else:
        q_prime = np.array(r, dtype=np.float32, ndmin=1)

      mu = get_mu(state_counts, s)  # Weight for importance of `s`

      # Gradient update for model towards q_prime
      loss = model.update(s, a, q_prime, mu, lr=lr)
コード例 #17
0
    def train_controller(self):
        """Fixes the shared parameters and updates the controller parameters.

        The controller is updated with a score function gradient estimator
        (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl
        is computed on a minibatch of validation data.

        A moving average baseline is used.

        The controller is trained for 2000 steps per epoch (i.e.,
        first (Train Shared) phase -> second (Train Controller) phase).
        """

        self.controller.train()
        # TODO(brendan): Why can't we call shared.eval() here? Leads to loss
        # being uniformly zero for the controller.
        # self.shared.eval()

        avg_reward_base = None
        baseline = None
        adv_history = []
        entropy_history = []
        reward_history = []
        genomes_all = []

        # hidden = self.shared.init_hidden(self.args.batch_size)
        total_loss = 0
        genome_epochs = args.genome_epochs

        for step in range(args.controller_max_step):

            bp_args = []
            log_probs_all = []

            for i in range(args.nprocs):
                # sample models
                genome, log_probs, entropies = self.controller.sample()

                # keep track of all log_probs for each genome
                log_probs_all.append(log_probs)

                # calculate reward
                np_entropies = entropies.data.cpu().numpy()

                # append entropies for all models
                entropy_history.extend(np_entropies)

                # rewards, valid_loss, genome_model = utils.get_reward(genome, np_entropies, self.traits, self.x, self.y, self.x_val, self.y_val)
                bp_args.append((genome, np_entropies, self.traits, self.x,
                                self.y, self.x_val, self.y_val, genome_epochs))

            if args.nprocs == 1:
                rewards_batch = [utils.get_reward(*bp_args[0])]
            else:
                rewards_batch = self.pool.starmap(utils.get_reward, bp_args)

            for i, (rewards, valid_loss, genome, model,
                    bp_iters) in enumerate(rewards_batch):

                genomes_all.append((genome, float(valid_loss),
                                    self.controller_step, bp_iters))

                # check for the best model
                self.n_models += 1
                if self.best_genome is None or genome.fitness > self.best_genome.fitness:
                    self.best_genome = genome
                    self.best_model = model

                # set the trained weights back to the set of vocab genes
                try:
                    # print('*'*100)
                    for module in model.modules[:-1]:
                        id_ = module.id_
                        gene = model.genome.nodes[id_]
                        key_orig = gene.key_orig
                        gene_orig = self.vocab.get(key_orig, None)
                        #print(gene_orig.parameters.keys())
                        gene_orig.save_parameters(module.function.state_dict())
                        #print(self.vocab[key_orig].parameters.keys())
                except:
                    ipdb.set_trace()
                """
                hist = {'time': time.time(),
                        'loss': float(valid_loss),
                        'model': str(model),
                        'num_params': utils.num_params(model)}
                self.history.append(hist)
                
                # wirte history to JSON for further analysis
                with open(self.history_file, 'a') as fout:
                    fout.write(json.dumps(hist) + '\n')
                """

                # VIP: you get one reward per entropy
                # reward_history.extend(rewards)
                """
                I have to do mean of entropies here because the number of 
                entropies varies based on network depth:
                    R = 10 - valid_loss
                    rewards = R + 1e-4 * entropies
                """
                rewards = np.mean(rewards)
                reward_history.append(rewards)

                # moving average baseline
                if baseline is None:
                    baseline = rewards
                else:
                    decay = 0.95  # ema_baseline_decay (very important)
                    baseline = decay * baseline + (1 - decay) * rewards

                adv = rewards - baseline
                # adv_history.extend(adv)
                adv_history.append(adv)

                # policy loss
                # loss = -log_probs_all[i] * utils.get_variable(adv, args.cuda, requires_grad=False)
                loss = -log_probs_all[i] * utils.get_variable(
                    [adv], args.cuda, requires_grad=False)

                # if args.entropy_mode == 'regularizer':
                #     loss -= args.entropy_coeff * entropies

                loss = loss.sum()  # or loss.mean()

                # update
                self.controller_optim.zero_grad()
                loss.backward()

                if args.controller_grad_clip > 0:
                    to.nn.utils.clip_grad_norm(
                        self.controller.model.parameters(),
                        args.controller_grad_clip)

                self.controller_optim.step()
                self.controller_step += 1
                assert self.controller_step == self.n_models, (
                    self.controller_step, self.n_models)

                total_loss += utils.to_item(loss.data)

                # if ((step % args.log_step) == 0) and (step > 0):
                # if ((step * args.nprocs % args.log_step) == 0) and (step > 0):
                if self.controller_step % args.log_step == 0:

                    logger.info('-' * 100)
                    logger.info(
                        'summarizing and resetting: reward_history, adv_history, entropy_history'
                    )
                    logger.info(
                        'step: {}, controller_step: {}, n_models: {}, max_layers: {}'
                        .format(step, self.controller_step, self.n_models,
                                self.controller.max_layers))
                    logger.info(
                        'len(reward_history): {}, len(adv_history): {}, len(entropy_history): {}, len(genomes_all): {}'
                        .format(len(reward_history), len(adv_history),
                                len(entropy_history), len(genomes_all)))

                    self._summarize_controller_train(total_loss, adv_history,
                                                     entropy_history,
                                                     reward_history,
                                                     avg_reward_base)

                    reward_history, adv_history, entropy_history = [], [], []
                    total_loss = 0

                    # update max number of layers (do it here so stats are comparable)
                    self.controller.set_max_layers(self.controller.max_layers +
                                                   1)

                    self.controller.save()

                if self.controller_step % args.log_step_genome == 0:
                    self._summarize_best_genome(genomes_all)
                    genomes_all = []

                # check for stopping
                elapsed_time = time.time() - self.start_time
                if args.max_time is not None:
                    if elapsed_time >= args.max_time:
                        logger.info(
                            'Stopping b/c max time exceeded: {}'.format(
                                elapsed_time))
                        return True
                else:
                    n_gens = int(np.round(self.n_models /
                                          args.log_step_genome))
                    if n_gens >= args.max_generations + 1:
                        logger.info(
                            'Stopping b/c max_generations: {}/{}. Run time: {}'
                            .format(n_gens, args.max_generations,
                                    elapsed_time))
                        return True

        logger.info('Controller finished training within time: {}'.format(
            elapsed_time))
        return True