-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulation.py
109 lines (80 loc) · 2.97 KB
/
simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import sys
import simplejson
import policy
from policy import imax
import rewards
def construct_policy(simu_params):
policy_class_name = simu_params['policy_class_name']
policy_args = simu_params['policy_args']
policy_kwargs = simu_params['policy_kwargs']
policy_class = getattr(policy, policy_class_name)
return policy_class(*policy_args, **policy_kwargs)
def construct_iter_rewards(simu_params):
reward_gen_name = simu_params['reward_gen_name']
reward_gen_args = simu_params['reward_gen_args']
reward_gen_kwargs = simu_params['reward_gen_kwargs']
reward_generator = getattr(rewards, reward_gen_name)
return reward_generator(*reward_gen_args, **reward_gen_kwargs)
def mean(xs):
s = 0
num_el = 0
for x in xs:
s += x
num_el += 1
return float(s) / num_el
class Simulation(object):
def __init__(self, simu_params, max_time=100, num_sims=10):
self.max_time = max_time
self.num_sims = num_sims
self.simu_params = simu_params
self.run_to_rewards = []
self.run_to_policy_rewards = []
def init(self):
pass
def _run_once(self, verbose=True):
policy = construct_policy(self.simu_params)
iter_rewards = construct_iter_rewards(self.simu_params)
# Pre-allocate to significantly increase performance
rewards = [[]] * self.max_time
policy_rewards = [0.0] * self.max_time
reward_sums = [0.0] * policy.num_arms
arm_choices = []
for t, rewards_t in enumerate(iter_rewards):
if t >= self.max_time:
break
rewards[t] = rewards_t
arm = policy.choose_arm()
arm_choices.append(arm)
reward = rewards_t[arm]
policy_rewards[t] = reward
policy.update(arm, reward)
for i, r in enumerate(rewards_t):
reward_sums[i] += r
if verbose and t % 1000 == 0:
print >>sys.stderr, "%s: distro %s ran %s time steps" % (
self.simu_params['name'], self.simu_params['distro_name'], t)
return rewards, reward_sums, policy_rewards, arm_choices
def run(self, verbose=True):
# opt_arm_s = [[]] * self.num_sims
opt_arm_rwd_s = [[]] * self.num_sims
policy_rewards_s = [[]] * self.num_sims
for s in range(self.num_sims):
rewards, reward_sums, policy_rewards, arm_choices = self._run_once(verbose=verbose)
best_arm, best_sum = imax(reward_sums)
# opt_arm_s[s] = [float(chosen_arm == best_arm) for chosen_arm in arm_choices]
opt_arm_rwd_s[s] = [row[best_arm] for row in rewards if len(row)]
policy_rewards_s[s] = policy_rewards
# opt_arm = []
opt_arm_rwd = []
mean_policy_reward = []
for t in range(len(opt_arm_rwd_s[0])):
# opt_arm.append(mean(col[t] for col in opt_arm_s))
opt_arm_rwd.append(mean(col[t] for col in opt_arm_rwd_s))
mean_policy_reward.append(mean(col[t] for col in policy_rewards_s))
self.run_data = (opt_arm_rwd, mean_policy_reward)
def save(self):
data = [self.simu_params, self.run_data]
fname = "records/%s_%s.json" % (self.simu_params['name'], self.simu_params['distro_name'])
fname = fname.replace(' ', '_')
simplejson.dump(data, open(fname, 'w'))