def act_Probabilistic(self): # !! Should not change q_estimation!! Otherwise will affect following Qs # And I put softmax here if '_CK' in self.forager: self.choice_prob[:, self.time] = softmax(np.vstack([self.q_estimation[:, self.time], self.choice_kernel[:, self.time]]), np.vstack([self.softmax_temperature, self.choice_softmax_temperature]), bias = self.bias_terms) # Updated softmax function that accepts two elements else: self.choice_prob[:, self.time] = softmax(self.q_estimation[:, self.time], self.softmax_temperature, bias = self.bias_terms) if self.if_fit_mode: self.predictive_choice_prob[:, self.time] = self.choice_prob[:, self.time] choice = None # No need to make specific choice in fitting mode else: choice = choose_ps(self.choice_prob[:, self.time]) self.choice_history[0, self.time] = choice return choice
def negLL_slide_win(fit_value, *args): ''' Negative likelihood function for the sliding window ''' # Arguments interpretation Q_0, choices, rewards = args learn_rate, softmax_temperature, biasL = fit_value bias_terms = np.array([biasL, 0]) trial_n_win = np.shape(choices)[1] Q_win = np.zeros_like(rewards) # K_arm * trial_n choice_prob_win = np.zeros_like(rewards) # -- Do mini-simulation in this sliding window (light version of RW1972) -- for t in range(trial_n_win): Q_old = Q_0 if t == 0 else Q_win[:, t - 1] # Update Q choice_this = choices[0, t] Q_win[choice_this, t] = Q_old[choice_this] + learn_rate * ( rewards[choice_this, t] - Q_old[choice_this]) # Chosen side Q_win[1 - choice_this, t] = Q_old[1 - choice_this] # Unchosen side # Update choice_prob choice_prob_win[:, t] = softmax(Q_win[:, t], softmax_temperature, bias=bias_terms) # Compute negative likelihood likelihood_each_trial = choice_prob_win[choices[ 0, :], range(trial_n_win)] # Get the actual likelihood for each trial # Deal with numerical precision likelihood_each_trial[(likelihood_each_trial <= 0) & ( likelihood_each_trial > -1e-5 )] = 1e-16 # To avoid infinity, which makes the number of zero likelihoods informative! likelihood_each_trial[likelihood_each_trial > 1] = 1 negLL = -sum(np.log(likelihood_each_trial)) return negLL
def step(self, choice): # ============================================================================= # Generate reward and make the state transition (i.e., prepare reward for the next trial) -- # ============================================================================= # These four lines work for both varying reward probability and amplitude reward = self.reward_available[choice, self.time] self.reward_history[choice, self.time] = reward # Note that according to Sutton & Barto's convention, # this update should belong to time t+1, but here I use t for simplicity. reward_available_after_choice = self.reward_available[:, self.time].copy() # An intermediate reward status. Note the .copy()! reward_available_after_choice [choice] = 0 # The reward is depleted at the chosen lick port. self.time += 1 # Time ticks here !!! if self.time == self.n_trials: return; # Session terminates if not self.if_varying_amplitude: # Varying reward prob. # For the next reward status, the "or" statement ensures the baiting property, gated by self.if_baited. self.reward_available[:, self.time] = np.logical_or( reward_available_after_choice * self.if_baited, np.random.uniform(0,1,self.k) < self.p_reward[:,self.time]).astype(int) else: # Varying reward amplitude # For the chosen side AND the unchosen side: # amplitude = 1 - (1 - amp)^(time from last chose) ==> next_amp = 1 - (1 - previous_amp) * (1 - p_reward) self.reward_available[:, self.time] = 1 - (1 - reward_available_after_choice * self.if_baited) * (1 - self.p_reward[:,self.time]) # ============================================================================= # Update value estimation (or Poisson choice probability) # ============================================================================= # = Forager types: # 1. Special foragers # 1). 'Random' # 2). 'LossCounting': switch to another option when loss count exceeds a threshold drawn from Gaussian [from Shahidi 2019] # - 3.1: loss_count_threshold = inf --> Always One Side # - 3.2: loss_count_threshold = 1 --> win-stay-lose-switch # - 3.3: loss_count_threshold = 0 --> Always switch # 3). 'IdealpGreedy': knows p_reward + always chooses the largest one # 4). 'IdealpHatGreedy': knows p_reward AND p_hat + always chooses the largest one p_hat ==> {m,1}, analytical # 5). 'IdealpHatOptimal': knows p_reward AND p_hat + always chooses the REAL optimal ==> {m,n}, no analytical solution # 6). 'pMatching': to show that pMatching is necessary but not sufficient # # 2. NLP-like foragers # 1). 'Sugrue2004': income -> exp filter -> fractional -> epsilon-Poisson (epsilon = 0 in their paper; I found it essential) # 2). 'Corrado2005': income -> 2-exp filter -> softmax ( = diff + sigmoid) -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??) # 3). 'Iigaya2019': income -> 2-exp filter -> fractional -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??) # # 3. RL-like foragers # 1). 'SuttonBartoRLBook': return -> exp filter -> epsilon-greedy (epsilon > 0 is essential) # 2). 'Bari2019': return/income -> exp filter (both forgetting) -> softmax -> epsilon-Poisson (epsilon = 0 in their paper, no necessary) # 3). 'Hattori2019': return/income -> exp filter (choice-dependent forgetting, reward-dependent step_size) -> softmax -> epsilon-Poisson (epsilon = 0 in their paper; no necessary) if self.forager in ['LossCounting']: if self.loss_count[0, self.time - 1] < 0: # A switch just happened self.loss_count[0, self.time - 1] = - self.loss_count[0, self.time - 1] # Back to normal (Note that this = 0 in Shahidi 2019) if reward: self.loss_count[0, self.time] = 0 else: self.loss_count[0, self.time] = 1 else: if reward: self.loss_count[0, self.time] = self.loss_count[0, self.time - 1] else: self.loss_count[0, self.time] = self.loss_count[0, self.time - 1] + 1 elif self.forager in ['SuttonBartoRLBook', 'Bari2019', 'Hattori2019'] or 'PatternMelioration' in self.forager: # Local return # Note 1: These three foragers only differ in how they handle step size and forget rate. # Note 2: It's "return" rather than "income" because the unchosen Q is not updated (when forget_rate = 0 in SuttonBartoRLBook) # Note 3: However, if forget_rate > 0, the unchosen one is also updated, and thus it's somewhere between "return" and "income". # In fact, when step_size = forget_rate, the unchosen Q is updated by exactly the same rule as chosen Q, so it becomes exactly "income" # 'PatternMelioration' = 'SuttonBartoRLBook'(i.e. RW1972) here because it needs to compute the average RETURN. # Reward-dependent step size ('Hattori2019') if reward: step_size_this = self.step_sizes[1] else: step_size_this = self.step_sizes[0] # Choice-dependent forgetting rate ('Hattori2019') # Chosen: Q(n+1) = (1- forget_rate_chosen) * Q(n) + step_size * (Reward - Q(n)) self.q_estimation[choice, self.time] = (1 - self.forget_rates[1]) * self.q_estimation[choice, self.time - 1] \ + step_size_this * (reward - self.q_estimation[choice, self.time - 1]) # Unchosen: Q(n+1) = (1-forget_rate_unchosen) * Q(n) unchosen_idx = [cc for cc in range(self.k) if cc != choice] self.q_estimation[unchosen_idx, self.time] = (1 - self.forget_rates[0]) * self.q_estimation[unchosen_idx, self.time - 1] # Softmax in 'Bari2019', 'Hattori2019' if self.forager in ['Bari2019', 'Hattori2019']: # --- The below line is erroneous!! Should not change q_estimation!! 04/08/2020 --- # self.q_estimation[:, self.time] = softmax(self.q_estimation[:, self.time], self.softmax_temperature) self.choice_prob[:, self.time] = softmax(self.q_estimation[:, self.time], self.softmax_temperature) elif self.forager in ['Sugrue2004', 'Iigaya2019']: # Fractional local income # Note: It's "income" because the following computations do not dependent on the current ~choice~. # 1. Local income = Reward history + exp filter in Sugrue or 2-exp filter in IIgaya valid_reward_history = self.reward_history[:, :self.time] # History till now valid_filter = self.history_filter[-self.time:] # Corresponding filter local_income = np.sum(valid_reward_history * valid_filter, axis = 1) # 2. Poisson choice probability = Fractional local income if np.sum(local_income) == 0: # 50%-to-50% # self.q_estimation[:, self.time] = [1/self.k] * self.k self.choice_prob[:, self.time] = [1/self.k] * self.k else: # Local fractional income # self.q_estimation[:, self.time] = local_income / np.sum(local_income) self.choice_prob[:, self.time] = local_income / np.sum(local_income) elif self.forager == 'Corrado2005': # Softmaxed local income # 1. Local income = Reward history + hyperbolic (2-exps) filter valid_reward_history = self.reward_history[:, :self.time] # History till now valid_filter = self.history_filter[-self.time:] # Corresponding filter local_income = np.sum(valid_reward_history * valid_filter, axis = 1) # 2. Poisson choice probability = Softmaxed local income (Note: Equivalent to "difference + sigmoid" in [Corrado etal 2005], for 2lp case) # self.q_estimation[:, self.time] = softmax(local_income, self.softmax_temperature) self.choice_prob[:, self.time] = softmax(local_income, self.softmax_temperature) elif 'FullState' in self.forager: # print(', rew = ', reward) self.full_state_Qforager.update_Q(reward) # All magics are in the Class definition if self.if_plot_Q: go_on = self.full_state_Qforager.plot_Q(self.time, reward, self.p_reward[:,self.time], self.description); if not go_on: # No longer plot self.if_plot_Q = False if self.if_record_Q and self.time == self.n_trials - 1: # The last frame, stop recording self.full_state_Qforager.writer.cleanup() self.full_state_Qforager.writer.finish() return reward
def act(self): # ============================================================================= # Update value estimation (or Poisson choice probability) # ============================================================================= # = Forager types: # 1. Special foragers # 1). 'Random' # 2). 'LossCounting': switch to another option when loss count exceeds a threshold drawn from Gaussian [from Shahidi 2019] # - 3.1: loss_count_threshold = inf --> Always One Side # - 3.2: loss_count_threshold = 1 --> win-stay-lose-switch # - 3.3: loss_count_threshold = 0 --> Always switch # 3). 'IdealpGreedy': knows p_reward + always chooses the largest one # 4). 'IdealpHatGreedy': knows p_reward AND p_hat + always chooses the largest one p_hat ==> {m,1}, analytical # 5). 'IdealpHatOptimal': knows p_reward AND p_hat + always chooses the REAL optimal ==> {m,n}, no analytical solution # 6). 'pMatching': to show that pMatching is necessary but not sufficient # # 2. NLP-like foragers # 1). 'Sugrue2004': income -> exp filter -> fractional -> epsilon-Poisson (epsilon = 0 in their paper; I found it essential) # 2). 'Corrado2005': income -> 2-exp filter -> softmax ( = diff + sigmoid) -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??) # 3). 'Iigaya2019': income -> 2-exp filter -> fractional -> epsilon-Poisson (epsilon = 0 in their paper; has the same effect as tau_long??) # # 3. RL-like foragers # 1). 'SuttonBartoRLBook': return -> exp filter -> epsilon-greedy (epsilon > 0 is essential) # 2). 'Bari2019': return/income -> exp filter (both forgetting) -> softmax -> epsilon-Poisson (epsilon = 0 in their paper, no necessary) # 3). 'Hattori2019': return/income -> exp filter (choice-dependent forgetting, reward-dependent step_size) -> softmax -> epsilon-Poisson (epsilon = 0 in their paper; no necessary) if self.forager == 'Random': choice = np.random.choice(self.k) elif self.forager == 'AlwaysLEFT': choice = LEFT elif self.forager in ['IdealpHatOptimal','IdealpHatGreedy','AmB1']: # Foragers that have the pattern {AmBn} choice = self.choice_history[0, self.time] # Already saved in the optimal sequence elif self.forager == 'pMatching': # Probability matching of base probabilities p choice = choose_ps(self.p_reward[:,self.time]) elif self.forager == 'LossCounting': if self.time == 0: choice = np.random.choice(self.k) # Random on the first trial else: # Retrieve the last choice last_choice = self.choice_history[0, self.time - 1] if self.loss_count[0, self.time] >= self.loss_threshold_this: # Switch choice = LEFT + RIGHT - last_choice # Reset loss counter threshold self.loss_count[0, self.time] = - self.loss_count[0, self.time] # A flag of "switch happens here" self.loss_threshold_this = np.random.normal(self.loss_count_threshold_mean, self.loss_count_threshold_std) else: # Stay choice = last_choice elif self.forager == 'IdealpGreedy': choice = np.random.choice(np.where(self.p_reward[:,self.time] == self.p_reward[:,self.time].max())[0]) elif 'PatternMelioration' in self.forager: rich_now = np.argmax(self.pattern_now) lean_now = 1 - rich_now if self.run_length_now[rich_now] < self.pattern_now[rich_now]: # If rich side is not finished choice = rich_now # Make decision self.run_length_now[rich_now] += 1 # Update counter elif self.run_length_now[lean_now] < self.pattern_now[lean_now]: # If rich has been just finished, run the lean side # assert(self.pattern_now[lean_now] == 1) # Only 1 trial for sure choice = lean_now self.run_length_now[lean_now] += 1 # Update counter else: # Otherwise, this pattern has been finished if self.forager == 'PatternMelioration': # Update the next pattern if np.abs(np.diff(self.q_estimation[:, self.time])) >= self.pattern_meliorate_threshold: # Probability of update pattern = Step function rich_Q = np.argmax(self.q_estimation[:, self.time]) # Better side indicated by Q if np.all(self.pattern_now == 1): # Already in {1,1} self.pattern_now[rich_Q] += 1 else: # Only modify rich side # -- Estimate p_base by Q (no block structure, direct estimation) -- Doesn't work... Sampling the lean side is not efficient # p_base_est_rich = self.q_estimation[rich_now, self.time] # p_base_est_lean = self.q_estimation[lean_now, self.time] / self.pattern_now[rich_Q] # [m, n], _ = self.get_IdealpHatGreedy_strategy([p_base_est_rich, p_base_est_lean]) # m = min(m,15) # if p_base_est_rich > p_base_est_lean: # Don't change side # self.pattern_now[[rich_now, lean_now]] = [m, 1] # else: # self.pattern_now[[rich_now, lean_now]] = [1, m] # Switch side immediately # -- Block-state enables fast switch if rich_Q == rich_now: self.pattern_now[rich_now] += 1 else: # Maybe this is a block switch, let's try to make some large modification # self.pattern_now = np.flipud(self.pattern_now) # Flip self.pattern_now = np.array([1,1]) # Reset self.q_estimation[:, self.time] = 0 # -- Not aware of block structure # pattern_step = 1 if (rich_Q == rich_now) else -1 # If the sign of diff_Q is aligned with rich_pattern, then add 1 # self.pattern_now[rich_now] += pattern_step elif self.forager == 'PatternMelioration_softmax': # -- Update_step \propto sigmoid -- # deltaQ = self.q_estimation[rich_now, self.time] - self.q_estimation[lean_now, self.time] # update_step = int(self.pattern_meliorate_softmax_max_step * 2 * (1 / (1 + np.exp(- deltaQ / self.pattern_meliorate_softmax_temp)) - 0.5)) # Max = 10 # self.pattern_now[rich_now] += update_step # if self.pattern_now[rich_now] < 1: # self.pattern_now[lean_now] = 2 - self.pattern_now[rich_now] # self.pattern_now[rich_now] = 1 # -- Softmax -> get p -> use {floor(p/(1-p)), 1} -- choice_p = softmax(self.q_estimation[:, self.time], self.pattern_meliorate_softmax_temp) rich_Q = np.argmax(choice_p) m_est = np.floor(choice_p[rich_Q] / choice_p[1-rich_Q]) m_est = np.min([m_est, 10]) self.pattern_now[[rich_Q, 1-rich_Q]] = [m_est, 1] self.run_length_now = np.array([0,0]) # Reset counter # Make the first trial for the next pattern rich_now = np.argmax(self.pattern_now) # Use the new pattern choice = rich_now self.run_length_now[rich_now] += 1 # Update counter elif 'FullState' in self.forager: if self.time == 0: choice = self.full_state_Qforager.current_state.which[0] else: choice = self.full_state_Qforager.act() # All magics are in the Class definition # print('\nTime = ', self.time, ': ', choice, end='') else: if np.random.rand() < self.epsilon or np.sum(self.reward_history) < self.random_before_total_reward: # Forced exploration with the prob. of epsilon (to avoid AlwaysLEFT/RIGHT in Sugrue2004...) choice = np.random.choice(self.k) else: # Forager-dependent if self.forager == 'SuttonBartoRLBook': # Greedy choice = np.random.choice(np.where(self.q_estimation[:, self.time] == self.q_estimation[:, self.time].max())[0]) elif self.forager in ['Sugrue2004', 'Corrado2005', 'Iigaya2019', 'Bari2019', 'Hattori2019' ]: # Poisson # choice = choose_ps(self.q_estimation[:,self.time]) choice = choose_ps(self.choice_prob[:,self.time]) self.choice_history[0, self.time] = int(choice) return int(choice)
def act_softmax( self, softmax_temp): # Generate the next action using softmax(Q) policy next_state_idx = choose_ps( softmax(self.Q[:len(self.next_states)], softmax_temp)) return next_state_idx # Return the index of the next state
def plot_Q(self, time=np.nan, reward=np.nan, p_reward=np.nan, description=''): # Visualize value functions (Q(s,a)) # Initialization if self.ax == []: # Prepare axes self.fig, self.ax = plt.subplots(2, 2, sharey=True, figsize=[12, 8]) plt.subplots_adjust(hspace=0.5, top=0.85) self.ax2 = self.ax.copy() self.annotate = plt.gcf().text(0.05, 0.9, '', fontsize=13) for c in [0, 1]: for d in [0, 1]: self.ax2[c, d] = self.ax[c, d].twinx() # Prepare animation if self.if_record_Q: metadata = dict(title='FullStateQ', artist='Matplotlib') self.writer = FFMpegWriter(fps=25, metadata=metadata) self.writer.setup(self.fig, "..\\results\\%s.mp4" % description, 150) direction = ['LEFT', 'RIGHT'] decision = ['Leave', 'Stay'] X = np.r_[1:np.shape(self.states)[1] - 0.1] # Ignore the last run_length (Must leave) # -- Q values and policy -- for d in [0, 1]: # Compute policy p(a|s) if self.if_softmax: Qs = np.array([s.Q for s in self.states[d, :-1]]) ps = [] for qq in Qs: ps.append(softmax(qq, self.softmax_temperature)) ps = np.array(ps) for c in [0, 1]: self.ax[c, d].cla() self.ax2[c, d].cla() self.ax[c, d].set_xlim([0, max(X) + 1]) self.ax[c, d].set_ylim([-0.05, max(plt.ylim())]) bar_color = 'r' if c == 0 else 'g' self.ax[c, d].bar(X, Qs[:, c], color=bar_color, alpha=0.5) self.ax[c, d].set_title(direction[d] + ', ' + decision[c]) self.ax[c, d].axhline(0, color='k', ls='--') if d == 0: self.ax[c, d].set_ylabel('Q(s,a)', color='k') # self.ax[c, d].set_xticks(np.round(self.ax[c, d].get_xticks())) self.ax[c, d].set_xticks(X) self.ax2[c, d].plot(X, ps[:, c], bar_color + '-o') if d == 1: self.ax2[c, d].set_ylabel('P(a|s)', color=bar_color) self.ax2[c, d].axhline(0, color=bar_color, ls='--') self.ax2[c, d].axhline(1, color=bar_color, ls='--') self.ax2[c, d].set_ylim([-0.05, 1.05]) # -- This state -- last_state = self.backup_SA[0].which current_state = self.current_state.which if time > 1: self.ax2[0, last_state[0]].plot(last_state[1] + 1, self.last_reward, 'go', markersize=10, alpha=0.5) self.ax2[0, current_state[0]].plot(current_state[1] + 1, reward, 'go', markersize=15) self.last_reward = reward # plt.ylim([-1,1]) self.annotate.set_text( '%s\nt = %g, p_reward = %s\n%s --> %s, reward = %g\n' % (description, time, p_reward, last_state, current_state, reward)) if self.if_record_Q: print(time) self.writer.grab_frame() return True else: plt.gcf().canvas.draw() return plt.waitforbuttonpress()
def fit_dynamic_learning_rate_session_no_bias_free_Q_0(choice_history, reward_history, slide_win=10, pool='', x0=[], fixed_sigma='none', method='DE'): ''' Fit R-W 1972 with sliding window = 10 (Wang, ..., Botvinick, 2018) For each sliding window, allows Q_init to be a parameter, no bias term ''' trial_n = np.shape(choice_history)[1] if x0 == []: x0 = [0.4, 0.4, 0.5, 0.5] # Settings for RW1972 # ['RW1972_softmax', ['learn_rate', 'softmax_temperature', 'Q_0'],[0, 1e-2, -5],[1, 15, 5]] if fixed_sigma == 'global': fit_bounds = [[0, x0[1], 0, 0], [1, x0[1], 1, 1] ] # Fixed sigma and bias at the global fitted parameters elif fixed_sigma == 'zeros': fit_bounds = [[0, 1e-4, 0, 0], [1, 1e-4, 1, 1]] elif fixed_sigma == 'none': fit_bounds = [[0, 1e-2, 0, 0], [1, 15, 1, 1]] Q = np.zeros(np.shape( reward_history)) # Cache of Q values (using the best fit at each step) choice_prob = Q.copy() fitted_learn_rate = np.zeros(np.shape(choice_history)) fitted_sigma = np.zeros(np.shape(choice_history)) # fitted_Q_0 = np.zeros(np.shape(choice_history)) for t in tqdm(range(1, trial_n - slide_win), desc='Sliding window', total=trial_n - slide_win): # for t in range(1, trial_n - slide_win): # Start from the second trial choice_this = choice_history[:, t:t + slide_win] reward_this = reward_history[:, t:t + slide_win] if method == 'DE': fitting_result = optimize.differential_evolution( func=negLL_slide_win_no_bias_free_Q_0, args=(choice_this, reward_this), bounds=optimize.Bounds(fit_bounds[0], fit_bounds[1]), mutation=(0.5, 1), recombination=0.7, popsize=4, strategy='best1bin', disp=False, workers=1 if pool == '' else 8, # For DE, use pool to control if_parallel, although we don't use pool for DE updating='immediate' if pool == '' else 'deferred') else: fitting_result = optimize.minimize( negLL_slide_win_no_bias_free_Q_0, x0, args=(choice_this, reward_this), method='L-BFGS-B', bounds=optimize.Bounds(fit_bounds[0], fit_bounds[1])) # Save parameters learn_rate, softmax_temperature, Q_0_L, Q_0_R = fitting_result.x fitted_learn_rate[:, t] = learn_rate fitted_sigma[:, t] = softmax_temperature fitted_Q_0 = np.array([Q_0_L, Q_0_R]) # Simulate one step to get the first Q from this best fit as the initial value of the next window choice_0 = choice_this[0, 0] Q[choice_0, t] = fitted_Q_0[choice_0] + learn_rate * ( reward_this[choice_0, 0] - fitted_Q_0[choice_0]) # Chosen side Q[1 - choice_0, t] = fitted_Q_0[1 - choice_0] # Unchosen side choice_prob[:, t] = softmax( Q[:, t], softmax_temperature) # Choice prob (just for validation) return fitted_learn_rate, fitted_sigma, fitted_Q_0, Q, choice_prob