def log_likelihood(fluo, A_log, v, noise, pi0_log, alpha, beta): """ :param fluo: List of fluo time series :param A_log: Log of transition matrix :param v: Emission States :param noise: Noise (stddev of signal) :param pi0_log: Log of initial state PDF :param alpha: Forward matrix :param beta: Backward matrix :return: Log Probability """ l_score = 0 K = len(v) for f, fluo_vec in enumerate(fluo): # Get log likelihood of sequence p_x = logsumexp(alpha[f][:, -1]) for t in xrange(len(fluo_vec)): for k in xrange(K): # Likelihood of observing F(t) l_score += math.exp(alpha[f][k, t] + beta[f][k, t] - p_x) * log_L_fluo(fluo_vec[t], v[k], noise) if t == 0: # Likelihood of sequencce starting with k for k in xrange(K): l_score += math.exp(alpha[f][k, t] + beta[f][k, t] - p_x) * (pi0_log[k] + alpha[f][k, t] + beta[f][k, t]) else: # Likelihood of transition TO l FROM k for k in xrange(K): for l in xrange(K): l_score += math.exp(alpha[f][l, t] + beta[f][l, t] + alpha[f][k, t - 1] + beta[f][k, t - 1] - p_x) * A_log[l, k] return l_score
def beta_alg_cp(fluo_vec, A_log, v, w, noise, pi0_log, pointers, alpha_states, cp_fluo, alpha_stack): """ :param fluo_vec: a single time series of fluorescence values :param A: current estimate of A :param v: current estimate of v :param w: system memory :param noise: system noise :param pi0: initial state PDF :param pointers: transition pointers from alpha calculation :alpha_states: list of states visited by alpha alg :cp_fluo: fluorescence corresponding to each state :alpha_stack: list of final state vectors from alpha calc...starting point for beta :return: K x T vector of log probabilities """ T = len(fluo_vec) K = len(v) max_stack = len(alpha_stack) # Truncated beta array beta_array = np.zeros((max_stack, T), dtype=float) - np.Inf # initialize--We basically ignore this step beta_array[:, -1] = np.log(1.0) #Initialize Stack Stack = alpha_stack # Iteration steps = np.arange(T - 1) steps = steps[::-1] for t in steps: for l, state in enumerate(alpha_states[t + 1]): for p in pointers[t][l]: beta_array[p, t] = logsumexp([ beta_array[p, t], beta_array[l, t + 1] + log_L_fluo(fluo_vec[t + 1], cp_fluo[t + 1][l], noise) + A_log[state, alpha_states[t][p]] ]) return beta_array
def cpEM_BW(fluo, A_init, v_init, noise, pi0, w, max_stack=100, max_iter=1000, eps=10e-4): """ :param fluo: time series of fluorescent intensities (list of lists) :param A_init: Initial guess at the system's transition probability matrix (KxK) :param v_init: Initial guess at emissions vector (K) :param noise: Standard Deviation of fluo emissions (Taken as given at this stage) :param pi0: Initial state PDF (Taken as given) :param w: memory of system :param max_stack: Max num states tracked per time step (<= K^w) :param max_iter: Maximum number of iterations permitted :param eps: Termination criteria--minimum permissible percent change in estimated params :return: Infers A and v for set of sequences """ pi0_log = np.log(pi0) K = len(v_init) A_list = [np.log(A_init)] v_list = [v_init] logL_list = [-10e7] stack_depth = min(K**w, max_stack) #Initialize variable to track percent change in likelihood delta = 1 iter = 1 total_time = 0 while iter < max_iter and abs(delta) > eps: loop_start_time = time.time() setup_start = time.time() v_curr = v_list[iter - 1] A_log = A_list[iter - 1] #--------------------------Fwd Bkwd Algorithm for Each Sequence----------------------------------------------------# #store likelihood of each sequence given current parameter estimates seq_log_probs = [] #store alpha and beta matrices alpha_arrays = [] beta_arrays = [] #track recent state and transition pointer info pointer_list = [] state_list = [] cp_fluo_list = [] cp_state_list = [] for f, fluo_vec in enumerate(fluo): alpha_array, s_list, p_list, cf_list, Stack, F_list = alpha_alg_cp( fluo_vec=fluo_vec, A_log=A_log, v=v, w=w, noise=noise, pi0_log=pi0_log, max_stack=stack_depth) beta_array = beta_alg_cp(fluo_vec=fluo_vec, A_log=A_log, v=v, w=w, noise=noise, pi0_log=pi0_log, pointers=p_list, alpha_states=s_list, cp_fluo=cf_list, alpha_stack=Stack) #use last values of alpha array to calculate series probability p_seq = logsumexp(alpha_array[:, -1]) #Store Results alpha_arrays.append(alpha_array) beta_arrays.append(beta_array) seq_log_probs.append(p_seq) pointer_list.append(p_list) state_list.append(s_list) cp_fluo_list.append(cf_list) cp_state_list.append(F_list) A_start = time.time() #---------------------------------------Calculate Updated A and v--------------------------------------------------# #Update A #List of Lists to store transition events #Index scheme: K*row + col event_list = [] event_id = [] for f, fluo_vec in enumerate(fluo): T = len(fluo_vec) a = alpha_arrays[f] b = beta_arrays[f] p = pointer_list[f] s = state_list[f] i = cp_fluo_list[f] sp = seq_log_probs[f] for t in xrange(0, T - 1): for row in xrange(len(p[t])): for r in p[t][row]: from_state = s[t][r] to_state = s[t + 1][row] event = a[r, t] + b[row, t + 1] + A_log[ to_state, from_state] + log_L_fluo( fluo=fluo_vec[t + 1], fluo_est=i[t + 1][row], noise=noise) event_list.append(event - sp) event_id.append(K * to_state + from_state) event_list = np.array(event_list) event_id = np.array(event_id) A_log_new = np.zeros((K, K)) for k in xrange(K**2): A_log_new[k / K, k % K] = logsumexp( event_list[np.where(event_id == k)[0]]) A_log_new = A_log_new - np.tile(logsumexp(A_log_new, axis=0), (K, 1)) A_new = np.exp(A_log_new) #Update v wt_full = [] for f, fluo_vec in enumerate(fluo): #Transpose alpha beta arrays to keep format compatible with F counts wt_full += np.reshape( np.transpose(alpha_arrays[f] + beta_arrays[f] - seq_log_probs[f]).tolist(), (len(fluo_vec) * stack_depth)).tolist() #Convert to arrays F_full = np.array(list(chain(*chain(*cp_state_list)))) wt_full = np.exp(np.array(wt_full)) b_full = np.repeat(list(chain(*fluo)), stack_depth) F_square = np.zeros((K, K)) b_vec = np.zeros(K) for k in xrange(K): F_square[k, :] = np.sum(F_full * F_full[:, k][:, np.newaxis] * wt_full[:, np.newaxis], axis=0) b_vec[k] = np.sum(F_full[:, k] * wt_full * b_full) v_new = np.linalg.solve(F_square, b_vec) v_list.append(v_new) A_list.append(A_log_new) logL = np.sum(seq_log_probs) logL_list.append(logL) #Check % improvement in logL delta = (logL_list[iter - 1] - logL) / logL_list[iter - 1] if delta < 0: print("Warning: Non-monotonic behavior in likelihood") #sys.exit(1) if iter % 1 == 0: print(logL) print(abs(delta)) print(v_new) print(A_new) loop_time = time.time() - loop_start_time print(loop_time) total_time += loop_time iter += 1 print(total_time) return (A_list, v_list, logL_list)
def viterbi_compound(fluo, A_log, v, noise, pi0_log, w, cp_array, to_from, cp_init): """ :param fluo: list of fluorescence time series :param A_log: Log of transition matrix :param v: State emission vector :param noise: system noise :param pi0_log: log of initation PDF :param w: memory in time steps :return: most likely series of promoter states and compound emissions for each fluo vector """ #Get state count K = len(v) # Calculate fluo values for each state cp_e_vec = np.zeros(K**w) for s in xrange(K**w): cp_e_vec[s] = np.sum(np.array([v[cp_array[s, i]] for i in xrange(w)])) #Initialize list to store fits cp_state_fits = [] cp_fluo_fits = [] state_fits = [] fluo_fits = [] logL_list = [] for f, fluo_vec in enumerate(fluo): T = len(fluo_vec) #Intilize array to store state likelihoods for each step v_array = np.zeros((K**w, T)) - np.Inf #Initialize array to store pointers p_array = np.zeros((K**w, T - 1), dtype='int') for t in xrange(T): if t == 0: for l in cp_init: v_array[l, t] = log_L_fluo( fluo=fluo_vec[t], fluo_est=cp_e_vec[l], noise=noise) + pi0_log[cp_array[l, 0]] else: for l in xrange(K**w): #Convenience lookup vector to simplify indexing lk = to_from[l, :] #most recent state in current cp state rc = cp_array[l, 0] lookback = [ v_array[k, t - 1] + A_log[rc, cp_array[k, 0]] for k in lk ] e_prob = log_L_fluo(fluo=fluo_vec[t], fluo_est=cp_e_vec[l], noise=noise) #Get probs for present time point v_array[l, t] = np.max(e_prob + lookback) #Get pointer to most likely previous state p_array[l, t - 1] = lk[np.argmax(np.array(lookback))] #Backtrack to find optimal path #Arrays to store compound sequences cp_v_fits = np.zeros(T, dtype='int') cp_f_fits = np.zeros(T) #Arrays to store simple sequences v_fits = np.zeros(T, dtype='int') f_fits = np.zeros(T) cp_v_fits[T - 1] = np.argmax(v_array[:, T - 1]) cp_f_fits[T - 1] = cp_e_vec[cp_v_fits[T - 1]] v_fits[T - 1] = cp_array[cp_v_fits[T - 1], 0] f_fits[T - 1] = v[v_fits[T - 1]] prev = cp_v_fits[T - 1] for t in xrange(T - 1): cp_v_fits[T - 2 - t] = p_array[prev, T - 2 - t] cp_f_fits[T - 2 - t] = cp_e_vec[cp_v_fits[T - 2 - t]] v_fits[T - 2 - t] = cp_array[cp_v_fits[T - 2 - t], 0] f_fits[T - 2 - t] = v[v_fits[T - 2 - t]] prev = cp_v_fits[T - 2 - t] cp_state_fits.append(cp_v_fits) cp_fluo_fits.append(cp_f_fits) state_fits.append(v_fits) fluo_fits.append(f_fits) logL_list.append(np.max(v_array[:, T - 1])) return (cp_state_fits, cp_fluo_fits, state_fits, fluo_fits, logL_list)
def alpha_alg_cp(fluo_vec, A_log, v, w, noise, pi0_log, max_stack): """ :param fluo_vec: a single time series of fluorescence values :param A: current estimate of A :param v: current estimate of v :param w: system memory :param noise: system noise :param pi0: initial state PDF :param max_stack: max num states tracked per step :return: K x T vector of log probabilities """ K = len(v) T = len(fluo_vec) #List to Store Transition Pointers (len T-1). Points to positional id NOT state ID p_list = [] #List to Store most recently added state (len T) s_list = [] #List to store cp fluo values (for convenience, could be calculated from above arrays) (len T) cf_list = [] #Truncated alpha array alpha_array = np.zeros((max_stack, T), dtype=float) - np.Inf #Stack list to track most recent N active states Stack = [] #List of Lists to track state counte F_list = [] # Iteration dp_total = 0 update_total = 0 sort_total = 0 for t in xrange(0, T): if t == 0: s_list.append(range(K)) cf_list.append([v[s] for s in xrange(K)]) alpha_array[:K, 0] = pi0_log + np.array( [log_L_fluo(fluo_vec[t], v[s], noise) for s in xrange(K)]) Stack = [[s] + [0] * (w - 1) for s in xrange(K)] # [s_{t}, s_{t-1}, s_{t-2},...s_{t-w+1} else: #Iterate through previous list of states to determine updates new_probs = [] new_pointers = [] new_states = [] new_fluo = [] new_stack = [] for l in xrange(K): for k, state in enumerate(s_list[t - 1]): dp_start = time.time() sn = [l] + Stack[k][:-1] if k > 0 and sn == new_stack[-1]: new_probs[-1] = logsumexp([ log_L_fluo(fluo_vec[t], new_fluo[-1], noise) + A_log[l, state] + alpha_array[k, t - 1], new_probs[-1] ]) new_pointers[-1] = [ pointer for pointer in new_pointers[-1] ] + [k] dp_total += time.time() - dp_start else: update_start = time.time() if t < w: new_fluo.append(cf_list[t - 1][k] + v[l]) else: new_fluo.append(cf_list[t - 1][k] + v[l] - v[Stack[k][-1]]) new_stack.append([l] + Stack[k][:-1]) # get logL of transitions into state l from each of the previous states new_states.append(l) new_pointers.append([k]) # scipy has a built in function for handling logsums new_probs.append( log_L_fluo(fluo_vec[t], new_fluo[-1], noise) + A_log[l, state] + alpha_array[k, t - 1]) update_total += time.time() - update_start #Sort resulting probs sort_time = time.time() new_args = np.argsort(new_probs)[-max_stack:] n_arg = len(new_args) #Now that we have ranked entries by prob, sort so that most similar stack entries are adjacent #will be used in next iteration to (somewhat) efficiently look for new duplicate entries stack_strings = [] for arg in new_args: stack_strings.append(''.join(map(str, new_stack[arg][:-1]))) sort_args = np.argsort(stack_strings) sorted_new_args = [new_args[i] for i in sort_args] #Update lists alpha_array[0:n_arg, t] = [new_probs[a] for a in sorted_new_args] p_list.append([new_pointers[a] for a in sorted_new_args]) s_list.append([new_states[a] for a in sorted_new_args]) cf_list.append([new_fluo[a] for a in sorted_new_args]) Stack = [new_stack[a] for a in sorted_new_args] sort_total += time.time() - sort_time f = [[0, 0, 0]] * max_stack for i, stk in enumerate(Stack): ct = [ len(np.where(np.array(stk, dtype='int') == k)[0]) for k in xrange(K) ] f[i] = ct F_list.append(f) return (alpha_array, s_list, p_list, cf_list, Stack, F_list)
def decode_cp(fluo, A_log, pi0_log, v, w, noise, stack_depth, alpha=0, log_stack=0): """ :param fluo: list of fluorescence vectors :param A_log: log of transition probability matrix :param pi0_log: log of initial state PDF :param v: emission state vector :param w: memory of system :param noise: estimated system noise :param stack_depth: max num active hypotheses to keep in stack :param alpha: fractional num time steps needed to transcribe MS2 loops :return: Best guess at optimal path for each input fluo vec """ # Get state count K = len(v) seq_out = [] f_out = [] v_out = [] logL_out = [] stack_register = [] # Calculate convolution kernel to apply to fluorescence if alpha > 0: alpha_vec = [ (float(i + 1) / alpha + (float(i) / alpha)) / 2.0 * (i < alpha) * ((i + 1) <= alpha) + ((alpha - i) * (1 + float(i) / alpha) / 2.0 + i + 1 - alpha) * (i < alpha) * (i + 1 > alpha) + 1 * (i >= alpha) for i in xrange(w) ] else: alpha_vec = np.array([1.0] * w) kernel = np.ones(w) * alpha_vec kernel = kernel[::-1] iter = 0 for f, fluo_vec in enumerate(fluo): Stack = [] T = len(fluo_vec) # assume that fluo vectors are off prior to first obs... for l in range(K): #Assume initial condition is OFF ns = [0] * (w) ns.append(l) ns = np.array(ns) #z_{t-w+1} ... z_{t-1}, z_{t} f = np.sum(ns[1:] * kernel) ns_score = log_L_fluo(fluo=fluo_vec[0], fluo_est=f, noise=noise) + pi0_log[l] Stack.append([ns_score, ns, f]) Stack.sort(key=itemgetter(0)) while (len(Stack[-1][1])) < T + w: #Get current best hypothesis hypothesis = Stack.pop() h_seq = hypothesis[1] prev = h_seq[-1] t = len(h_seq) - w #Check possible extensions for l in range(K): new_seq = [0] * (len(h_seq) + 1) new_seq[:-1] = h_seq new_seq[-1] = l f1 = np.array([v[z] for z in new_seq]) f1 = np.sum(kernel * f1[-w:]) ns_score = (hypothesis[0] * float(t) + log_L_fluo( fluo=fluo_vec[t], fluo_est=f1, noise=noise) + A_log[l, prev]) / float(t + 1) bisect.insort(Stack, [ns_score, new_seq, f1]) #Enforce max stack length while (len(Stack) > stack_depth): Stack.pop(0) if iter % 10 == 0: if log_stack: stack_register.append(Stack[-1][1]) iter += 1 logL_out.append(T * Stack[-1][0]) seq_out.append(Stack[-1][1][w:]) v_out.append([v[i] for i in Stack[-1][1][w:]]) emissions = [v[t] for t in Stack[-1][1]] f_cp = np.convolve(kernel[::-1], emissions, mode='full') f_cp = f_cp[w:-w + 1] f_out.append(f_cp) return (seq_out, f_out, v_out, logL_out, stack_register)