def plot_unfolding_trajectory(traj, labels, PDB_files): f, l = utils.get_trajectory(labels, PDB_files, traj) #if filter_osc==True: l=filter_oscillations(l, thresh=thresh) plt.figure() plt.plot(utils.get_times(f),l) unique=list(set(labels)) if np.nan in unique: unique.remove(np.nan)
def get_unfolding_pathways(labels, PDB_files, temperatures='*', verbose=True, goal_state = -1): """ What are all the unfolding pathways that are taken? We ignore loops, so 0->1->2->1->3 becomes 0->1->3 Sorts unfolding pathways in order of decreasing flux through them. Can choose to specify only one temperature, for instnace, '0.925_', (that underscore is important) or a list of them We only keep pathways that reach a desired unfolded state By default, this is the fully unfodled state, and so we indicate this with goal_state = -1 But if you care about pathways that reach some other state, just specify this other state with goal_state keyword """ PDB_files, labels=utils.get_trajectory(labels, PDB_files, temperatures) times=np.array(utils.get_times(PDB_files)) zero_times=np.where(times==0)[0] pathways=[] for n, t in enumerate(zero_times[:-1]): traj=labels[t:zero_times[n+1]] pathway=[] for s,p in enumerate(traj): if s==0: #always include the initial time pathway.append(p) else: if p!=traj[s-1]: #a change has occured in trajectory if p in pathway: ind = pathway.index(p) pathway=pathway[0:ind+1] else: pathway.append(p) if goal_state ==-1 and p==np.max(labels): pathways.append(pathway) elif goal_state in pathway: pathways.append(pathway) #only keep track of pathways that fully unfold unique_pathways=[] fracs=[] #what fraction of flux goes through each unique pathway? for p in pathways: if p not in unique_pathways: unique_pathways.append(p) fracs.append(len([P for P in pathways if P==p])/len(pathways)) #sort in order of decreasing flux order=np.argsort(-np.array(fracs)) fracs=np.array(fracs)[order] unique_pathways=[unique_pathways[o] for o in order] if verbose: print ('Pathway \t \t fraction of flux') for p, path in enumerate(unique_pathways): if fracs[p]>=0.01: print('{} \t \t {} \n'.format(path, np.round(fracs[p], 3))) return pathways, unique_pathways, fracs
def get_unfolding_transitions(labels, PDB_files, temp='*', verbose=True, min_to_show=0): """ FInds unfolding transitions (ex. (0,1)) and ranks by abundance Can choose to specify only one temperature or multiple using wildcard notation min_to_show tells you the minimum number of instances a transition soudl occur if you want it to be printed by this function """ if type(temp)==int:temp=float(temp) if type(temp)!=str: #convert to a format the Get_trajectory can work with temp=str(temp) while len(temp)<5: temp = '{}0'.format(temp) temp='{}_'.format(temp) #need this underscore at the end for Get_trajectory to be happy PDB_files, labels=utils.get_trajectory(labels, PDB_files, temp) times=np.array(utils.get_times(PDB_files)) zero_times=np.where(times==0)[0] transitions=[] for n, tau in enumerate(zero_times[:-1]): traj=labels[tau:zero_times[n+1]] for t in range(len(traj)-1): if traj[t+1]>traj[t]: #an unfolding transition has occured transitions.append((traj[t], traj[t+1])) unique_transitions=[] counts=[] # dic = {} for p in transitions: if p not in unique_transitions: unique_transitions.append(p) counts.append(len([P for P in transitions if P==p])) dic[p] = counts[-1] #sort in order of decreasing frequency order=np.argsort(-np.array(counts)) counts=np.array(counts)[order] unique_transitions=[unique_transitions[o] for o in order] if verbose: print ('Transition \t \t number ofinstances') for p, path in enumerate(unique_transitions): if counts[p]>min_to_show: print('{} \t \t {} \n'.format(path, counts[p])) return dic
def predict_unfolding_at_temperature(temp, data, PDB_files): """ Function to predict lables for all trajectoires at a given temperature Note: The assumption is that at a given temperature, all snapshots are at the same times Filter should be 'First commit' or 'Last commit' or 'Filter osc' as described in ClusterPCA You can also enter None (or anything else besides the options above) in whcih case no filtering is applied """ temp=str(temp) if len(temp)==1: temp='{}.'.format(temp) while len(temp)<5: #add zeros so that the temperature is of the form 0.80 temp='{}0'.format(temp) f, trajectories = utils.get_trajectory(data, PDB_files, '{}_'.format(temp) ) #need to figure out how long are all the trajectories. #to figure this out, iterate through the first files until you see a change go=True i=0 traj_nums=[] while go: file=f[i] file=file.split('{}_'.format(temp)) suffix=file[1] traj_num=suffix.split('.')[0] traj_nums.append(traj_num) if traj_nums[i]!=traj_nums[i-1]: go=False else: i+=1 traj_len=i n_trajectories=int(len(f)/traj_len) sim_labels=np.zeros((n_trajectories, traj_len)) times=utils.get_times(f[0:traj_len]) for n in range(n_trajectories): traj=trajectories[n*traj_len:n*traj_len+traj_len] sim_labels[n,:]=traj return times, sim_labels
def runHMM(score_path, f,s, m,starting_states, plot = True): """ Parameters are as follows: score_path: Path to unfolding simulation substructure scores f: thresh for defining whether a substructure is formed. Should be same value as when computing substrucutre PMF s: The Sanders thresh, the minimum representation a topological configuration should have among unfolding snapshots for it to be kept m: Misassignment probability used in HMM starting_state: topological configuraiton of starting state By the end, all trajectories are fit to HMM, and we can generate a plot of pairwise kinetic distances between states, if we wish """ print('Loading scores...') scores, PDB_files, Substructures=load_data.load_scores(score_path, f, convert_to_binary=True ) labels, key=Bernie_elimination(scores, PDB_files, s, plot=False) alphabet = 'abcdefghijklmnopqrstuvwxyz' #passed Kindergarden NN = len(key[0]) start_states=[] for starting_state in starting_states: ss = '' for n in range(NN): if alphabet[n] in starting_state: ss= '{}{}'.format(ss, (int(1))) else: ss= '{}{}'.format(ss, (int(0))) start_states.append(np.where(np.array(key)==ss)[0][0]) start_states = np.array(start_states) times=np.array(utils.get_times(PDB_files)) zero_times=np.where(times==0)[0] #print('Zero times: {}'.format(zero_times)) lengths=[zero_times[t+1]-zero_times[t] for t in range(len(zero_times)-1)] lengths.append(200) #need to include the last trajectory lengths=np.array(lengths) for i, t in enumerate(zero_times): if t==zero_times[-1]: labels[t:]=utils.filter_nans(labels[t:]) else: nextt=zero_times[i+1] labels[t:nextt]=utils.filter_nans(labels[t:nextt]) unique_labels=np.unique(labels) Nlabels=len(unique_labels) P0_val=1/len(start_states) start_prob=np.zeros(Nlabels) start_prob[start_states]=P0_val print('The start_prob array is {} \n'.format(start_prob)) #Prepare the emission and starting probabilities emit=((m)/(Nlabels-1) )*np.ones(( Nlabels, Nlabels)) emit[np.diag_indices(Nlabels)]=1 - m #start_prob[starting_state]=1 #Train the HMM !!! print('Training HMM...') labels=np.array(labels).reshape(-1,1) HMM = hmmlearn.hmm.MultinomialHMM(n_components = Nlabels, params='t', init_params='t') HMM.emissionprob_=emit HMM.startprob_=start_prob HMM.fit(labels, lengths=lengths) S=HMM.predict(labels, lengths=lengths) score=HMM.score(labels, lengths=lengths) transmat=HMM.transmat_ #Construct our distance matrix and do loop clustering print('Computing kinetic distances...') distance=np.nan*np.zeros((Nlabels,Nlabels)) for i in range(Nlabels): for j in range(Nlabels): P_i_given_ij=len(np.where(labels==i)[0])/(len(np.where(labels==i)[0])+len(np.where(labels==j)[0])) P_j_given_ij=1-P_i_given_ij distance[i,j]= P_i_given_ij/transmat[i,j] + P_j_given_ij/transmat[j,i] #distance[i,j]= (transmat[i,j]+transmat[i,i]) *P_i_given_ij/transmat[i,j] + (transmat[j,i]+transmat[j,j]) *P_j_given_ij/transmat[j,i] distance[np.diag_indices(Nlabels)]=0 #should be zero, but sometimes isn't due to oscillations if plot: plot_distance_map(distance, key) return unique_labels, distance, S, key, PDB_files # # #def Compute_LL(S, clusters, key, PDB_files): # """ # Computes a log likelihood for a given clustering as follows: # For every cluster (ex. [101, 111]), assigns emission probabilities in accordance with how often # a given configuration appears within the cluster # # Also computes a transition matrix based on how often you see transitions between each pair of clusters # The total log likelihood is the sum of the log probabilities of all observed transitions and emissions # # Vector S tells you your structure assignment at each timepoint, in numbers (which are converted to configurations via the key) # # Not using this at the moment # """ # N=len(key) # M=len(clusters) # # if type(clusters[0][0])==str: #converts clusters to a list of lists of numbers, ex. ([[0,1], [2], [3,4]]) # C=[] # for clust in clusters: # c=[] # for s in clust: # i = key.index(s) # c.append(i) # C.append(c) # else: # C=clusters # X=[] #recreates X, which tells you your cluster assignemt at each timepoint # for s in S: # clust=[c for c in range(len(C)) if s in C[c]][0] # X.append(clust) # X=np.array(X) # Emissions = np.zeros(N) #emissions will be a row vector whose ith entry corresponds to the probability of observing state i given that you are in the cluster to whcih i has been assigned # # for c, clust in enumerate(C): # norm = len(np.where(X==c)[0]) #how many times is cluster observed? # for s in clust: # count = len(np.where(S==s)[0]) # Emissions[s]=count/norm # # E_score = np.sum([np.log(Emissions[s]) for s in S]) #score contribution due to emissions # # counts=np.zeros((M, M)) # norm=np.zeros((M, M)) # # #reshape trajectories so you have one list per trajectory that starts at time 0 # times=np.array(utils.get_times(PDB_files)) # zero_times=np.where(times==0)[0] # sim_labels=[] # # for i, t in enumerate(zero_times): # if i!=len(zero_times)-1: # sim_labels.append(X[t:zero_times[i+1]]) # else: # sim_labels.append(X[t:]) # # # for traj in sim_labels: # for t, l in enumerate(traj): # if t<len(traj)-1: # norm[int(l),:]+=1 # counts[int(l), int(traj[t+1])]+=1 # transmat=np.divide(counts, norm) # # T_score=0 #score due to transitions # # for traj in sim_labels: # for t in range(len(traj)-1): # transprob = transmat[X[t], X[t+1]] # T_score+=(np.log(transprob)) # return T_score + E_score
def Bernie_elimination(Assignments, PDB_files, Sanders_thresh, plot=True, cumulative=False, keep_zero_times=True): """ Eliminate assignments that do not represent more than some fraction of the population given by Sanders_thresh e.g. Sanders_thresh may be The 1 Percent (i.e.Sanders_thresh=0.01) Can also do this in a cumulative fashion, so that we sort unique assignemnts by representation, and keep however many we need to cumulatively represent fraction of the population 1 - Sanders_thresh if keep_zero_times=True, then we will never eliminate assignments that occur at the beginning of a simulation """ if Sanders_thresh==0 and cumulative: Sanders_thresh=-0.1 #to avoid rounding errors when summing the cumulative fraction unique=list(set(Assignments)) times=utils.get_times(PDB_files) zero_points=list(set([a for t,a in enumerate(Assignments) if times[t]==0])) #points that occur at beginning of simulations rep=[] for u in unique: rep.append(len([i for i in Assignments if i==u])/len(Assignments)) #We whipe out clusters that represent too small a fraction of the population. We do this in one of two fashions Bernie_dic={} #this dic will tell you what fraction of the population is represented by a given cluster, OR 1 minus the cumulative fraction accounted for by clusters up to that one when the clusters are sorted in order if cumulative: sorted_indices=np.argsort(-np.array(rep)) sorted_points=[unique[s] for s in sorted_indices] sorted_rep=[rep[s] for s in sorted_indices] cum_rep=[np.sum(sorted_rep[0:i+1]) for i in range(len(unique))] for i, u in enumerate(sorted_points): Bernie_dic[u] = 1 - cum_rep[i] else: for i,u in enumerate(unique): Bernie_dic[u]=rep[i] #Determine which points we accept or reject if keep_zero_times: #Accept points with sufficient representation, AND points that occur at beginnings of simulations Accepted=[u for u in unique if Bernie_dic[u]>=Sanders_thresh or u in zero_points] else: #Only accept points if they have sufficient representation Accepted=[u for u in unique if Bernie_dic[u]>=Sanders_thresh] BankstoBust=[u for u in unique if u not in Accepted] #represent less than Sanders_thresh percent of the population #We assign a numerical label to all structures that are accepted sums=[-sum([int(i) for i in str]) for str in Accepted] #(negative of) number of formed substructures in each config sort=natsort.natsorted(zip(sums, Accepted)) dic={} #a dictionary to readily map assignments to labels key=[] for i, pair in enumerate(sort): struct=pair[1] dic[struct]=i key.append(struct) ##### for u in BankstoBust: dic[u]='BustBigBank' #for now, rejected structures will not be given a label labels=[dic[a] for a in Assignments] rejected_indices = np.array([i for i in range(len(labels)) if labels[i]=='BustBigBank']) accepted_indices = np.array([i for i in range(len(labels)) if labels[i]!='BustBigBank']) #Now we fix rejected structures for r in rejected_indices: prev_accept=accepted_indices[accepted_indices<r][-1] future_accepts=accepted_indices[accepted_indices>r] if len(future_accepts)==0: #no structures got accepted after this one so we go with the previous one prev_accept_struct = Assignments[prev_accept] labels[r]=dic[prev_accept_struct] else: next_accept=future_accepts[0] prev_accept_struct=Assignments[prev_accept] next_accept_struct=Assignments[next_accept] curr_struct=Assignments[r] #How different is this structure fromt the previous and the next accepted structure d_to_prev = np.sum([ np.abs( int(curr_struct[i])- int(prev_accept_struct[i])) for i in range(len(curr_struct))]) d_to_next = np.sum([ np.abs( int(curr_struct[i])- int(next_accept_struct[i])) for i in range(len(curr_struct))]) if d_to_prev<=d_to_next: labels[r]=dic[prev_accept_struct] else: labels[r]=dic[next_accept_struct] if plot: utils.histogram(labels, key) return labels, key