Ejemplo n.º 1
0
def plot_unfolding_trajectory(traj, labels, PDB_files):
    f, l = utils.get_trajectory(labels, PDB_files, traj)
    #if filter_osc==True: l=filter_oscillations(l, thresh=thresh)
    plt.figure()
    plt.plot(utils.get_times(f),l)
    
    unique=list(set(labels))
    if np.nan in unique: unique.remove(np.nan)
Ejemplo n.º 2
0
def get_unfolding_pathways(labels, PDB_files, temperatures='*', verbose=True, goal_state = -1):
    """
    What are all the unfolding pathways that are taken?
    We ignore loops, so 0->1->2->1->3 becomes 0->1->3
    Sorts unfolding pathways in order of decreasing flux through them.
    Can choose to specify only one temperature, for instnace, '0.925_', (that underscore is important) or a list of them
    
    We only keep pathways that reach a desired unfolded state
    By default, this is the fully unfodled state, and so we indicate this with goal_state = -1
    But if you care about pathways that reach some other state, just specify this other state with goal_state keyword
    """
    
    PDB_files, labels=utils.get_trajectory(labels, PDB_files, temperatures)
    
    times=np.array(utils.get_times(PDB_files))
    zero_times=np.where(times==0)[0]
    pathways=[]
    for n, t in enumerate(zero_times[:-1]):
        traj=labels[t:zero_times[n+1]]
        pathway=[]
        for s,p in enumerate(traj):
            if s==0: #always include the initial time
                pathway.append(p)
            else:
                if p!=traj[s-1]: #a change has occured in trajectory
                    if p in pathway:
                        ind = pathway.index(p)
                        pathway=pathway[0:ind+1]
                    else:
                        pathway.append(p)
                
        if goal_state ==-1 and p==np.max(labels):
            pathways.append(pathway)
        elif goal_state in pathway:
            pathways.append(pathway)  #only keep track of pathways that fully unfold
            
    unique_pathways=[]
    fracs=[]  #what fraction of flux goes through each unique pathway?
    for p in pathways:
        if p not in unique_pathways:
            unique_pathways.append(p)
            fracs.append(len([P for P in pathways if P==p])/len(pathways))
    #sort in order of decreasing flux
    order=np.argsort(-np.array(fracs))
    fracs=np.array(fracs)[order]
    unique_pathways=[unique_pathways[o] for o in order]    

    if verbose:
        print ('Pathway \t \t fraction of flux')
        for p, path in enumerate(unique_pathways):
            if fracs[p]>=0.01: print('{} \t \t {} \n'.format(path, np.round(fracs[p], 3)))
    return pathways, unique_pathways, fracs 
Ejemplo n.º 3
0
def get_unfolding_transitions(labels, PDB_files, temp='*', verbose=True, min_to_show=0):
    """
    FInds unfolding transitions (ex. (0,1)) and ranks by abundance
    Can choose to specify only one temperature or multiple using wildcard notation
    
    min_to_show tells you the minimum number of instances a transition soudl occur if you want it to be printed by this function
    """
    if type(temp)==int:temp=float(temp)
    if type(temp)!=str: #convert to a format the Get_trajectory can work with
        temp=str(temp)
        while len(temp)<5: temp = '{}0'.format(temp)
        temp='{}_'.format(temp) #need this underscore at the end for Get_trajectory to be happy 
    
    PDB_files, labels=utils.get_trajectory(labels, PDB_files, temp)
    
    times=np.array(utils.get_times(PDB_files))
    zero_times=np.where(times==0)[0]
    transitions=[]
    
    for n, tau in enumerate(zero_times[:-1]):
        traj=labels[tau:zero_times[n+1]]
        for t in range(len(traj)-1):
            if traj[t+1]>traj[t]: #an unfolding transition has occured
                transitions.append((traj[t], traj[t+1]))
    
    unique_transitions=[]
    counts=[]  #
    
    dic = {}
    for p in transitions:
        if p not in unique_transitions:
            unique_transitions.append(p)
            counts.append(len([P for P in transitions if P==p]))
            dic[p] =  counts[-1]
            
    #sort in order of decreasing frequency
    order=np.argsort(-np.array(counts))
    counts=np.array(counts)[order]
    unique_transitions=[unique_transitions[o] for o in order]    

    if verbose:
        print ('Transition \t \t number ofinstances')
        for p, path in enumerate(unique_transitions):
             if counts[p]>min_to_show: print('{} \t \t {} \n'.format(path, counts[p]))   
    
    
    return dic
Ejemplo n.º 4
0
def predict_unfolding_at_temperature(temp, data, PDB_files):
    """
    Function to predict lables for all trajectoires at a given temperature
    Note: The assumption is that at a given temperature, all snapshots are at the same times
    
    Filter should be 'First commit' or 'Last commit' or 'Filter osc' as described in ClusterPCA
    You can also enter None (or anything else besides the options above) in whcih case no filtering is applied
    """
    
    temp=str(temp)
    
    if len(temp)==1:
        temp='{}.'.format(temp)
    while len(temp)<5:  #add zeros so that the temperature is of the form 0.80
        temp='{}0'.format(temp)
    f, trajectories = utils.get_trajectory(data, PDB_files, '{}_'.format(temp) )
    #need to figure out how long are all the trajectories.
    #to figure this out, iterate through the first files until you see a change

    
    go=True
    i=0
    traj_nums=[]
    while go:
        file=f[i]
        file=file.split('{}_'.format(temp))
        suffix=file[1]
        traj_num=suffix.split('.')[0]
        traj_nums.append(traj_num)
        if traj_nums[i]!=traj_nums[i-1]:
            go=False
        else:
            i+=1
    
    traj_len=i
    n_trajectories=int(len(f)/traj_len)
    sim_labels=np.zeros((n_trajectories, traj_len))
    
    times=utils.get_times(f[0:traj_len])
    for n in range(n_trajectories):  
        traj=trajectories[n*traj_len:n*traj_len+traj_len]  
        sim_labels[n,:]=traj
        
    return times, sim_labels
Ejemplo n.º 5
0
def runHMM(score_path, f,s, m,starting_states, plot = True):
    """
    Parameters are as follows:
        score_path: Path to unfolding simulation substructure scores
        f: thresh for defining whether a substructure is formed. Should be same value as when computing substrucutre PMF
        s: The Sanders thresh, the minimum representation a topological configuration should have among unfolding snapshots for it to be kept
        m: Misassignment probability used in HMM
        starting_state: topological configuraiton of starting state
    
    By the end, all trajectories are fit to HMM, and we can generate a plot of pairwise kinetic distances between states, if we wish
    """    
    print('Loading scores...')
    scores, PDB_files, Substructures=load_data.load_scores(score_path, f, convert_to_binary=True )
    
    
    labels, key=Bernie_elimination(scores, PDB_files, s, plot=False)
    
    
    alphabet = 'abcdefghijklmnopqrstuvwxyz' #passed Kindergarden
    NN = len(key[0]) 
    
    start_states=[]
    
    for starting_state in starting_states:
        ss = ''
        for n in range(NN):
            if alphabet[n] in starting_state:
                ss= '{}{}'.format(ss, (int(1)))
            else:
                ss= '{}{}'.format(ss, (int(0)))
        
        start_states.append(np.where(np.array(key)==ss)[0][0])
        
    
    start_states = np.array(start_states)
    
    times=np.array(utils.get_times(PDB_files))
    zero_times=np.where(times==0)[0]
    
    #print('Zero times: {}'.format(zero_times))
    
    lengths=[zero_times[t+1]-zero_times[t] for t in range(len(zero_times)-1)]
    lengths.append(200) #need to include the last trajectory
    lengths=np.array(lengths)
    
    
    
    
    for i, t in enumerate(zero_times):
        if t==zero_times[-1]:
            labels[t:]=utils.filter_nans(labels[t:])
        else:
            nextt=zero_times[i+1]
            labels[t:nextt]=utils.filter_nans(labels[t:nextt])
    
    unique_labels=np.unique(labels)
    Nlabels=len(unique_labels)
    
    
    P0_val=1/len(start_states)
    
    start_prob=np.zeros(Nlabels)    
    start_prob[start_states]=P0_val
    
    print('The start_prob array is {} \n'.format(start_prob))
    
    #Prepare the emission and starting probabilities 
    
    emit=((m)/(Nlabels-1) )*np.ones(( Nlabels, Nlabels))
    emit[np.diag_indices(Nlabels)]=1 - m
    
    #start_prob[starting_state]=1
    
    
    #Train the HMM !!!
    
    print('Training HMM...')
    labels=np.array(labels).reshape(-1,1)
    
    HMM = hmmlearn.hmm.MultinomialHMM(n_components = Nlabels, params='t', init_params='t')
    HMM.emissionprob_=emit
    HMM.startprob_=start_prob
    HMM.fit(labels, lengths=lengths)
    S=HMM.predict(labels, lengths=lengths)
    score=HMM.score(labels, lengths=lengths)
    transmat=HMM.transmat_
    
    
    #Construct our distance matrix and do loop clustering
    
    print('Computing kinetic distances...')
    distance=np.nan*np.zeros((Nlabels,Nlabels))
    for i in range(Nlabels):
        for j in range(Nlabels):
            P_i_given_ij=len(np.where(labels==i)[0])/(len(np.where(labels==i)[0])+len(np.where(labels==j)[0]))
            P_j_given_ij=1-P_i_given_ij
            
            distance[i,j]=  P_i_given_ij/transmat[i,j] + P_j_given_ij/transmat[j,i]
            #distance[i,j]=  (transmat[i,j]+transmat[i,i]) *P_i_given_ij/transmat[i,j] + (transmat[j,i]+transmat[j,j]) *P_j_given_ij/transmat[j,i]
            
    distance[np.diag_indices(Nlabels)]=0  #should be zero, but sometimes isn't due to oscillations
    
    if plot: plot_distance_map(distance, key)
    
    return unique_labels, distance, S, key, PDB_files

#    
#
#def Compute_LL(S, clusters, key, PDB_files):
#    """
#    Computes a log likelihood for a given clustering as follows:
#    For every cluster (ex. [101, 111]), assigns emission probabilities in accordance with how often
#    a given configuration appears within the cluster
#    
#    Also computes a transition matrix based on how often you see transitions between each pair of clusters
#    The total log likelihood is the sum of the log probabilities of all observed transitions and emissions
#    
#    Vector S tells you your structure assignment at each timepoint, in numbers (which are converted to configurations via the key)
#    
#    Not using this at the moment
#    """
#    N=len(key)
#    M=len(clusters)
#
#    if type(clusters[0][0])==str:  #converts clusters to a list of  lists of numbers, ex. ([[0,1], [2], [3,4]])
#        C=[]
#        for clust in clusters:
#            c=[]
#            for s in clust: 
#                i = key.index(s)
#                c.append(i)
#            C.append(c)
#    else:
#        C=clusters
#    X=[]  #recreates X, which tells you your cluster assignemt at each timepoint
#    for s in S:
#        clust=[c for c in range(len(C)) if s in C[c]][0]
#        X.append(clust)  
#    X=np.array(X)
#    Emissions = np.zeros(N)  #emissions will be a row vector whose ith entry corresponds to the probability of observing state i given that you are in the cluster to whcih i has been assigned
#    
#    for c, clust in enumerate(C):
#        norm = len(np.where(X==c)[0]) #how many times is cluster observed?
#        for s in clust:
#            count = len(np.where(S==s)[0])
#            Emissions[s]=count/norm
#    
#    E_score = np.sum([np.log(Emissions[s]) for s in S]) #score contribution due to emissions
#
#    counts=np.zeros((M, M))
#    norm=np.zeros((M, M))
#
#    #reshape trajectories so you have one list per trajectory that starts at time 0
#    times=np.array(utils.get_times(PDB_files))
#    zero_times=np.where(times==0)[0]
#    sim_labels=[]
#    
#    for i, t in enumerate(zero_times):
#        if i!=len(zero_times)-1:
#            sim_labels.append(X[t:zero_times[i+1]])
#        else:
#            sim_labels.append(X[t:])
#    
#            
#    for traj in sim_labels:
#        for t, l in enumerate(traj):
#            if t<len(traj)-1:
#                norm[int(l),:]+=1
#                counts[int(l), int(traj[t+1])]+=1
#    transmat=np.divide(counts, norm)
#    
#    T_score=0  #score due to transitions
#    
#    for traj in sim_labels:
#        for t in range(len(traj)-1):
#            transprob = transmat[X[t], X[t+1]]
#            T_score+=(np.log(transprob))
#    return T_score + E_score     
Ejemplo n.º 6
0
def Bernie_elimination(Assignments, PDB_files, Sanders_thresh, plot=True, cumulative=False, keep_zero_times=True):
    """
    Eliminate assignments that do not represent more than some fraction of the population given by Sanders_thresh
    e.g. Sanders_thresh may be The 1 Percent (i.e.Sanders_thresh=0.01)
    
    Can also do this in a cumulative fashion, so that we sort unique assignemnts by representation, and keep however many we need 
    to cumulatively represent fraction of the population 1 - Sanders_thresh
    
    if keep_zero_times=True, then we will never eliminate assignments that occur at the beginning of a simulation
    """     
    if Sanders_thresh==0 and cumulative:
        Sanders_thresh=-0.1  #to avoid rounding errors when summing the cumulative fraction

    unique=list(set(Assignments))
    times=utils.get_times(PDB_files)
    zero_points=list(set([a for t,a in enumerate(Assignments) if times[t]==0]))  #points that occur at beginning of simulations
    rep=[]
    for u in unique:
        rep.append(len([i for i in Assignments if i==u])/len(Assignments))
    
    
    #We whipe out clusters that represent too small a fraction of the population. We do this in one of two fashions
    
    Bernie_dic={} #this dic will tell you what fraction of the population is represented by a given cluster, OR 1 minus the cumulative fraction accounted for by clusters up to that one when the clusters are sorted in order
    if cumulative:
        sorted_indices=np.argsort(-np.array(rep))
        sorted_points=[unique[s] for s in sorted_indices]
        sorted_rep=[rep[s] for s in sorted_indices]
        cum_rep=[np.sum(sorted_rep[0:i+1]) for i in range(len(unique))]
        
        for i, u in enumerate(sorted_points):
            Bernie_dic[u] = 1 - cum_rep[i]
    else:
        for i,u in enumerate(unique):
            Bernie_dic[u]=rep[i]
    
    #Determine which points we accept or reject
    if keep_zero_times: #Accept points with sufficient representation, AND points that occur at beginnings of simulations
        Accepted=[u for u in unique if Bernie_dic[u]>=Sanders_thresh or u in zero_points]
    else: #Only accept points if they have sufficient representation  
         Accepted=[u for u in unique if Bernie_dic[u]>=Sanders_thresh]
    
    BankstoBust=[u for u in unique if u not in Accepted]  #represent less than Sanders_thresh percent of the population

    #We assign a numerical label to all structures that are accepted
    sums=[-sum([int(i) for i in str]) for str in Accepted] #(negative of) number of formed substructures in each config
    sort=natsort.natsorted(zip(sums, Accepted))
    dic={}  #a dictionary to readily map assignments to labels 
    key=[]
    for i, pair in enumerate(sort):
        struct=pair[1]
        dic[struct]=i
        key.append(struct)    
    #####    
    for u in BankstoBust: dic[u]='BustBigBank'   #for now, rejected structures will not be given a label
    
    labels=[dic[a] for a in Assignments] 
    rejected_indices = np.array([i for i in range(len(labels)) if  labels[i]=='BustBigBank'])
    accepted_indices = np.array([i for i in range(len(labels)) if  labels[i]!='BustBigBank'])

    
    #Now we fix rejected structures
    
    for r in rejected_indices:

        prev_accept=accepted_indices[accepted_indices<r][-1]
        future_accepts=accepted_indices[accepted_indices>r]
        
        if len(future_accepts)==0: #no structures got accepted after this one so we go with the previous one
            prev_accept_struct = Assignments[prev_accept]
            labels[r]=dic[prev_accept_struct]
        else:
            next_accept=future_accepts[0]
            prev_accept_struct=Assignments[prev_accept]
            next_accept_struct=Assignments[next_accept]
            curr_struct=Assignments[r]
            
            #How different is this structure fromt the previous and the next accepted structure
            d_to_prev = np.sum([ np.abs( int(curr_struct[i])- int(prev_accept_struct[i])) for i in range(len(curr_struct))])
            d_to_next = np.sum([ np.abs( int(curr_struct[i])- int(next_accept_struct[i])) for i in range(len(curr_struct))])
            
            if d_to_prev<=d_to_next:
                labels[r]=dic[prev_accept_struct]
            else:
                labels[r]=dic[next_accept_struct] 
    
    if plot:
        utils.histogram(labels, key)

    return labels, key