def __init__(self, information, projectfile, populationfile, assignmentfile_fixed, tmatrixfile, rawdatafile): try: self.Info = information self.ProjectInfo = Serializer.LoadFromHDF(projectfile) self.Population = loadtxt(populationfile) self.Assignments = Serializer.LoadFromHDF(assignmentfile_fixed) self.Tmatrix = mmread(tmatrixfile) self.StateAssignment = hct.get_StatesAssignments(self.Assignments) self.getrawdata(rawdatafile) except: print "Having trouble with getting required files" raise
def CalculateProjectRg(ProjectInfo, Output, returnRgs=False): """ Calculate Radius of gyration for the Project ie. all the Trajectories. ProjectInfo: ProjectInfo.h5 file. Output: output file (XXX.dat). The Output default will be set in the scripts and it is './Rgs.dat'. """ Output = checkoutput(Output) if not isinstance(ProjectInfo, str): print "Please input the Path to ProjectInfo.h5" raise IOError print 'Calculating the Rg for each trajectory......' ProjectInfoPath = '/'.join(os.path.realpath(ProjectInfo).split('/')[:-1]) os.chdir(ProjectInfoPath) Trajfiles = [] ProjectInfo = Serializer.LoadFromHDF(ProjectInfo) for i in range(ProjectInfo['NumTrajs']): Trajfiles.append(ProjectInfo['TrajFilePath'] + ProjectInfo['TrajFileBaseName'] + '%d' % i + ProjectInfo['TrajFileType']) Rgs = computeRg(Trajfiles) print "Save data to %s" % Output savetxt(Output, Rgs) print "Done." if returnRgs: return Rgs
def get_projectinfo(): try: projectinfo = Serializer.LoadFromHDF('../ProjectInfo.h5') except IOError: print "Can't find ProjectInfo.h5!" return projectinfo
def get_Trajectory_frame(trajid,frames): """ Get trajectory frames. From Trajectory file(traj_.lh5) get the frame. """ Path = "/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/sourcedata/Trajectories" traj = Serializer.LoadFromHDF('%s/trj%d.lh5'%(Path,trajid)) return [traj['XYZList'][i] for i in frames]
def calculate_statepopulation_rawdata(AssignmentsFixed): a = Serializer.LoadFromHDF(AssignmentsFixed) statenumber = max([max(a['Data'][i]) for i in range(len(a['Data']))]) + 1 p = np.zeros(statenumber) for state in range(statenumber): for traj in range(len(a['Data'])): p[state] += a['Data'][traj].tolist().count(state) p = p / p.sum() return p
def pseudosampling(states, assignment, numberoftrajs, frames, output): try: fn = Serializer.LoadFromHDF(assignment) except IOError: print "Can't find Assignment file" sys.exit() for stateid in states: a = pseudotrajs(numberoftrajs, stateid, fn, frames) a.SaveToHDF(output) print "Wrote:%s" % output
def getrawdata(self, rawdatafile): try: d = Serializer.LoadFromHDF(rawdatafile) self.RawData = d['Data'] except: try: self.RawData = loadtxt(rawdatafile) except: print "Can not load {}".format(rawdatafile) raise
def test(): assignments = Serializer.LoadFromHDF("/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/RMSDCluster4.2/Data/Assignments.h5") StatesAsi = get_StatesAssignments(assignments) NumHelix_states = compute_numhelix_states(StatesAsi) #print "NumHelix_states",NumHelix_states #savetxt('NumHelix_states',NumHelix_states) states = [int(i) for i in NumHelix_states.keys()] states.sort() mean_numhelix_states = [] std_numhelix_states = [] for state in states: mean_numhelix_states.append(np.mean(NumHelix_states['%d'%state])) std_numhelix_states.append(np.std(NumHelix_states['%d'%state])) plt.figure() plt.errorbar(states,mean_numhelix_states,std_numhelix_states) plt.xlabel("State ID") plt.ylabel("Number of Helix") plt.savefig("Numhelix_states") plt.show()
def get_StatesAssignments(AssignmentFiles): """ StatesAssignments {'state':{'Trajectory':[Frames]}} """ A = AssignmentFiles if isinstance(A,str): A = Serializer.LoadFromHDF(A) S = {} try: for trajid,data in zip(A['TrajID'],A['Data']): for i in range(len(data)): if data[i] == -1: continue else: S.setdefault('%d'%data[i],{}).setdefault('%d'%trajid,[]).append(i) return S except KeyError: # Assignments file which doesn't has key 'TrajID' should be regular Assignments file instead of new assignments files created for bootstrap. Then modify the assignments file i.e, create key 'TrajID'. A['TrajID'] = list(range(len(A['Data']))) return get_StatesAssignments(A)
def bootstrap(AssignmentsFile,numtraj,bootstrapnumber,PathtoSaveFiles): bootstraplist,TrajID = [],[] File = Serializer.LoadFromHDF(AssignmentsFile) datalist = File['Data'] if isinstance(numtraj,str) and numtraj.lower()=='all': numtraj = len(datalist) elif int(numtraj) <= 0: print "Please input valid number from 1 to %d"%len(datalist) sys.exit() else: numtraj = int(numtraj) for i in range(bootstrapnumber): all_assignments = -1 * np.ones((numtraj,len(datalist[0])), dtype=np.int) k = 0 trajid = [] for j,m in sample_with_replacement(datalist,numtraj): all_assignments[k][:] = j[:] trajid.append(m) k += 1 TrajID.append(trajid) bootstraplist.append(all_assignments) SaveBootstrapFiles(bootstraplist,TrajID,PathtoSaveFiles,File,AssignmentsFile)
def compute_numhelix_states(StatesAssignments): """ Compute the average number of helix for all states. Need Path to trj_hc.h5 """ SA = StatesAssignments states = SA.keys() numhelix_states = {} n = 0 for state in states: n +=1 print "Compute number of helix for state %d/%d"%(n,len(states)) TrajID = SA[state].keys() numhelix_state = [] for trajid in TrajID: T = {} TrajFile = '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/sourcedata/Trajectories/trj%s_hc.h5'%trajid Traj = Serializer.LoadFromHDF(TrajFile) T['HCs'] = [Traj['HCs'][i] for i in SA[state][trajid]] numhelix_state += count_Helix(T) numhelix_states[state] = numhelix_state return numhelix_states
def get_projectinfo(): projectinfo = Serializer.LoadFromHDF('RMSDCluster4.2/ProjectInfo.h5') return projectinfo
def get_RMSD(): rmsds = Serializer.LoadFromHDF('RMSDCluster4.2/Data/RMSD-pdb-gen0.h5') return rmsds['Data']
#---------------------------- #October,17,2012 #Guangfeng Zhou #Dr.Voelz Lab #Room 100/102, Beury Hall #Temple University import sys,os sys.path.append('/Users/tud51931/scripts/gfzhou') from msmhcanalysis import SequenceEntropy_states from msmbuilder import Serializer import matplotlib.pyplot as plt import numpy as np stateentropy = SequenceEntropy_states('HCstrings_states_Dihedral5.2.txt') rmsdfile = Serializer.LoadFromHDF('StateRMSDs_DihedralCluster5.2.h5') RMSD = np.ma.array(rmsdfile['Data'],mask=[rmsdfile['Data']==-1]) statermsd = RMSD.mean(1) plt.figure() plt.plot(statermsd,stateentropy,'.') plt.title('StateSequecneEntropy versus StateRMSD') plt.ylabel('StateSequenceEntropy') plt.xlabel('StateRMSD(nm)') plt.savefig('seqentropy_statermsd_dihedralcluster.png') #plt.show()
from msmbuilder import Serializer def draw_index(probs, n_picks=1, UseFastMethod=True): """Draw a number (or many numbers, controlled by n_picks), weighted by the probabilities probs.""" if UseFastMethod: t = np.cumsum(probs) s = sum(probs) return np.searchsorted(t, np.random.rand(n_picks) * s) tcounts = mmread( '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/RMSDCluster4.2/lagtime50/tCounts.UnMapped.mtx' ) Assignment = Serializer.LoadFromHDF( '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/RMSDCluster4.2/lagtime50/Assignments.Fixed.h5' ) trajnum = 100 frames = Assignment['Data'].shape[1] a = Serializer() a['Data'] = -1 * np.ones((trajnum, frames)) for traj in range(trajnum): print '%d of %d Trajectories' % (traj, trajnum) startstate = 126 a['Data'][traj, 0] = startstate for step in range(1, frames): probs = tcounts.data[tcounts.row == startstate] / sum( tcounts.data[tcounts.row == startstate]) a['Data'][traj, step] = tcounts.col[tcounts.row == startstate][ draw_index(probs)[0]] startstate = a['Data'][traj, step]
import os, sys import numpy as np from msmbuilder import Serializer, Trajectory import matplotlib.pyplot as plt sys.path.append("/Users/tud51931/scripts/gfzhou") import HelixCoilTools as hct ProjectInfo = Serializer.LoadFromHDF( '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/sourcedata/ProjectInfo.h5' ) Counts = -1 * np.ones( (ProjectInfo['NumTrajs'], max(ProjectInfo['TrajLengths']))) print Counts.shape Savepath = '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/result/NvOfTrajectory' plt.figure() plt.xlabel('Steps') plt.ylabel('Nv') plt.hold(False) for i in range(0, 93): T = Trajectory.LoadFromHDF( '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/sourcedata/Trajectories/trj%d_hc.h5' % i) Hcount = hct.count_Helix(T) plt.title('Nv-steps of Traj%d' % i) plt.plot(range(len(Hcount)), Hcount, '.') print 'Save figure to %s/Nvoftraj%d.png' % (Savepath, i) plt.savefig('%s/Nvoftraj%d.png' % (Savepath, i)) Counts[i, :len(Hcount)] = Hcount[:] Counts_ma = np.ma.array(Counts, mask=[Counts == -1])
from scipy import savetxt from msmbuilder import Serializer cutoff = 3.0 metrics = 'rmsd' if metrics.lower() == 'dihedral': Path = "/Users/tud51931/projects/MSM/msm/ff03-dihedralhybrid/" metrics = 'Dihedral' elif metrics.lower() == 'rmsd': Path = "/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/" metrics = 'RMSD' Path = os.path.join(Path, '%sCluster%0.1f' % (metrics, cutoff)) AssignmentFile = os.path.join(Path, "Data", "Assignments.h5") A = Serializer.LoadFromHDF(AssignmentFile) StateAssignment = hct.get_StatesAssignments(AssignmentFiles=A) RMSDFile = os.path.join(Path, "Data", "RMSD.h5") RMSD = Serializer.LoadFromHDF(RMSDFile) rmsd_allstates = {} for state in StateAssignment.keys(): rmsd_singlestate = [] for trajid in StateAssignment[state].keys(): rmsd_singlestate += list( RMSD['Data'][int(trajid)][StateAssignment[state][trajid]]) rmsd_allstates[int(state)] = rmsd_singlestate maxstatelength = max([len(i) for i in rmsd_allstates.values()]) StateRMSDs = copy.deepcopy(RMSD) StateRMSDs['Data'] = -1 * np.ones((len(rmsd_allstates), maxstatelength)) for state in rmsd_allstates.keys():
import os, sys import numpy as np from msmbuilder import Serializer from scipy import savetxt, loadtxt import matplotlib.pyplot as plt try: R = Serializer.LoadFromHDF( '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/RMSDCluster4.2/Data/RMSD.h5' ) rmsd = [] for i in range(len(R['Data'])): for j in range(len(R['Data'][i])): if R['Data'][i, j] != -1: rmsd.append(R['Data'][i, j]) except IOError: print "Can't find RMSD.h5, please run CalculateProjectRMSD.py first to get RMSD.h5." sys.exit() try: Nv = loadtxt( '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/result/numhelix_alltraj.txt' ) nv = [] for i in range(len(Nv)): for j in range(len(Nv[i])): if Nv[i, j] != -1: nv.append(Nv[i, j]) except IOError: print "Can't find numhelix_alltraj.txt, please run computeNumhelix_alltrajs.py first." sys.exit()
help="Input RMSD.h5 file", required=True) parser.add_argument('-rg', '--Rg', help="Input Rgs.dat file", required=True) parser.add_argument('-l', '--Locate', help="Locate states on the Rg-RMSD graph", type=str) parser.add_argument('-o', '--Output', help="Output file (graph) name.Default: Rg-RMSD.png", default="Rg-RMSD.png") args = parser.parse_args() try: R = Serializer.LoadFromHDF(args.RMSD) rmsd = [] for i in range(len(R['Data'])): for j in range(len(R['Data'][i])): if R['Data'][i, j] != -1: rmsd.append(R['Data'][i, j]) except IOError: print "Can't find RMSD.h5, please run CalculateProjectRMSD.py first to get RMSD.h5." raise IOError try: Rgs = loadtxt(args.Rg) rgs = [] for i in range(len(Rgs)): for j in range(len(Rgs[i])): if Rgs[i, j] != -1: rgs.append(Rgs[i, j])
#def GetHCStringsforProject(ProjectInfo) if __name__ == '__main__': tau = 50 cutoff = 4.2 metrics = 'rmsd' if metrics.lower() == 'dihedral': Path = "/Users/tud51931/projects/MSM/msm/ff03-dihedralhybrid" metrics = 'Dihedral' elif metrics.lower() == 'rmsd': Path = "/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter" metrics = 'RMSD' Path = os.path.join(Path, '%sCluster%0.1f' % (metrics, cutoff)) ProjectInfo = Serializer.LoadFromHDF('%s/ProjectInfo.h5' % Path) Population = loadtxt('%s/lagtime%d/Populations.dat' % (Path, tau)) Assignments = Serializer.LoadFromHDF("%s/lagtime%d/Assignments.Fixed.h5" % (Path, tau)) Tmatrix = mmread('%s/lagtime%d/tProb.mtx' % (Path, tau)) Gens = '%s/Data/Gens.lh5' % Path Nvprediction() EEdistanceprediction() RMSDprediction() Rgprediction() #barchartsforStatesEntropy() #SequenceEntropy_states() #GetRgsforGeneratorFile() #GetHCStringsforTrajectory(Gens)
x = (t['XYZList'][i, Atom1, :] - t['XYZList'][i, Atom2, :])[0] x = x.tolist() distance.append(np.dot(x, x)**0.5) distance += [-1] * (LongestTrajLength - len(t['XYZList'])) return distance #------------MAIN--------------- AtomName1 = 'C' ResidueID1 = 1 AtomName2 = 'N' ResidueID2 = 23 path = '/Users/tud51931/projects/MSM/msm/ff03ERR-hybridkcenter/RMSDCluster4.0' Distances = [] ProjectInfo = Serializer.LoadFromHDF('%s/ProjectInfo.h5' % path) LongestTrajLength = max(ProjectInfo['TrajLengths']) os.chdir(path) if os.path.exists('EndtoEndDistances.dat'): print "EndtoEndDistances.dat exists!" sys.exit() print 'Calculating the eeDistance of each trajectory......' for i in range(ProjectInfo['NumTrajs']): trajfile = ProjectInfo['TrajFilePath'] + ProjectInfo[ 'TrajFileBaseName'] + '%d' % i + ProjectInfo['TrajFileType'] print '%d in %d Trajectories' % (i, ProjectInfo['NumTrajs']), trajfile d = calculatedistance(AtomName1, ResidueID1, AtomName2, ResidueID2, trajfile, LongestTrajLength) Distances.append(d) print "Save data to ./EndtoEndDistance.dat" savetxt('EndtoEndDistances.dat', Distances)
def RMSDprediction(): try: R = Serializer.LoadFromHDF( '/Users/tud51931/projects/MSM/msm/ff03-hybridkcenter/RMSDCluster4.2/Data/RMSD.h5' ) except IOError: print "Can't find RMSD.h5, please run CalculateProjectRMSD.py first to get RMSD.h5." sys.exit() RMSD = R['Data'] StatesAsi = hct.get_StatesAssignments(Assignments) RMSD_states = {} for state in StatesAsi.keys(): for trajid in StatesAsi[state].keys(): for frame in StatesAsi[state][trajid]: RMSD_states.setdefault(state, []).append( RMSD[int(trajid)][int(frame)]) states = [int(i) for i in RMSD_states.keys()] states.sort() mean_rmsd_states = [] std_rmsd_states = [] for state in states: mean_rmsd_states.append(np.mean(RMSD_states['%d' % state])) std_rmsd_states.append(np.std(RMSD_states['%d' % state])) #savetxt('mean_numhelix_states0.dat',mean_numhelix_states) #savetxt('std_numhelix_states0.dat',std_numhelix_states) print mean_rmsd_states P0 = np.zeros(len(Population)) for data in Assignments['Data']: P0[data[0]] += 1 P0 = P0 / P0.sum() populationslist = [] for k in range(140): populationslist.append(P0) P0 *= Tmatrix RMSD_predicted = np.dot(np.array(populationslist), np.array(mean_rmsd_states).reshape(-1, 1)) print RMSD_predicted RMSD_predicted = RMSD_predicted.reshape(1, -1)[0] plt.figure() plt.plot( np.arange(0, 7000, 50), RMSD_predicted, 'ro', ) plt.hold(True) Counts_ma = np.ma.array(RMSD, mask=[RMSD == -1]) RMSD_mean = Counts_ma.mean(0) RMSD_std = Counts_ma.std(0) print RMSD_mean plt.plot(range(len(RMSD_mean)), RMSD_mean, 'b') plt.title('RMSD-steps') plt.xlabel('Steps') plt.ylabel('RMSD') plt.legend(('RMSD_msm', 'RMSD_rawdata'), loc='upper right') figname = 'RMSD_prediction_%sCluster%0.1f_tau%d.png' % (metrics, cutoff, tau) plt.savefig(figname) print "Save to %s" % figname