def amputareFile(filename): """Specific to MDOUT. Returns True if the last line of the file is a float entry.""" amp = True try: _ = float(unixlike.tailPy(filename, 1)[0]) except ValueError: amp = False return amp
def readDataGOMC(P): """Read in .dat files; return nsnapshots, lv, dhdlt, and u_klt.""" class F: """This is the object to be built on the filename.""" def __init__(self, filename): self.filename = filename def sortedHelper(self): """This function will assist the built-in 'sorted' to sort filenames. Returns a tuple whose first element is an integer while others are strings.""" meat = os.path.basename(self.filename).replace(P.prefix, '').replace( P.suffix, '') l = [i for i in re.split('\.|-|_', meat) if i] try: self.state = l[0] = int( l[0]) # Will be of use for selective MBAR analysis. except: print "\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n" raise return tuple(l) def readHeader(self): self.skip_lines = 0 # Number of lines from the top that are to be skipped. self.lv_names = () # Lambda type names, e.g. 'coul', 'vdw'. snap_size = [ ] # Time from first two snapshots to determine snapshot's size. self.lv = [] # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0). self.bEnergy = False self.bPV = False self.bExpanded = False self.temperature = False print "Reading metadata from %s..." % self.filename with open(self.filename, 'r') as infile: for line in infile: if line.startswith('#'): self.skip_lines += 1 elements = unixlike.trPy(line).split() for i in range(len(elements)): if '#T' == elements[i]: self.temperature = elements[i + 2] elif 'Total_En' == elements[i]: self.bEnergy = True elif 'PV' == elements[i]: self.bPV = True elif 'State' == elements[i]: c1 = elements[i + 3][:4].lower() c2 = elements[i + 5][:3].lower() self.lv_names = c1, c2 elif 'DelE' == elements[i]: self.lv.append(elements[i + 2:i + 4]) else: snap_size.append(float(line.split()[0])) if len(snap_size) > 1: self.snap_size = numpy.diff(snap_size)[0] P.snap_size.append(self.snap_size) break return self.lv def iter_loadtxt(self, state): """Houstonian Joe Kington claims it is faster than numpy.loadtxt: http://stackoverflow.com/questions/8956832/python-out-of-memory-on-large-csv-file-numpy""" def iter_func(): with open(self.filename, 'r') as infile: for _ in range(self.skip_lines): next(infile) for line in infile: line = line.split() for item in line: yield item def slice_data(data, state=state): # Where the dE columns should be stored. if (len(ndE_unique) > 1 and ndE[state] < 4): # If BAR, store shifted 2/3 arrays. s1, s2 = numpy.array((0, ndE[state])) + state - (state > 0) else: # If MBAR or selective MBAR or BAR/MBAR, store all. s1, s2 = (0, K) # Which dhdl columns are to be read. read_dhdl_sta = 1 + self.bEnergy + self.bExpanded read_dhdl_end = read_dhdl_sta + n_components data = data.T dhdlt[state, :, nsnapshots_l[state]:nsnapshots_r[state]] = data[ read_dhdl_sta:read_dhdl_end, :] if not bSelective_MBAR: r1, r2 = (read_dhdl_end, read_dhdl_end + (ndE[state] if not self.bExpanded else K)) if bPV: u_klt[state, s1:s2, nsnapshots_l[state]: nsnapshots_r[state]] = P.beta * (data[r1:r2, :] + data[-1, :]) else: u_klt[state, s1:s2, nsnapshots_l[state]: nsnapshots_r[state]] = P.beta * data[r1:r2, :] else: # can't do slicing; prepare a mask (slicing is thought to be faster/less memory consuming than masking) mask_read_uklt = numpy.array( [0] * read_dhdl_end + [1 if (k in sel_states) else 0 for k in range(ndE[0])] + ([0] if bPV else []), bool) if bPV: u_klt[state, s1:s2, nsnapshots_l[state]: nsnapshots_r[state]] = P.beta * ( data[mask_read_uklt, :] + data[-1, :]) else: u_klt[state, s1:s2, nsnapshots_l[state]: nsnapshots_r[state]] = P.beta * data[ mask_read_uklt, :] return print "Loading in data from %s (%s) ..." % ( self.filename, "all states" if self.bExpanded else 'state %d' % state) data = numpy.fromiter(iter_func(), dtype=float) if not self.len_first == self.len_last: data = data[:-self.len_last] data = data.reshape((-1, self.len_first)) if self.bExpanded: for k in range(K): mask_k = (data[:, 1] == k) data_k = data[mask_k] slice_data(data_k, k) else: slice_data(data) def parseLog(self): """By parsing the .log file of the expanded-ensemble simulation find out the time in ps when the WL equilibration has been reached. Return the greater of WLequiltime and equiltime.""" if not (P.bIgnoreWL): logfilename = self.filename.replace('.xvg', '.log') if not os.path.isfile(logfilename): raise SystemExit( "\nERROR!\nThe .log file '%s' is needed to figure out when the Wang-Landau weights have been equilibrated, and it was not found.\nYou may rerun with the -x flag and the data will be discarded to 'equiltime', not bothering\nwith the extraction of the information on when the WL weights equilibration was reached.\nOtherwise, put the proper log file into the directory which is subject to the analysis." % logfilename) try: with open(logfilename, 'r') as infile: dt = float( unixlike.grepPy(infile, s='delta-t').split()[-1]) WLstep = int( unixlike.grepPy( infile, s='equilibrated').split()[1].replace(':', '')) except: print "\nERROR!\nThe Wang-Landau weights haven't equilibrated yet.\n" print "If you comprehend the consequences,\n rerun with the -x flag and the data\n will be discarded to equiltime.\n" raise WLtime = WLstep * dt else: WLtime = -1 return max(WLtime, P.equiltime) #=================================================================================================== # Preliminaries I: Sort the dhdl.dat files; read in the @-header. #=================================================================================================== datafile_tuple = P.datafile_directory, P.prefix, P.suffix fs = [F(filename) for filename in glob('%s/%s*%s' % datafile_tuple)] n_files = len(fs) #NML: Clean up corrupted lines print 'Checking for corrupted dat files....' xvgs = [filename for filename in sorted(glob('%s/%s*%s' % datafile_tuple))] for f in xvgs: removeCorruptLines(f, f) if not n_files: raise SystemExit( "\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs." % datafile_tuple) if n_files > 1: fs = sorted(fs, key=F.sortedHelper) if P.bSkipLambdaIndex: try: lambdas_to_skip = [ int(l) for l in unixlike.trPy(P.bSkipLambdaIndex, '-').split() ] except: print '\nERROR!\nDo not understand the format of the string that follows -k.\nIt should be a string of lambda indices linked by "-".\n' raise fs = [f for f in fs if not f.state in lambdas_to_skip] n_files = len(fs) lv = [] # *** P.snap_size = [] for nf, f in enumerate(fs): lv.append(f.readHeader()) if nf > 0: if not f.lv_names == lv_names: if not len(f.lv_names) == n_components: raise SystemExit( "\nERROR!\nFiles do not contain the same number of lambda gradient components; I cannot combine the data." ) else: raise SystemExit( "\nERROR!\nThe lambda gradient components have different names; I cannot combine the data." ) if not f.bPV == bPV: raise SystemExit( "\nERROR!\nSome files contain the PV energies, some do not; I cannot combine the files." ) if not f.temperature == temperature: # compare against a string, not a float. raise SystemExit( "\nERROR!\nTemperature is not the same in all .xvg files.") else: P.lv_names = lv_names = f.lv_names temperature = f.temperature if temperature: temperature_float = float(temperature) P.beta *= P.temperature / temperature_float P.beta_report *= P.temperature / temperature_float P.temperature = temperature_float print "Temperature is %s K." % temperature else: print "Temperature not present in dat files. Using %g K." % P.temperature n_components = len(lv_names) bPV = f.bPV P.bExpanded = f.bExpanded #=================================================================================================== # Preliminaries II: Analyze data for validity; build up proper 'lv' and count up lambda states 'K'. #=================================================================================================== ndE = [len(i) for i in lv] # *** ndE_unique = numpy.unique(ndE) # *** # Scenario #1: Each file has all the dE columns -- can use MBAR. if len(ndE_unique) == 1: # [K] if not numpy.array([i == lv[0] for i in lv]).all(): raise SystemExit( "\nERROR!\nArrays of lambda vectors are different; I cannot combine the data." ) else: lv = lv[0] # Handle the case when only some particular files/lambdas are given. if 1 < n_files < len(lv) and not P.bExpanded: bSelective_MBAR = True sel_states = [f.state for f in fs] lv = [lv[i] for i in sel_states] else: bSelective_MBAR = False elif len(ndE_unique) <= 3: bSelective_MBAR = False # Scenario #2: Have the adjacent states only; 2 dE columns for the terminal states, 3 for inner ones. if ndE_unique.tolist() == [2, 3]: lv = [l[i > 0] for i, l in enumerate(lv)] # Scenario #3: Have a mixture of formats (adjacent and all): either [2,3,K], or [2,K], or [3,K]. else: lv = lv[ndE_unique.argmax()] if 'MBAR' in P.methods: print "\nNumber of states is NOT the same for all simulations; I'm assuming that we only evaluate" print "nearest neighbor states, and so cannot use MBAR, removing the method." P.methods.remove('MBAR') print "\nStitching together the dhdl files. I am assuming that the files are numbered in order of" print "increasing lambda; otherwise, results will not be correct." else: print "The files contain the number of the dE columns I cannot deal with; will terminate.\n\n%-10s %s " % ( "# of dE's", "File") for nf, f in enumerate(fs): print "%6d %s" % (ndE[nf], f.filename) raise SystemExit( "\nERROR!\nThere are more than 3 groups of files (%s, to be exact) each having different number of the dE columns; I cannot combine the data." % len(ndE_unique)) lv = numpy.array(lv, float) # *** Lambda vectors. K = len(lv) # *** Number of lambda states. #=================================================================================================== # Preliminaries III: Count up the equilibrated snapshots. #=================================================================================================== equiltime = P.equiltime nsnapshots = numpy.zeros((n_files, K), int) for nf, f in enumerate(fs): f.len_first, f.len_last = (len(line.split()) for line in unixlike.tailPy(f.filename, 2)) bLenConsistency = (f.len_first != f.len_last) if f.bExpanded: equiltime = f.parseLog() equilsnapshots = int(round(equiltime / f.snap_size)) f.skip_lines += equilsnapshots extract_states = (numpy.genfromtxt(f.filename, dtype=float, skip_header=f.skip_lines, skip_footer=1 * bLenConsistency, usecols=1)).astype(int) if np.max(extract_states) > K: # The number of states is actually bigger. we need to make the array larger. # for some reason, resize isn't working. So do it more brute force. old_K = K K = np.max(extract_states) temp_array = numpy.zeros([n_files, K], int) temp_array[:, :old_K] = nsnapshots.copy() nsnapshots = temp_array.copy() c = Counter( extract_states ) # need to make sure states with zero counts are properly counted. # It's OK for some of the expanded files to have no samples as long # at least one has samples for all states for k in range(K): nsnapshots[nf, k] += c[k] #nsnapshots[nf] += numpy.array(Counter(extract_states).values()) else: equilsnapshots = int(equiltime / f.snap_size) f.skip_lines += equilsnapshots nsnapshots[nf, nf] += unixlike.wcPy( f.filename) - f.skip_lines - 1 * bLenConsistency print "First %s ps (%s snapshots) will be discarded due to equilibration from file %s..." % ( equiltime, equilsnapshots, f.filename) #=================================================================================================== # Preliminaries IV: Load in equilibrated data. #=================================================================================================== maxn = max(nsnapshots.sum( axis=0)) # maximum number of the equilibrated snapshots from any state dhdlt = numpy.zeros( [K, n_components, int(maxn)], float ) # dhdlt[k,n,t] is the derivative of energy component n with respect to state k of snapshot t u_klt = numpy.zeros( [K, K, int(maxn)], numpy.float64 ) # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m nsnapshots = numpy.concatenate((numpy.zeros([1, K], int), nsnapshots)) for nf, f in enumerate(fs): nsnapshots_l = nsnapshots[:nf + 1].sum(axis=0) nsnapshots_r = nsnapshots[:nf + 2].sum(axis=0) f.iter_loadtxt(nf) return nsnapshots.sum(axis=0), lv, dhdlt, u_klt
def readDataDesmond(P): class F: def __init__(self, filename): self.filename = filename def sortedHelper(self): meat = os.path.basename(self.filename).replace(P.prefix, '').replace(P.suffix, '') l = [i for i in re.split('\.|-|_', meat) if i] try: self.state = l[0] = int(l[0]) # Will be of use for selective MBAR analysis. except: print("\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n") raise return tuple(l) def get_snapsize(self): self.skip_lines = 0 self.lv_names = () snap_size = [] # Time from first two snapshots to determine snapshot's size. self.lv = [] # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0). with open(self.filename,'r') as infile: for line in infile: snap_size.append(float(line.split()[0])) if len(snap_size) > 1: self.snap_size = numpy.diff(snap_size)[0] P.snap_size.append(self.snap_size) break def iter_loadtxt(self, state): def iter_func(): with open(self.filename, 'r') as infile: for _ in range(self.skip_lines): next(infile) for line in infile: line = line.split() for item in line: yield item def slice_data(data, state=state): #Energies stored in: # Reverse: data[1,:] # Forward: data[2,:] #Desmond unit input: kcal/mol, conversion factor 4.184kJ/kcal #P.beta from alchemical_analysis.py in kJ/mol/K #Return: u_klt contains energies of adjacent lambdas only data = data.T if state == 0: u_klt[state, state+1 , :nsnapshots[state]] = data[ 2 , : ]*4.184*P.beta elif state == K: u_klt[state, state-1 , :nsnapshots[state]] = data[ 2 , : ]*4.184*P.beta else: u_klt[state, state-1, :nsnapshots[state]] = data[ 1 , :]*4.184*P.beta u_klt[state, state+1, :nsnapshots[state]] = data[ 2 , :]*4.184*P.beta return print("Loading in data from %s (%s) ...") % (self.filename, 'state %d' % state) data = numpy.fromiter(iter_func(), dtype=float) if not self.len_first == self.len_last: data = data[: -self.len_last] data = data.reshape((-1, self.len_first)) slice_data(data) #=================================================================================================== # Preliminaries I: Get LV,Snapsize,consistency check, and skip frames #=================================================================================================== datafile_tuple = P.datafile_directory, P.prefix, P.suffix fs = [ F(filename) for filename in glob( '%s/%s*%s' % datafile_tuple ) ] n_files = len(fs) if not n_files: raise SystemExit("\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs." % datafile_tuple) if n_files > 1: fs = sorted(fs, key=F.sortedHelper) ###Set lambda vector and get snapsize lv = [] P.snap_size = [] for nf, f in enumerate(fs): lv.append( [nf,0] ) f.get_snapsize() P.lv_names = lv_names = f.lv_names n_components = len(lv_names) lv = numpy.array(lv, float) # *** Lambda vectors. K = len(lv) # *** Number of lambda states. equiltime = P.equiltime nsnapshots = numpy.zeros(K, int) for nf, f in enumerate(fs): ###Check for consistent timestep??? f.len_first, f.len_last = (len(line.split()) for line in unixlike.tailPy(f.filename, 2)) bLenConsistency = (f.len_first != f.len_last) ###Skip N snapshots equilsnapshots = int(equiltime/f.snap_size) f.skip_lines += equilsnapshots nsnapshots[nf] += unixlike.wcPy(f.filename) - f.skip_lines - 1*bLenConsistency print("First %s ps (%s snapshots) will be discarded due to equilibration from file %s...") % (equiltime, equilsnapshots, f.filename) #=================================================================================================== # Preliminaries: Load in equilibrated data. #=================================================================================================== maxn = max(nsnapshots) # maximum number of the equilibrated snapshots from any state u_klt = numpy.zeros([K,K+1,int(maxn)], numpy.float64) # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m for nf, f in enumerate(fs): f.iter_loadtxt(nf) return nsnapshots, lv, u_klt
def readDataGromacs(P): """Read in .xvg files; return nsnapshots, lv, dhdlt, and u_klt.""" class F: """This is the object to be built on the filename.""" def __init__(self, filename): self.filename = filename def sortedHelper(self): """This function will assist the built-in 'sorted' to sort filenames. Returns a tuple whose first element is an integer while others are strings.""" meat = os.path.basename(self.filename).replace(P.prefix, '').replace(P.suffix, '') l = [i for i in re.split('\.|-|_', meat) if i] try: self.state = l[0] = int(l[0]) # Will be of use for selective MBAR analysis. except: print "\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n" raise return tuple(l) def readHeader(self): self.skip_lines = 0 # Number of lines from the top that are to be skipped. self.lv_names = () # Lambda type names, e.g. 'coul', 'vdw'. snap_size = [] # Time from first two snapshots to determine snapshot's size. self.lv = [] # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0). self.bEnergy = False self.bPV = False self.bExpanded = False self.temperature= False print "Reading metadata from %s..." % self.filename with open(self.filename,'r') as infile: for line in infile: if line.startswith('#'): self.skip_lines += 1 elif line.startswith('@'): self.skip_lines += 1 elements = unixlike.trPy(line).split() if not 'legend' in elements: if 'T' in elements: self.temperature = elements[4] continue if 'Energy' in elements: self.bEnergy = True if 'pV' in elements: self.bPV = True if 'state' in elements: self.bExpanded = True if 'dH' in elements: self.lv_names += elements[7], if 'xD' in elements: self.lv.append(elements[-len(self.lv_names):]) else: snap_size.append(float(line.split()[0])) if len(snap_size) > 1: self.snap_size = numpy.diff(snap_size)[0] P.snap_size.append(self.snap_size) break return self.lv def iter_loadtxt(self, state): """Houstonian Joe Kington claims it is faster than numpy.loadtxt: http://stackoverflow.com/questions/8956832/python-out-of-memory-on-large-csv-file-numpy""" def iter_func(): with open(self.filename, 'r') as infile: for _ in range(self.skip_lines): next(infile) for line in infile: line = line.split() for item in line: yield item def slice_data(data, state=state): # Where the dE columns should be stored. if (len(ndE_unique)>1 and ndE[state]<4): # If BAR, store shifted 2/3 arrays. s1, s2 = numpy.array((0, ndE[state])) + state-(state>0) else: # If MBAR or selective MBAR or BAR/MBAR, store all. s1, s2 = (0, K) # Which dhdl columns are to be read. read_dhdl_sta = 1+self.bEnergy+self.bExpanded read_dhdl_end = read_dhdl_sta + n_components data = data.T dhdlt[state, :, nsnapshots_l[state]:nsnapshots_r[state]] = data[read_dhdl_sta : read_dhdl_end, :] if not bSelective_MBAR: r1, r2 = ( read_dhdl_end, read_dhdl_end + (ndE[state] if not self.bExpanded else K) ) if bPV: u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * ( data[r1:r2, :] + data[-1,:] ) else: u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * data[r1:r2, :] else: # can't do slicing; prepare a mask (slicing is thought to be faster/less memory consuming than masking) mask_read_uklt = numpy.array( [0]*read_dhdl_end + [1 if (k in sel_states) else 0 for k in range(ndE[0])] + ([0] if bPV else []), bool ) if bPV: u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * ( data[mask_read_uklt, :] + data[-1,:] ) else: u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * data[mask_read_uklt, :] return print "Loading in data from %s (%s) ..." % (self.filename, "all states" if self.bExpanded else 'state %d' % state) data = numpy.fromiter(iter_func(), dtype=float) if not self.len_first == self.len_last: data = data[: -self.len_last] data = data.reshape((-1, self.len_first)) if self.bExpanded: for k in range(K): mask_k = (data[:, 1] == k) data_k = data[mask_k] slice_data(data_k, k) else: slice_data(data) def parseLog(self): """By parsing the .log file of the expanded-ensemble simulation find out the time in ps when the WL equilibration has been reached. Return the greater of WLequiltime and equiltime.""" if not(P.bIgnoreWL): logfilename = self.filename.replace('.xvg', '.log') if not os.path.isfile(logfilename): raise SystemExit("\nERROR!\nThe .log file '%s' is needed to figure out when the Wang-Landau weights have been equilibrated, and it was not found.\nYou may rerun with the -x flag and the data will be discarded to 'equiltime', not bothering\nwith the extraction of the information on when the WL weights equilibration was reached.\nOtherwise, put the proper log file into the directory which is subject to the analysis." % logfilename) try: with open(logfilename, 'r') as infile: dt = float(unixlike.grepPy(infile, s='delta-t').split()[-1]) WLstep = int(unixlike.grepPy(infile, s='equilibrated').split()[1].replace(':', '')) except: print "\nERROR!\nThe Wang-Landau weights haven't equilibrated yet.\nIf you comprehend the consequences,\nrerun with the -x flag and the data\nwill be discarded to 'equiltime'.\n" raise WLtime = WLstep * dt else: WLtime = -1 return max(WLtime, P.equiltime) #=================================================================================================== # Preliminaries I: Sort the dhdl.xvg files; read in the @-header. #=================================================================================================== datafile_tuple = P.datafile_directory, P.prefix, P.suffix fs = [ F(filename) for filename in glob( '%s/%s*%s' % datafile_tuple ) ] n_files = len(fs) #NML: Clean up corrupted lines print 'Checking for corrupted xvg files....' xvgs = [filename for filename in sorted(glob( '%s/%s*%s' % datafile_tuple )) ] for f in xvgs: removeCorruptLines(f,f) if not n_files: raise SystemExit("\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs." % datafile_tuple) if n_files > 1: fs = sorted(fs, key=F.sortedHelper) if P.bSkipLambdaIndex: try: lambdas_to_skip = [int(l) for l in unixlike.trPy(P.bSkipLambdaIndex, '-').split()] except: print '\nERROR!\nDo not understand the format of the string that follows -k.\nIt should be a string of lambda indices linked by "-".\n' raise fs = [f for f in fs if not f.state in lambdas_to_skip] n_files = len(fs) lv = [] # *** P.snap_size = [] for nf, f in enumerate(fs): lv.append(f.readHeader()) if nf>0: if not f.lv_names == lv_names: if not len(f.lv_names) == n_components: raise SystemExit("\nERROR!\nFiles do not contain the same number of lambda gradient components; I cannot combine the data.") else: raise SystemExit("\nERROR!\nThe lambda gradient components have different names; I cannot combine the data.") if not f.bPV == bPV: raise SystemExit("\nERROR!\nSome files contain the PV energies, some do not; I cannot combine the files.") if not f.temperature == temperature: # compare against a string, not a float. raise SystemExit("\nERROR!\nTemperature is not the same in all .xvg files.") else: P.lv_names = lv_names = f.lv_names temperature = f.temperature if temperature: temperature_float = float(temperature) P.beta *= P.temperature/temperature_float P.beta_report *= P.temperature/temperature_float P.temperature = temperature_float print "Temperature is %s K." % temperature else: print "Temperature not present in xvg files. Using %g K." % P.temperature n_components = len(lv_names) bPV = f.bPV P.bExpanded = f.bExpanded #=================================================================================================== # Preliminaries II: Analyze data for validity; build up proper 'lv' and count up lambda states 'K'. #=================================================================================================== ndE = [len(i) for i in lv] # *** ndE_unique = numpy.unique(ndE) # *** # Scenario #1: Each file has all the dE columns -- can use MBAR. if len(ndE_unique) == 1: # [K] if not numpy.array([i == lv[0] for i in lv]).all(): raise SystemExit("\nERROR!\nArrays of lambda vectors are different; I cannot combine the data.") else: lv = lv[0] # Handle the case when only some particular files/lambdas are given. if 1 < n_files < len(lv) and not P.bExpanded: bSelective_MBAR = True sel_states = [f.state for f in fs] lv = [lv[i] for i in sel_states] else: bSelective_MBAR = False elif len(ndE_unique) <= 3: bSelective_MBAR = False # Scenario #2: Have the adjacent states only; 2 dE columns for the terminal states, 3 for inner ones. if ndE_unique.tolist() == [2, 3]: lv = [l[i>0] for i,l in enumerate(lv)] # Scenario #3: Have a mixture of formats (adjacent and all): either [2,3,K], or [2,K], or [3,K]. else: lv = lv[ndE_unique.argmax()] if 'MBAR' in P.methods: print "\nNumber of states is NOT the same for all simulations; I'm assuming that we only evaluate" print "nearest neighbor states, and so cannot use MBAR, removing the method." P.methods.remove('MBAR') print "\nStitching together the dhdl files. I am assuming that the files are numbered in order of" print "increasing lambda; otherwise, results will not be correct." else: print "The files contain the number of the dE columns I cannot deal with; will terminate.\n\n%-10s %s " % ("# of dE's", "File") for nf, f in enumerate(fs): print "%6d %s" % (ndE[nf], f.filename) raise SystemExit("\nERROR!\nThere are more than 3 groups of files (%s, to be exact) each having different number of the dE columns; I cannot combine the data." % len(ndE_unique)) lv = numpy.array(lv, float) # *** Lambda vectors. K = len(lv) # *** Number of lambda states. #=================================================================================================== # Preliminaries III: Count up the equilibrated snapshots. #=================================================================================================== equiltime = P.equiltime nsnapshots = numpy.zeros((n_files, K), int) for nf, f in enumerate(fs): f.len_first, f.len_last = (len(line.split()) for line in unixlike.tailPy(f.filename, 2)) bLenConsistency = (f.len_first != f.len_last) if f.bExpanded: equiltime = f.parseLog() equilsnapshots = int(round(equiltime/f.snap_size)) f.skip_lines += equilsnapshots extract_states = (numpy.genfromtxt(f.filename, dtype=float, skip_header=f.skip_lines, skip_footer=1*bLenConsistency, usecols=1)).astype(int) if np.max(extract_states) > K: # The number of states is actually bigger. we need to make the array larger. # for some reason, resize isn't working. So do it more brute force. old_K = K K = np.max(extract_states) temp_array = numpy.zeros([n_files,K],int) temp_array[:,:old_K] = nsnapshots.copy() nsnapshots = temp_array.copy() c = Counter(extract_states) # need to make sure states with zero counts are properly counted. # It's OK for some of the expanded files to have no samples as long # at least one has samples for all states for k in range(K): nsnapshots[nf,k] += c[k] #nsnapshots[nf] += numpy.array(Counter(extract_states).values()) else: equilsnapshots = int(equiltime/f.snap_size) f.skip_lines += equilsnapshots nsnapshots[nf,nf] += unixlike.wcPy(f.filename) - f.skip_lines - 1*bLenConsistency print "First %s ps (%s snapshots) will be discarded due to equilibration from file %s..." % (equiltime, equilsnapshots, f.filename) #=================================================================================================== # Preliminaries IV: Load in equilibrated data. #=================================================================================================== maxn = max(nsnapshots.sum(axis=0)) # maximum number of the equilibrated snapshots from any state dhdlt = numpy.zeros([K,n_components,int(maxn)], float) # dhdlt[k,n,t] is the derivative of energy component n with respect to state k of snapshot t u_klt = numpy.zeros([K,K,int(maxn)], numpy.float64) # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m nsnapshots = numpy.concatenate((numpy.zeros([1, K], int), nsnapshots)) for nf, f in enumerate(fs): nsnapshots_l = nsnapshots[:nf+1].sum(axis=0) nsnapshots_r = nsnapshots[:nf+2].sum(axis=0) f.iter_loadtxt(nf) return nsnapshots.sum(axis=0), lv, dhdlt, u_klt
def readDataDesmond(P): class F: def __init__(self, filename): self.filename = filename def sortedHelper(self): meat = os.path.basename(self.filename).replace(P.prefix, '').replace( P.suffix, '') l = [i for i in re.split('\.|-|_', meat) if i] try: self.state = l[0] = int( l[0]) # Will be of use for selective MBAR analysis. except: print( "\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n" ) raise return tuple(l) def get_snapsize(self): self.skip_lines = 0 self.lv_names = () snap_size = [ ] # Time from first two snapshots to determine snapshot's size. self.lv = [] # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0). with open(self.filename, 'r') as infile: for line in infile: snap_size.append(float(line.split()[0])) if len(snap_size) > 1: self.snap_size = numpy.diff(snap_size)[0] P.snap_size.append(self.snap_size) break def iter_loadtxt(self, state): def iter_func(): with open(self.filename, 'r') as infile: for _ in range(self.skip_lines): next(infile) for line in infile: line = line.split() for item in line: yield item def slice_data(data, state=state): #Energies stored in: # Reverse: data[1,:] # Forward: data[2,:] #Desmond unit input: kcal/mol, conversion factor 4.184kJ/kcal #P.beta from alchemical_analysis.py in kJ/mol/K #Return: u_klt contains energies of adjacent lambdas only data = data.T if state == 0: u_klt[state, state + 1, :nsnapshots[state]] = data[2, :] * 4.184 * P.beta elif state == K: u_klt[state, state - 1, :nsnapshots[state]] = data[2, :] * 4.184 * P.beta else: u_klt[state, state - 1, :nsnapshots[state]] = data[1, :] * 4.184 * P.beta u_klt[state, state + 1, :nsnapshots[state]] = data[2, :] * 4.184 * P.beta return print("Loading in data from %s (%s) ...") % (self.filename, 'state %d' % state) data = numpy.fromiter(iter_func(), dtype=float) if not self.len_first == self.len_last: data = data[:-self.len_last] data = data.reshape((-1, self.len_first)) slice_data(data) #=================================================================================================== # Preliminaries I: Get LV,Snapsize,consistency check, and skip frames #=================================================================================================== datafile_tuple = P.datafile_directory, P.prefix, P.suffix fs = [F(filename) for filename in glob('%s/%s*%s' % datafile_tuple)] n_files = len(fs) if not n_files: raise SystemExit( "\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs." % datafile_tuple) if n_files > 1: fs = sorted(fs, key=F.sortedHelper) ###Set lambda vector and get snapsize lv = [] P.snap_size = [] for nf, f in enumerate(fs): lv.append([nf, 0]) f.get_snapsize() P.lv_names = lv_names = f.lv_names n_components = len(lv_names) lv = numpy.array(lv, float) # *** Lambda vectors. K = len(lv) # *** Number of lambda states. equiltime = P.equiltime nsnapshots = numpy.zeros(K, int) for nf, f in enumerate(fs): ###Check for consistent timestep??? f.len_first, f.len_last = (len(line.split()) for line in unixlike.tailPy(f.filename, 2)) bLenConsistency = (f.len_first != f.len_last) ###Skip N snapshots equilsnapshots = int(equiltime / f.snap_size) f.skip_lines += equilsnapshots nsnapshots[nf] += unixlike.wcPy( f.filename) - f.skip_lines - 1 * bLenConsistency print( "First %s ps (%s snapshots) will be discarded due to equilibration from file %s..." ) % (equiltime, equilsnapshots, f.filename) #=================================================================================================== # Preliminaries: Load in equilibrated data. #=================================================================================================== maxn = max(nsnapshots ) # maximum number of the equilibrated snapshots from any state u_klt = numpy.zeros( [K, K + 1, int(maxn)], numpy.float64 ) # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m for nf, f in enumerate(fs): f.iter_loadtxt(nf) return nsnapshots, lv, u_klt