def __init__(self, filename):
            self.filename = filename
            self.skip_lines = 0  # Number of lines from the top that are to be skipped.
            snap_size = [
            ]  # Time from first two snapshots to determine snapshot's size.

            print "Reading metadata from %-*s" % (len_fstring + 1,
                                                  self.filename + ';'),
            with open(self.filename, 'r') as infile:
                for line in infile:

                    if line.startswith('#'):
                        self.skip_lines += 1
                        elements = line.split()
                        if 'lambba_val.val' in elements:
                            self.lv = elements[-1]
                            lv.append(elements[-1:])
                    else:
                        snap_size.append(float(line.split()[0]))
                        if len(snap_size) > 1:
                            self.snap_size = numpy.diff(snap_size)[0]
                            break
                equilsnapshots = int(P.equiltime / self.snap_size)
                self.skip_lines += equilsnapshots
                nsnapshots.append(unixlike.wcPy(infile) + 2 - equilsnapshots)
                print "first %s ps (%s snapshots) will be discarded due to equilibration..." % (
                    P.equiltime, equilsnapshots)
Ejemplo n.º 2
0
      def __init__(self, filename):
         self.filename   = filename
         self.skip_lines = 0  # Number of lines from the top that are to be skipped.
         snap_size       = [] # Time from first two snapshots to determine snapshot's size.

         print "Reading metadata from %-*s" % (len_fstring+1, self.filename+';'),
         with open(self.filename,'r') as infile:
            for line in infile:

               if line.startswith('#'):
                  self.skip_lines += 1
                  elements = line.split()
                  if 'lambba_val.val' in elements:
                     self.lv = elements[-1]
                     lv.append(elements[-1:])
               else:
                  snap_size.append(float(line.split()[0]))
                  if len(snap_size) > 1:
                     self.snap_size = numpy.diff(snap_size)[0]
                     break
            equilsnapshots  = int(P.equiltime/self.snap_size)
            self.skip_lines += equilsnapshots
            nsnapshots.append(unixlike.wcPy(infile) + 2 - equilsnapshots)
            print "first %s ps (%s snapshots) will be discarded due to equilibration..." % (P.equiltime, equilsnapshots)
Ejemplo n.º 3
0
def readDataGOMC(P):
    """Read in .dat files; return nsnapshots, lv, dhdlt, and u_klt."""
    class F:
        """This is the object to be built on the filename."""
        def __init__(self, filename):
            self.filename = filename

        def sortedHelper(self):
            """This function will assist the built-in 'sorted' to sort filenames.
            Returns a tuple whose first element is an integer while others are strings."""
            meat = os.path.basename(self.filename).replace(P.prefix,
                                                           '').replace(
                                                               P.suffix, '')
            l = [i for i in re.split('\.|-|_', meat) if i]
            try:
                self.state = l[0] = int(
                    l[0])  # Will be of use for selective MBAR analysis.
            except:
                print "\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n"
                raise
            return tuple(l)

        def readHeader(self):
            self.skip_lines = 0  # Number of lines from the top that are to be skipped.
            self.lv_names = ()  # Lambda type names, e.g. 'coul', 'vdw'.
            snap_size = [
            ]  # Time from first two snapshots to determine snapshot's size.
            self.lv = []  # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0).

            self.bEnergy = False
            self.bPV = False
            self.bExpanded = False
            self.temperature = False

            print "Reading metadata from %s..." % self.filename
            with open(self.filename, 'r') as infile:
                for line in infile:

                    if line.startswith('#'):
                        self.skip_lines += 1
                        elements = unixlike.trPy(line).split()
                        for i in range(len(elements)):
                            if '#T' == elements[i]:
                                self.temperature = elements[i + 2]
                            elif 'Total_En' == elements[i]:
                                self.bEnergy = True
                            elif 'PV' == elements[i]:
                                self.bPV = True
                            elif 'State' == elements[i]:
                                c1 = elements[i + 3][:4].lower()
                                c2 = elements[i + 5][:3].lower()
                                self.lv_names = c1, c2
                            elif 'DelE' == elements[i]:
                                self.lv.append(elements[i + 2:i + 4])

                    else:
                        snap_size.append(float(line.split()[0]))
                        if len(snap_size) > 1:
                            self.snap_size = numpy.diff(snap_size)[0]
                            P.snap_size.append(self.snap_size)
                            break
            return self.lv

        def iter_loadtxt(self, state):
            """Houstonian Joe Kington claims it is faster than numpy.loadtxt:
         http://stackoverflow.com/questions/8956832/python-out-of-memory-on-large-csv-file-numpy"""
            def iter_func():
                with open(self.filename, 'r') as infile:
                    for _ in range(self.skip_lines):
                        next(infile)
                    for line in infile:
                        line = line.split()
                        for item in line:
                            yield item

            def slice_data(data, state=state):
                # Where the dE columns should be stored.
                if (len(ndE_unique) > 1 and ndE[state] < 4):
                    # If BAR, store shifted 2/3 arrays.
                    s1, s2 = numpy.array((0, ndE[state])) + state - (state > 0)
                else:
                    # If MBAR or selective MBAR or BAR/MBAR, store all.
                    s1, s2 = (0, K)
                # Which dhdl columns are to be read.
                read_dhdl_sta = 1 + self.bEnergy + self.bExpanded
                read_dhdl_end = read_dhdl_sta + n_components

                data = data.T
                dhdlt[state, :,
                      nsnapshots_l[state]:nsnapshots_r[state]] = data[
                          read_dhdl_sta:read_dhdl_end, :]

                if not bSelective_MBAR:
                    r1, r2 = (read_dhdl_end, read_dhdl_end +
                              (ndE[state] if not self.bExpanded else K))
                    if bPV:
                        u_klt[state, s1:s2, nsnapshots_l[state]:
                              nsnapshots_r[state]] = P.beta * (data[r1:r2, :] +
                                                               data[-1, :])
                    else:
                        u_klt[state, s1:s2, nsnapshots_l[state]:
                              nsnapshots_r[state]] = P.beta * data[r1:r2, :]
                else:  # can't do slicing; prepare a mask (slicing is thought to be faster/less memory consuming than masking)
                    mask_read_uklt = numpy.array(
                        [0] * read_dhdl_end +
                        [1 if (k in sel_states) else 0
                         for k in range(ndE[0])] + ([0] if bPV else []), bool)
                    if bPV:
                        u_klt[state, s1:s2, nsnapshots_l[state]:
                              nsnapshots_r[state]] = P.beta * (
                                  data[mask_read_uklt, :] + data[-1, :])
                    else:
                        u_klt[state, s1:s2, nsnapshots_l[state]:
                              nsnapshots_r[state]] = P.beta * data[
                                  mask_read_uklt, :]
                return

            print "Loading in data from %s (%s) ..." % (
                self.filename,
                "all states" if self.bExpanded else 'state %d' % state)
            data = numpy.fromiter(iter_func(), dtype=float)
            if not self.len_first == self.len_last:
                data = data[:-self.len_last]
            data = data.reshape((-1, self.len_first))

            if self.bExpanded:
                for k in range(K):
                    mask_k = (data[:, 1] == k)
                    data_k = data[mask_k]
                    slice_data(data_k, k)
            else:
                slice_data(data)

        def parseLog(self):
            """By parsing the .log file of the expanded-ensemble simulation
         find out the time in ps when the WL equilibration has been reached.
         Return the greater of WLequiltime and equiltime."""
            if not (P.bIgnoreWL):
                logfilename = self.filename.replace('.xvg', '.log')
                if not os.path.isfile(logfilename):
                    raise SystemExit(
                        "\nERROR!\nThe .log file '%s' is needed to figure out when the Wang-Landau weights have been equilibrated, and it was not found.\nYou may rerun with the -x flag and the data will be discarded to 'equiltime', not bothering\nwith the extraction of the information on when the WL weights equilibration was reached.\nOtherwise, put the proper log file into the directory which is subject to the analysis."
                        % logfilename)
                try:
                    with open(logfilename, 'r') as infile:
                        dt = float(
                            unixlike.grepPy(infile, s='delta-t').split()[-1])
                        WLstep = int(
                            unixlike.grepPy(
                                infile,
                                s='equilibrated').split()[1].replace(':', ''))
                except:
                    print "\nERROR!\nThe Wang-Landau weights haven't equilibrated yet.\n"
                    print "If you comprehend the consequences,\n rerun with the -x flag and the data\n will be discarded to equiltime.\n"
                    raise
                WLtime = WLstep * dt
            else:
                WLtime = -1
            return max(WLtime, P.equiltime)

    #===================================================================================================
    # Preliminaries I: Sort the dhdl.dat files; read in the @-header.
    #===================================================================================================

    datafile_tuple = P.datafile_directory, P.prefix, P.suffix
    fs = [F(filename) for filename in glob('%s/%s*%s' % datafile_tuple)]
    n_files = len(fs)

    #NML: Clean up corrupted lines
    print 'Checking for corrupted dat files....'
    xvgs = [filename for filename in sorted(glob('%s/%s*%s' % datafile_tuple))]
    for f in xvgs:
        removeCorruptLines(f, f)

    if not n_files:
        raise SystemExit(
            "\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs."
            % datafile_tuple)
    if n_files > 1:
        fs = sorted(fs, key=F.sortedHelper)

    if P.bSkipLambdaIndex:
        try:
            lambdas_to_skip = [
                int(l) for l in unixlike.trPy(P.bSkipLambdaIndex, '-').split()
            ]
        except:
            print '\nERROR!\nDo not understand the format of the string that follows -k.\nIt should be a string of lambda indices linked by "-".\n'
            raise
        fs = [f for f in fs if not f.state in lambdas_to_skip]
        n_files = len(fs)

    lv = []  # ***
    P.snap_size = []
    for nf, f in enumerate(fs):
        lv.append(f.readHeader())

        if nf > 0:

            if not f.lv_names == lv_names:
                if not len(f.lv_names) == n_components:
                    raise SystemExit(
                        "\nERROR!\nFiles do not contain the same number of lambda gradient components; I cannot combine the data."
                    )
                else:
                    raise SystemExit(
                        "\nERROR!\nThe lambda gradient components have different names; I cannot combine the data."
                    )
            if not f.bPV == bPV:
                raise SystemExit(
                    "\nERROR!\nSome files contain the PV energies, some do not; I cannot combine the files."
                )
            if not f.temperature == temperature:  # compare against a string, not a float.
                raise SystemExit(
                    "\nERROR!\nTemperature is not the same in all .xvg files.")

        else:

            P.lv_names = lv_names = f.lv_names

            temperature = f.temperature
            if temperature:
                temperature_float = float(temperature)
                P.beta *= P.temperature / temperature_float
                P.beta_report *= P.temperature / temperature_float
                P.temperature = temperature_float
                print "Temperature is %s K." % temperature
            else:
                print "Temperature not present in dat files. Using %g K." % P.temperature

            n_components = len(lv_names)
            bPV = f.bPV
            P.bExpanded = f.bExpanded

    #===================================================================================================
    # Preliminaries II: Analyze data for validity; build up proper 'lv' and count up lambda states 'K'.
    #===================================================================================================

    ndE = [len(i) for i in lv]  # ***
    ndE_unique = numpy.unique(ndE)  # ***

    # Scenario #1: Each file has all the dE columns -- can use MBAR.
    if len(ndE_unique) == 1:  # [K]
        if not numpy.array([i == lv[0] for i in lv]).all():
            raise SystemExit(
                "\nERROR!\nArrays of lambda vectors are different; I cannot combine the data."
            )
        else:
            lv = lv[0]
            # Handle the case when only some particular files/lambdas are given.
            if 1 < n_files < len(lv) and not P.bExpanded:
                bSelective_MBAR = True
                sel_states = [f.state for f in fs]
                lv = [lv[i] for i in sel_states]
            else:
                bSelective_MBAR = False

    elif len(ndE_unique) <= 3:
        bSelective_MBAR = False
        # Scenario #2: Have the adjacent states only; 2 dE columns for the terminal states, 3 for inner ones.
        if ndE_unique.tolist() == [2, 3]:
            lv = [l[i > 0] for i, l in enumerate(lv)]
        # Scenario #3: Have a mixture of formats (adjacent and all): either [2,3,K], or [2,K], or [3,K].
        else:
            lv = lv[ndE_unique.argmax()]
        if 'MBAR' in P.methods:
            print "\nNumber of states is NOT the same for all simulations; I'm assuming that we only evaluate"
            print "nearest neighbor states, and so cannot use MBAR, removing the method."
            P.methods.remove('MBAR')
        print "\nStitching together the dhdl files. I am assuming that the files are numbered in order of"
        print "increasing lambda; otherwise, results will not be correct."

    else:
        print "The files contain the number of the dE columns I cannot deal with; will terminate.\n\n%-10s %s " % (
            "# of dE's", "File")
        for nf, f in enumerate(fs):
            print "%6d     %s" % (ndE[nf], f.filename)
        raise SystemExit(
            "\nERROR!\nThere are more than 3 groups of files (%s, to be exact) each having different number of the dE columns; I cannot combine the data."
            % len(ndE_unique))

    lv = numpy.array(lv, float)  # *** Lambda vectors.
    K = len(lv)  # *** Number of lambda states.

    #===================================================================================================
    # Preliminaries III: Count up the equilibrated snapshots.
    #===================================================================================================

    equiltime = P.equiltime
    nsnapshots = numpy.zeros((n_files, K), int)

    for nf, f in enumerate(fs):

        f.len_first, f.len_last = (len(line.split())
                                   for line in unixlike.tailPy(f.filename, 2))
        bLenConsistency = (f.len_first != f.len_last)

        if f.bExpanded:

            equiltime = f.parseLog()
            equilsnapshots = int(round(equiltime / f.snap_size))
            f.skip_lines += equilsnapshots

            extract_states = (numpy.genfromtxt(f.filename,
                                               dtype=float,
                                               skip_header=f.skip_lines,
                                               skip_footer=1 * bLenConsistency,
                                               usecols=1)).astype(int)
            if np.max(extract_states) > K:
                # The number of states is actually bigger. we need to make the array larger.
                # for some reason, resize isn't working. So do it more brute force.
                old_K = K
                K = np.max(extract_states)
                temp_array = numpy.zeros([n_files, K], int)
                temp_array[:, :old_K] = nsnapshots.copy()
                nsnapshots = temp_array.copy()

            c = Counter(
                extract_states
            )  # need to make sure states with zero counts are properly counted.
            # It's OK for some of the expanded files to have no samples as long
            # at least one has samples for all states
            for k in range(K):
                nsnapshots[nf, k] += c[k]
            #nsnapshots[nf] += numpy.array(Counter(extract_states).values())

        else:
            equilsnapshots = int(equiltime / f.snap_size)
            f.skip_lines += equilsnapshots
            nsnapshots[nf, nf] += unixlike.wcPy(
                f.filename) - f.skip_lines - 1 * bLenConsistency

        print "First %s ps (%s snapshots) will be discarded due to equilibration from file %s..." % (
            equiltime, equilsnapshots, f.filename)

    #===================================================================================================
    # Preliminaries IV: Load in equilibrated data.
    #===================================================================================================

    maxn = max(nsnapshots.sum(
        axis=0))  # maximum number of the equilibrated snapshots from any state
    dhdlt = numpy.zeros(
        [K, n_components, int(maxn)], float
    )  # dhdlt[k,n,t] is the derivative of energy component n with respect to state k of snapshot t
    u_klt = numpy.zeros(
        [K, K, int(maxn)], numpy.float64
    )  # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m

    nsnapshots = numpy.concatenate((numpy.zeros([1, K], int), nsnapshots))
    for nf, f in enumerate(fs):
        nsnapshots_l = nsnapshots[:nf + 1].sum(axis=0)
        nsnapshots_r = nsnapshots[:nf + 2].sum(axis=0)
        f.iter_loadtxt(nf)
    return nsnapshots.sum(axis=0), lv, dhdlt, u_klt
Ejemplo n.º 4
0
def readDataDesmond(P):

   class F:

      def __init__(self, filename):
         self.filename = filename

      def sortedHelper(self):
         meat = os.path.basename(self.filename).replace(P.prefix, '').replace(P.suffix, '')
         l = [i for i in re.split('\.|-|_', meat) if i]
         try:
            self.state = l[0] = int(l[0]) # Will be of use for selective MBAR analysis.
         except:
            print("\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n")
            raise
         return tuple(l)

      def get_snapsize(self):
         self.skip_lines = 0
         self.lv_names   = ()
         snap_size       = [] # Time from first two snapshots to determine snapshot's size.
         self.lv         = [] # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0).


         with open(self.filename,'r') as infile:
            for line in infile:
               snap_size.append(float(line.split()[0]))
               if len(snap_size) > 1:
                  self.snap_size = numpy.diff(snap_size)[0]
                  P.snap_size.append(self.snap_size)
                  break

      def iter_loadtxt(self, state):
         def iter_func():
            with open(self.filename, 'r') as infile:
               for _ in range(self.skip_lines):
                  next(infile)
               for line in infile:
                  line = line.split()
                  for item in line:
                     yield item

         def slice_data(data, state=state):
            #Energies stored in:
            #   Reverse: data[1,:]
            #   Forward: data[2,:]
            #Desmond unit input: kcal/mol, conversion factor 4.184kJ/kcal
            #P.beta from alchemical_analysis.py in kJ/mol/K
            #Return: u_klt contains energies of adjacent lambdas only

            data = data.T
            if state == 0:
               u_klt[state, state+1 , :nsnapshots[state]] = data[ 2 , : ]*4.184*P.beta
            elif state == K:
               u_klt[state, state-1 , :nsnapshots[state]] = data[ 2 , : ]*4.184*P.beta
            else:
               u_klt[state, state-1, :nsnapshots[state]] = data[ 1 , :]*4.184*P.beta
               u_klt[state, state+1, :nsnapshots[state]] = data[ 2 , :]*4.184*P.beta
            return

         print("Loading in data from %s (%s) ...") % (self.filename, 'state %d' % state)
         data = numpy.fromiter(iter_func(), dtype=float)
         if not self.len_first == self.len_last:
            data = data[: -self.len_last]
         data = data.reshape((-1, self.len_first))

         slice_data(data)
   #===================================================================================================
   # Preliminaries I: Get LV,Snapsize,consistency check, and skip frames
   #===================================================================================================

   datafile_tuple = P.datafile_directory, P.prefix, P.suffix
   fs = [ F(filename) for filename in glob( '%s/%s*%s' % datafile_tuple ) ]
   n_files = len(fs)

   if not n_files:
      raise SystemExit("\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs." % datafile_tuple)
   if n_files > 1:
      fs = sorted(fs, key=F.sortedHelper)

   ###Set lambda vector and get snapsize
   lv = []
   P.snap_size = []
   for nf, f in enumerate(fs):
      lv.append( [nf,0] )
      f.get_snapsize()

      P.lv_names = lv_names = f.lv_names
      n_components = len(lv_names)

   lv = numpy.array(lv, float) # *** Lambda vectors.
   K  = len(lv)                # *** Number of lambda states.
   equiltime = P.equiltime
   nsnapshots = numpy.zeros(K, int)

   for nf, f in enumerate(fs):
      ###Check for consistent timestep???
      f.len_first, f.len_last = (len(line.split()) for line in unixlike.tailPy(f.filename, 2))
      bLenConsistency = (f.len_first != f.len_last)

      ###Skip N snapshots
      equilsnapshots  = int(equiltime/f.snap_size)
      f.skip_lines   += equilsnapshots
      nsnapshots[nf] += unixlike.wcPy(f.filename) - f.skip_lines - 1*bLenConsistency
      print("First %s ps (%s snapshots) will be discarded due to equilibration from file %s...") % (equiltime, equilsnapshots, f.filename)

   #===================================================================================================
   # Preliminaries: Load in equilibrated data.
   #===================================================================================================

   maxn  = max(nsnapshots)                                   # maximum number of the equilibrated snapshots from any state
   u_klt = numpy.zeros([K,K+1,int(maxn)], numpy.float64)       # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m
   for nf, f in enumerate(fs):
      f.iter_loadtxt(nf)
   return nsnapshots, lv, u_klt
def readDataGromacs(P):
   """Read in .xvg files; return nsnapshots, lv, dhdlt, and u_klt."""
   
   class F:
      """This is the object to be built on the filename."""
 
      def __init__(self, filename):
         self.filename = filename
 
      def sortedHelper(self):
         """This function will assist the built-in 'sorted' to sort filenames.
            Returns a tuple whose first element is an integer while others are strings."""
         meat = os.path.basename(self.filename).replace(P.prefix, '').replace(P.suffix, '')
         l = [i for i in re.split('\.|-|_', meat) if i]
         try:
            self.state = l[0] = int(l[0]) # Will be of use for selective MBAR analysis.
         except:
            print "\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n"
            raise
         return tuple(l)
 
      def readHeader(self):
         self.skip_lines = 0  # Number of lines from the top that are to be skipped.
         self.lv_names   = () # Lambda type names, e.g. 'coul', 'vdw'.
         snap_size       = [] # Time from first two snapshots to determine snapshot's size.
         self.lv         = [] # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0).
 
         self.bEnergy    = False
         self.bPV        = False
         self.bExpanded  = False
         self.temperature= False
 
         print "Reading metadata from %s..." % self.filename
         with open(self.filename,'r') as infile:
            for line in infile:
 
               if line.startswith('#'):
                  self.skip_lines += 1
 
               elif line.startswith('@'):
                  self.skip_lines += 1
                  elements = unixlike.trPy(line).split()
                  if not 'legend' in elements:
                     if 'T' in elements:
                        self.temperature = elements[4]
                     continue
 
                  if 'Energy' in elements:
                     self.bEnergy = True
                  if 'pV' in elements:
                     self.bPV = True
                  if 'state' in elements:
                     self.bExpanded = True
 
                  if 'dH' in elements:
                     self.lv_names += elements[7],
                  if 'xD' in elements:
                     self.lv.append(elements[-len(self.lv_names):])
 
               else:
                  snap_size.append(float(line.split()[0]))
                  if len(snap_size) > 1:
                     self.snap_size = numpy.diff(snap_size)[0]
                     P.snap_size.append(self.snap_size)
                     break
         return self.lv
 
      def iter_loadtxt(self, state):
         """Houstonian Joe Kington claims it is faster than numpy.loadtxt:
         http://stackoverflow.com/questions/8956832/python-out-of-memory-on-large-csv-file-numpy"""
         
         def iter_func():
            with open(self.filename, 'r') as infile:
               for _ in range(self.skip_lines):
                  next(infile)
               for line in infile:
                  line = line.split()
                  for item in line:
                     yield item
 
         def slice_data(data, state=state):
            # Where the dE columns should be stored.
            if (len(ndE_unique)>1 and ndE[state]<4):
               # If BAR, store shifted 2/3 arrays.
               s1, s2 = numpy.array((0, ndE[state])) + state-(state>0)
            else:
               # If MBAR or selective MBAR or BAR/MBAR, store all.
               s1, s2 = (0, K)
            # Which dhdl columns are to be read.
            read_dhdl_sta = 1+self.bEnergy+self.bExpanded
            read_dhdl_end = read_dhdl_sta + n_components
  
            data = data.T
            dhdlt[state, :, nsnapshots_l[state]:nsnapshots_r[state]] = data[read_dhdl_sta : read_dhdl_end, :]

            if not bSelective_MBAR:
               r1, r2 = ( read_dhdl_end, read_dhdl_end + (ndE[state] if not self.bExpanded else K) )
               if bPV:
                  u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * ( data[r1:r2, :] + data[-1,:] )
               else:
                  u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * data[r1:r2, :]
            else: # can't do slicing; prepare a mask (slicing is thought to be faster/less memory consuming than masking)
               mask_read_uklt = numpy.array( [0]*read_dhdl_end + [1 if (k in sel_states) else 0 for k in range(ndE[0])] + ([0] if bPV else []), bool )
               if bPV:
                  u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * ( data[mask_read_uklt, :] + data[-1,:] )
               else:
                  u_klt[state, s1:s2, nsnapshots_l[state]:nsnapshots_r[state]] = P.beta * data[mask_read_uklt, :]
            return
 
         print "Loading in data from %s (%s) ..." % (self.filename, "all states" if self.bExpanded else 'state %d' % state)
         data = numpy.fromiter(iter_func(), dtype=float)
         if not self.len_first == self.len_last:
            data = data[: -self.len_last]
         data = data.reshape((-1, self.len_first))
         
         if self.bExpanded:
            for k in range(K):
               mask_k = (data[:, 1] == k)
               data_k = data[mask_k]
               slice_data(data_k, k)
         else:
            slice_data(data)
 
      def parseLog(self):
         """By parsing the .log file of the expanded-ensemble simulation
         find out the time in ps when the WL equilibration has been reached.
         Return the greater of WLequiltime and equiltime."""
         if not(P.bIgnoreWL):
            logfilename = self.filename.replace('.xvg', '.log')
            if not os.path.isfile(logfilename):
               raise SystemExit("\nERROR!\nThe .log file '%s' is needed to figure out when the Wang-Landau weights have been equilibrated, and it was not found.\nYou may rerun with the -x flag and the data will be discarded to 'equiltime', not bothering\nwith the extraction of the information on when the WL weights equilibration was reached.\nOtherwise, put the proper log file into the directory which is subject to the analysis." % logfilename)
            try:
               with open(logfilename, 'r') as infile:
                  dt = float(unixlike.grepPy(infile, s='delta-t').split()[-1])
                  WLstep = int(unixlike.grepPy(infile, s='equilibrated').split()[1].replace(':', ''))
            except:
               print "\nERROR!\nThe Wang-Landau weights haven't equilibrated yet.\nIf you comprehend the consequences,\nrerun with the -x flag and the data\nwill be discarded to 'equiltime'.\n"
               raise
            WLtime = WLstep * dt
         else:
            WLtime = -1
         return max(WLtime, P.equiltime)

   #===================================================================================================
   # Preliminaries I: Sort the dhdl.xvg files; read in the @-header.
   #===================================================================================================
   
   datafile_tuple = P.datafile_directory, P.prefix, P.suffix
   fs = [ F(filename) for filename in glob( '%s/%s*%s' % datafile_tuple ) ]
   n_files = len(fs)
   
   #NML: Clean up corrupted lines
   print 'Checking for corrupted xvg files....'
   xvgs = [filename for filename in sorted(glob( '%s/%s*%s' % datafile_tuple )) ]
   for f in xvgs:
      removeCorruptLines(f,f)
      
   if not n_files:
      raise SystemExit("\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs." % datafile_tuple)
   if n_files > 1:
      fs = sorted(fs, key=F.sortedHelper)
   
   if P.bSkipLambdaIndex:
      try:
         lambdas_to_skip = [int(l) for l in unixlike.trPy(P.bSkipLambdaIndex, '-').split()]
      except:
         print '\nERROR!\nDo not understand the format of the string that follows -k.\nIt should be a string of lambda indices linked by "-".\n'
         raise
      fs = [f for f in fs if not f.state in lambdas_to_skip]
      n_files = len(fs)
   
   lv = []  # *** 
   P.snap_size = []
   for nf, f in enumerate(fs):
      lv.append(f.readHeader())
   
      if nf>0:
   
         if not f.lv_names == lv_names:
            if not len(f.lv_names) == n_components:
               raise SystemExit("\nERROR!\nFiles do not contain the same number of lambda gradient components; I cannot combine the data.")
            else:
               raise SystemExit("\nERROR!\nThe lambda gradient components have different names; I cannot combine the data.")
         if not f.bPV == bPV:
            raise SystemExit("\nERROR!\nSome files contain the PV energies, some do not; I cannot combine the files.")
         if not f.temperature == temperature: # compare against a string, not a float.
            raise SystemExit("\nERROR!\nTemperature is not the same in all .xvg files.")
   
      else:
   
         P.lv_names = lv_names = f.lv_names

         temperature = f.temperature
         if temperature:
            temperature_float = float(temperature)
            P.beta *= P.temperature/temperature_float
            P.beta_report *= P.temperature/temperature_float
            P.temperature = temperature_float
            print "Temperature is %s K." % temperature
         else:
            print "Temperature not present in xvg files. Using %g K." % P.temperature

         n_components = len(lv_names)
         bPV = f.bPV
         P.bExpanded = f.bExpanded

   #===================================================================================================
   # Preliminaries II: Analyze data for validity; build up proper 'lv' and count up lambda states 'K'.
   #===================================================================================================
   
   ndE = [len(i) for i in lv]     # ***
   ndE_unique = numpy.unique(ndE) # ***
   
   # Scenario #1: Each file has all the dE columns -- can use MBAR.
   if len(ndE_unique) == 1: # [K]
      if not numpy.array([i == lv[0] for i in lv]).all():
         raise SystemExit("\nERROR!\nArrays of lambda vectors are different; I cannot combine the data.")
      else:
         lv = lv[0]
         # Handle the case when only some particular files/lambdas are given.
         if 1 < n_files < len(lv) and not P.bExpanded:
            bSelective_MBAR = True
            sel_states = [f.state for f in fs]
            lv = [lv[i] for i in sel_states]
         else:
            bSelective_MBAR = False
   
   elif len(ndE_unique) <= 3:
      bSelective_MBAR = False
      # Scenario #2: Have the adjacent states only; 2 dE columns for the terminal states, 3 for inner ones.
      if ndE_unique.tolist() == [2, 3]:
         lv  = [l[i>0]  for i,l in enumerate(lv)]
      # Scenario #3: Have a mixture of formats (adjacent and all): either [2,3,K], or [2,K], or [3,K].
      else:
         lv = lv[ndE_unique.argmax()]
      if 'MBAR' in P.methods:
         print "\nNumber of states is NOT the same for all simulations; I'm assuming that we only evaluate"
         print "nearest neighbor states, and so cannot use MBAR, removing the method."
         P.methods.remove('MBAR')
      print "\nStitching together the dhdl files. I am assuming that the files are numbered in order of"
      print "increasing lambda; otherwise, results will not be correct."
   
   else:
      print "The files contain the number of the dE columns I cannot deal with; will terminate.\n\n%-10s %s " % ("# of dE's", "File")
      for nf, f in enumerate(fs):
         print "%6d     %s" % (ndE[nf], f.filename)
      raise SystemExit("\nERROR!\nThere are more than 3 groups of files (%s, to be exact) each having different number of the dE columns; I cannot combine the data." % len(ndE_unique))
   
   lv = numpy.array(lv, float) # *** Lambda vectors.
   K  = len(lv)                # *** Number of lambda states.

   #===================================================================================================
   # Preliminaries III: Count up the equilibrated snapshots.
   #===================================================================================================
   
   equiltime = P.equiltime
   nsnapshots = numpy.zeros((n_files, K), int)


   for nf, f in enumerate(fs):

      f.len_first, f.len_last = (len(line.split()) for line in unixlike.tailPy(f.filename, 2))
      bLenConsistency = (f.len_first != f.len_last)
         
      if f.bExpanded:

   
         equiltime       = f.parseLog()
         equilsnapshots  = int(round(equiltime/f.snap_size))
         f.skip_lines   += equilsnapshots
   
         extract_states  = (numpy.genfromtxt(f.filename, dtype=float, skip_header=f.skip_lines, skip_footer=1*bLenConsistency, usecols=1)).astype(int)
         if np.max(extract_states) > K:
            # The number of states is actually bigger. we need to make the array larger.
            # for some reason, resize isn't working. So do it more brute force.
            old_K = K
            K = np.max(extract_states)
            temp_array = numpy.zeros([n_files,K],int)
            temp_array[:,:old_K] = nsnapshots.copy()
            nsnapshots = temp_array.copy()

         c = Counter(extract_states)  # need to make sure states with zero counts are properly counted. 
                                      # It's OK for some of the expanded files to have no samples as long
                                      # at least one has samples for all states
         for k in range(K):
            nsnapshots[nf,k] += c[k]
         #nsnapshots[nf] += numpy.array(Counter(extract_states).values())
   
      else:
         equilsnapshots  = int(equiltime/f.snap_size)
         f.skip_lines   += equilsnapshots
         nsnapshots[nf,nf] += unixlike.wcPy(f.filename) - f.skip_lines - 1*bLenConsistency
   
      print "First %s ps (%s snapshots) will be discarded due to equilibration from file %s..." % (equiltime, equilsnapshots, f.filename)
   
   #===================================================================================================
   # Preliminaries IV: Load in equilibrated data.
   #===================================================================================================   
   
   maxn  = max(nsnapshots.sum(axis=0))                       # maximum number of the equilibrated snapshots from any state
   dhdlt = numpy.zeros([K,n_components,int(maxn)], float)    # dhdlt[k,n,t] is the derivative of energy component n with respect to state k of snapshot t
   u_klt = numpy.zeros([K,K,int(maxn)], numpy.float64)       # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m

   nsnapshots = numpy.concatenate((numpy.zeros([1, K], int), nsnapshots))   
   for nf, f in enumerate(fs):
      nsnapshots_l = nsnapshots[:nf+1].sum(axis=0)
      nsnapshots_r = nsnapshots[:nf+2].sum(axis=0)
      f.iter_loadtxt(nf)
   return nsnapshots.sum(axis=0), lv, dhdlt, u_klt
def readDataDesmond(P):
    class F:
        def __init__(self, filename):
            self.filename = filename

        def sortedHelper(self):
            meat = os.path.basename(self.filename).replace(P.prefix,
                                                           '').replace(
                                                               P.suffix, '')
            l = [i for i in re.split('\.|-|_', meat) if i]
            try:
                self.state = l[0] = int(
                    l[0])  # Will be of use for selective MBAR analysis.
            except:
                print(
                    "\nERROR!\nFile's prefix should be followed by a numerical character. Cannot sort the files.\n"
                )
                raise
            return tuple(l)

        def get_snapsize(self):
            self.skip_lines = 0
            self.lv_names = ()
            snap_size = [
            ]  # Time from first two snapshots to determine snapshot's size.
            self.lv = []  # Lambda vectors, e.g. (0, 0), (0.2, 0), (0.5, 0).

            with open(self.filename, 'r') as infile:
                for line in infile:
                    snap_size.append(float(line.split()[0]))
                    if len(snap_size) > 1:
                        self.snap_size = numpy.diff(snap_size)[0]
                        P.snap_size.append(self.snap_size)
                        break

        def iter_loadtxt(self, state):
            def iter_func():
                with open(self.filename, 'r') as infile:
                    for _ in range(self.skip_lines):
                        next(infile)
                    for line in infile:
                        line = line.split()
                        for item in line:
                            yield item

            def slice_data(data, state=state):
                #Energies stored in:
                #   Reverse: data[1,:]
                #   Forward: data[2,:]
                #Desmond unit input: kcal/mol, conversion factor 4.184kJ/kcal
                #P.beta from alchemical_analysis.py in kJ/mol/K
                #Return: u_klt contains energies of adjacent lambdas only

                data = data.T
                if state == 0:
                    u_klt[state, state +
                          1, :nsnapshots[state]] = data[2, :] * 4.184 * P.beta
                elif state == K:
                    u_klt[state, state -
                          1, :nsnapshots[state]] = data[2, :] * 4.184 * P.beta
                else:
                    u_klt[state, state -
                          1, :nsnapshots[state]] = data[1, :] * 4.184 * P.beta
                    u_klt[state, state +
                          1, :nsnapshots[state]] = data[2, :] * 4.184 * P.beta
                return

            print("Loading in data from %s (%s) ...") % (self.filename,
                                                         'state %d' % state)
            data = numpy.fromiter(iter_func(), dtype=float)
            if not self.len_first == self.len_last:
                data = data[:-self.len_last]
            data = data.reshape((-1, self.len_first))

            slice_data(data)

    #===================================================================================================
    # Preliminaries I: Get LV,Snapsize,consistency check, and skip frames
    #===================================================================================================

    datafile_tuple = P.datafile_directory, P.prefix, P.suffix
    fs = [F(filename) for filename in glob('%s/%s*%s' % datafile_tuple)]
    n_files = len(fs)

    if not n_files:
        raise SystemExit(
            "\nERROR!\nNo files found within directory '%s' with prefix '%s' and suffix '%s': check your inputs."
            % datafile_tuple)
    if n_files > 1:
        fs = sorted(fs, key=F.sortedHelper)

    ###Set lambda vector and get snapsize
    lv = []
    P.snap_size = []
    for nf, f in enumerate(fs):
        lv.append([nf, 0])
        f.get_snapsize()

        P.lv_names = lv_names = f.lv_names
        n_components = len(lv_names)

    lv = numpy.array(lv, float)  # *** Lambda vectors.
    K = len(lv)  # *** Number of lambda states.
    equiltime = P.equiltime
    nsnapshots = numpy.zeros(K, int)

    for nf, f in enumerate(fs):
        ###Check for consistent timestep???
        f.len_first, f.len_last = (len(line.split())
                                   for line in unixlike.tailPy(f.filename, 2))
        bLenConsistency = (f.len_first != f.len_last)

        ###Skip N snapshots
        equilsnapshots = int(equiltime / f.snap_size)
        f.skip_lines += equilsnapshots
        nsnapshots[nf] += unixlike.wcPy(
            f.filename) - f.skip_lines - 1 * bLenConsistency
        print(
            "First %s ps (%s snapshots) will be discarded due to equilibration from file %s..."
        ) % (equiltime, equilsnapshots, f.filename)

    #===================================================================================================
    # Preliminaries: Load in equilibrated data.
    #===================================================================================================

    maxn = max(nsnapshots
               )  # maximum number of the equilibrated snapshots from any state
    u_klt = numpy.zeros(
        [K, K + 1, int(maxn)], numpy.float64
    )  # u_klt[k,m,t] is the reduced potential energy of snapshot t of state k evaluated at state m
    for nf, f in enumerate(fs):
        f.iter_loadtxt(nf)
    return nsnapshots, lv, u_klt