Beispiel #1
0
 def doCovar(self, winsize=.1, slide=.05):
   # Calculate Covariance matrices over all temporal data IAW given window param
   self.covar = []
   self.fealcov  = []
   for i, tr in self.E.trlist.items():
     if i % 100 == 0:
       print(i)
     cov = dr.calc_covar(tr.xyz, winsize, 1, slide=slide)
     if self.space in ['ds', 'dsw']:
       X, C = dr.distance_space(tr), self.E.cent_ds
     else:
       X, C = tr.xyz, self.E.cent_c
     wgt = self.E.cw if self.space in ['cw', 'dsw'] else [1,1,1,1,1]
     rms = calc_rmsd(X, C, weights=wgt)
     W = int(winsize * 1000)
     S = int(slide * 1000)
     feal = [np.mean([FL.feal.atemporal(i) for i in rms[st:st+W]], axis=0) for st in range(0, len(tr.xyz), S)]
     for n in range(min(len(cov), len(feal))):
       self.covar.append(cov[n])
       self.fealcov.append(feal[n])
Beispiel #2
0
  def calc_feal_AB(cls, traj):
    """Atemporal (individual frame) featue landscape
    """
    maxd = self.max_rms_dist
    mind = self.min_rms_dist
    ds = DR.distance_space(traj)
    rms = rmsd.calc_rmsd(ds, self.cent_ds)

    # Proximity to State
    for i in range(traj.n_frames):
      fealand = [0 for i in range(5)]
      fealand[np.argmin(rms[i])] = self.scaleto
      # Proximity
      for dist in enumerate(rms):
        fealand.append(scaleto*max(maxd-(max(dist, mind), 0))/(maxd-mind))

      # Additional Feature Spaces
      for a in range(4):
        for b in range(a+1, 5):
          rel_dist = rms[a]-rms[b]
          tup.append(log_reld(rel_dist))

      fealand.extend(tup)
Beispiel #3
0
    def calc_feal_AB(cls, traj):
        """Atemporal (individual frame) featue landscape
    """
        maxd = self.max_rms_dist
        mind = self.min_rms_dist
        ds = DR.distance_space(traj)
        rms = rmsd.calc_rmsd(ds, self.cent_ds)

        # Proximity to State
        for i in range(traj.n_frames):
            fealand = [0 for i in range(5)]
            fealand[np.argmin(rms[i])] = self.scaleto
            # Proximity
            for dist in enumerate(rms):
                fealand.append(scaleto * max(maxd - (max(dist, mind), 0)) /
                               (maxd - mind))

            # Additional Feature Spaces
            for a in range(4):
                for b in range(a + 1, 5):
                    rel_dist = rms[a] - rms[b]
                    tup.append(log_reld(rel_dist))

            fealand.extend(tup)
Beispiel #4
0
traj, rmsd = [], []
for tnum in range(42):
  p, d = DE.getHistoricalTrajectory_prot(tnum)
  traj.append(md.load(d, top=p).atom_slice(afilt))

resrms = []
ccent = np.load('../data/bpti-alpha-cartesian-centroid.npy')
for tnum, tr in enumerate(detraj):
  minlist = TS.TimeScape.read_log(home+'/work/timescape/desh_%02d_minima.log'%tnum)
  minima = tr.slice(minlist)
  minima.superpose(topoa)
  for m in minima.xyz:
    resrms.append(LA.norm(ref - m, axis=1))


  ds = DR.distance_space(minima)
  state = np.array([np.argmin(LA.norm(cent-i, axis=1)) for i in ds])
  resrms = [np.zeros(58)]
  for in range(len(minima[1:])):
    resrms.append(LA.norm(minima[i].xyz - minima[i-1], axis=2))

  for m, s in zip(minima, state):
    resrms.append(LA.norm(m.xyz - ccent[s], axis=0))

for i in range(1, 91116): 
  CM[i] = np.abs(resrms[max(0, i-5):i].mean(0)-resrms[i: min(91116, i+5)].mean(0)) > theta

a, b, Z = 0, 30000, 300
for sc in range(8):
  data = {'State':bL[a:b]}
  for rd in range(7):
Beispiel #5
0
def bootstrap_lattice(catalog, num=10, build_new=False):
  ''' Bootstrap After TimeScape has run on source trajectory '''
  home = os.getenv("HOME")
  support = 1
  cutoff  = 8

  start_coord = ['de2586_315', 'de531_20', 'de3765_63', 'de3305_668', 'de1732_139']
  dcdfile = lambda x: home + '/work/data/{0}.dcd'.format(x)
  outloc  = lambda x: home+'/work/jc/denovouniform1/{0}/{0}'.format(x)


  traj_list = {}

  basin_list = catalog.lrange('basin:list', 0, -1)
  if len(basin_list) == 134:
    logging.info('Basin Data already loaded!')
    rms_delta_list = [(i, np.sum(pickle.loads(catalog.get('basin:rmsdelta:'+b)))) for i, b in enumerate(basin_list)]
  else:
    logging.info('Loading all bootstrap data to initialize...')
    basin_list = []
    rms_delta_list = []
    pdb_file = home+'/work/data/alpha.pdb'
    topo = md.load(pdb_file)
    ref_alpha = md.load(home+'/work/' + catalog.get('pdb:ref:0'))
    ref_alpha.atom_slice(ref_alpha.top.select_atom_indices('alpha'), inplace=True)
    res_rms_Kr = FEATURE_SET

    for sc in start_coord:
      dist_space = []
      srcfile = outloc(sc) + '.dcd'
      pdbfile = srcfile.replace('dcd', 'pdb')
      logging.debug('LOADING TRAJ:  %s', srcfile)
      traj = md.load(srcfile, top = pdbfile)
      traj_list[sc] = traj
      alpha = traj.atom_slice(traj.top.select_atom_indices('alpha'))

      logging.info('Grabbing TS data...')
      W = TS.TimeScape.windows(outloc(sc) + '_transitions.log')
      ts_traj = TS.TimeScapeParser(pdbfile, outloc(sc), sc, dcd=srcfile, traj=traj)
      basins = ts_traj.load_basins()

      logging.info("Processing distance space and residue RMS")
      dsa = DR.distance_space(alpha)
      resrmsd = 10*np.array([LA.norm(i-ref_alpha.xyz[0], axis=1) for i in alpha.xyz])
      basin_res_rms = np.zeros(shape=(len(ts_traj.basins), alpha.n_atoms))
      for i, (a,b) in enumerate(W):
        dist_space.append(dsa[a:b].mean(0))
        basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

      basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T
      logging.debug('RMS LEN CHECK:  %d =?= %d    -- Updating RMS Delta',len(basins), len(basin_res_rms_delta))


      for i, basin in enumerate(basins):
        pipe = catalog.pipeline()
        bid = basin.id

        # Store on Disk and in redis
        jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid)
        logging.info('MIN for %s:   Idx# %d  to %s', bid, basin.mindex, jc_filename)
        minima_frame = traj.slice(basin.mindex)  #md.load_frame(src_traj, basin.mindex, top=src_traj.replace('dcd', 'pdb'))
        minima_frame.save_pdb(jc_filename)

        basin_hash = basin.kv()
        basin_hash['pdbfile'] = jc_filename
        logging.info('  Basin: %(id)s  %(start)d - %(end)d   Minima: %(mindex)d    size=%(len)d' % basin_hash)

        pipe.rpush('basin:list', bid)
        pipe.hmset('basin:%s'%bid, basin_hash)
        pipe.set('basin:dmu:'+bid, pickle.dumps(dist_space[i]))
        pipe.set('minima:%s'%bid, pickle.dumps(minima_frame))

        # FOR RESIDUE RMSD
        resrms_d = np.sum(basin_res_rms_delta[i][res_rms_Kr])
        basin_hash['resrms_delta'] = resrms_d
        rms_delta_list.append((len(basin_list), resrms_d))
        basin_list.append(basin_hash)
        pipe.set('basin:rmsdelta:'+bid, pickle.dumps(basin_res_rms_delta[i]))

        pipe.execute()




  # FOR RESIDUE RMSD

  # FOR SEED SAMPLING USING RMS_DELTA

  # Note: skip the first basin



  # Re-Construct the Lattice from 
  if build_new:
    dist_space = 10*np.array(dist_space)
    cm = ds<cutoff
    fs = lat.reduced_feature_set(cm,.115); len(fs)
    dr, cr = ds[:,fs], cm[:,fs]


    mfis,lfis = lat.maxminer(cr, 1)
    dlat, ik = lat.derived_lattice(mfis, dr, cr)
    pickle.dump(mfis, open(home + '/work/data/denovo_mfis.p', 'wb'))
    pickle.dump(lfis, open(home + '/work/data/denovo_lfis.p', 'wb'))
    pickle.dump(ik, open(home + '/work/data/denovo_iset.p', 'wb'))
    pickle.dump(dlat, open(home + '/work/data/denovo_dlat.p', 'wb'))

  else:

    logging.info('Loading Pre-Constructed Lattice Data')
    dlat = pickle.load(open(home + '/work/data/denovo_dlat.p', 'rb'))
    mfis = pickle.load(open(home + '/work/data/denovo_mfis.p', 'rb'))
    lfis = pickle.load(open(home + '/work/data/denovo_lfis.p', 'rb'))
    ik = pickle.load(open(home + '/work/data/denovo_iset.p', 'rb'))

  with catalog.pipeline() as pipe:
    pipe.set('lattice:max_fis', pickle.dumps(mfis))
    pipe.set('lattice:low_fis', pickle.dumps(lfis))
    pipe.set('lattice:dlat', pickle.dumps(dlat))
    pipe.set('lattice:iset', pickle.dumps(ik))
    pipe.execute()

  # logging.info('Building Existing lattice object')
  # lattice=lat.Lattice(ds, fs, cutoff, support)
  # lattice.set_fis(max_fis, low_fis)
  # lattice.set_dlat(dlat, Ik)
  # sampler = LatticeSampler(lattice)

  # Sample -- FOR USING LATTICE TO BOOTSTRAP
  # cl,sc,el = lat.clusterlattice(dlat, cr, dr, ik, num_k=8, invert=True)
  # cl_list = sorted(el, key=lambda x: len(x))

  # TODO: Check if fan out > single item clusters
  # start_indices = [clu[0][0] for clu in cl_list[:num]]

  rms_delta_ranked = [x[0] for x in sorted(rms_delta_list, key=lambda i: i[1], reverse=True)]
  start_indices = rms_delta_ranked[:num]

  seedlist = [catalog.lindex('basin:list', i) for i in start_indices]
  sim_init = {key: catalog.get(key) for key in settings.sim_params.keys()}
  global_params = getSimParameters(sim_init, 'seed')
  global_params['psf'] = home+'/work/jc/serial2/de0_0/de0_0.psf'

  for seed in seedlist:
    logging.debug('\nSeeding Job: %s ', seed)
    basin = catalog.hgetall('basin:%s'%seed)
    catalog.rpush('executed', seed)

    # Generate new set of params/coords
    jcID, config = generateFromBasin(basin)

    # Update Additional JC Params and Decision History, as needed
    config.update(global_params)

    # Push to catalog
    logging.info("New Simulation Job Created: %s", jcID)
    for k, v in config.items():
      logging.debug("   %s:  %s", k, str(v))
    catalog.rpush('jcqueue', jcID)
    catalog.hmset(wrapKey('jc', jcID), config)
Beispiel #6
0
    def execute(self, job):

        # PRE-PREOCESS ----------------------------------------------------------
        settings = systemsettings()
        bench = microbench("sim_%s" % settings.name, self.seqNumFromID())
        bench.start()
        stat = StatCollector("sim_%s" % settings.name, self.seqNumFromID())
        mylogical_seqnum = str(self.seqNumFromID())

        # Prepare working directory, input/output files
        conFile = os.path.join(job["workdir"], job["name"] + ".conf")
        logFile = conFile.replace("conf", "log")  # log in same place as config file
        dcdFile = conFile.replace("conf", "dcd")  # dcd in same place as config file
        USE_SHM = True

        SIMULATE_RATIO = settings.SIMULATE_RATIO
        if SIMULATE_RATIO > 1:
            logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
        frame_size = (SIMULATE_RATIO * int(job["interval"])) / (1000)
        logging.info("Frame Size is %f  Using Sim Ratio of 1:%d", frame_size, SIMULATE_RATIO)

        EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
        logging.info("Running Experiment Configuration #%d", EXPERIMENT_NUMBER)

        # # Grab historical basin data's relative time to start (for lineage)
        traj = None

        # EXECUTE SIMULATION ---------------------------------------------------------
        if self.skip_simulation:

            logging.info("1. SKIPPING SIMULATION.....")
            USE_SHM = False

            job["dcd"] = dcdFile
            key = wrapKey("jc", job["name"])
            self.data[key]["dcd"] = dcdFile

        else:
            logging.info("1. Run Simulation")

            # Prepare & source to config file
            with open(self.data["sim_conf_template"], "r") as template:
                source = template.read()

            # >>>>Storing DCD into shared memory on this node

            if USE_SHM:
                ramdisk = gettempdir()
                job["outputloc"] = ramdisk
                dcd_ramfile = os.path.join(ramdisk, job["name"] + ".dcd")
            else:
                job["outputloc"] = ""

            with open(conFile, "w") as sysconfig:
                sysconfig.write(source % job)
                logging.info("Config written to: " + conFile)

            # # Run simulation in parallel
            # if 'parallel' in job:
            #   numnodes = job['parallel']
            #   total_tasks = numnodes * 24
            #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

            # # Run simulation single threaded
            # else:
            #   cmd = 'namd2 %s > %s' % (conFile, logFile)

            # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
            check = executecmd("module list")
            logging.debug("%s", check)

            cmd = "namd2 +p%d %s > %s" % (PARALLELISM, conFile, logFile)

            #  MICROBENCH #1 (file to Lustre)
            # logging.debug("Executing Simulation:\n   %s\n", cmd)
            # bench = microbench()
            # bench.start()
            # stdout = executecmd(cmd)
            # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
            # bench.mark('SimExec:%s' % job['name'])
            # shm_contents = os.listdir('/dev/shm/out')
            # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
            # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
            # logging.info("Copy Complete to Lustre.")
            # bench.mark('CopyLustre:%s' % job['name'])
            # shutil.rmtree(ramdisk)
            # shm_contents = os.listdir('/dev/shm')
            # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
            # bench.show()

            max_expected_obs = int(job["runtime"]) // int(job["dcdfreq"])
            # Retry upto 3 attempts if the sim fails
            MAX_TRY = 3
            for i in range(MAX_TRY, 0, -1):
                min_required_obs = int(max_expected_obs * ((i - 1) / (MAX_TRY)))
                logging.debug("Executing Simulation:\n   %s\n", cmd)
                logging.debug("# Obs Expected to see: %d", max_expected_obs)
                stdout = executecmd(cmd)
                logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
                # Check file for expected data
                if USE_SHM:
                    traj = md.load(dcd_ramfile, top=job["pdb"])
                else:
                    traj = md.load(dcdFile, top=job["pdb"])
                logging.info("Obs Threshold  = %4d", min_required_obs)
                logging.info("#Obs This Traj = %4d", traj.n_frames)
                if traj.n_frames >= min_required_obs:
                    logging.info("Full (enough) Sim Completed")
                    break
                logging.info("Detected a failed Simulation. Retrying the same sim.")
                break

            bench.mark("simulation")
            # bench.mark('SimExec:%s' % job['name'])

            # Internal stats
            sim_length = self.data["sim_step_size"] * int(job["runtime"])
            sim_realtime = bench.delta_last()
            sim_run_ratio = (sim_realtime / 60) / (sim_length / 1000000)
            logging.info("##SIM_RATIO %6.3f  min-per-ns-sim", sim_run_ratio)
            stat.collect("sim_ratio", sim_run_ratio)

            if USE_SHM:
                shm_contents = os.listdir(ramdisk)
                logging.debug("Ramdisk contents (should have files) : %s", str(shm_contents))

                if not os.path.exists(dcd_ramfile):
                    logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
                    time.sleep(10)

                if not os.path.exists(dcd_ramfile):
                    logging.warning("DCD STIILL FILE NOT FOUND!!!!")
                else:
                    logging.info("DCD File was found")

            # #  MICROBENCH #2 (file to Alluxio)
            # allux = AlluxioClient()
            # # copy to Aluxio FS
            # allux.put(ramdisk + job['name'] + '.dcd', '/')
            # logging.info("Copy Complete to Alluxio.")
            # bench.mark('CopyAllux:%s' % job['name'])

            # And copy to Lustre
            # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
            # And copy to Lustre (usng zero-copy):
            if USE_SHM:

                # ALT:  X-Mit to Cache Service and let cache write to disk lazily
                src = open(dcd_ramfile, "rb")
                dest = open(dcdFile, "w+b")
                offset = 0
                dcdfilesize = os.path.getsize(dcd_ramfile)
                while True:
                    sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
                    if sent == 0:
                        break
                    offset += sent
                logging.info("Copy Complete to Lustre.")
                bench.mark("CopyLustre:%s" % job["name"])

            # TODO: Update job's metadata
            key = wrapKey("jc", job["name"])
            self.data[key]["dcd"] = dcdFile

        # ANALYSIS   ------- ---------------------------------------------------------
        #  ANALYSIS ALGORITHM
        # 1. With combined Sim-analysis: file is loaded locally from shared mem
        logging.debug("2. Load DCD")

        # Load topology and define filters
        topo = self.protein.pdb
        pdb = md.load(job["pdb"])
        hfilt = pdb.top.select_atom_indices("heavy")
        pfilt = pdb.top.select("protein")
        afilt = pdb.top.select_atom_indices("alpha")

        # Load full higher dim trajectory
        if traj is None:
            if USE_SHM:
                traj = md.load(dcd_ramfile, top=pdb)
            else:
                traj = md.load(dcdFile, top=pdb)

        logging.debug("Trajectory Loaded: %s (%s)", job["name"], str(traj))
        traj_prot = traj.atom_slice(pfilt)
        traj_heavy = traj.atom_slice(hfilt)
        traj_alpha = traj.atom_slice(afilt)

        # Superpose Coordinates to Common Reference
        traj_prot.superpose(topo)

        # Calculate output Distance Space
        # Use of the second side chain atom is discussed in the ref paper on Timescape
        # The atom pairs and selected atoms are in the timescape module
        sc_pairs = side_chain_pairs(traj_prot)
        n_features = len(sc_pairs)

        # Use the CA atoms to calculate distance space
        # NOTE THE CONVERSION FROM NM TO ANG!!!!!!
        dist_space = 10 * DR.distance_space(traj_alpha)

        # Get the frame rate to for conversion between source and timescape
        #  NOTE: Timescae is run at a more coarse level
        logging.debug("Preprocessing output for TimeScapes: terrain")
        traj_frame_per_ps = SIMULATE_RATIO * int(job["interval"]) / 1000.0  # jc interval is in fs
        ts_frame_per_ps = int(self.data["timescape:rate"])  # this value is in ps
        frame_rate = int(ts_frame_per_ps / traj_frame_per_ps)
        logging.debug(
            "%5.2f fr/ps (Traj)     %5.2f fr/ps (TS)    FrameRate= %4.1f",
            traj_frame_per_ps,
            ts_frame_per_ps,
            frame_rate,
        )

        # Execute Timescapes agility program to detect spatial-temporal basins
        output_prefix = os.path.join(job["workdir"], job["name"])
        if not self.skip_timescape:
            # Prep file and save locally in shm
            tmploc = gettempdir()
            ts_out = tmploc + "traj_ts"
            ts_dcd = ts_out + ".dcd"
            ts_pdb = ts_out + ".pdb"
            heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate))
            heavy.slice(0).save_pdb(ts_pdb)
            heavy.save_dcd(ts_dcd)

            # Gaussuan Full Width at Half-Max value affects sliding window size
            # ref:  http://timescapes.biomachina.org/guide.pdf
            gmd_cut1 = int(self.data["timescape:gmd:low"])
            gmd_cut2 = int(self.data["timescape:gmd:hi"])
            gauss_wght_delta = int(self.data["timescape:delta"])

            # Execute timescapes' terrain.py on the pre-processed trajectory
            cmd = "terrain.py %s %s %d %d %d GMD %s" % (
                ts_pdb,
                ts_dcd,
                gmd_cut1,
                gmd_cut2,
                gauss_wght_delta,
                output_prefix,
            )
            logging.info("Running Timescapes:\n  %s", cmd)
            stdout = executecmd(cmd)
            logging.info("TimeScapes COMPLETE:\n%s", stdout)

        # Collect and parse Timescape output
        logging.debug("Parsing Timescapes output")
        ts_parse = TimeScapeParser(job["pdb"], output_prefix, job["name"], dcd=dcdFile, traj=traj, uniqueid=False)
        basin_list = ts_parse.load_basins(frame_ratio=frame_rate)
        n_basins = len(basin_list)

        minima_coords = {}
        basin_rms = {}
        basins = {}

        new_dmu = {}
        new_dsig = {}
        resid_rms_delta = {}

        stat.collect("num_basin", n_basins)
        downstream_list = []

        # FOR BOOTSTRAPPING USING RMSD
        ref_file = os.path.join(settings.workdir, self.data["pdb:ref:0"])
        logging.info("Loading RMSD reference frame from %s", self.data["pdb:ref:0"])
        refframe = md.load(ref_file)
        ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices("alpha"))
        rmsd = 10 * md.rmsd(traj_alpha, ref_alpha)

        # FOR RESIDUE RMSD
        res_rms_Kr = FEATURE_SET_RESID
        resrmsd = 10 * np.array([LA.norm(i - ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz])
        basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms))

        # LOAD CENROIDS -- todo move to immut
        centroid = pickle.loads(self.catalog.get("centroid:ds"))
        basin_label_list = {}

        # Process each basin
        for i, basin in enumerate(basin_list):
            logging.info("  Processing basin #%2d", i)
            bid = basin.id
            downstream_list.append(bid)

            # Slice out minima coord  & save to disk (for now)
            #  TODO:  Store in memory in cache
            minima_coords[bid] = traj.slice(basin.mindex)
            jc_filename = os.path.join(settings.datadir, "basin_%s.pdb" % bid)
            minima_coords[bid].save_pdb(jc_filename)

            # METRIC CALCULATION
            a, b = basin.start, basin.end
            new_dmu[bid] = np.mean(dist_space[a:b], axis=0)
            new_dsig[bid] = np.std(dist_space[a:b], axis=0)

            # Collect Basin metadata
            basin_hash = basin.kv()
            basin_hash["pdbfile"] = jc_filename
            basin_hash["rmsd"] = np.median(rmsd[a:b])
            basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

            # FOR EXP #16 -- Residue RMSD

            # Set relative time (in ns)
            # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000
            basins[bid] = basin_hash

            # LABEL ATEMPORAL DATA
            # Take every 4th frame
            stride = [dist_space[i] for i in range(a, b, 4)]
            rms = [LA.norm(centroid - i, axis=1) for i in stride]
            label_seq = [np.argmin(i) for i in rms]
            basin_label_list[bid] = label_seq
            logging.info("  Basin Processed: #%s, %d - %d", basin_hash["traj"], basin_hash["start"], basin_hash["end"])

        # RMSD DELTA (RESIDUE)
        basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T
        basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)}
        for k in basins.keys():
            basins[k]["resrms_delta"] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr])

        # TODO:  Use Min Index as snapshot, median (or mean) DistSpace vals for each basin?????

        bench.mark("analysis")
        #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
        # try:
        self.wait_catalog()
        # except OverlayNotAvailable as e:
        #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")

        # FOR EXPERIMENT METRICS
        src_basin = self.catalog.hgetall("basin:" + job["src_basin"])
        with open("/home-1/[email protected]/ddc/results/{0}_prov.log".format(settings.name), "a") as metric_out:
            for i, basin in enumerate(basin_list):
                bid = basin.id
                label_seq = basin_label_list[bid]
                basin_metric_label = LABEL10(label_seq)
                metric_out.write(
                    "BASIN,%s,%s,%s,%s\n"
                    % (bid, src_basin["label:10"], basin_metric_label, "".join([str(i) for i in label_seq]))
                )

        # Update Index Synchronized data lists
        basin_list_sorted = sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1])))
        for bid in basin_list_sorted:
            with self.catalog.pipeline() as pipe:
                while True:
                    try:
                        logging.debug("Updating %s basin indeces and distance space", len(basins))
                        pipe.rpush("basin:list", bid)
                        pipe.rpush("dspace", pickle.dumps(new_dmu[bid]))
                        basin_index, _ = pipe.execute()
                        break
                    except redis.WatchError as e:
                        logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...")
                        continue
            basins[bid]["dsidx"] = basin_index - 1

        # Update Catalog with 1 Long Atomic Transaction
        with self.catalog.pipeline() as pipe:
            while True:
                try:
                    logging.debug("Update Filelist")
                    pipe.watch(wrapKey("jc", job["name"]))
                    file_idx = pipe.rpush("xid:filelist", job["dcd"]) - 1

                    # HD Points
                    logging.debug("Update HD Points")
                    start_index = pipe.llen("xid:reference")
                    pipe.multi()
                    pipe.rpush("xid:reference", *[(file_idx, x) for x in range(traj.n_frames)])
                    pipe.set("resrms:" + job["name"], resrmsd)

                    # Store all basin data
                    logging.debug("Updating %s basins", len(basins))
                    for bid in sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1]))):
                        pipe.hmset("basin:" + bid, basins[bid])
                        pipe.set("minima:%s" % bid, pickle.dumps(minima_coords[bid]))
                        for i in basin_label_list[bid]:
                            pipe.rpush("basin:labelseq:" + bid, i)

                    pipe.hset("anl_sequence", job["name"], mylogical_seqnum)

                    logging.debug("Executing")
                    pipe.execute()
                    break

                except redis.WatchError as e:
                    logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...")
                    continue

        self.data[key]["xid:start"] = start_index
        self.data[key]["xid:end"] = start_index + traj.n_frames
        bench.mark("catalog")

        # ---- POST PROCESSING
        # Check for total data in downstream queue  & Notify control manager to run
        # 1. Det. what's still running
        job_queue = slurm.currentqueue()
        n_simjobs = -1  # -1 to exclude self
        for j in job_queue:
            if j["name"].startswith("sw"):
                n_simjobs += 1

        logging.info("SIM WORKER has detected %d other simulations running/pending.", len(job_queue))
        remain_percent = n_simjobs / self.data["numresources"]

        # Fault Tolerance: ensure the pipeline persists
        if n_simjobs == 0:
            logging.info("I am the last simulation. Ensuring the controler executes.")
            self.catalog.set("ctl:force", 1)

        # Send downstream notification when less than 10% of jobs are still running
        if not self.skip_notify and remain_percent < 0.1:
            # Notify the control manager 'CM'
            self.notify("ctl")

        if USE_SHM:
            shutil.rmtree(ramdisk)
            shm_contents = os.listdir("/dev/shm")
            logging.debug("Ramdisk contents (should be empty of DDC) : %s", str(shm_contents))

        # For benchmarching:
        bench.show()
        stat.show()

        # Return # of observations (frames) processed
        return downstream_list
Beispiel #7
0
all_label = np.zeros(shape=(len(jclist), 4500))
sumlog = open(home + '/work/results/trajout_{0}'.format(expname), 'w')
for i, jc in enumerate(jclist[startnum:]):
  if 'dcd' not in jc:
    jc['dcd'] = jc['pdb'].replace('dcd', 'pdb')
  if not os.path.exists(jc['dcd']):
    logging.info("%d %s NO_DCD", i, jc['name'])
    continue
  tr = md.load(jc['dcd'], top=topo, stride=4)
  if tr.n_frames < 4503:
    logging.info("%d %s Frames: %d", i, jc['name'], tr.n_frames)
    continue
  prot  = tr.atom_slice(pfilt)
  prot.save_dcd(wkdir + '/%s_%03d.dcd' % (expname, (i + startnum)))
  alpha = prot.atom_slice(afilt)
  ds    = DR.distance_space(alpha)
  rms   = [LA.norm(cent-i, axis=1) for i in ds]
  label = np.array([np.argmin(i) for i in rms[3:]])
  all_label[i] = label
  label_str = ''.join([str(i) for i in label])
  red.hset('basin:' + jc['name'], 'label', label_str)
  bincnt = ','.join([str(i) for i in np.bincount(label, minlength=5)])
  src_basin = jc['src_basin'] if 'src_basin' in jc else 'NONE'
  logging.info("%d,%s,%s,%s", i, jc['name'], src_basin, bincnt)
  sumlog.write('%d,%s,%s,%s\n' % (i, jc['name'], src_basin, bincnt))

sumlog.close()
np.save(home + '/work/results/label_{0}'.format(expname), all_label)
logging.info('ALL Done!')

Beispiel #8
0
def load_historical_Expl(catalog):
  """ Load all DEShaw data into basins for processing """
  settings = systemsettings()
  
  # idx_list      = [0,1,2,3,4,20,23,24,30,32,34,40,41,42]
  idx_list      = [0,34]
  tlist         = {k: 'tr%d.dcd'%k for k in idx_list}
  seed_dir      = os.path.join(settings.WORKDIR, 'seed')
  seed_ts_ratio = 16     # TimeScape ran on 4ps frame rate (16x source)

  # Load topology and anscillary data
  # bpti = Protein(bpti, catalog, load=True)
  pdb_file = os.path.join(seed_dir, 'coord.pdb')
  topo = md.load(pdb_file)
  pfilt = topo.top.select('protein')

  logging.info('Topology loaded %s', topo)

  # ID Side Chain pair atoms for each distance space calc 
  sc_pairs = side_chain_pairs(topo.atom_slice(pfilt))
  logging.info('Identified side chains: %d', len(sc_pairs))


  # Process all sorce trajectories
  basin_list = []
  C_T, mu_T, sigma_T = [], [], []
  for idx in idx_list:
    logging.info('Procesing Seed index: %d', idx)
    # Load SRC seed trajetory & calc distance space  -- TODO: make this optional
    tfile = os.path.join(seed_dir, tlist[idx])
    traj = md.load(tfile, top=topo)
    traj.superpose(topo)
    ds = datareduce.distance_space(traj, pairs=sc_pairs)

      # Push to Catalog
    file_idx = catalog.rpush('xid:filelist', tfile) - 1
    start_index = catalog.llen('xid:reference')
    # TODO: Do I still need to index every frame??????
    catalog.rpush('xid:reference', *[(file_idx, x) for x in range(traj.n_frames)])

    # Process Trajectory as basins
    logging.info("  Seed Loaded. Loading TimeScape Data...")
    seed_name = 'tr%d'%idx
    ts_data_path  = os.path.join(seed_dir, 'TEST', seed_name)
    ts_traj = TimeScapeParser(pdb_file, 
        ts_data_path, seed_name, dcd=tfile, traj=traj)
    basin_list = ts_traj.load_basins(frame_ratio=seed_ts_ratio)
    corr_mat   = ts_traj.correlation_matrix()

    for i, basin in enumerate(ts_traj.basins):
      a, b = basin.start, basin.end
      bid = basin.id
      if a > traj.n_frames:
        logging.info('Finished processing all basins for this Trajectory!')
        break

      # Store on Disk and in redis
      jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid)
      minima_frame = md.load_frame(tfile, basin.mindex, top=topo) if traj is None else traj.slice(basin.mindex)
      minima_frame.save_pdb(jc_filename)

      C_T.append(np.mean(corr_mat[a:b], axis=0))
      mu_T.append(np.mean(ds[a:b], axis=0))
      sigma_T.append(np.std(ds[a:b], axis=0))
     
      basin_hash = basin.kv()
      basin_hash['pdbfile'] = jc_filename
      basin_hash['dsidx'] = i
      logging.info('  Basin: %(id)s  %(start)d - %(end)d   Minima: %(mindex)d    size=%(len)d' % basin_hash)

      pipe = catalog.pipeline()
      pipe.rpush('basin:list', bid)
      pipe.hmset('basin:%s'%bid, basin_hash)
      pipe.set('minima:%s'%bid, pickle.dumps(minima_frame))
      pipe.execute()

    catalog.set('basin:processed', len(ts_traj.basins))

  catalog.storeNPArray(np.array(C_T), 'corr_vector')
  catalog.storeNPArray(np.array(mu_T), 'dspace_mu')
Beispiel #9
0
  def execute(self, job):

  # PRE-PREOCESS ----------------------------------------------------------
    settings = systemsettings()
    bench = microbench('sim_%s' % settings.name, self.seqNumFromID())
    bench.start()
    stat  = StatCollector('sim_%s' % settings.name, self.seqNumFromID())
    mylogical_seqnum = str(self.seqNumFromID())

    # Prepare working directory, input/output files
    conFile = os.path.join(job['workdir'], job['name'] + '.conf')
    logFile = conFile.replace('conf', 'log')      # log in same place as config file
    dcdFile = conFile.replace('conf', 'dcd')      # dcd in same place as config file
    USE_SHM = True

    SIMULATE_RATIO = settings.SIMULATE_RATIO
    if SIMULATE_RATIO > 1:
      logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
    frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000)
    logging.info('Frame Size is %f  Using Sim Ratio of 1:%d', \
      frame_size, SIMULATE_RATIO)

    EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
    logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER)

    # # Grab historical basin data's relative time to start (for lineage)
    traj = None

  # EXECUTE SIMULATION ---------------------------------------------------------
    if self.skip_simulation:

      logging.info('1. SKIPPING SIMULATION.....')
      USE_SHM = False

      job['dcd'] = dcdFile
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

    else:
      logging.info('1. Run Simulation')

      # Prepare & source to config file
      with open(self.data['sim_conf_template'], 'r') as template:
        source = template.read()

      # >>>>Storing DCD into shared memory on this node

      if USE_SHM:
        ramdisk = gettempdir()
        job['outputloc'] = ramdisk
        dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd')
      else:
        job['outputloc'] = ''

      with open(conFile, 'w') as sysconfig:
        sysconfig.write(source % job)
        logging.info("Config written to: " + conFile)

      # # Run simulation in parallel
      # if 'parallel' in job:
      #   numnodes = job['parallel']
      #   total_tasks = numnodes * 24
      #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

      # # Run simulation single threaded
      # else:
      #   cmd = 'namd2 %s > %s' % (conFile, logFile)

      # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
      check = executecmd('module list')
      logging.debug('%s', check)

      cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile)

      #  MICROBENCH #1 (file to Lustre)
      # logging.debug("Executing Simulation:\n   %s\n", cmd)
      # bench = microbench()
      # bench.start()
      # stdout = executecmd(cmd)
      # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
      # bench.mark('SimExec:%s' % job['name'])
      # shm_contents = os.listdir('/dev/shm/out')
      # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # logging.info("Copy Complete to Lustre.")
      # bench.mark('CopyLustre:%s' % job['name'])
      # shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
      # bench.show()

      max_expected_obs = int(job['runtime']) // int(job['dcdfreq'])
      # Retry upto 3 attempts if the sim fails
      MAX_TRY = 3
      for i in range(MAX_TRY, 0, -1):
        min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY)))
        logging.debug("Executing Simulation:\n   %s\n", cmd)
        logging.debug('# Obs Expected to see: %d', max_expected_obs)
        stdout = executecmd(cmd)
        logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
        # Check file for expected data
        if USE_SHM:
          traj = md.load(dcd_ramfile, top=job['pdb'])
        else:
          traj = md.load(dcdFile, top=job['pdb'])
        logging.info("Obs Threshold  = %4d", min_required_obs)
        logging.info("#Obs This Traj = %4d", traj.n_frames)
        if traj.n_frames >= min_required_obs:
          logging.info('Full (enough) Sim Completed')
          break
        logging.info('Detected a failed Simulation. Retrying the same sim.')
        break
      
      bench.mark('simulation')
      # bench.mark('SimExec:%s' % job['name'])

      # Internal stats
      sim_length = self.data['sim_step_size'] * int(job['runtime'])
      sim_realtime = bench.delta_last()
      sim_run_ratio =  (sim_realtime/60) / (sim_length/1000000)
      logging.info('##SIM_RATIO %6.3f  min-per-ns-sim', sim_run_ratio)
      stat.collect('sim_ratio', sim_run_ratio)

      if USE_SHM:
        shm_contents = os.listdir(ramdisk)
        logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
          time.sleep(10)

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD STIILL FILE NOT FOUND!!!!")
        else:
          logging.info("DCD File was found")

      # #  MICROBENCH #2 (file to Alluxio)
      # allux = AlluxioClient()
      # # copy to Aluxio FS
      # allux.put(ramdisk + job['name'] + '.dcd', '/')
      # logging.info("Copy Complete to Alluxio.")
      # bench.mark('CopyAllux:%s' % job['name'])

      # And copy to Lustre
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # And copy to Lustre (usng zero-copy):
      if USE_SHM:

        # ALT:  X-Mit to Cache Service and let cache write to disk lazily
        src  = open(dcd_ramfile, 'rb')
        dest = open(dcdFile, 'w+b')
        offset = 0
        dcdfilesize = os.path.getsize(dcd_ramfile)
        while True:
          sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
          if sent == 0:
            break
          offset += sent
        logging.info("Copy Complete to Lustre.")
        bench.mark('CopyLustre:%s' % job['name'])
      
      # TODO: Update job's metadata
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

  # ANALYSIS   ------- ---------------------------------------------------------
    #  ANALYSIS ALGORITHM
  # 1. With combined Sim-analysis: file is loaded locally from shared mem
    logging.debug("2. Load DCD")

    # Load topology and define filters
    topo = self.protein.pdb
    pdb = md.load(job['pdb'])
    hfilt = pdb.top.select_atom_indices('heavy')
    pfilt = pdb.top.select('protein')
    afilt = pdb.top.select_atom_indices('alpha')

    # Load full higher dim trajectory
    if traj is None:
      if USE_SHM:
        traj = md.load(dcd_ramfile, top=pdb)
      else:
        traj = md.load(dcdFile, top=pdb)

    logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj))
    traj_prot = traj.atom_slice(pfilt)
    traj_heavy = traj.atom_slice(hfilt)
    traj_alpha = traj.atom_slice(afilt)

    # Superpose Coordinates to Common Reference
    traj_prot.superpose(topo)

    # Calculate output Distance Space
    # Use of the second side chain atom is discussed in the ref paper on Timescape
    # The atom pairs and selected atoms are in the timescape module
    sc_pairs = side_chain_pairs(traj_prot)
    n_features = len(sc_pairs)

    # Use the CA atoms to calculate distance space
    # NOTE THE CONVERSION FROM NM TO ANG!!!!!!
    dist_space = 10 * DR.distance_space(traj_alpha)

    # Get the frame rate to for conversion between source and timescape
    #  NOTE: Timescae is run at a more coarse level
    logging.debug('Preprocessing output for TimeScapes: terrain')
    traj_frame_per_ps = SIMULATE_RATIO * int(job['interval']) / 1000.   # jc interval is in fs
    ts_frame_per_ps = int(self.data['timescape:rate'])  # this value is in ps
    frame_rate = int(ts_frame_per_ps / traj_frame_per_ps)
    logging.debug('%5.2f fr/ps (Traj)     %5.2f fr/ps (TS)    FrameRate= %4.1f', traj_frame_per_ps, ts_frame_per_ps, frame_rate)

    # Execute Timescapes agility program to detect spatial-temporal basins
    output_prefix = os.path.join(job['workdir'], job['name'])
    if not self.skip_timescape:
      # Prep file and save locally in shm
      tmploc = gettempdir()
      ts_out = tmploc + 'traj_ts'
      ts_dcd = ts_out + '.dcd'
      ts_pdb = ts_out + '.pdb'
      heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate))
      heavy.slice(0).save_pdb(ts_pdb)
      heavy.save_dcd(ts_dcd)

      # Gaussuan Full Width at Half-Max value affects sliding window size
      # ref:  http://timescapes.biomachina.org/guide.pdf
      gmd_cut1 = int(self.data['timescape:gmd:low'])
      gmd_cut2 = int(self.data['timescape:gmd:hi'])
      gauss_wght_delta = int(self.data['timescape:delta'])

      # Execute timescapes' terrain.py on the pre-processed trajectory
      cmd = 'terrain.py %s %s %d %d %d GMD %s' %\
        (ts_pdb, ts_dcd, gmd_cut1, gmd_cut2, gauss_wght_delta, output_prefix)
      logging.info('Running Timescapes:\n  %s', cmd)
      stdout = executecmd(cmd)
      logging.info('TimeScapes COMPLETE:\n%s', stdout)


    # Collect and parse Timescape output
    logging.debug('Parsing Timescapes output')
    ts_parse = TimeScapeParser(job['pdb'], output_prefix, job['name'], 
      dcd=dcdFile, traj=traj, uniqueid=False)
    basin_list = ts_parse.load_basins(frame_ratio=frame_rate)
    n_basins = len(basin_list)

    minima_coords = {}
    basin_rms = {}
    basins = {}

    new_dmu={}
    new_dsig={}
    resid_rms_delta = {}

    stat.collect('num_basin', n_basins)
    downstream_list = []

    # FOR BOOTSTRAPPING USING RMSD
    ref_file = os.path.join(settings.workdir, self.data['pdb:ref:0'])
    logging.info('Loading RMSD reference frame from %s', self.data['pdb:ref:0'])
    refframe = md.load(ref_file)
    ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices('alpha'))
    rmsd = 10*md.rmsd(traj_alpha, ref_alpha)

    # FOR RESIDUE RMSD
    res_rms_Kr = FEATURE_SET_RESID
    resrmsd = 10*np.array([LA.norm(i-ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz])
    basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms))

    # LOAD CENROIDS -- todo move to immut
    centroid = pickle.loads(self.catalog.get('centroid:ds'))
    basin_label_list = {}

    # Process each basin
    for i, basin in enumerate(basin_list):
      logging.info('  Processing basin #%2d', i)
      bid = basin.id
      downstream_list.append(bid)

      # Slice out minima coord  & save to disk (for now)
      #  TODO:  Store in memory in cache
      minima_coords[bid] = traj.slice(basin.mindex)
      jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid)
      minima_coords[bid].save_pdb(jc_filename)

      # METRIC CALCULATION
      a, b = basin.start, basin.end
      new_dmu[bid]    = np.mean(dist_space[a:b], axis=0)
      new_dsig[bid]   = np.std(dist_space[a:b], axis=0)

      # Collect Basin metadata
      basin_hash = basin.kv()
      basin_hash['pdbfile'] = jc_filename
      basin_hash['rmsd'] = np.median(rmsd[a:b])
      basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

      # FOR EXP #16 -- Residue RMSD

      # Set relative time (in ns)
      # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000
      basins[bid] = basin_hash

      # LABEL ATEMPORAL DATA
      # Take every 4th frame
      stride = [dist_space[i] for i in range(a,b,4)]
      rms = [LA.norm(centroid - i, axis=1) for i in stride]
      label_seq = [np.argmin(i) for i in rms]
      basin_label_list[bid] = label_seq
      logging.info('  Basin Processed: #%s, %d - %d', basin_hash['traj'], 
        basin_hash['start'], basin_hash['end'])

    # RMSD DELTA (RESIDUE)
    basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T
    basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)}
    for k in basins.keys():
      basins[k]['resrms_delta'] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr])


    # TODO:  Use Min Index as snapshot, median (or mean) DistSpace vals for each basin?????

    bench.mark('analysis')
  #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
    # try:
    self.wait_catalog()
    # except OverlayNotAvailable as e:
    #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")

  # FOR EXPERIMENT METRICS
    src_basin = self.catalog.hgetall("basin:" + job['src_basin'])
    with open('/home-1/[email protected]/ddc/results/{0}_prov.log'.format(settings.name), 'a') as metric_out:
      for i, basin in enumerate(basin_list):
        bid = basin.id
        label_seq = basin_label_list[bid]
        basin_metric_label = LABEL10(label_seq)
        metric_out.write('BASIN,%s,%s,%s,%s\n'% \
          (bid, src_basin['label:10'], basin_metric_label, ''.join([str(i) for i in label_seq])))

  # Update Index Synchronized data lists
    basin_list_sorted =sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1])))
    for bid in basin_list_sorted:
      with self.catalog.pipeline() as pipe:
        while True:
          try:
            logging.debug('Updating %s basin indeces and distance space', len(basins))
            pipe.rpush('basin:list', bid)
            pipe.rpush('dspace', pickle.dumps(new_dmu[bid]))
            basin_index,_ = pipe.execute()
            break
          except redis.WatchError as e:
            logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...')
            continue
      basins[bid]['dsidx'] = basin_index - 1

  # Update Catalog with 1 Long Atomic Transaction  
    with self.catalog.pipeline() as pipe:
      while True:
        try:
          logging.debug('Update Filelist')
          pipe.watch(wrapKey('jc', job['name']))
          file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1

          # HD Points
          logging.debug('Update HD Points')
          start_index = pipe.llen('xid:reference')
          pipe.multi()
          pipe.rpush('xid:reference', *[(file_idx, x) for x in range(traj.n_frames)])
          pipe.set('resrms:' + job['name'], resrmsd)

          # Store all basin data
          logging.debug('Updating %s basins', len(basins))
          for bid in sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1]))):
            pipe.hmset('basin:'+bid, basins[bid])
            pipe.set('minima:%s'%bid, pickle.dumps(minima_coords[bid]))
            for i in basin_label_list[bid]:
              pipe.rpush('basin:labelseq:'+bid, i)

          pipe.hset('anl_sequence', job['name'], mylogical_seqnum)

          logging.debug('Executing')
          pipe.execute()
          break

        except redis.WatchError as e:
          logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...')
          continue

    self.data[key]['xid:start'] = start_index
    self.data[key]['xid:end'] = start_index + traj.n_frames
    bench.mark('catalog')

  # ---- POST PROCESSING
    # Check for total data in downstream queue  & Notify control manager to run
    # 1. Det. what's still running
    job_queue = slurm.currentqueue()
    n_simjobs = -1    # -1 to exclude self
    for j in job_queue:
      if j['name'].startswith('sw'):
        n_simjobs += 1

    logging.info('SIM WORKER has detected %d other simulations running/pending.', len(job_queue))
    remain_percent = n_simjobs / self.data['numresources']

    # Fault Tolerance: ensure the pipeline persists
    if n_simjobs == 0:
      logging.info('I am the last simulation. Ensuring the controler executes.')
      self.catalog.set('ctl:force', 1)

    # Send downstream notification when less than 10% of jobs are still running
    if not self.skip_notify and remain_percent < .1:
      # Notify the control manager 'CM'
      self.notify('ctl')

    if USE_SHM:
      shutil.rmtree(ramdisk)
      shm_contents = os.listdir('/dev/shm')
      logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents))
    
    # For benchmarching:
    bench.show()
    stat.show()

    # Return # of observations (frames) processed
    return downstream_list
Beispiel #10
0
def bootstrap_lattice(catalog, num=10, build_new=False):
    ''' Bootstrap After TimeScape has run on source trajectory '''
    home = os.getenv("HOME")
    support = 1
    cutoff = 8

    start_coord = [
        'de2586_315', 'de531_20', 'de3765_63', 'de3305_668', 'de1732_139'
    ]
    dcdfile = lambda x: home + '/work/data/{0}.dcd'.format(x)
    outloc = lambda x: home + '/work/jc/denovouniform1/{0}/{0}'.format(x)

    traj_list = {}

    basin_list = catalog.lrange('basin:list', 0, -1)
    if len(basin_list) == 134:
        logging.info('Basin Data already loaded!')
        rms_delta_list = [
            (i, np.sum(pickle.loads(catalog.get('basin:rmsdelta:' + b))))
            for i, b in enumerate(basin_list)
        ]
    else:
        logging.info('Loading all bootstrap data to initialize...')
        basin_list = []
        rms_delta_list = []
        pdb_file = home + '/work/data/alpha.pdb'
        topo = md.load(pdb_file)
        ref_alpha = md.load(home + '/work/' + catalog.get('pdb:ref:0'))
        ref_alpha.atom_slice(ref_alpha.top.select_atom_indices('alpha'),
                             inplace=True)
        res_rms_Kr = FEATURE_SET

        for sc in start_coord:
            dist_space = []
            srcfile = outloc(sc) + '.dcd'
            pdbfile = srcfile.replace('dcd', 'pdb')
            logging.debug('LOADING TRAJ:  %s', srcfile)
            traj = md.load(srcfile, top=pdbfile)
            traj_list[sc] = traj
            alpha = traj.atom_slice(traj.top.select_atom_indices('alpha'))

            logging.info('Grabbing TS data...')
            W = TS.TimeScape.windows(outloc(sc) + '_transitions.log')
            ts_traj = TS.TimeScapeParser(pdbfile,
                                         outloc(sc),
                                         sc,
                                         dcd=srcfile,
                                         traj=traj)
            basins = ts_traj.load_basins()

            logging.info("Processing distance space and residue RMS")
            dsa = DR.distance_space(alpha)
            resrmsd = 10 * np.array(
                [LA.norm(i - ref_alpha.xyz[0], axis=1) for i in alpha.xyz])
            basin_res_rms = np.zeros(shape=(len(ts_traj.basins),
                                            alpha.n_atoms))
            for i, (a, b) in enumerate(W):
                dist_space.append(dsa[a:b].mean(0))
                basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

            basin_res_rms_delta = np.array(
                [rms_delta(i) for i in basin_res_rms.T]).T
            logging.debug('RMS LEN CHECK:  %d =?= %d    -- Updating RMS Delta',
                          len(basins), len(basin_res_rms_delta))

            for i, basin in enumerate(basins):
                pipe = catalog.pipeline()
                bid = basin.id

                # Store on Disk and in redis
                jc_filename = os.path.join(settings.datadir,
                                           'basin_%s.pdb' % bid)
                logging.info('MIN for %s:   Idx# %d  to %s', bid, basin.mindex,
                             jc_filename)
                minima_frame = traj.slice(
                    basin.mindex
                )  #md.load_frame(src_traj, basin.mindex, top=src_traj.replace('dcd', 'pdb'))
                minima_frame.save_pdb(jc_filename)

                basin_hash = basin.kv()
                basin_hash['pdbfile'] = jc_filename
                logging.info(
                    '  Basin: %(id)s  %(start)d - %(end)d   Minima: %(mindex)d    size=%(len)d'
                    % basin_hash)

                pipe.rpush('basin:list', bid)
                pipe.hmset('basin:%s' % bid, basin_hash)
                pipe.set('basin:dmu:' + bid, pickle.dumps(dist_space[i]))
                pipe.set('minima:%s' % bid, pickle.dumps(minima_frame))

                # FOR RESIDUE RMSD
                resrms_d = np.sum(basin_res_rms_delta[i][res_rms_Kr])
                basin_hash['resrms_delta'] = resrms_d
                rms_delta_list.append((len(basin_list), resrms_d))
                basin_list.append(basin_hash)
                pipe.set('basin:rmsdelta:' + bid,
                         pickle.dumps(basin_res_rms_delta[i]))

                pipe.execute()

    # FOR RESIDUE RMSD

    # FOR SEED SAMPLING USING RMS_DELTA

    # Note: skip the first basin

    # Re-Construct the Lattice from
    if build_new:
        dist_space = 10 * np.array(dist_space)
        cm = ds < cutoff
        fs = lat.reduced_feature_set(cm, .115)
        len(fs)
        dr, cr = ds[:, fs], cm[:, fs]

        mfis, lfis = lat.maxminer(cr, 1)
        dlat, ik = lat.derived_lattice(mfis, dr, cr)
        pickle.dump(mfis, open(home + '/work/data/denovo_mfis.p', 'wb'))
        pickle.dump(lfis, open(home + '/work/data/denovo_lfis.p', 'wb'))
        pickle.dump(ik, open(home + '/work/data/denovo_iset.p', 'wb'))
        pickle.dump(dlat, open(home + '/work/data/denovo_dlat.p', 'wb'))

    else:

        logging.info('Loading Pre-Constructed Lattice Data')
        dlat = pickle.load(open(home + '/work/data/denovo_dlat.p', 'rb'))
        mfis = pickle.load(open(home + '/work/data/denovo_mfis.p', 'rb'))
        lfis = pickle.load(open(home + '/work/data/denovo_lfis.p', 'rb'))
        ik = pickle.load(open(home + '/work/data/denovo_iset.p', 'rb'))

    with catalog.pipeline() as pipe:
        pipe.set('lattice:max_fis', pickle.dumps(mfis))
        pipe.set('lattice:low_fis', pickle.dumps(lfis))
        pipe.set('lattice:dlat', pickle.dumps(dlat))
        pipe.set('lattice:iset', pickle.dumps(ik))
        pipe.execute()

    # logging.info('Building Existing lattice object')
    # lattice=lat.Lattice(ds, fs, cutoff, support)
    # lattice.set_fis(max_fis, low_fis)
    # lattice.set_dlat(dlat, Ik)
    # sampler = LatticeSampler(lattice)

    # Sample -- FOR USING LATTICE TO BOOTSTRAP
    # cl,sc,el = lat.clusterlattice(dlat, cr, dr, ik, num_k=8, invert=True)
    # cl_list = sorted(el, key=lambda x: len(x))

    # TODO: Check if fan out > single item clusters
    # start_indices = [clu[0][0] for clu in cl_list[:num]]

    rms_delta_ranked = [
        x[0] for x in sorted(rms_delta_list, key=lambda i: i[1], reverse=True)
    ]
    start_indices = rms_delta_ranked[:num]

    seedlist = [catalog.lindex('basin:list', i) for i in start_indices]
    sim_init = {key: catalog.get(key) for key in settings.sim_params.keys()}
    global_params = getSimParameters(sim_init, 'seed')
    global_params['psf'] = home + '/work/jc/serial2/de0_0/de0_0.psf'

    for seed in seedlist:
        logging.debug('\nSeeding Job: %s ', seed)
        basin = catalog.hgetall('basin:%s' % seed)
        catalog.rpush('executed', seed)

        # Generate new set of params/coords
        jcID, config = generateFromBasin(basin)

        # Update Additional JC Params and Decision History, as needed
        config.update(global_params)

        # Push to catalog
        logging.info("New Simulation Job Created: %s", jcID)
        for k, v in config.items():
            logging.debug("   %s:  %s", k, str(v))
        catalog.rpush('jcqueue', jcID)
        catalog.hmset(wrapKey('jc', jcID), config)
Beispiel #11
0
def load_historical_Expl(catalog):
    """ Load all DEShaw data into basins for processing """
    settings = systemsettings()

    # idx_list      = [0,1,2,3,4,20,23,24,30,32,34,40,41,42]
    idx_list = [0, 34]
    tlist = {k: 'tr%d.dcd' % k for k in idx_list}
    seed_dir = os.path.join(settings.WORKDIR, 'seed')
    seed_ts_ratio = 16  # TimeScape ran on 4ps frame rate (16x source)

    # Load topology and anscillary data
    # bpti = Protein(bpti, catalog, load=True)
    pdb_file = os.path.join(seed_dir, 'coord.pdb')
    topo = md.load(pdb_file)
    pfilt = topo.top.select('protein')

    logging.info('Topology loaded %s', topo)

    # ID Side Chain pair atoms for each distance space calc
    sc_pairs = side_chain_pairs(topo.atom_slice(pfilt))
    logging.info('Identified side chains: %d', len(sc_pairs))

    # Process all sorce trajectories
    basin_list = []
    C_T, mu_T, sigma_T = [], [], []
    for idx in idx_list:
        logging.info('Procesing Seed index: %d', idx)
        # Load SRC seed trajetory & calc distance space  -- TODO: make this optional
        tfile = os.path.join(seed_dir, tlist[idx])
        traj = md.load(tfile, top=topo)
        traj.superpose(topo)
        ds = datareduce.distance_space(traj, pairs=sc_pairs)

        # Push to Catalog
        file_idx = catalog.rpush('xid:filelist', tfile) - 1
        start_index = catalog.llen('xid:reference')
        # TODO: Do I still need to index every frame??????
        catalog.rpush('xid:reference',
                      *[(file_idx, x) for x in range(traj.n_frames)])

        # Process Trajectory as basins
        logging.info("  Seed Loaded. Loading TimeScape Data...")
        seed_name = 'tr%d' % idx
        ts_data_path = os.path.join(seed_dir, 'TEST', seed_name)
        ts_traj = TimeScapeParser(pdb_file,
                                  ts_data_path,
                                  seed_name,
                                  dcd=tfile,
                                  traj=traj)
        basin_list = ts_traj.load_basins(frame_ratio=seed_ts_ratio)
        corr_mat = ts_traj.correlation_matrix()

        for i, basin in enumerate(ts_traj.basins):
            a, b = basin.start, basin.end
            bid = basin.id
            if a > traj.n_frames:
                logging.info(
                    'Finished processing all basins for this Trajectory!')
                break

            # Store on Disk and in redis
            jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid)
            minima_frame = md.load_frame(
                tfile, basin.mindex, top=topo) if traj is None else traj.slice(
                    basin.mindex)
            minima_frame.save_pdb(jc_filename)

            C_T.append(np.mean(corr_mat[a:b], axis=0))
            mu_T.append(np.mean(ds[a:b], axis=0))
            sigma_T.append(np.std(ds[a:b], axis=0))

            basin_hash = basin.kv()
            basin_hash['pdbfile'] = jc_filename
            basin_hash['dsidx'] = i
            logging.info(
                '  Basin: %(id)s  %(start)d - %(end)d   Minima: %(mindex)d    size=%(len)d'
                % basin_hash)

            pipe = catalog.pipeline()
            pipe.rpush('basin:list', bid)
            pipe.hmset('basin:%s' % bid, basin_hash)
            pipe.set('minima:%s' % bid, pickle.dumps(minima_frame))
            pipe.execute()

        catalog.set('basin:processed', len(ts_traj.basins))

    catalog.storeNPArray(np.array(C_T), 'corr_vector')
    catalog.storeNPArray(np.array(mu_T), 'dspace_mu')