Beispiel #1
0
 def notify(self, mthread):
   """ Notifies the Manager process of the given macrothread to begin
   executing (within the next 60 seconds).  If the manager is not
   running, this will launch it locally """
   settings = systemsettings()
   if self.singleuse:
     logging.info("Notification invoked, but running in single use  (not notifying)")
     return
   logging.info("Notifying downstream `%s` to executing in 60 seconds.", mthread)
   fname = self.catalog.hget('macrothread:ctl', 'fname')
   # fname = self.data['macrothread'][mthread]['fname']
   mngr_prefix = self.fname[0] + 'm'
   job_queue = slurm.currentqueue()
   found_jobs = []
   for j in job_queue:
     if j['name'].startswith(mngr_prefix):
       found_jobs.append(j)
   if len(found_jobs) == 0:
     logging.warning('Downstream manager not running. Recovering thread locally...')
     stdout = proc.check_output('python3 %s -c %s' % (fname, settings.name), shell=True)
     logging.info('Manager RECOVERED!\n %s\n------------', stdout)
   else:
     if len(found_jobs) > 1:
       logging.warning('Multple Downsteam manager found. Notifying earliest submitted')
       downstream_mngr = min(found_jobs, key=lambda x: x['submit'])
     else:
       downstream_mngr = found_jobs[0]
     jobid = int(downstream_mngr['jobid'])
     logging.info("Notifying %s Manager (jobid = %s) to start in 60 seconds", mthread, jobid)
     slurm.notify_start(jobid, delay=60)
Beispiel #2
0
    def execute(self, job):

        # PRE-PREOCESS ----------------------------------------------------------
        settings = systemsettings()
        bench = microbench("sim_%s" % settings.name, self.seqNumFromID())
        bench.start()
        stat = StatCollector("sim_%s" % settings.name, self.seqNumFromID())
        mylogical_seqnum = str(self.seqNumFromID())

        # Prepare working directory, input/output files
        conFile = os.path.join(job["workdir"], job["name"] + ".conf")
        logFile = conFile.replace("conf", "log")  # log in same place as config file
        dcdFile = conFile.replace("conf", "dcd")  # dcd in same place as config file
        USE_SHM = True

        SIMULATE_RATIO = settings.SIMULATE_RATIO
        if SIMULATE_RATIO > 1:
            logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
        frame_size = (SIMULATE_RATIO * int(job["interval"])) / (1000)
        logging.info("Frame Size is %f  Using Sim Ratio of 1:%d", frame_size, SIMULATE_RATIO)

        EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
        logging.info("Running Experiment Configuration #%d", EXPERIMENT_NUMBER)

        # # Grab historical basin data's relative time to start (for lineage)
        traj = None

        # EXECUTE SIMULATION ---------------------------------------------------------
        if self.skip_simulation:

            logging.info("1. SKIPPING SIMULATION.....")
            USE_SHM = False

            job["dcd"] = dcdFile
            key = wrapKey("jc", job["name"])
            self.data[key]["dcd"] = dcdFile

        else:
            logging.info("1. Run Simulation")

            # Prepare & source to config file
            with open(self.data["sim_conf_template"], "r") as template:
                source = template.read()

            # >>>>Storing DCD into shared memory on this node

            if USE_SHM:
                ramdisk = gettempdir()
                job["outputloc"] = ramdisk
                dcd_ramfile = os.path.join(ramdisk, job["name"] + ".dcd")
            else:
                job["outputloc"] = ""

            with open(conFile, "w") as sysconfig:
                sysconfig.write(source % job)
                logging.info("Config written to: " + conFile)

            # # Run simulation in parallel
            # if 'parallel' in job:
            #   numnodes = job['parallel']
            #   total_tasks = numnodes * 24
            #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

            # # Run simulation single threaded
            # else:
            #   cmd = 'namd2 %s > %s' % (conFile, logFile)

            # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
            check = executecmd("module list")
            logging.debug("%s", check)

            cmd = "namd2 +p%d %s > %s" % (PARALLELISM, conFile, logFile)

            #  MICROBENCH #1 (file to Lustre)
            # logging.debug("Executing Simulation:\n   %s\n", cmd)
            # bench = microbench()
            # bench.start()
            # stdout = executecmd(cmd)
            # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
            # bench.mark('SimExec:%s' % job['name'])
            # shm_contents = os.listdir('/dev/shm/out')
            # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
            # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
            # logging.info("Copy Complete to Lustre.")
            # bench.mark('CopyLustre:%s' % job['name'])
            # shutil.rmtree(ramdisk)
            # shm_contents = os.listdir('/dev/shm')
            # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
            # bench.show()

            max_expected_obs = int(job["runtime"]) // int(job["dcdfreq"])
            # Retry upto 3 attempts if the sim fails
            MAX_TRY = 3
            for i in range(MAX_TRY, 0, -1):
                min_required_obs = int(max_expected_obs * ((i - 1) / (MAX_TRY)))
                logging.debug("Executing Simulation:\n   %s\n", cmd)
                logging.debug("# Obs Expected to see: %d", max_expected_obs)
                stdout = executecmd(cmd)
                logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
                # Check file for expected data
                if USE_SHM:
                    traj = md.load(dcd_ramfile, top=job["pdb"])
                else:
                    traj = md.load(dcdFile, top=job["pdb"])
                logging.info("Obs Threshold  = %4d", min_required_obs)
                logging.info("#Obs This Traj = %4d", traj.n_frames)
                if traj.n_frames >= min_required_obs:
                    logging.info("Full (enough) Sim Completed")
                    break
                logging.info("Detected a failed Simulation. Retrying the same sim.")
                break

            bench.mark("simulation")
            # bench.mark('SimExec:%s' % job['name'])

            # Internal stats
            sim_length = self.data["sim_step_size"] * int(job["runtime"])
            sim_realtime = bench.delta_last()
            sim_run_ratio = (sim_realtime / 60) / (sim_length / 1000000)
            logging.info("##SIM_RATIO %6.3f  min-per-ns-sim", sim_run_ratio)
            stat.collect("sim_ratio", sim_run_ratio)

            if USE_SHM:
                shm_contents = os.listdir(ramdisk)
                logging.debug("Ramdisk contents (should have files) : %s", str(shm_contents))

                if not os.path.exists(dcd_ramfile):
                    logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
                    time.sleep(10)

                if not os.path.exists(dcd_ramfile):
                    logging.warning("DCD STIILL FILE NOT FOUND!!!!")
                else:
                    logging.info("DCD File was found")

            # #  MICROBENCH #2 (file to Alluxio)
            # allux = AlluxioClient()
            # # copy to Aluxio FS
            # allux.put(ramdisk + job['name'] + '.dcd', '/')
            # logging.info("Copy Complete to Alluxio.")
            # bench.mark('CopyAllux:%s' % job['name'])

            # And copy to Lustre
            # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
            # And copy to Lustre (usng zero-copy):
            if USE_SHM:

                # ALT:  X-Mit to Cache Service and let cache write to disk lazily
                src = open(dcd_ramfile, "rb")
                dest = open(dcdFile, "w+b")
                offset = 0
                dcdfilesize = os.path.getsize(dcd_ramfile)
                while True:
                    sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
                    if sent == 0:
                        break
                    offset += sent
                logging.info("Copy Complete to Lustre.")
                bench.mark("CopyLustre:%s" % job["name"])

            # TODO: Update job's metadata
            key = wrapKey("jc", job["name"])
            self.data[key]["dcd"] = dcdFile

        # ANALYSIS   ------- ---------------------------------------------------------
        #  ANALYSIS ALGORITHM
        # 1. With combined Sim-analysis: file is loaded locally from shared mem
        logging.debug("2. Load DCD")

        # Load topology and define filters
        topo = self.protein.pdb
        pdb = md.load(job["pdb"])
        hfilt = pdb.top.select_atom_indices("heavy")
        pfilt = pdb.top.select("protein")
        afilt = pdb.top.select_atom_indices("alpha")

        # Load full higher dim trajectory
        if traj is None:
            if USE_SHM:
                traj = md.load(dcd_ramfile, top=pdb)
            else:
                traj = md.load(dcdFile, top=pdb)

        logging.debug("Trajectory Loaded: %s (%s)", job["name"], str(traj))
        traj_prot = traj.atom_slice(pfilt)
        traj_heavy = traj.atom_slice(hfilt)
        traj_alpha = traj.atom_slice(afilt)

        # Superpose Coordinates to Common Reference
        traj_prot.superpose(topo)

        # Calculate output Distance Space
        # Use of the second side chain atom is discussed in the ref paper on Timescape
        # The atom pairs and selected atoms are in the timescape module
        sc_pairs = side_chain_pairs(traj_prot)
        n_features = len(sc_pairs)

        # Use the CA atoms to calculate distance space
        # NOTE THE CONVERSION FROM NM TO ANG!!!!!!
        dist_space = 10 * DR.distance_space(traj_alpha)

        # Get the frame rate to for conversion between source and timescape
        #  NOTE: Timescae is run at a more coarse level
        logging.debug("Preprocessing output for TimeScapes: terrain")
        traj_frame_per_ps = SIMULATE_RATIO * int(job["interval"]) / 1000.0  # jc interval is in fs
        ts_frame_per_ps = int(self.data["timescape:rate"])  # this value is in ps
        frame_rate = int(ts_frame_per_ps / traj_frame_per_ps)
        logging.debug(
            "%5.2f fr/ps (Traj)     %5.2f fr/ps (TS)    FrameRate= %4.1f",
            traj_frame_per_ps,
            ts_frame_per_ps,
            frame_rate,
        )

        # Execute Timescapes agility program to detect spatial-temporal basins
        output_prefix = os.path.join(job["workdir"], job["name"])
        if not self.skip_timescape:
            # Prep file and save locally in shm
            tmploc = gettempdir()
            ts_out = tmploc + "traj_ts"
            ts_dcd = ts_out + ".dcd"
            ts_pdb = ts_out + ".pdb"
            heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate))
            heavy.slice(0).save_pdb(ts_pdb)
            heavy.save_dcd(ts_dcd)

            # Gaussuan Full Width at Half-Max value affects sliding window size
            # ref:  http://timescapes.biomachina.org/guide.pdf
            gmd_cut1 = int(self.data["timescape:gmd:low"])
            gmd_cut2 = int(self.data["timescape:gmd:hi"])
            gauss_wght_delta = int(self.data["timescape:delta"])

            # Execute timescapes' terrain.py on the pre-processed trajectory
            cmd = "terrain.py %s %s %d %d %d GMD %s" % (
                ts_pdb,
                ts_dcd,
                gmd_cut1,
                gmd_cut2,
                gauss_wght_delta,
                output_prefix,
            )
            logging.info("Running Timescapes:\n  %s", cmd)
            stdout = executecmd(cmd)
            logging.info("TimeScapes COMPLETE:\n%s", stdout)

        # Collect and parse Timescape output
        logging.debug("Parsing Timescapes output")
        ts_parse = TimeScapeParser(job["pdb"], output_prefix, job["name"], dcd=dcdFile, traj=traj, uniqueid=False)
        basin_list = ts_parse.load_basins(frame_ratio=frame_rate)
        n_basins = len(basin_list)

        minima_coords = {}
        basin_rms = {}
        basins = {}

        new_dmu = {}
        new_dsig = {}
        resid_rms_delta = {}

        stat.collect("num_basin", n_basins)
        downstream_list = []

        # FOR BOOTSTRAPPING USING RMSD
        ref_file = os.path.join(settings.workdir, self.data["pdb:ref:0"])
        logging.info("Loading RMSD reference frame from %s", self.data["pdb:ref:0"])
        refframe = md.load(ref_file)
        ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices("alpha"))
        rmsd = 10 * md.rmsd(traj_alpha, ref_alpha)

        # FOR RESIDUE RMSD
        res_rms_Kr = FEATURE_SET_RESID
        resrmsd = 10 * np.array([LA.norm(i - ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz])
        basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms))

        # LOAD CENROIDS -- todo move to immut
        centroid = pickle.loads(self.catalog.get("centroid:ds"))
        basin_label_list = {}

        # Process each basin
        for i, basin in enumerate(basin_list):
            logging.info("  Processing basin #%2d", i)
            bid = basin.id
            downstream_list.append(bid)

            # Slice out minima coord  & save to disk (for now)
            #  TODO:  Store in memory in cache
            minima_coords[bid] = traj.slice(basin.mindex)
            jc_filename = os.path.join(settings.datadir, "basin_%s.pdb" % bid)
            minima_coords[bid].save_pdb(jc_filename)

            # METRIC CALCULATION
            a, b = basin.start, basin.end
            new_dmu[bid] = np.mean(dist_space[a:b], axis=0)
            new_dsig[bid] = np.std(dist_space[a:b], axis=0)

            # Collect Basin metadata
            basin_hash = basin.kv()
            basin_hash["pdbfile"] = jc_filename
            basin_hash["rmsd"] = np.median(rmsd[a:b])
            basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

            # FOR EXP #16 -- Residue RMSD

            # Set relative time (in ns)
            # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000
            basins[bid] = basin_hash

            # LABEL ATEMPORAL DATA
            # Take every 4th frame
            stride = [dist_space[i] for i in range(a, b, 4)]
            rms = [LA.norm(centroid - i, axis=1) for i in stride]
            label_seq = [np.argmin(i) for i in rms]
            basin_label_list[bid] = label_seq
            logging.info("  Basin Processed: #%s, %d - %d", basin_hash["traj"], basin_hash["start"], basin_hash["end"])

        # RMSD DELTA (RESIDUE)
        basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T
        basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)}
        for k in basins.keys():
            basins[k]["resrms_delta"] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr])

        # TODO:  Use Min Index as snapshot, median (or mean) DistSpace vals for each basin?????

        bench.mark("analysis")
        #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
        # try:
        self.wait_catalog()
        # except OverlayNotAvailable as e:
        #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")

        # FOR EXPERIMENT METRICS
        src_basin = self.catalog.hgetall("basin:" + job["src_basin"])
        with open("/home-1/[email protected]/ddc/results/{0}_prov.log".format(settings.name), "a") as metric_out:
            for i, basin in enumerate(basin_list):
                bid = basin.id
                label_seq = basin_label_list[bid]
                basin_metric_label = LABEL10(label_seq)
                metric_out.write(
                    "BASIN,%s,%s,%s,%s\n"
                    % (bid, src_basin["label:10"], basin_metric_label, "".join([str(i) for i in label_seq]))
                )

        # Update Index Synchronized data lists
        basin_list_sorted = sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1])))
        for bid in basin_list_sorted:
            with self.catalog.pipeline() as pipe:
                while True:
                    try:
                        logging.debug("Updating %s basin indeces and distance space", len(basins))
                        pipe.rpush("basin:list", bid)
                        pipe.rpush("dspace", pickle.dumps(new_dmu[bid]))
                        basin_index, _ = pipe.execute()
                        break
                    except redis.WatchError as e:
                        logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...")
                        continue
            basins[bid]["dsidx"] = basin_index - 1

        # Update Catalog with 1 Long Atomic Transaction
        with self.catalog.pipeline() as pipe:
            while True:
                try:
                    logging.debug("Update Filelist")
                    pipe.watch(wrapKey("jc", job["name"]))
                    file_idx = pipe.rpush("xid:filelist", job["dcd"]) - 1

                    # HD Points
                    logging.debug("Update HD Points")
                    start_index = pipe.llen("xid:reference")
                    pipe.multi()
                    pipe.rpush("xid:reference", *[(file_idx, x) for x in range(traj.n_frames)])
                    pipe.set("resrms:" + job["name"], resrmsd)

                    # Store all basin data
                    logging.debug("Updating %s basins", len(basins))
                    for bid in sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1]))):
                        pipe.hmset("basin:" + bid, basins[bid])
                        pipe.set("minima:%s" % bid, pickle.dumps(minima_coords[bid]))
                        for i in basin_label_list[bid]:
                            pipe.rpush("basin:labelseq:" + bid, i)

                    pipe.hset("anl_sequence", job["name"], mylogical_seqnum)

                    logging.debug("Executing")
                    pipe.execute()
                    break

                except redis.WatchError as e:
                    logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...")
                    continue

        self.data[key]["xid:start"] = start_index
        self.data[key]["xid:end"] = start_index + traj.n_frames
        bench.mark("catalog")

        # ---- POST PROCESSING
        # Check for total data in downstream queue  & Notify control manager to run
        # 1. Det. what's still running
        job_queue = slurm.currentqueue()
        n_simjobs = -1  # -1 to exclude self
        for j in job_queue:
            if j["name"].startswith("sw"):
                n_simjobs += 1

        logging.info("SIM WORKER has detected %d other simulations running/pending.", len(job_queue))
        remain_percent = n_simjobs / self.data["numresources"]

        # Fault Tolerance: ensure the pipeline persists
        if n_simjobs == 0:
            logging.info("I am the last simulation. Ensuring the controler executes.")
            self.catalog.set("ctl:force", 1)

        # Send downstream notification when less than 10% of jobs are still running
        if not self.skip_notify and remain_percent < 0.1:
            # Notify the control manager 'CM'
            self.notify("ctl")

        if USE_SHM:
            shutil.rmtree(ramdisk)
            shm_contents = os.listdir("/dev/shm")
            logging.debug("Ramdisk contents (should be empty of DDC) : %s", str(shm_contents))

        # For benchmarching:
        bench.show()
        stat.show()

        # Return # of observations (frames) processed
        return downstream_list
Beispiel #3
0
  def execute(self, job):

  # PRE-PREOCESS ----------------------------------------------------------
    settings = systemsettings()
    bench = microbench('sim_%s' % settings.name, self.seqNumFromID())
    bench.start()
    stat  = StatCollector('sim_%s' % settings.name, self.seqNumFromID())
    mylogical_seqnum = str(self.seqNumFromID())

    # Prepare working directory, input/output files
    conFile = os.path.join(job['workdir'], job['name'] + '.conf')
    logFile = conFile.replace('conf', 'log')      # log in same place as config file
    dcdFile = conFile.replace('conf', 'dcd')      # dcd in same place as config file
    USE_SHM = True

    SIMULATE_RATIO = settings.SIMULATE_RATIO
    if SIMULATE_RATIO > 1:
      logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
    frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000)
    logging.info('Frame Size is %f  Using Sim Ratio of 1:%d', \
      frame_size, SIMULATE_RATIO)

    EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
    logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER)

    # # Grab historical basin data's relative time to start (for lineage)
    traj = None

  # EXECUTE SIMULATION ---------------------------------------------------------
    if self.skip_simulation:

      logging.info('1. SKIPPING SIMULATION.....')
      USE_SHM = False

      job['dcd'] = dcdFile
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

    else:
      logging.info('1. Run Simulation')

      # Prepare & source to config file
      with open(self.data['sim_conf_template'], 'r') as template:
        source = template.read()

      # >>>>Storing DCD into shared memory on this node

      if USE_SHM:
        ramdisk = gettempdir()
        job['outputloc'] = ramdisk
        dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd')
      else:
        job['outputloc'] = ''

      with open(conFile, 'w') as sysconfig:
        sysconfig.write(source % job)
        logging.info("Config written to: " + conFile)

      # # Run simulation in parallel
      # if 'parallel' in job:
      #   numnodes = job['parallel']
      #   total_tasks = numnodes * 24
      #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

      # # Run simulation single threaded
      # else:
      #   cmd = 'namd2 %s > %s' % (conFile, logFile)

      # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
      check = executecmd('module list')
      logging.debug('%s', check)

      cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile)

      #  MICROBENCH #1 (file to Lustre)
      # logging.debug("Executing Simulation:\n   %s\n", cmd)
      # bench = microbench()
      # bench.start()
      # stdout = executecmd(cmd)
      # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
      # bench.mark('SimExec:%s' % job['name'])
      # shm_contents = os.listdir('/dev/shm/out')
      # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # logging.info("Copy Complete to Lustre.")
      # bench.mark('CopyLustre:%s' % job['name'])
      # shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
      # bench.show()

      max_expected_obs = int(job['runtime']) // int(job['dcdfreq'])
      # Retry upto 3 attempts if the sim fails
      MAX_TRY = 3
      for i in range(MAX_TRY, 0, -1):
        min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY)))
        logging.debug("Executing Simulation:\n   %s\n", cmd)
        logging.debug('# Obs Expected to see: %d', max_expected_obs)
        stdout = executecmd(cmd)
        logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
        # Check file for expected data
        if USE_SHM:
          traj = md.load(dcd_ramfile, top=job['pdb'])
        else:
          traj = md.load(dcdFile, top=job['pdb'])
        logging.info("Obs Threshold  = %4d", min_required_obs)
        logging.info("#Obs This Traj = %4d", traj.n_frames)
        if traj.n_frames >= min_required_obs:
          logging.info('Full (enough) Sim Completed')
          break
        logging.info('Detected a failed Simulation. Retrying the same sim.')
        break
      
      bench.mark('simulation')
      # bench.mark('SimExec:%s' % job['name'])

      # Internal stats
      sim_length = self.data['sim_step_size'] * int(job['runtime'])
      sim_realtime = bench.delta_last()
      sim_run_ratio =  (sim_realtime/60) / (sim_length/1000000)
      logging.info('##SIM_RATIO %6.3f  min-per-ns-sim', sim_run_ratio)
      stat.collect('sim_ratio', sim_run_ratio)

      if USE_SHM:
        shm_contents = os.listdir(ramdisk)
        logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
          time.sleep(10)

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD STIILL FILE NOT FOUND!!!!")
        else:
          logging.info("DCD File was found")

      # #  MICROBENCH #2 (file to Alluxio)
      # allux = AlluxioClient()
      # # copy to Aluxio FS
      # allux.put(ramdisk + job['name'] + '.dcd', '/')
      # logging.info("Copy Complete to Alluxio.")
      # bench.mark('CopyAllux:%s' % job['name'])

      # And copy to Lustre
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # And copy to Lustre (usng zero-copy):
      if USE_SHM:

        # ALT:  X-Mit to Cache Service and let cache write to disk lazily
        src  = open(dcd_ramfile, 'rb')
        dest = open(dcdFile, 'w+b')
        offset = 0
        dcdfilesize = os.path.getsize(dcd_ramfile)
        while True:
          sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
          if sent == 0:
            break
          offset += sent
        logging.info("Copy Complete to Lustre.")
        bench.mark('CopyLustre:%s' % job['name'])
      
      # TODO: Update job's metadata
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

  # ANALYSIS   ------- ---------------------------------------------------------
    #  ANALYSIS ALGORITHM
  # 1. With combined Sim-analysis: file is loaded locally from shared mem
    logging.debug("2. Load DCD")

    # Load topology and define filters
    topo = self.protein.pdb
    pdb = md.load(job['pdb'])
    hfilt = pdb.top.select_atom_indices('heavy')
    pfilt = pdb.top.select('protein')
    afilt = pdb.top.select_atom_indices('alpha')

    # Load full higher dim trajectory
    if traj is None:
      if USE_SHM:
        traj = md.load(dcd_ramfile, top=pdb)
      else:
        traj = md.load(dcdFile, top=pdb)

    logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj))
    traj_prot = traj.atom_slice(pfilt)
    traj_heavy = traj.atom_slice(hfilt)
    traj_alpha = traj.atom_slice(afilt)

    # Superpose Coordinates to Common Reference
    traj_prot.superpose(topo)

    # Calculate output Distance Space
    # Use of the second side chain atom is discussed in the ref paper on Timescape
    # The atom pairs and selected atoms are in the timescape module
    sc_pairs = side_chain_pairs(traj_prot)
    n_features = len(sc_pairs)

    # Use the CA atoms to calculate distance space
    # NOTE THE CONVERSION FROM NM TO ANG!!!!!!
    dist_space = 10 * DR.distance_space(traj_alpha)

    # Get the frame rate to for conversion between source and timescape
    #  NOTE: Timescae is run at a more coarse level
    logging.debug('Preprocessing output for TimeScapes: terrain')
    traj_frame_per_ps = SIMULATE_RATIO * int(job['interval']) / 1000.   # jc interval is in fs
    ts_frame_per_ps = int(self.data['timescape:rate'])  # this value is in ps
    frame_rate = int(ts_frame_per_ps / traj_frame_per_ps)
    logging.debug('%5.2f fr/ps (Traj)     %5.2f fr/ps (TS)    FrameRate= %4.1f', traj_frame_per_ps, ts_frame_per_ps, frame_rate)

    # Execute Timescapes agility program to detect spatial-temporal basins
    output_prefix = os.path.join(job['workdir'], job['name'])
    if not self.skip_timescape:
      # Prep file and save locally in shm
      tmploc = gettempdir()
      ts_out = tmploc + 'traj_ts'
      ts_dcd = ts_out + '.dcd'
      ts_pdb = ts_out + '.pdb'
      heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate))
      heavy.slice(0).save_pdb(ts_pdb)
      heavy.save_dcd(ts_dcd)

      # Gaussuan Full Width at Half-Max value affects sliding window size
      # ref:  http://timescapes.biomachina.org/guide.pdf
      gmd_cut1 = int(self.data['timescape:gmd:low'])
      gmd_cut2 = int(self.data['timescape:gmd:hi'])
      gauss_wght_delta = int(self.data['timescape:delta'])

      # Execute timescapes' terrain.py on the pre-processed trajectory
      cmd = 'terrain.py %s %s %d %d %d GMD %s' %\
        (ts_pdb, ts_dcd, gmd_cut1, gmd_cut2, gauss_wght_delta, output_prefix)
      logging.info('Running Timescapes:\n  %s', cmd)
      stdout = executecmd(cmd)
      logging.info('TimeScapes COMPLETE:\n%s', stdout)


    # Collect and parse Timescape output
    logging.debug('Parsing Timescapes output')
    ts_parse = TimeScapeParser(job['pdb'], output_prefix, job['name'], 
      dcd=dcdFile, traj=traj, uniqueid=False)
    basin_list = ts_parse.load_basins(frame_ratio=frame_rate)
    n_basins = len(basin_list)

    minima_coords = {}
    basin_rms = {}
    basins = {}

    new_dmu={}
    new_dsig={}
    resid_rms_delta = {}

    stat.collect('num_basin', n_basins)
    downstream_list = []

    # FOR BOOTSTRAPPING USING RMSD
    ref_file = os.path.join(settings.workdir, self.data['pdb:ref:0'])
    logging.info('Loading RMSD reference frame from %s', self.data['pdb:ref:0'])
    refframe = md.load(ref_file)
    ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices('alpha'))
    rmsd = 10*md.rmsd(traj_alpha, ref_alpha)

    # FOR RESIDUE RMSD
    res_rms_Kr = FEATURE_SET_RESID
    resrmsd = 10*np.array([LA.norm(i-ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz])
    basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms))

    # LOAD CENROIDS -- todo move to immut
    centroid = pickle.loads(self.catalog.get('centroid:ds'))
    basin_label_list = {}

    # Process each basin
    for i, basin in enumerate(basin_list):
      logging.info('  Processing basin #%2d', i)
      bid = basin.id
      downstream_list.append(bid)

      # Slice out minima coord  & save to disk (for now)
      #  TODO:  Store in memory in cache
      minima_coords[bid] = traj.slice(basin.mindex)
      jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid)
      minima_coords[bid].save_pdb(jc_filename)

      # METRIC CALCULATION
      a, b = basin.start, basin.end
      new_dmu[bid]    = np.mean(dist_space[a:b], axis=0)
      new_dsig[bid]   = np.std(dist_space[a:b], axis=0)

      # Collect Basin metadata
      basin_hash = basin.kv()
      basin_hash['pdbfile'] = jc_filename
      basin_hash['rmsd'] = np.median(rmsd[a:b])
      basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

      # FOR EXP #16 -- Residue RMSD

      # Set relative time (in ns)
      # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000
      basins[bid] = basin_hash

      # LABEL ATEMPORAL DATA
      # Take every 4th frame
      stride = [dist_space[i] for i in range(a,b,4)]
      rms = [LA.norm(centroid - i, axis=1) for i in stride]
      label_seq = [np.argmin(i) for i in rms]
      basin_label_list[bid] = label_seq
      logging.info('  Basin Processed: #%s, %d - %d', basin_hash['traj'], 
        basin_hash['start'], basin_hash['end'])

    # RMSD DELTA (RESIDUE)
    basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T
    basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)}
    for k in basins.keys():
      basins[k]['resrms_delta'] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr])


    # TODO:  Use Min Index as snapshot, median (or mean) DistSpace vals for each basin?????

    bench.mark('analysis')
  #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
    # try:
    self.wait_catalog()
    # except OverlayNotAvailable as e:
    #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")

  # FOR EXPERIMENT METRICS
    src_basin = self.catalog.hgetall("basin:" + job['src_basin'])
    with open('/home-1/[email protected]/ddc/results/{0}_prov.log'.format(settings.name), 'a') as metric_out:
      for i, basin in enumerate(basin_list):
        bid = basin.id
        label_seq = basin_label_list[bid]
        basin_metric_label = LABEL10(label_seq)
        metric_out.write('BASIN,%s,%s,%s,%s\n'% \
          (bid, src_basin['label:10'], basin_metric_label, ''.join([str(i) for i in label_seq])))

  # Update Index Synchronized data lists
    basin_list_sorted =sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1])))
    for bid in basin_list_sorted:
      with self.catalog.pipeline() as pipe:
        while True:
          try:
            logging.debug('Updating %s basin indeces and distance space', len(basins))
            pipe.rpush('basin:list', bid)
            pipe.rpush('dspace', pickle.dumps(new_dmu[bid]))
            basin_index,_ = pipe.execute()
            break
          except redis.WatchError as e:
            logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...')
            continue
      basins[bid]['dsidx'] = basin_index - 1

  # Update Catalog with 1 Long Atomic Transaction  
    with self.catalog.pipeline() as pipe:
      while True:
        try:
          logging.debug('Update Filelist')
          pipe.watch(wrapKey('jc', job['name']))
          file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1

          # HD Points
          logging.debug('Update HD Points')
          start_index = pipe.llen('xid:reference')
          pipe.multi()
          pipe.rpush('xid:reference', *[(file_idx, x) for x in range(traj.n_frames)])
          pipe.set('resrms:' + job['name'], resrmsd)

          # Store all basin data
          logging.debug('Updating %s basins', len(basins))
          for bid in sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1]))):
            pipe.hmset('basin:'+bid, basins[bid])
            pipe.set('minima:%s'%bid, pickle.dumps(minima_coords[bid]))
            for i in basin_label_list[bid]:
              pipe.rpush('basin:labelseq:'+bid, i)

          pipe.hset('anl_sequence', job['name'], mylogical_seqnum)

          logging.debug('Executing')
          pipe.execute()
          break

        except redis.WatchError as e:
          logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...')
          continue

    self.data[key]['xid:start'] = start_index
    self.data[key]['xid:end'] = start_index + traj.n_frames
    bench.mark('catalog')

  # ---- POST PROCESSING
    # Check for total data in downstream queue  & Notify control manager to run
    # 1. Det. what's still running
    job_queue = slurm.currentqueue()
    n_simjobs = -1    # -1 to exclude self
    for j in job_queue:
      if j['name'].startswith('sw'):
        n_simjobs += 1

    logging.info('SIM WORKER has detected %d other simulations running/pending.', len(job_queue))
    remain_percent = n_simjobs / self.data['numresources']

    # Fault Tolerance: ensure the pipeline persists
    if n_simjobs == 0:
      logging.info('I am the last simulation. Ensuring the controler executes.')
      self.catalog.set('ctl:force', 1)

    # Send downstream notification when less than 10% of jobs are still running
    if not self.skip_notify and remain_percent < .1:
      # Notify the control manager 'CM'
      self.notify('ctl')

    if USE_SHM:
      shutil.rmtree(ramdisk)
      shm_contents = os.listdir('/dev/shm')
      logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents))
    
    # For benchmarching:
    bench.show()
    stat.show()

    # Return # of observations (frames) processed
    return downstream_list