예제 #1
0
파일: ctl_expl.py 프로젝트: DaMSL/ddc
    def execute(self, new_basin_list):
        """Executing the Controler Algorithm. Load pre-analyzed lower dimensional
      subspaces, process user query and identify the sampling space with 
      corresponding distribution function for each user query. Calculate 
      convergence rates, run sampler, and then execute fairness policy to
      distribute resources among users' sampled values.
      """
        logging.debug('CTL MT')

        # PRE-PROCESSING ---------------------------------------------------------------------------------
        logging.debug(
            "============================  <PRE-PROCESS>  ============================="
        )
        np.set_printoptions(precision=4, linewidth=150)

        self.data['timestep'] += 1
        logging.info('TIMESTEP: %d', self.data['timestep'])

        settings = systemsettings()
        bench = microbench('ctl_%s' % settings.name, self.seqNumFromID())
        stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID())

        if self.force_decision:
            new_basin_list = []

        # Connect to the cache
        self.cacheclient = CacheClient(settings.APPL_LABEL)

        # create the "binlist":
        numresources = self.data['numresources']
        explore_factor = float(self.data['sampler:explore'])
        topo = self.protein.top

        # Load new RMS Labels -- load all for now
        bench.start()
        logging.debug('Loading RMS Labels')
        start_index = max(0, self.data['ctl_index_head'])
        logging.debug(" Start_index=%d,  batch_size=%d", start_index,
                      len(new_basin_list))

        # Calculate variable PDF estimations for each subspace via bootstrapping:
        logging.debug(
            "=======================  <SUBSPACE CONVERGENCE> (skip)  ==================="
        )

        # IMPLEMENT USER QUERY with REWEIGHTING:
        logging.debug(
            "=======================  <QUERY PROCESSING>  ========================="
        )
        stat.collect('new_basin', len(new_basin_list))

        ##### BARRIER
        self.wait_catalog()
        selected_index_list = []

        # QUERY PROCESSING & SAMPLING BELOW to select indices.
        EXPERIMENT_NUMBER = self.experiment_number
        logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER)

        # TODO: ABSTRACT FEATURE DIMENSIONALITY
        n_features = 1653

        # Basin List will be the list of basin representing the new Job Candidates
        selected_basin_list = []
        all_basins = self.data['basin:list']
        num_prev_basins = int(self.data['basin:processed'])

        # Load Previous Distance Space (historical data)
        prev_ds_raw = self.catalog.lrange('dspace', 0, num_prev_basins)
        local_basins = {}
        if len(prev_ds_raw) > 0:
            ds_prev = np.zeros(shape=(len(prev_ds_raw), n_features))
            logging.info("Unpickling distance space to array: %s",
                         ds_prev.shape)
            for i, elm in enumerate(prev_ds_raw):
                ds_prev[i] = pickle.loads(elm)
            logging.info('Prev DS loaded. Size = %d', len(ds_prev))
        else:
            logging.info('NO Prev DS')
            ds_prev = []

        # FOR Supervised Classification (included for all for post-partem processing)
        bin_labels_10 = [
            'T0', 'T1', 'T2', 'T3', 'T4', 'W0', 'W1', 'W2', 'W3', 'W4'
        ]
        bin_labels_25 = [(a, b) for a in range(5) for b in range(5)]
        bin_list_10 = {
            k: [int(i) for i in self.catalog.lrange('bin:10:%s' % k, 0, -1)]
            for k in bin_labels_10
        }
        bin_list_25 = {
            k:
            [int(i) for i in self.catalog.lrange('bin:25:%d_%d' % k, 0, -1)]
            for k in bin_labels_25
        }

        # Merge locally analyzed distance space
        delta_ds = np.zeros(shape=(len(new_basin_list), n_features))
        logging.info('Collecting new data from basins: %s', new_basin_list)
        for i, bid in enumerate(new_basin_list):
            basin = self.data['basin:' + bid]
            global_basin_index = int(basin['dsidx'])
            local_basins[global_basin_index] = basin

            # Update Distance Space
            dmu_ = self.catalog.lindex('dspace', global_basin_index)
            if dmu_ is None:
                print("ERROR!!!!")
                print(global_basin_index, i)
                for k, v in basin.items():
                    print(k, v)
                sys.exit(0)

            delta_ds[i] = pickle.loads(dmu_)

            # Update Supervise Classification
            label_seq = [
                int(i)
                for i in self.catalog.lrange('basin:labelseq:' + bid, 0, -1)
            ]
            local_basins[global_basin_index][
                'label:10'] = label10 = bin_label_10(label_seq)
            local_basins[global_basin_index][
                'label:25'] = label25 = bin_label_25(label_seq)
            bin_list_10[label10].append(global_basin_index)
            bin_list_25[label25].append(global_basin_index)
            self.data['basin:processed'] += 1

        if len(new_basin_list) > 0 and len(prev_ds_raw) > 0:
            dist_space = np.vstack((ds_prev, delta_ds))
        elif len(delta_ds) == 0:
            dist_space = np.array(ds_prev)
        elif len(ds_prev) == 0:
            logging.info('First Set of Distance Coord laoded')
            dist_space = delta_ds
        else:
            logging.error("ERROR! NO DISTANCE SPACE IN THE CATALOG")

        # UNIFORM SAMPLER (BASIC)
        if EXPERIMENT_NUMBER == 12:
            basin_idx_list = []
            candidate_list = [[] for i in range(5)]
            for k, v in bin_list_10.items():
                candidate_list[int(k[1])].extend(v)

            # UNIFORM SAMPLING
            for sel_num in range(numresources):
                # Select Random start state
                start_state = np.random.randint(5)
                # Select random index
                rand_index = np.random.choice(len(candidate_list[start_state]))
                basin_idx = candidate_list[start_state][rand_index]
                basin_idx_list.append(basin_idx)

            for i in basin_idx_list:
                if i < num_prev_basins:
                    logging.info("Select index: %s   (Retrieve from Catalog)",
                                 i)
                    bid = self.data['basin:list'][i]
                    basin = self.catalog.hgetall('basin:%s' % bid)
                else:
                    logging.info(
                        "Select index: %s   (New locally built basin in mem)",
                        i)
                    basin = local_basins[i]
                logging.debug('   BASIN:  %s', basin['id'])
                selected_basin_list.append(basin)

        # BIASED SAMPLER (UPDATED)
        if EXPERIMENT_NUMBER == 13:

            # USING 10-BIN LABELS
            distro = [len(bin_list_10[i]) for i in bin_labels_10]

            # Create and invoke the sampler
            logging.info('Running the biased (umbrella) samplers')
            sampler = BiasSampler(distro)
            samplecount = np.zeros(len(bin_labels_10), dtype=np.int16)

            # Find the first index for each bin:
            explore_direction = 1 if explore_factor < .5 else -1
            for i, b in enumerate(bin_list_10):
                if len(b) == 0:
                    idx = 0
                else:
                    idx = np.floor(explore_factor * (len(b) - 1))
                samplecount[i] = idx

            sel_bins = sampler.execute(numresources)

            logging.info(
                'Processing selected bins to find starting candidates')
            candidate_list = {}
            basin_idx_list = []
            for b in sel_bins:
                target_bin = bin_labels_10[b]
                if target_bin not in candidate_list:
                    candidate_list[target_bin] = bin_list_10[target_bin]
                #  TODO:  FULLY IMPLEMENT EXPLORE/EXPLOIT BUT INCL HISTORY/PROVONANCE
                # Lazy Update to centroid -- push to catalog immediately
                # vals = dist_space[bin_list_10[target_bin]]
                # logging.info('Updating Centroid for bin %s,  bindata: %s', target_bin, vals.shape)
                # centroid = np.mean(vals, axis=0)
                # self.catalog.set('bin:10:centroid:%s' % target_bin, pickle.dumps(centroid))
                # dist_center = [LA.norm(centroid - dist_space[i]) for i in bin_list_10[target_bin]]
                # candidate_list[target_bin] = sorted(zip(bin_list_10[target_bin], dist_center), key=lambda x: x[1])
                # basin_idx, basin_diff = candidate_list[target_bin][samplecount[b]]
                # samplecount[b] += explore_direction
                # # Wrap
                # if samplecount[b] == 0:
                #   samplecount = len(candidate_list[target_bin]) - 1
                # if samplecount[b] == len(candidate_list[target_bin]):
                #   samplecount = 0

                # FOR NOW PICK A RANDOM CANDIDATE
                rand_index = np.random.choice(len(candidate_list[target_bin]))
                basin_idx = candidate_list[target_bin][rand_index]
                logging.info('BIAS SAMPLER:\n   Bin: %s\n   basin: %d     Delta from Center: %6.3f  (note: dist not factored in)', \
                  target_bin, basin_idx, 0.)
                basin_idx_list.append(basin_idx)

            for i in basin_idx_list:
                if i < num_prev_basins:
                    logging.info("Select index: %s   (Retrieve from Catalog)",
                                 i)
                    bid = self.data['basin:list'][i]
                    basin = self.catalog.hgetall('basin:%s' % bid)
                else:
                    logging.info(
                        "Select index: %s   (New locally built basin in mem)",
                        i)
                    basin = local_basins[i]
                logging.debug('   BASIN:  %s', basin['id'])
                selected_basin_list.append(basin)

        # LATTICE SAMPLER (WITH HISTORICAL DATA)
        if EXPERIMENT_NUMBER == 14:
            # Merge Existing delta with DEShaw Pre-Processed data:
            logging.info('Merging DEShaw with existing generated data')

            # Set parameters for lattice
            Kr = [
                int(i) for i in self.catalog.lrange('lattice:features', 0, -1)
            ]
            support = int(self.data['lattice:support'])
            dspt = self.catalog.get('lattice:delta_support')
            delta_support = 5 if dspt is None else int(dspt)
            cutoff = float(self.data['lattice:cutoff'])
            logging.info('PARAMS  Kr:%s\n support:%d  dspt:%d  cutoff:%f', Kr,
                         support, delta_support, cutoff)

            # Load existing (base) lattice data
            logging.info(
                "Unpickling max/low FIS and derived lattice EMD values")
            max_fis = pickle.loads(self.catalog.get('lattice:max_fis'))
            low_fis = pickle.loads(self.catalog.get('lattice:low_fis'))
            dlat = pickle.loads(self.catalog.get('lattice:dlat'))

            # TODO: MOVE iset completely into catalog
            # Ik       = pickle.loads(self.catalog.get('lattice:iset'))

            # FOR NOW: Full DEShaw Index is saved on disk  (TODO: MOVE TO CATALOG)
            logging.info(
                "Loading full Itemset from disk (TODO: Det optimization on mem/time)"
            )
            Ik = pickle.load(open(settings.datadir + '/iset.p', 'rb'))
            # Item_set Keys (Ik) are only saved as a delta for space conservation
            if os.path.exists(settings.datadir + '/iset_delta.p'):
                Ik_delta = pickle.load(
                    open(settings.datadir + '/iset_delta.p', 'rb'))
            else:
                Ik_delta = {}
            # Merge previous item set delta with DEShaw index
            logging.info("Merging DEShaw Ik with Delta IK")
            for k, v in Ik_delta.items():
                Ik[k] = np.concatenate((Ik[k], v)) if k in Ik else v

            # Build Base Lattice Object
            base_lattice = lat.Lattice(ds_prev, Kr, cutoff, support)
            base_lattice.set_fis(max_fis, low_fis)
            base_lattice.set_dlat(dlat, Ik)

            if not self.force_decision and len(delta_ds) > 0:
                # Build Delta Lattice Object
                logging.info('Building Delta lattice. Num new items: %d',
                             len(delta_ds))
                delta_lattice = lat.Lattice(delta_ds,
                                            Kr,
                                            cutoff,
                                            delta_support,
                                            invert=invert_vals)
                delta_lattice.maxminer()
                delta_lattice.derive_lattice()

                # Update non-DEShaw delta itemset key index
                logging.info("Updating Itemsets and Distance Space Matrix")
                for k, v in delta_lattice.Ik.items():
                    Ik_delta[k] = np.concatenate(
                        (Ik_delta[k], v)) if k in Ik_delta else v

                # Save Ik delta to disk
                logging.info("Saving Delta Itemset (to disk)")
                pickle.dump(Ik_delta,
                            open(settings.datadir + '/iset_delta.p', 'wb'))

                #  Perform incremental maintenance
                logging.info('Merging Delta lattice with Base Lattice')
                base_lattice.merge(delta_lattice)

            # Create the Sampler object (also make clusters)
            logging.info('Invoking the Lattice Sampler')

            sampler = LatticeSampler(base_lattice)
            basin_id_list = sampler.execute(numresources)
            for index in basin_id_list:
                bid = all_basins[index]
                selected_basin_list.append(
                    self.catalog.hgetall('basin:%s' % bid))

        # LATTICE SAMPLER (DE NOVO)
        if EXPERIMENT_NUMBER == 15:

            # PREPROCESS
            N_features_src = topo.n_residues
            N_features_corr = (N_features_src**2 - N_features_src) // 2
            upt = np.triu_indices(N_features_src, 1)
            old_basin_ids = all_basind[:num_prev_basins]

            # DENOVO Exploratory Bootstrapping (RMSD)
            explore_factor = float(self.data['sampler:explore'])

            # TODO:  Better transtion plan from explore to exploit
            self.data['sampler:explore'] *= .75
            executed_basins = self.catalog.lrange('executed', 0, -1)

            if explore_factor > 0:
                logging.info("EXPLORING Most active basins....")
                basindata = [
                    self.catalog.hgetall(bid) for bid in old_basin_ids
                ]
                for bid in new_basin_list:
                    basindata.append(self.data['basin:' + bid])

                basins_with_rms = [b for b in basindata if 'resrms_delta' in b]

                basin_by_rmsd = sorted(basins_with_rms,
                                       key=lambda x: float(x['resrms_delta']),
                                       reverse=True)
                explore_samples = int(np.floor(numresources * explore_factor))
                logging.info('Num to explore: %d  out of %d', explore_samples,
                             len(basin_by_rmsd))
                idx, num_sampled = 0, 0

                while idx < len(
                        basin_by_rmsd) and num_sampled < explore_samples:
                    selb = basin_by_rmsd[idx]
                    if selb['id'] in executed_basins:
                        logging.info(
                            'Selected %s, but it has been executed. Selecting next option',
                            selb['id'])
                    else:
                        selected_basin_list.append(selb)
                        logging.info('  (%d) EXPLORE BASIN:  %s  %f',
                                     selb['id'], selb['id'],
                                     float(selb['resrms_delta']))
                        numresources -= 1
                        num_sampled += 1
                    idx += 1

            # TODO:  Reduced Feature Sets
            #  Using Reduced Feature Set Alg #2 HERE
            support = int(.01 * len(dist_space))
            cutoff = 8

            # RE-Calc the whole lattice:
            logging.info("Building the new lattice")
            BUILD_NEW = not self.catalog.exists('lattice:bootstrapped')
            # TODO: Decision to go from build new to incr maint
            if BUILD_NEW:
                tval = .05
                Kr = lat.reduced_feature_set2(dist_space,
                                              cutoff,
                                              theta=tval,
                                              maxk=25)
                retry = 5
                while len(Kr) < 12 and retry > 0:
                    tval /= 2
                    retry -= 1
                    Kr = lat.reduced_feature_set2(dist_space,
                                                  cutoff,
                                                  theta=tval,
                                                  maxk=25)

                base_lattice = lat.Lattice(dist_space, Kr, cutoff, support)
                base_lattice.maxminer()
                base_lattice.derive_lattice()
                with self.catalog.pipeline() as pipe:
                    pipe.delete('lattice:kr')
                    for i in sorted(Kr):
                        pipe.rpush('lattice:kr', i)
                    pipe.execute()

            else:
                # Load existing (base) lattice data
                max_fis = pickle.loads(self.catalog.get('lattice:max_fis'))
                low_fis = pickle.loads(self.catalog.get('lattice:low_fis'))
                dlat = pickle.loads(self.catalog.get('lattice:dlat'))
                Ik = pickle.loads(self.catalog.get('lattice:iset'))
                num_k = self.catalog.get('lattice:num_k')
                Kr = [int(i) for i in self.catalog.lrange('lattice:kr', 0, -1)]
                if num_k is None:
                    num_k = max(8, min(15, numresources * 2))

                # Build Lattice Object
                logging.info('Building Existing lattice object')
                base_lattice = lat.Lattice(ds_prev, Kr, cutoff, support)
                base_lattice.set_fis(max_fis, low_fis)
                base_lattice.set_dlat(dlat, Ik)

                # Build Delta Lattice Object
                logging.info('Building Delta lattice. Num new items: %d',
                             len(delta_ds))
                delta_lattice = lat.Lattice(delta_ds, Kr, cutoff, 1)
                delta_lattice.maxminer()
                delta_lattice.derive_lattice()

                #  Perform incremental maintenance
                logging.info('Merging Delta lattice with Base Lattice')
                base_lattice.merge(delta_lattice)

            if numresources > 0:
                logging.info('Invoking the Lattice Sampler')
                sampler = LatticeSampler(base_lattice)
                basin_id_list = sampler.execute(numresources)

                # For now retrieve immediately from catalog
                self.wait_catalog()
                for index in basin_id_list:
                    bid = all_basins[index]
                    key = 'basin:%s' % bid
                    # Check to ensure its not a new basin and that it exists in the DB
                    if self.catalog.exists(key):
                        logging.debug('KEY EXISTS: %s', key)
                        selbasin = self.catalog.hgetall(key)
                    else:
                        logging.debug('NO KEY: %s\n%s', key, self.data[key])
                        selbasin = self.data[key]
                    selected_basin_list.append(selbasin)

        # CORRELATION SAMPLER (KMEANS)
        if EXPERIMENT_NUMBER == 18:
            sampler = CorrelationSampler(cm_all, mu=dist_space)
            basin_id_list = sampler.execute(numresources)
            for bid in basin_id_list:
                selected_basin_list.append(
                    self.catalog.hgetall('basin:%s' % bid))

        bench.mark('GlobalAnalysis')

        # Generate new starting positions
        jcqueue = OrderedDict()
        src_traj_list = []
        for basin in selected_basin_list:
            src_traj_list.append(basin['traj'])
            if basin['traj'].startswith('desh'):
                global_params = getSimParameters(self.data, 'deshaw')
                # fileno = int(basin['traj'][-4:])
                # frame = int(basin['mindex'])
                # jcID, config = generateDEShawJC(fileno, frame)
            else:
                global_params = getSimParameters(self.data, 'gen')
                src_psf = self.catalog.hget('jc_' + basin['traj'], 'psf')
                global_params.update({'psf': src_psf})
                # jcID, config = generateExplJC(basin, jcid=None)

            jcID, config = generateFromBasin(basin)
            config.update(global_params)
            config['name'] = jcID
            logging.info("New Simulation Job Created: %s", jcID)
            for k, v in config.items():
                logging.debug("   %s:  %s", k, str(v))

            #  Add to the output queue & save config info
            jcqueue[jcID] = config
            logging.info("New Job Candidate Completed:  %s   #%d on the Queue",
                         jcID, len(jcqueue))

        stat.collect('src_traj_list', src_traj_list)
        bench.mark('GenInputParams')

        #  POST-PROCESSING  -------------------------------------
        logging.debug(
            "============================  <POST-PROCESSING & OUTPUT>  ============================="
        )
        self.wait_catalog()

        with self.catalog.pipeline() as pipe:
            for basin in selected_basin_list:
                pipe.rpush('executed', basin['id'])
            pipe.execute()

        # Append new distance values
        if EXPERIMENT_NUMBER == 14:
            # Save Ik delta to disk
            logging.info("Saving Delta Itemset (to disk)")
            pickle.dump(Ik_delta, open(settings.datadir + '/iset_delta.p',
                                       'wb'))

            with self.catalog.pipeline() as pipe:
                pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
                pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
                pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
                pipe.execute()

        if EXPERIMENT_NUMBER == 15:
            with self.catalog.pipeline() as pipe:
                pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
                pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
                pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
                pipe.execute()
            self.catalog.set('lattice:iset', pickle.dumps(base_lattice.Ik))

        for basin_idx, basin in local_basins.items():
            key = 'basin:' + basin['id']
            with self.catalog.pipeline() as pipe:
                pipe.hset(key, 'label:10', basin['label:10'])
                pipe.hset(key, 'label:25', basin['label:25'])
                pipe.rpush('bin:10:%s' % basin['label:10'], basin_idx)
                pipe.rpush('bin:25:%d_%d' % basin['label:25'], basin_idx)
                pipe.execute()
        self.catalog.set('basin:processed',
                         num_prev_basins + len(local_basins))

        # Clear current queue, mark previously queues jobs for GC, push new queue
        qlen = self.catalog.llen('jcqueue')
        logging.debug('Current queue len;   %s', str(qlen))
        if qlen > 0:
            curqueue = self.catalog.lrange('jcqueue', 0, -1)
            logging.info("Marking %d obsolete jobs for garbage collection",
                         len(curqueue))
            for jc_key in curqueue:
                key = wrapKey('jc', jc_key)
                config = self.catalog.hgetall(key)
                config['gc'] = 0
                # Add gc jobs it to the state to write back to catalog (flags it for gc)
                self.addMut(key, config)
            self.catalog.delete('jcqueue')

        # Update cache hit/miss
        # hit = self.cache_hit
        # miss = self.cache_miss
        # logging.info('##CACHE_HIT_MISS %d %d  %.3f', hit, miss, (hit)/(hit+miss))
        # self.catalog.rpush('cache:hit', self.cache_hit)
        # self.catalog.rpush('cache:miss', self.cache_miss)

        self.data['jcqueue'] = list(jcqueue.keys())

        logging.debug("   JCQUEUE:  %s", str(self.data['jcqueue']))
        # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y)
        logging.debug("Updated Job Queue length:  %d",
                      len(self.data['jcqueue']))
        for jcid, config in jcqueue.items():
            # config['converge'] = self.data['converge']
            self.addMut(wrapKey('jc', jcid), config)

        self.notify('sim')

        bench.mark('PostProcessing')
        print('## TS=%d' % self.data['timestep'])
        bench.show()
        stat.show()

        return list(jcqueue.keys())
예제 #2
0
파일: overlayService.py 프로젝트: DaMSL/ddc
class OverlayService(object):
  """Overlay services provide an abstracted layer of control to 
  allow an underlying implemented service to operate within
  a timedelayed HPC environment
  (TODO:  abstract to other environments)
  """
  __metaclass__ = abc.ABCMeta

  # Delay (in sec) to wait 
  # SERVICE_HEARTBEAT_DELAY = 5
  # SERVICE_STARTUP_DELAY = 5

  def __init__(self, name, port, **kwargs):

    settings = systemsettings()
    settings.applyConfig(name + '.json')

    # For Stat Collection
    self.stat = StatCollector('Overlay')
    self.stat.collect('name', name)

    # DEBUG
    self._host = socket.gethostname()
    self._port = port

    svc_name = kwargs.get('svc_name', type(self).__name__)
    self._name_svc = svc_name
    self._name_app = name
    self.lockfile = '%s_%s.lock' % (self._name_app, self._name_svc)
    logging.info('MY Lock file is: %s', self.lockfile)

    self.numslaves = kwargs.get('numslaves', 0)
    self.launchcmd = None
    self.shutdowncmd = None

    self._pid = None

    self._state = 'INIT'
    self._role  = kwargs.get('role', 'NONE')

    self.aggressive = kwargs.get('isaggressive', False)

    # Check if this is a Slurm job and get the settings
    self.slurm_id = os.getenv('SLURM_JOBID')
    if self.slurm_id is None:
      # Not running as a Slurm Job (set default TTL)
      self.ttl = dt.now().timestamp() + DEFAULT_TTL
      self.jobinfo = None
      self.stat.collect("slurmid","None")
    else:
      logging.info("SLURM JOB:  %s", str(self.slurm_id))
      self.jobinfo = slurm.jobinfo(int(self.slurm_id))
      endtime = dtparser.parse(self.jobinfo['EndTime'])
      self.ttl = endtime.timestamp()
      self.stat.collect("slurmid",self.slurm_id)
    self.stat.collect("ttl",self.ttl)


    # TODO: All services' nodes assume to run on same port (different hosts)
    #  This the master hold only the host name
    self.master = kwargs.get('master', None)
    self.slavelist = {}

    self.terminationFlag = Event()
    self.handoverFlag = Event()
    self.shuttingdown = Event()
    self.state_running = Event()

    # TODO: Should we require basic setting via systemsetting object???
    #   and thus, enforce this object on the abstract class?
    self.SERVICE_STARTUP_DELAY = 60
    self.SERVICE_HEARTBEAT_DELAY = 60

    # Register tty signaling to gracefully shutdown when halted externall7
    #   (e.g. from scancel)
    signal.signal(signal.SIGTERM, self.halt)
    signal.signal(signal.SIGINT, self.halt)
    signal.signal(signal.SIGQUIT, self.halt)
    signal.signal(signal.SIGHUP, self.halt)



  @abc.abstractmethod
  def ping(self, host='localhost'):
    """Method to check if service is running on the given host
    """
    pass

  @abc.abstractmethod
  def handover_to(self):
    """Invoked to handover services from this instance to a new master
    """
    pass

  @abc.abstractmethod
  def handover_from(self):
    """Invoked as a callback when another master service is handing over
    service control (i.e. master duties) to this instance
    """
    pass

  def prepare_service(self):
    """Pre-execution processing (e.g. config file / env creation)
    """
    pass

  def tear_down(self):
    """Post-execution processing (e.g. remove temp files)
    """
    pass

  def idle(self):
    """To define an idle detection method and gracefully shutdown if the service
    is idle. If undefined, idle returns False and the system will always assume
    that is it never idle.
    """
    return False

  def launch_slave(self):
    """For multi-node services, the master node is responsible for starting/stopping
    all slave nodes. Launch slave is the method called when the master monitor
    needs to start up a new slave.
    """

  def stop_slave(self):
    """For multi-node services, the master node will invoke this to stop a 
    slave. 
    TODO: policy development for rotating slave nodes and ensuring they 
    are either not all stopping concurrently or do exactly that.
    """

  def start(self):
    """
    """
    logging.info('Overlay Service is Starting: %s', self._name_svc)
    logging.info('  SERVICE_STARTUP_DELAY  = %d', self.SERVICE_STARTUP_DELAY)
    logging.info('  SERVICE_HEARTBEAT_DELAY  = %d\n', self.SERVICE_HEARTBEAT_DELAY)

    # Validate the initialization State. Services first start in a Slave State
    #  but change over to Master if no Master is detected. If the master is
    #  online, the service becomes a replica (future enhancements will allow
    #  disitrbuted and multiple slaves/replicas)
    if self._role == 'SLAVE':
      logging.info('[%s] Starting as a slave. ', self._name_svc)
      slave_wait_ttl = self.SERVICE_STARTUP_DELAY * 4
      while slave_wait_ttl > 0:
        host, port, state = self.getconnection()
        logging.debug('SLAVE Coming online: Current Conn  = %s, %s', str(host), str(state))
        if host is None:
          # No Master alive -- become the master
          logging.info('No Master Detected. Becoming the Master.')
          self._role = 'MASTER'
          break
        elif (state == 'RUN' or state == 'EXPIRED') and self.ping(host, port):
          # FLAG MASTER
          logging.info('[%s] Master is Alive. Starting as a slave. ', self._name_svc)
          with open(self.lockfile, 'a') as conn:
            ts = dt.now().timestamp()
            conn.write('%s,%d,%f,%f,%s\n' % (self._host, self._port, ts, self.ttl, 'SLAVE'))
          # Master is ALIVE and RUNNING
          host = None
          self._role = 'REPLICA'
          break
        elif state == 'EXPIRED':
          # Master is ALIVE, but detected as EXPIRED
          logging.info('[%s] Master at %s has expired. Becoming the Master.', self._name_svc, str(host))
          host = None
          self._role = 'MASTER'
          break
        else:
          # Master is either registered as START or it's unavailable. Wait
          logging.warning('[%s] My Master is Not alive. Host is %s.  Waiting to see if it will come online...', self._name_svc, str(host))
          time.sleep(self.SERVICE_STARTUP_DELAY)
          slave_wait_ttl -= self.SERVICE_STARTUP_DELAY
          if slave_wait_ttl < 0: 
            logging.warning('[%s] Could not find a master. Taking over as Master.', self._name_svc, self._host)
            if os.path.exists(self.lockfile):
              os.remove(self.lockfile)
            host = None
            self._role = 'MASTER'
            break
      #TODO: 
      # self.masterport = port

    # Check to ensure lock is not already acquired
    while self.master is None:
      logging.info('Checking on the master.... (MY ROLE = %s)', self._role)
      if self._role == 'MASTER':
        try:
          lock = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
          ts = dt.now().timestamp()
          os.write(lock, bytes('%s,%d,%f,%f,%s\n' % (self._host, self._port, ts, self.ttl, 'START'), 'UTF-8'))
          os.close(lock)
          logging.debug("[%s] Lock File not found (I am creating it from %s)", self._name_svc, self._host)
          self._state = 'START'
          self.master = self._host
          break
        except FileExistsError as ex:
          logging.debug("[%s] Lock File exists (someone else has acquired it)", self._name_svc)

      host, port, state = self.getconnection()
      logging.debug('Overlay Service State = %s', str(state))
      if state is None or state == 'EXPIRED':
        logging.error("[%s] BAD state from existing %s. Removing it retrying to lock it.", self._name_svc, self.lockfile)
        if os.path.exists(self.lockfile):
          os.remove(self.lockfile)
        self._role = 'MASTER'
        continue

      if state == 'START':
        logging.debug("[%s] SOMEONE ELSE is starting my service on %s.", self._name_svc, host)
        timeout = self.SERVICE_STARTUP_DELAY
        while timeout > 0 and state == 'START':
          time.sleep(1)
          host, port, state = self.getconnection()
          timeout -= 1
        if state is not None and state == 'RUN':
          logging.info('Found a NEW master on %s.', host)
        else:
          logging.debug("[%s] Detected zombie master. Killing the lockfile and retrying to start locally.", self._name_svc)
          self._role  = 'MASTER'
          if os.path.exists(self.lockfile):
            os.remove(self.lockfile)
          continue

      if state == 'RUN':
        logging.debug("[%s] Checking if service is available on %s, %s", self._name_svc, host, port)
        if self.ping(host, port):
          logging.debug("[%s] Service is ALIVE on %s", self._name_svc, host)
          if self.aggressive:
            logging.debug("[%s] Starting AGGRESSIVE -- will initiate master takeover control protocol", self._name_svc)
            # If starting up as an aggressive replica -- assume immediate replication and control
            #  of service operation. This will initiate this node to start as a 
            #  replica and initate replication/handover protocol. Future improvements
            #  should allow for the system to detect the # of replica (for FT) and 
            #  flag handover accordingly
            self._role  = 'REPLICA'
            self.master = host
            self.handoverFlag.set()
            break
          else:
            logging.debug('[%s] Service already running on %s. I am TIMID -- to take over, start me in aggressive mode', self._name_svc, host)
            return None
        else:
          logging.warning('[%s] Service is NOT running. Will attempt to recover and start locally.', self._name_svc)
          self._role  = 'MASTER'
          if os.path.exists(self.lockfile):
            os.remove(self.lockfile)

    self.prepare_service()

    if self.launchcmd is None:
      logging.error("[%s] Launch Command not set. It needs to be defined.", self._name_svc)
      return None

    if self.shutdowncmd is None:
      logging.error("[%s] Shutdown Command not set. It needs to be defined.", self._name_svc)
      return None

    # TODO: Check subproc call here -- should this also be threaded in a python wrap call?
    logging.info("[%s] Launching following command %s", self._name_svc, self.launchcmd)    
    args = shlex.split(self.launchcmd)
    service = proc.Popen(args)

    if service.returncode is not None:
      logging.error("[%s] ERROR starting local service on %s", self._name_svc, self.host)    
      # Exit or Return ???
      return None

    # Ensure service has started locally
    svc_up = None
    timeout = time.time() + self.SERVICE_STARTUP_DELAY
    while not svc_up:
      if time.time() > timeout:
        logging.error("[%s] Timed Out waiting on the server", self._name_svc)    
        break
      time.sleep(1)
      svc_up = self.ping()

    if not svc_up:
      logging.error("[%s] Service never started. You may need to retry.", self._name_svc)    
      return None

    self._state = 'RUN'
    self.state_running.set()
    logging.info("[%s] My Service started local on %s (ROLE=%s). Starting the local monitor.", self._name_svc, self._role, self._host)    
    ts = dt.now().timestamp()

    if self._role  == 'MASTER':
      if os.path.exists(self.lockfile):
        logging.info("[%s] Removing and creating the Lockfile.", self._name_svc)    
        os.remove(self.lockfile)
      else:
        logging.info("[%s] Creating a new Lockfile.", self._name_svc)    
      lock = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
      os.write(lock, bytes('%s,%d,%f,%f,%s\n' % (self._host, self._port, ts, self.ttl, 'RUN'), 'UTF-8'))
      os.close(lock)

    self.service = service

    t = Thread(target=self.monitor)
    t.start()
   
    return t

  def monitor(self):

    MISS_HB_TO = 5  # 5 min timeot

    logging.info("[%s] Monitor Daemon Started.", self._name_svc)
    # TODO: Move to settings
    missed_hb = 0
    allowed_misshb = 3

    # Redundant check to ensure service has started locally (it should be started already)
    alive = False
    timeout = time.time() + self.SERVICE_STARTUP_DELAY
    while not alive:
      if time.time() > timeout:
        logging.error("[%s-Mon] Timed Out waiting on the server", self._name_svc)    
        break
      time.sleep(1)
      alive = self.ping()
    if not alive:
      logging.error("[%s-Mon] Service never started. You may need to retry.", self._name_svc)    
      return

    # TODO:  For multi-node services, the Master monitor process will start up all slaves
    #  as individual jobs  -- FUTURE
    # if self.numslaves > 0:
    #   logging.info("[%s-Mon] Service is up locally on %s. Preparing to launch %d slaves.", self._name_svc, self.numslaves)    

    #   # TODO: Thread each slave with a separate monitor or launcher 
    #   for i in len(range(self.numslaves)):
    #     self.launch_slave()

    #  Blocking loop. Will only exit if the term flag is set or by a 
    #   in-loop check every heartbeat (via miss ping, lost pid, or idle timeout)
    logging.info("[%s-Mon] Service is Operational and READY. Running the event handler loop.", self._name_svc)    
    if self.jobinfo is not None:
      load_time = (dt.now() - dtparser.parse(self.jobinfo['StartTime'])).total_seconds()
      logging.info('Load Time:\nLOAD_TIME,%f', load_time)
      self.stat.collect('LOAD', load_time)

    while not self.terminationFlag.wait(self.SERVICE_HEARTBEAT_DELAY):
      logging.warning("[%s-Mon] Heartbeat. My ROLE = %s", self._name_svc, self._role)
      if self.shuttingdown.is_set():
        logging.warning("[%s-Mon] Detectd Service was flagged to shutdown. Monitor is halting.", self._name_svc)
        break

      # TODO:  try/catch service connection errors here
      if self.service.poll():
        logging.warning("[%s-Mon] Local Service has STOPPED on %s.", self._name_svc, self._host)
        break

      # Heartbeat
      if not self.ping():
        missed_hb += 1
        logging.warning("[%s-Mon]  MISSED Heartbeat on %s. Lost communicate with service for %d seconds.", 
              self._name_svc, self._host, missed_hb * self.SERVICE_HEARTBEAT_DELAY)
        if missed_hb * self.SERVICE_HEARTBEAT_DELAY > 60 * MISS_HB_TO:
          logging.warning("[%s-Mon]  Lost commo with Server for over %d minutes. Shutting Down",self._name_svc, MISS_HB_TO)
          self.terminationFlag.set()
        continue

      missed_hb = 0

      # CHECK IDLE TIME
      if self.idle():
        logging.info("[%s-Mon] Service is idle. Initiate graceful shutdown.", self._name_svc)    
        break

      if self._role == 'MASTER':
        with open(self.lockfile, 'r') as conn:
          lines = conn.read().strip().split('\n')
        logging.info("[%s-Mon] Checking Lockfile shows %d hosts online", self._name_svc, len(lines))    
        if len(lines) > 1:
          # TODO: Multiple slave
          for i in lines[1:]:
            logging.info('SLAVE detected: %s', i)
            data = i.split(',')
            self.slavelist[data[0]] = data[-1]
            logging.info("[%s-Mon]  Detected a new slave on %s", self._name_svc, data[0])
          self.handoverFlag.set()

      if self.handoverFlag.is_set():
        handover_begin = dt.now()
        # This service has been flagged to handover control to a new master or from an old one
        if self._role == 'MASTER':
          # Handover Control
          next_master = self.handover_to()
          if next_master is None:
            self.handoverFlag.clear()
            continue
          self._role = 'REPLICA'
          logging.info("[%s-Mon]  Handover complete. I am now terminating. My role is now %s.", self._name_svc, self._role)
          # Terminate
          handover_time = (dt.now() - handover_begin).total_seconds()
          logging.info('handover Time:\nHANDOVER_TIME_TO,%f', handover_time)
          self.stat.collect('HANDOVER_TO', handover_time)

          self.terminationFlag.set()

        elif self._role == 'REPLICA':
          # Initiate replica protocol and assume role as master
          self.handover_from()
          logging.info("[%s-Mon]  Handover complete. I am now the master.", self._name_svc)

          # Re-set lock file (TODO: other slaves)
          self._role = 'MASTER'
          ts = dt.now().timestamp()
          ttl = ts + self.ttl
          os.remove(self.lockfile)
          lock = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
          os.write(lock, bytes('%s,%d,%f,%f,%s\n' % (self._host, self._port, ts, ttl, 'RUN'), 'UTF-8'))
          os.close(lock)

          handover_time = (dt.now() - handover_begin).total_seconds()
          logging.info('handover Time:\nHANDOVER_TIME_FROM,%f', handover_time)
          self.stat.collect('HANDOVER_FROM', handover_time)

          self.handoverFlag.clear()

          logging.debug("[%s-Mon] Handover status is %s.", self._name_svc, (self.handoverFlag.is_set()))

        else:
          logging.error("[%s-Mon]  Flagged for handover, but cannot (I am neither a master or a replica -- what am I?", self._name_svc)
          self.terminationFlag.set()

    self.stop()
  
    # Post-termination logic
    logging.info("[%s] Service is shutdown on %s. Monitor is terminating", self._name_svc, self._host)

  def stop(self):
    """
    """
    if not self.shuttingdown.is_set():
      self.shuttingdown.set()
      logging.debug('MY ROLE = %s', self._role)
      if self._role == 'MASTER' and len(self.slavelist) == 0:
        logging.info('[%s] Last master shutdown. Veryfying I am the last one', self._name_svc)
        host, port, state = self.getconnection()
        if host == self._host:
          os.remove(self.lockfile)
        else:
          logging.info('[%s] New service detected on %s.', self._name_svc, host)
      else:
        logging.info('[%s] This master is shutting down, but detected someone else.', self._name_svc)

      logging.info('[%s] Service shutting down on %s.', self._name_svc, self._host)
      args = shlex.split(self.shutdowncmd)
      result = proc.call(args)
      logging.info("[%s] shutdown with return code: %s.  Tearing Down remaining work.", self._name_svc, str(result))

      self.tear_down()
      self.stat.show()


  def halt(self, signum, frame):

    logging.warning('[%s] SIGNAL Received: %s', self._name_svc, siglist[signum])
    logging.warning('[%s] Setting Termination Flag and Gracefully stopping the service',self._name_svc)
    self.terminationFlag.set()
    self.stop()

  def getconnection(self):
    try:
      with open(self.lockfile, 'r') as conn:
        nodes = conn.read().strip().split('\n')
        if len(nodes) == 0:
          return None, None, None
        conn_string = nodes[0].split(',')
        if len(conn_string) < 5:
          return None, None, None
        host = conn_string[0]
        port = conn_string[1]
        ttl  = float(conn_string[3])
        state = conn_string[4].strip()
        ts = dt.now()
        if ts.timestamp() > ttl:
          logging.warning("Overlay Service Master has expired. Check lockfile: %s", self.lockfile)
          state = 'EXPIRED'
      return host, port, state
    except FileNotFoundError:
      logging.warning("WARNING!  Lock file is missing.")
      return None, None, None
예제 #3
0
파일: overlayService.py 프로젝트: DaMSL/ddc
  def __init__(self, name, port, **kwargs):

    settings = systemsettings()
    settings.applyConfig(name + '.json')

    # For Stat Collection
    self.stat = StatCollector('Overlay')
    self.stat.collect('name', name)

    # DEBUG
    self._host = socket.gethostname()
    self._port = port

    svc_name = kwargs.get('svc_name', type(self).__name__)
    self._name_svc = svc_name
    self._name_app = name
    self.lockfile = '%s_%s.lock' % (self._name_app, self._name_svc)
    logging.info('MY Lock file is: %s', self.lockfile)

    self.numslaves = kwargs.get('numslaves', 0)
    self.launchcmd = None
    self.shutdowncmd = None

    self._pid = None

    self._state = 'INIT'
    self._role  = kwargs.get('role', 'NONE')

    self.aggressive = kwargs.get('isaggressive', False)

    # Check if this is a Slurm job and get the settings
    self.slurm_id = os.getenv('SLURM_JOBID')
    if self.slurm_id is None:
      # Not running as a Slurm Job (set default TTL)
      self.ttl = dt.now().timestamp() + DEFAULT_TTL
      self.jobinfo = None
      self.stat.collect("slurmid","None")
    else:
      logging.info("SLURM JOB:  %s", str(self.slurm_id))
      self.jobinfo = slurm.jobinfo(int(self.slurm_id))
      endtime = dtparser.parse(self.jobinfo['EndTime'])
      self.ttl = endtime.timestamp()
      self.stat.collect("slurmid",self.slurm_id)
    self.stat.collect("ttl",self.ttl)


    # TODO: All services' nodes assume to run on same port (different hosts)
    #  This the master hold only the host name
    self.master = kwargs.get('master', None)
    self.slavelist = {}

    self.terminationFlag = Event()
    self.handoverFlag = Event()
    self.shuttingdown = Event()
    self.state_running = Event()

    # TODO: Should we require basic setting via systemsetting object???
    #   and thus, enforce this object on the abstract class?
    self.SERVICE_STARTUP_DELAY = 60
    self.SERVICE_HEARTBEAT_DELAY = 60

    # Register tty signaling to gracefully shutdown when halted externall7
    #   (e.g. from scancel)
    signal.signal(signal.SIGTERM, self.halt)
    signal.signal(signal.SIGINT, self.halt)
    signal.signal(signal.SIGQUIT, self.halt)
    signal.signal(signal.SIGHUP, self.halt)
예제 #4
0
파일: ctl_mv.py 프로젝트: DaMSL/ddc
    def execute(self, thru_index):
      """Executing the Controler Algorithm. Load pre-analyzed lower dimensional
      subspaces, process user query and identify the sampling space with 
      corresponding distribution function for each user query. Calculate 
      convergence rates, run sampler, and then execute fairness policy to
      distribute resources among users' sampled values.
      """
      logging.debug('CTL MT')

    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")

      np.set_printoptions(precision=4, linewidth=150)

      self.data['timestep'] += 1
      logging.info('TIMESTEP: %d', self.data['timestep'])

      settings = systemsettings()
      bench = microbench('ctl_%s' % settings.name, self.seqNumFromID())
      stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID())

      # Connect to the cache
      self.cacheclient = CacheClient(settings.APPL_LABEL)

      # create the "binlist":
      numLabels = self.data['numLabels']
      numresources = self.data['numresources']


    # LOAD all new subspaces (?) and values

      ##### BARRIER
      self.wait_catalog()

      # Load new RMS Labels -- load all for now
      bench.start()
      logging.debug('Loading RMS Labels')
      start_index = max(0, self.data['ctlIndexHead'])

      # labeled_pts_rms = self.catalog.lrange('label:rms', self.data['ctlIndexHead'], thru_index)
      logging.debug(" Start_index=%d,  thru_index=%d,   ctlIndexHead=%d", start_index, thru_index, self.data['ctlIndexHead'])

      feallist = [np.fromstring(i) for i in self.catalog.lrange('subspace:feal', 0, -1)]
      num_pts = len(feallist)
      self.data['ctlIndexHead'] = thru_index
      thru_count = self.data['observe:count']

      logging.debug('##NUM_RMS_THIS_ROUND: %d', num_pts)
      stat.collect('numpts', len(feallist))


    # Calculate variable PDF estimations for each subspace via bootstrapping:
      logging.debug("=======================  <SUBSPACE CONVERGENCE>  =========================")

      # Bootstrap current sample for RMS
      logging.info("Feature Landscapes for %d points loaded. Calculating PDF.....", len(feallist))

      #  Static for now
      blocksize = 5000
      mv_convergence = op.bootstrap_block(feallist, blocksize)
      global_landscape = np.mean(feallist, axis=0)
      stat.collect('convergence', mv_convergence)
      stat.collect('globalfeal', global_landscape)
      # logging.info('MV Convergence values:\nCONV,%s', ','.join(['%5.3f'%i for i in mv_convergence]))
      # logging.info('Global Feature Landscape:\nFEAL,%s', ','.join(['%5.3f'%i for i in global_landscape]))
      logging.info('MV Convergence values:\nCONV,%s', str(mv_convergence[-1]))
      logging.info('Global Feature Landscape:\n%s', feal.tostring(global_landscape))

    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

      ##### BARRIER
      self.wait_catalog()
      selected_index_list = []

      # QUERY PROCESSING & SAMPLING BELOW to select indices. 
      EXPERIMENT_NUMBER = self.experiment_number
      logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER)


      ###### EXPERIMENT #5:  BIASED (Umbrella) SAMPLER
      if EXPERIMENT_NUMBER == 5:
        if self.catalog.exists('label:deshaw'):
          logging.info("Loading DEShaw historical points.... From Catalog")
          rmslabel = [eval(x) for x in self.catalog.lrange('label:deshaw', 0, -1)]
        else:
          logging.info("Loading DEShaw historical points.... From File (and recalculating)")
          rmslabel = deshaw.labelDEShaw_rmsd()

        deshaw_samples = {b:[] for b in binlist}
        for i, b in enumerate(rmslabel):
          deshaw_samples[b].append(i)

        coord_origin = []

        conv_vals = np.array([v for k, v in sorted(convergence_rms.items())])
        norm_pdf_conv = conv_vals / sum(conv_vals)
        logging.info("Umbrella Samping PDF (Bootstrapping):")
        sampled_distro_perbin = {b: 0 for b in binlist}

        while numresources > 0:
          # First sampling is BIASED
          selected_bin = np.random.choice(len(binlist), p=norm_pdf_conv)
          A, B = binlist[selected_bin]
          sampled_distro_perbin[binlist[selected_bin]] += 1
          if bincounts[selected_bin] is not None and bincounts[selected_bin] > 0:
            # Secondary Sampling is Uniform
            sample_num = np.random.randint(bincounts[selected_bin])
            logging.debug('SAMPLER: selecting sample #%d from bin %s', 
              sample_num, str(binlist[selected_bin]))
            index = self.catalog.lindex('varbin:rms:%d_%d' % binlist[selected_bin], 
              sample_num)
            selected_index_list.append(index)
            coord_origin.append(('sim', index, binlist[selected_bin], '%d-D'%A))
            numresources -= 1
          elif len(deshaw_samples[binlist[selected_bin]]) > 0:
            index = np.random.choice(deshaw_samples[binlist[selected_bin]])
            logging.debug('SAMPLER: selecting DEShaw frame #%d from bin %s', 
              index, str(binlist[selected_bin]))
            # Negation indicates an historical index number
            selected_index_list.append(-index)
            coord_origin.append(('deshaw', index, binlist[selected_bin], '%d-D'%A))
            numresources -= 1
          else:
            logging.info("NO Candidates for bin: %s", binlist[selected_bin])



      ###### EXPERIMENT #10:  MutiVariate Nearest Neighbor (MVNN) SAMPLER
      if EXPERIMENT_NUMBER == 10:

        # Create the KD Tree from all feature landscapes (ignore first 5 features)
        kd = KDTree(100, 15, np.array(feallist), 'median')

        # Collect hypercubes
        hc = kd.getleaves()

        logging.info('KD Tree Stats')
        logging.info('    # HCubes   : %5d', len(hc))
        logging.info('    Largest  HC: %5d', max([v['count'] for k,v in hc.items()]))
        logging.info('    Smallest HC: %5d', min([v['count'] for k,v in hc.items()]))

        for key, hcube in hc.items():
          hc_feal = [feallist[i] for i in hcube['elm']]
          hc[key]['feal'] = np.mean(hc_feal, axis=0)

        #  Det scale and/or sep scales for each feature set
        desired = 10 - global_landscape
        logging.info('Desired Landscape:\n%s', feal.tostring(desired))

        #  Calc euclidean dist to each mean HC's feal
        nn = {k: LA.norm(desired[5:] - v['feal'][5:]) for k,v in hc.items()}

        #  Grab top N Neighbors (10 for now)
        neighbors = sorted(nn.items(), key=lambda x: x[1])[:10]
        logging.info('BestFit Landscape:\n%s', feal.tostring(hc[neighbors[0][0]]['feal']))

        ## DATA SAMPLER
        nn_keys = [i for i,w in neighbors]
        nn_wgts = np.array([w for i,w in neighbors])
        nn_wgts /= np.sum(nn_wgts)  # normalize

        coord_origin = []
        while numresources > 0:
          # First sampling is BIASED
          selected_hc = np.random.choice(nn_keys, p=nn_wgts)

          # Second is UNIFORM (within the HCube)
          index = np.random.choice(hc[selected_hc]['elm'])
          selected_index_list.append(index)
          src_state = np.argmax(feallist[index][:5])
          coord_origin.append(('sim', index, src_state, selected_hc))
          logging.info('Sampled Landscape [hc=%s]:\n%s', selected_hc, feal.tostring(feallist[index]))

          numresources -= 1


      elif EXPERIMENT_NUMBER == 11:

        #  Use only right most 10 features (non-normalized ones)
        inventory = np.array([f[10:] for f in feallist])
        desired = 10 - global_landscape
        logging.info('Desired Landscape (NOTE Only Including A-B values:\n%s', feal.tostring(desired))

        selected_index_list = mvkp.knapsack(desired[10:], inventory, numresources, 2000000)
        coord_origin = [('sim', index, np.argmax(feallist[index][:5]), 'D') for index in selected_index_list]
        logging.info("KNAPSACK Completed:")
        logging.info('Target Distribution:\n%s', str(desired[10:]))
        logging.info('Target Distribution:\n%s', '\n'.join(['%s'%feallist[i] for i in selected_index_list]))

    # Back Project to get new starting Coords for each sample  
      logging.debug("=======================  <INPUT PARAM GENERATION>  =================")
      logging.info('All Indices sampled. Back projecting to high dim coords')
      sampled_set = []
      for i in selected_index_list:
        traj = self.backProjection([i])
        sampled_set.append(traj)
      bench.mark('Sampler')

    # Generate new starting positions
      runtime = self.data['runtime']
      jcqueue = OrderedDict()
      for i, start_traj in enumerate(sampled_set):
        jcID, params = generateNewJC(start_traj)

        # TODO:  Update/check adaptive runtime, starting state
        jcConfig = dict(params,
              name    = jcID,
              runtime = runtime,     # In timesteps
              dcdfreq = self.data['dcdfreq'],           # Frame save rate
              interval = self.data['dcdfreq'] * self.data['sim_step_size'],                       
              temp    = 310,
              timestep = self.data['timestep'],
              gc      = 1,
              origin  = coord_origin[i][0],
              src_index = coord_origin[i][1],
              src_bin  = coord_origin[i][2],
              src_hcube = coord_origin[i][3],
              application   = settings.APPL_LABEL)

        logging.info("New Simulation Job Created: %s", jcID)
        for k, v in jcConfig.items():
          logging.debug("   %s:  %s", k, str(v))

        #  Add to the output queue & save config info
        jcqueue[jcID] = jcConfig
        logging.info("New Job Candidate Completed:  %s   #%d on the Queue", jcID, len(jcqueue))

      bench.mark('GenInputParams')

    #  POST-PROCESSING  -------------------------------------
      logging.debug("============================  <POST-PROCESSING & OUTPUT>  =============================")
          
      self.wait_catalog()

      # Clear current queue, mark previously queues jobs for GC, push new queue
      qlen = self.catalog.llen('jcqueue')
      logging.debug('Current queue len;   %s', str(qlen))
      if qlen > 0:
        curqueue = self.catalog.lrange('jcqueue', 0, -1)
        logging.info("Marking %d obsolete jobs for garbage collection", len(curqueue))
        for jc_key in curqueue:
          key = wrapKey('jc', jc_key)
          config = self.catalog.hgetall(key)
          config['gc'] = 0
          # Add gc jobs it to the state to write back to catalog (flags it for gc)
          self.addMut(key, config)
        self.catalog.delete('jcqueue')

      #  CATALOG UPDATES
      self.catalog.rpush('datacount', len(feallist))

      #  EXPR 7 Update:
      if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10:
        # self.catalog.storeNPArray(np.array(centroid), 'subspace:covar:centroid:%d' % cov_iteration)
        self.catalog.rpush('subspace:covar:thruindex', len(covar_pts))

      # Update cache hit/miss
      hit = self.cache_hit
      miss = self.cache_miss
      logging.info('##CACHE_HIT_MISS %d %d  %.3f', hit, miss, (hit)/(hit+miss))
      self.catalog.rpush('cache:hit', self.cache_hit)
      self.catalog.rpush('cache:miss', self.cache_miss)

      self.data['jcqueue'] = list(jcqueue.keys())

      logging.debug("   JCQUEUE:  %s", str(self.data['jcqueue']))
      # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y)
      logging.debug("Updated Job Queue length:  %d", len(self.data['jcqueue']))
      for jcid, config in jcqueue.items():
        # config['converge'] = self.data['converge']
        self.addMut(wrapKey('jc', jcid), config)
 
      bench.mark('PostProcessing')
      print ('## TS=%d' % self.data['timestep'])
      bench.show()
      stat.show()

      return list(jcqueue.keys())
예제 #5
0
파일: ctl_expl.py 프로젝트: DaMSL/ddc
    def execute(self, new_basin_list):
      """Executing the Controler Algorithm. Load pre-analyzed lower dimensional
      subspaces, process user query and identify the sampling space with 
      corresponding distribution function for each user query. Calculate 
      convergence rates, run sampler, and then execute fairness policy to
      distribute resources among users' sampled values.
      """
      logging.debug('CTL MT')

    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")
      np.set_printoptions(precision=4, linewidth=150)

      self.data['timestep'] += 1
      logging.info('TIMESTEP: %d', self.data['timestep'])

      settings = systemsettings()
      bench = microbench('ctl_%s' % settings.name, self.seqNumFromID())
      stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID())

      if self.force_decision:
        new_basin_list = []

      # Connect to the cache
      self.cacheclient = CacheClient(settings.APPL_LABEL)

      # create the "binlist":
      numresources = self.data['numresources']
      explore_factor = float(self.data['sampler:explore'])
      topo = self.protein.top

      # Load new RMS Labels -- load all for now
      bench.start()
      logging.debug('Loading RMS Labels')
      start_index = max(0, self.data['ctl_index_head'])
      logging.debug(" Start_index=%d,  batch_size=%d", start_index, len(new_basin_list))

    # Calculate variable PDF estimations for each subspace via bootstrapping:
      logging.debug("=======================  <SUBSPACE CONVERGENCE> (skip)  ===================")

    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")
      stat.collect('new_basin', len(new_basin_list))

      ##### BARRIER
      self.wait_catalog()
      selected_index_list = []

      # QUERY PROCESSING & SAMPLING BELOW to select indices. 
      EXPERIMENT_NUMBER = self.experiment_number
      logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER)

      # TODO: ABSTRACT FEATURE DIMENSIONALITY
      n_features = 1653

      # Basin List will be the list of basin representing the new Job Candidates
      selected_basin_list = []
      all_basins = self.data['basin:list']
      num_prev_basins = int(self.data['basin:processed'])

      # Load Previous Distance Space (historical data)
      prev_ds_raw = self.catalog.lrange('dspace', 0, num_prev_basins)
      local_basins = {}
      if  len(prev_ds_raw) > 0:
        ds_prev = np.zeros(shape=(len(prev_ds_raw), n_features))
        logging.info("Unpickling distance space to array: %s", ds_prev.shape)
        for i, elm in enumerate(prev_ds_raw):
          ds_prev[i] = pickle.loads(elm)
        logging.info('Prev DS loaded. Size = %d', len(ds_prev))
      else:
        logging.info('NO Prev DS')
        ds_prev = []

      # FOR Supervised Classification (included for all for post-partem processing)
      bin_labels_10 = ['T0', 'T1', 'T2', 'T3', 'T4', 'W0', 'W1', 'W2', 'W3', 'W4']
      bin_labels_25 = [(a,b) for a in range(5) for b in range(5)]
      bin_list_10 = {k: [int(i) for i in self.catalog.lrange('bin:10:%s' % k, 0, -1)] for k in bin_labels_10}
      bin_list_25 = {k: [int(i) for i in self.catalog.lrange('bin:25:%d_%d' % k, 0, -1)] for k in bin_labels_25}

      # Merge locally analyzed distance space
      delta_ds = np.zeros(shape=(len(new_basin_list), n_features))
      logging.info('Collecting new data from basins: %s', new_basin_list)
      for i, bid in enumerate(new_basin_list):
        basin = self.data['basin:'+bid]
        global_basin_index = int(basin['dsidx'])
        local_basins[global_basin_index] = basin

        # Update Distance Space
        dmu_  = self.catalog.lindex('dspace', global_basin_index)
        if dmu_ is None:
          print("ERROR!!!!")
          print(global_basin_index, i)
          for k,v in basin.items():
            print(k,v)
          sys.exit(0)

        delta_ds[i] = pickle.loads(dmu_)

        # Update Supervise Classification
        label_seq = [int(i) for i in self.catalog.lrange('basin:labelseq:'+bid, 0, -1)]
        local_basins[global_basin_index]['label:10'] = label10 = bin_label_10(label_seq)
        local_basins[global_basin_index]['label:25'] = label25 = bin_label_25(label_seq)
        bin_list_10[label10].append(global_basin_index)
        bin_list_25[label25].append(global_basin_index)
        self.data['basin:processed'] += 1


      if len(new_basin_list) > 0 and len(prev_ds_raw) > 0:
        dist_space = np.vstack((ds_prev, delta_ds))
      elif len(delta_ds) == 0:
        dist_space = np.array(ds_prev)
      elif len(ds_prev) == 0:
        logging.info('First Set of Distance Coord laoded')
        dist_space = delta_ds
      else:
        logging.error("ERROR! NO DISTANCE SPACE IN THE CATALOG")

      # UNIFORM SAMPLER (BASIC)
      if EXPERIMENT_NUMBER == 12:
        basin_idx_list = []
        candidate_list = [[] for i in range(5)]
        for k,v in bin_list_10.items():
          candidate_list[int(k[1])].extend(v)

        # UNIFORM SAMPLING
        for sel_num in range(numresources):
          # Select Random start state
          start_state = np.random.randint(5)
          # Select random index
          rand_index = np.random.choice(len(candidate_list[start_state]))
          basin_idx = candidate_list[start_state][rand_index]
          basin_idx_list.append(basin_idx)

        for i in basin_idx_list:
          if i < num_prev_basins:
            logging.info("Select index: %s   (Retrieve from Catalog)", i)
            bid = self.data['basin:list'][i]
            basin = self.catalog.hgetall('basin:%s'%bid)
          else:
            logging.info("Select index: %s   (New locally built basin in mem)", i)  
            basin = local_basins[i]
          logging.debug('   BASIN:  %s', basin['id'])
          selected_basin_list.append(basin)

      # BIASED SAMPLER (UPDATED)
      if EXPERIMENT_NUMBER == 13:

        # USING 10-BIN LABELS
        distro = [len(bin_list_10[i]) for i in bin_labels_10]

        # Create and invoke the sampler
        logging.info('Running the biased (umbrella) samplers')
        sampler = BiasSampler(distro)
        samplecount = np.zeros(len(bin_labels_10), dtype=np.int16)

        # Find the first index for each bin:
        explore_direction = 1 if explore_factor < .5 else -1
        for i, b in enumerate(bin_list_10):
          if len(b) == 0:
            idx = 0
          else: 
            idx = np.floor(explore_factor * (len(b) - 1))
          samplecount[i] = idx

        sel_bins = sampler.execute(numresources)

        logging.info('Processing selected bins to find starting candidates')
        candidate_list = {}
        basin_idx_list = []
        for b in sel_bins:
          target_bin = bin_labels_10[b]
          if target_bin not in candidate_list:
            candidate_list[target_bin] = bin_list_10[target_bin]
          #  TODO:  FULLY IMPLEMENT EXPLORE/EXPLOIT BUT INCL HISTORY/PROVONANCE
            # Lazy Update to centroid -- push to catalog immediately
            # vals = dist_space[bin_list_10[target_bin]]
            # logging.info('Updating Centroid for bin %s,  bindata: %s', target_bin, vals.shape)
            # centroid = np.mean(vals, axis=0)
            # self.catalog.set('bin:10:centroid:%s' % target_bin, pickle.dumps(centroid))
            # dist_center = [LA.norm(centroid - dist_space[i]) for i in bin_list_10[target_bin]]
            # candidate_list[target_bin] = sorted(zip(bin_list_10[target_bin], dist_center), key=lambda x: x[1])
          # basin_idx, basin_diff = candidate_list[target_bin][samplecount[b]]
          # samplecount[b] += explore_direction
          # # Wrap
          # if samplecount[b] == 0:
          #   samplecount = len(candidate_list[target_bin]) - 1
          # if samplecount[b] == len(candidate_list[target_bin]):
          #   samplecount = 0

          # FOR NOW PICK A RANDOM CANDIDATE 
          rand_index = np.random.choice(len(candidate_list[target_bin]))
          basin_idx = candidate_list[target_bin][rand_index]
          logging.info('BIAS SAMPLER:\n   Bin: %s\n   basin: %d     Delta from Center: %6.3f  (note: dist not factored in)', \
            target_bin, basin_idx, 0.)
          basin_idx_list.append(basin_idx)

        for i in basin_idx_list:
          if i < num_prev_basins:
            logging.info("Select index: %s   (Retrieve from Catalog)", i)
            bid = self.data['basin:list'][i]
            basin = self.catalog.hgetall('basin:%s'%bid)
          else:
            logging.info("Select index: %s   (New locally built basin in mem)", i)  
            basin = local_basins[i]
          logging.debug('   BASIN:  %s', basin['id'])
          selected_basin_list.append(basin)

      # LATTICE SAMPLER (WITH HISTORICAL DATA)
      if EXPERIMENT_NUMBER == 14:
        # Merge Existing delta with DEShaw Pre-Processed data:
        logging.info('Merging DEShaw with existing generated data')

        # Set parameters for lattice
        Kr = [int(i) for i in self.catalog.lrange('lattice:features', 0, -1)]
        support = int(self.data['lattice:support'])
        dspt = self.catalog.get('lattice:delta_support')
        delta_support = 5 if dspt is None else int(dspt)
        cutoff  = float(self.data['lattice:cutoff'])
        logging.info('PARAMS  Kr:%s\n support:%d  dspt:%d  cutoff:%f', Kr, support, delta_support, cutoff)

        # Load existing (base) lattice data
        logging.info("Unpickling max/low FIS and derived lattice EMD values")
        max_fis    = pickle.loads(self.catalog.get('lattice:max_fis'))
        low_fis    = pickle.loads(self.catalog.get('lattice:low_fis'))
        dlat       = pickle.loads(self.catalog.get('lattice:dlat'))

        # TODO: MOVE iset completely into catalog        
        # Ik       = pickle.loads(self.catalog.get('lattice:iset'))

        # FOR NOW: Full DEShaw Index is saved on disk  (TODO: MOVE TO CATALOG)
        logging.info("Loading full Itemset from disk (TODO: Det optimization on mem/time)")
        Ik        = pickle.load(open(settings.datadir + '/iset.p', 'rb'))
        # Item_set Keys (Ik) are only saved as a delta for space conservation
        if os.path.exists(settings.datadir + '/iset_delta.p'):
          Ik_delta  = pickle.load(open(settings.datadir + '/iset_delta.p', 'rb'))
        else:
          Ik_delta = {}
        # Merge previous item set delta with DEShaw index
        logging.info("Merging DEShaw Ik with Delta IK")
        for k,v in Ik_delta.items():
          Ik[k] = np.concatenate((Ik[k], v)) if k in Ik else v

        # Build Base Lattice Object
        base_lattice=lat.Lattice(ds_prev, Kr, cutoff, support)
        base_lattice.set_fis(max_fis, low_fis)
        base_lattice.set_dlat(dlat, Ik)

        if not self.force_decision and len(delta_ds) > 0:
          # Build Delta Lattice Object
          logging.info('Building Delta lattice. Num new items: %d', len(delta_ds))
          delta_lattice = lat.Lattice(delta_ds, Kr, cutoff, delta_support, invert=invert_vals)
          delta_lattice.maxminer()
          delta_lattice.derive_lattice()

          # Update non-DEShaw delta itemset key index
          logging.info("Updating Itemsets and Distance Space Matrix")
          for k,v in delta_lattice.Ik.items():
            Ik_delta[k] = np.concatenate((Ik_delta[k], v)) if k in Ik_delta else v

          # Save Ik delta to disk
          logging.info("Saving Delta Itemset (to disk)")
          pickle.dump(Ik_delta, open(settings.datadir + '/iset_delta.p', 'wb'))

          #  Perform incremental maintenance
          logging.info('Merging Delta lattice with Base Lattice')
          base_lattice.merge(delta_lattice)

        # Create the Sampler object (also make clusters)
        logging.info('Invoking the Lattice Sampler')

        sampler = LatticeSampler(base_lattice)
        basin_id_list = sampler.execute(numresources)
        for index in basin_id_list:
          bid = all_basins[index]
          selected_basin_list.append(self.catalog.hgetall('basin:%s'%bid))

      # LATTICE SAMPLER (DE NOVO)
      if EXPERIMENT_NUMBER == 15:

        # PREPROCESS
        N_features_src = topo.n_residues
        N_features_corr = (N_features_src**2 - N_features_src) // 2 
        upt = np.triu_indices(N_features_src, 1)
        old_basin_ids = all_basind[:num_prev_basins]
    
        # DENOVO Exploratory Bootstrapping (RMSD)
        explore_factor = float(self.data['sampler:explore'])

        # TODO:  Better transtion plan from explore to exploit
        self.data['sampler:explore'] *= .75   
        executed_basins = self.catalog.lrange('executed', 0, -1)

        if explore_factor > 0:
          logging.info("EXPLORING Most active basins....")
          basindata = [self.catalog.hgetall(bid) for bid in old_basin_ids]
          for bid in new_basin_list:
            basindata.append(self.data['basin:'+bid])          

          basins_with_rms = [b for b in basindata if 'resrms_delta' in b]

          basin_by_rmsd = sorted(basins_with_rms, key=lambda x: float(x['resrms_delta']), reverse=True)
          explore_samples = int(np.floor(numresources * explore_factor))
          logging.info('Num to explore: %d  out of %d', explore_samples, len(basin_by_rmsd))
          idx, num_sampled = 0, 0

          while idx < len(basin_by_rmsd) and num_sampled < explore_samples:
            selb = basin_by_rmsd[idx]
            if selb['id'] in executed_basins:
              logging.info('Selected %s, but it has been executed. Selecting next option', selb['id'])
            else:
              selected_basin_list.append(selb)
              logging.info('  (%d) EXPLORE BASIN:  %s  %f', selb['id'], selb['id'], float(selb['resrms_delta']))
              numresources -= 1
              num_sampled += 1
            idx += 1

        # TODO:  Reduced Feature Sets
        #  Using Reduced Feature Set Alg #2 HERE
        support = int(.01 * len(dist_space))
        cutoff  = 8

        # RE-Calc the whole lattice:
        logging.info("Building the new lattice")
        BUILD_NEW = not self.catalog.exists('lattice:bootstrapped')
        # TODO: Decision to go from build new to incr maint
        if BUILD_NEW:
          tval = .05
          Kr = lat.reduced_feature_set2(dist_space, cutoff, theta=tval, maxk=25)
          retry = 5
          while len(Kr) < 12 and retry > 0:
            tval /= 2
            retry -= 1
            Kr = lat.reduced_feature_set2(dist_space, cutoff, theta=tval, maxk=25)

          base_lattice = lat.Lattice(dist_space, Kr, cutoff, support)
          base_lattice.maxminer()
          base_lattice.derive_lattice()
          with self.catalog.pipeline() as pipe:
            pipe.delete('lattice:kr')
            for i in sorted(Kr):
              pipe.rpush('lattice:kr', i)
            pipe.execute()

        else:
          # Load existing (base) lattice data
          max_fis    = pickle.loads(self.catalog.get('lattice:max_fis'))
          low_fis    = pickle.loads(self.catalog.get('lattice:low_fis'))
          dlat       = pickle.loads(self.catalog.get('lattice:dlat'))
          Ik         = pickle.loads(self.catalog.get('lattice:iset'))
          num_k      = self.catalog.get('lattice:num_k')
          Kr         = [int(i) for i in self.catalog.lrange('lattice:kr', 0, -1)]
          if num_k is None:
            num_k = max(8, min(15, numresources*2))

          # Build Lattice Object
          logging.info('Building Existing lattice object')
          base_lattice=lat.Lattice(ds_prev, Kr, cutoff, support)
          base_lattice.set_fis(max_fis, low_fis)
          base_lattice.set_dlat(dlat, Ik)

          # Build Delta Lattice Object
          logging.info('Building Delta lattice. Num new items: %d', len(delta_ds))
          delta_lattice = lat.Lattice(delta_ds, Kr, cutoff, 1)
          delta_lattice.maxminer()
          delta_lattice.derive_lattice()

          #  Perform incremental maintenance
          logging.info('Merging Delta lattice with Base Lattice')
          base_lattice.merge(delta_lattice)

        if numresources > 0:
          logging.info('Invoking the Lattice Sampler')
          sampler = LatticeSampler(base_lattice)
          basin_id_list = sampler.execute(numresources)

          # For now retrieve immediately from catalog
          self.wait_catalog()
          for index in basin_id_list:
            bid = all_basins[index]
            key = 'basin:%s'%bid
            # Check to ensure its not a new basin and that it exists in the DB
            if self.catalog.exists(key):
              logging.debug('KEY EXISTS: %s', key)
              selbasin = self.catalog.hgetall(key)
            else:
              logging.debug('NO KEY: %s\n%s', key, self.data[key])
              selbasin = self.data[key]
            selected_basin_list.append(selbasin)

      # CORRELATION SAMPLER (KMEANS)
      if EXPERIMENT_NUMBER == 18:
        sampler = CorrelationSampler(cm_all, mu=dist_space)
        basin_id_list = sampler.execute(numresources)
        for bid in basin_id_list:
          selected_basin_list.append(self.catalog.hgetall('basin:%s'%bid))

      bench.mark('GlobalAnalysis')

    # Generate new starting positions
      jcqueue = OrderedDict()
      src_traj_list = []
      for basin in selected_basin_list:
        src_traj_list.append(basin['traj'])
        if basin['traj'].startswith('desh'):
          global_params = getSimParameters(self.data, 'deshaw')
          # fileno = int(basin['traj'][-4:])
          # frame = int(basin['mindex'])
          # jcID, config = generateDEShawJC(fileno, frame)
        else:
          global_params = getSimParameters(self.data, 'gen')
          src_psf = self.catalog.hget('jc_' + basin['traj'], 'psf')
          global_params.update({'psf': src_psf})
          # jcID, config = generateExplJC(basin, jcid=None)

        jcID, config = generateFromBasin(basin)
        config.update(global_params)
        config['name'] = jcID
        logging.info("New Simulation Job Created: %s", jcID)
        for k, v in config.items():
          logging.debug("   %s:  %s", k, str(v))

        #  Add to the output queue & save config info
        jcqueue[jcID] = config
        logging.info("New Job Candidate Completed:  %s   #%d on the Queue", jcID, len(jcqueue))

      stat.collect('src_traj_list', src_traj_list)
      bench.mark('GenInputParams')

    #  POST-PROCESSING  -------------------------------------
      logging.debug("============================  <POST-PROCESSING & OUTPUT>  =============================")
      self.wait_catalog()

      with self.catalog.pipeline() as pipe:
        for basin in selected_basin_list:
          pipe.rpush('executed', basin['id'])
        pipe.execute()

      # Append new distance values
      if EXPERIMENT_NUMBER == 14:
        # Save Ik delta to disk
        logging.info("Saving Delta Itemset (to disk)")
        pickle.dump(Ik_delta, open(settings.datadir + '/iset_delta.p', 'wb'))

        with self.catalog.pipeline() as pipe:
          pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
          pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
          pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
          pipe.execute()

      if EXPERIMENT_NUMBER == 15:
        with self.catalog.pipeline() as pipe:
          pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
          pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
          pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
          pipe.execute()
        self.catalog.set('lattice:iset', pickle.dumps(base_lattice.Ik))

      for basin_idx, basin in local_basins.items():
        key = 'basin:' + basin['id']
        with self.catalog.pipeline() as pipe:
          pipe.hset(key, 'label:10', basin['label:10'])
          pipe.hset(key, 'label:25', basin['label:25'])
          pipe.rpush('bin:10:%s' % basin['label:10'], basin_idx)
          pipe.rpush('bin:25:%d_%d' % basin['label:25'], basin_idx)
          pipe.execute()
      self.catalog.set('basin:processed', num_prev_basins + len(local_basins))

      # Clear current queue, mark previously queues jobs for GC, push new queue
      qlen = self.catalog.llen('jcqueue')
      logging.debug('Current queue len;   %s', str(qlen))
      if qlen > 0:
        curqueue = self.catalog.lrange('jcqueue', 0, -1)
        logging.info("Marking %d obsolete jobs for garbage collection", len(curqueue))
        for jc_key in curqueue:
          key = wrapKey('jc', jc_key)
          config = self.catalog.hgetall(key)
          config['gc'] = 0
          # Add gc jobs it to the state to write back to catalog (flags it for gc)
          self.addMut(key, config)
        self.catalog.delete('jcqueue')


      # Update cache hit/miss
      # hit = self.cache_hit
      # miss = self.cache_miss
      # logging.info('##CACHE_HIT_MISS %d %d  %.3f', hit, miss, (hit)/(hit+miss))
      # self.catalog.rpush('cache:hit', self.cache_hit)
      # self.catalog.rpush('cache:miss', self.cache_miss)

      self.data['jcqueue'] = list(jcqueue.keys())

      logging.debug("   JCQUEUE:  %s", str(self.data['jcqueue']))
      # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y)
      logging.debug("Updated Job Queue length:  %d", len(self.data['jcqueue']))
      for jcid, config in jcqueue.items():
        # config['converge'] = self.data['converge']
        self.addMut(wrapKey('jc', jcid), config)
 
      self.notify('sim')

      bench.mark('PostProcessing')
      print ('## TS=%d' % self.data['timestep'])
      bench.show()
      stat.show()

      return list(jcqueue.keys())
예제 #6
0
파일: simexpl.py 프로젝트: DaMSL/ddc
    def execute(self, job):

        # PRE-PREOCESS ----------------------------------------------------------
        settings = systemsettings()
        bench = microbench("sim_%s" % settings.name, self.seqNumFromID())
        bench.start()
        stat = StatCollector("sim_%s" % settings.name, self.seqNumFromID())
        mylogical_seqnum = str(self.seqNumFromID())

        # Prepare working directory, input/output files
        conFile = os.path.join(job["workdir"], job["name"] + ".conf")
        logFile = conFile.replace("conf", "log")  # log in same place as config file
        dcdFile = conFile.replace("conf", "dcd")  # dcd in same place as config file
        USE_SHM = True

        SIMULATE_RATIO = settings.SIMULATE_RATIO
        if SIMULATE_RATIO > 1:
            logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
        frame_size = (SIMULATE_RATIO * int(job["interval"])) / (1000)
        logging.info("Frame Size is %f  Using Sim Ratio of 1:%d", frame_size, SIMULATE_RATIO)

        EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
        logging.info("Running Experiment Configuration #%d", EXPERIMENT_NUMBER)

        # # Grab historical basin data's relative time to start (for lineage)
        traj = None

        # EXECUTE SIMULATION ---------------------------------------------------------
        if self.skip_simulation:

            logging.info("1. SKIPPING SIMULATION.....")
            USE_SHM = False

            job["dcd"] = dcdFile
            key = wrapKey("jc", job["name"])
            self.data[key]["dcd"] = dcdFile

        else:
            logging.info("1. Run Simulation")

            # Prepare & source to config file
            with open(self.data["sim_conf_template"], "r") as template:
                source = template.read()

            # >>>>Storing DCD into shared memory on this node

            if USE_SHM:
                ramdisk = gettempdir()
                job["outputloc"] = ramdisk
                dcd_ramfile = os.path.join(ramdisk, job["name"] + ".dcd")
            else:
                job["outputloc"] = ""

            with open(conFile, "w") as sysconfig:
                sysconfig.write(source % job)
                logging.info("Config written to: " + conFile)

            # # Run simulation in parallel
            # if 'parallel' in job:
            #   numnodes = job['parallel']
            #   total_tasks = numnodes * 24
            #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

            # # Run simulation single threaded
            # else:
            #   cmd = 'namd2 %s > %s' % (conFile, logFile)

            # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
            check = executecmd("module list")
            logging.debug("%s", check)

            cmd = "namd2 +p%d %s > %s" % (PARALLELISM, conFile, logFile)

            #  MICROBENCH #1 (file to Lustre)
            # logging.debug("Executing Simulation:\n   %s\n", cmd)
            # bench = microbench()
            # bench.start()
            # stdout = executecmd(cmd)
            # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
            # bench.mark('SimExec:%s' % job['name'])
            # shm_contents = os.listdir('/dev/shm/out')
            # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
            # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
            # logging.info("Copy Complete to Lustre.")
            # bench.mark('CopyLustre:%s' % job['name'])
            # shutil.rmtree(ramdisk)
            # shm_contents = os.listdir('/dev/shm')
            # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
            # bench.show()

            max_expected_obs = int(job["runtime"]) // int(job["dcdfreq"])
            # Retry upto 3 attempts if the sim fails
            MAX_TRY = 3
            for i in range(MAX_TRY, 0, -1):
                min_required_obs = int(max_expected_obs * ((i - 1) / (MAX_TRY)))
                logging.debug("Executing Simulation:\n   %s\n", cmd)
                logging.debug("# Obs Expected to see: %d", max_expected_obs)
                stdout = executecmd(cmd)
                logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
                # Check file for expected data
                if USE_SHM:
                    traj = md.load(dcd_ramfile, top=job["pdb"])
                else:
                    traj = md.load(dcdFile, top=job["pdb"])
                logging.info("Obs Threshold  = %4d", min_required_obs)
                logging.info("#Obs This Traj = %4d", traj.n_frames)
                if traj.n_frames >= min_required_obs:
                    logging.info("Full (enough) Sim Completed")
                    break
                logging.info("Detected a failed Simulation. Retrying the same sim.")
                break

            bench.mark("simulation")
            # bench.mark('SimExec:%s' % job['name'])

            # Internal stats
            sim_length = self.data["sim_step_size"] * int(job["runtime"])
            sim_realtime = bench.delta_last()
            sim_run_ratio = (sim_realtime / 60) / (sim_length / 1000000)
            logging.info("##SIM_RATIO %6.3f  min-per-ns-sim", sim_run_ratio)
            stat.collect("sim_ratio", sim_run_ratio)

            if USE_SHM:
                shm_contents = os.listdir(ramdisk)
                logging.debug("Ramdisk contents (should have files) : %s", str(shm_contents))

                if not os.path.exists(dcd_ramfile):
                    logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
                    time.sleep(10)

                if not os.path.exists(dcd_ramfile):
                    logging.warning("DCD STIILL FILE NOT FOUND!!!!")
                else:
                    logging.info("DCD File was found")

            # #  MICROBENCH #2 (file to Alluxio)
            # allux = AlluxioClient()
            # # copy to Aluxio FS
            # allux.put(ramdisk + job['name'] + '.dcd', '/')
            # logging.info("Copy Complete to Alluxio.")
            # bench.mark('CopyAllux:%s' % job['name'])

            # And copy to Lustre
            # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
            # And copy to Lustre (usng zero-copy):
            if USE_SHM:

                # ALT:  X-Mit to Cache Service and let cache write to disk lazily
                src = open(dcd_ramfile, "rb")
                dest = open(dcdFile, "w+b")
                offset = 0
                dcdfilesize = os.path.getsize(dcd_ramfile)
                while True:
                    sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
                    if sent == 0:
                        break
                    offset += sent
                logging.info("Copy Complete to Lustre.")
                bench.mark("CopyLustre:%s" % job["name"])

            # TODO: Update job's metadata
            key = wrapKey("jc", job["name"])
            self.data[key]["dcd"] = dcdFile

        # ANALYSIS   ------- ---------------------------------------------------------
        #  ANALYSIS ALGORITHM
        # 1. With combined Sim-analysis: file is loaded locally from shared mem
        logging.debug("2. Load DCD")

        # Load topology and define filters
        topo = self.protein.pdb
        pdb = md.load(job["pdb"])
        hfilt = pdb.top.select_atom_indices("heavy")
        pfilt = pdb.top.select("protein")
        afilt = pdb.top.select_atom_indices("alpha")

        # Load full higher dim trajectory
        if traj is None:
            if USE_SHM:
                traj = md.load(dcd_ramfile, top=pdb)
            else:
                traj = md.load(dcdFile, top=pdb)

        logging.debug("Trajectory Loaded: %s (%s)", job["name"], str(traj))
        traj_prot = traj.atom_slice(pfilt)
        traj_heavy = traj.atom_slice(hfilt)
        traj_alpha = traj.atom_slice(afilt)

        # Superpose Coordinates to Common Reference
        traj_prot.superpose(topo)

        # Calculate output Distance Space
        # Use of the second side chain atom is discussed in the ref paper on Timescape
        # The atom pairs and selected atoms are in the timescape module
        sc_pairs = side_chain_pairs(traj_prot)
        n_features = len(sc_pairs)

        # Use the CA atoms to calculate distance space
        # NOTE THE CONVERSION FROM NM TO ANG!!!!!!
        dist_space = 10 * DR.distance_space(traj_alpha)

        # Get the frame rate to for conversion between source and timescape
        #  NOTE: Timescae is run at a more coarse level
        logging.debug("Preprocessing output for TimeScapes: terrain")
        traj_frame_per_ps = SIMULATE_RATIO * int(job["interval"]) / 1000.0  # jc interval is in fs
        ts_frame_per_ps = int(self.data["timescape:rate"])  # this value is in ps
        frame_rate = int(ts_frame_per_ps / traj_frame_per_ps)
        logging.debug(
            "%5.2f fr/ps (Traj)     %5.2f fr/ps (TS)    FrameRate= %4.1f",
            traj_frame_per_ps,
            ts_frame_per_ps,
            frame_rate,
        )

        # Execute Timescapes agility program to detect spatial-temporal basins
        output_prefix = os.path.join(job["workdir"], job["name"])
        if not self.skip_timescape:
            # Prep file and save locally in shm
            tmploc = gettempdir()
            ts_out = tmploc + "traj_ts"
            ts_dcd = ts_out + ".dcd"
            ts_pdb = ts_out + ".pdb"
            heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate))
            heavy.slice(0).save_pdb(ts_pdb)
            heavy.save_dcd(ts_dcd)

            # Gaussuan Full Width at Half-Max value affects sliding window size
            # ref:  http://timescapes.biomachina.org/guide.pdf
            gmd_cut1 = int(self.data["timescape:gmd:low"])
            gmd_cut2 = int(self.data["timescape:gmd:hi"])
            gauss_wght_delta = int(self.data["timescape:delta"])

            # Execute timescapes' terrain.py on the pre-processed trajectory
            cmd = "terrain.py %s %s %d %d %d GMD %s" % (
                ts_pdb,
                ts_dcd,
                gmd_cut1,
                gmd_cut2,
                gauss_wght_delta,
                output_prefix,
            )
            logging.info("Running Timescapes:\n  %s", cmd)
            stdout = executecmd(cmd)
            logging.info("TimeScapes COMPLETE:\n%s", stdout)

        # Collect and parse Timescape output
        logging.debug("Parsing Timescapes output")
        ts_parse = TimeScapeParser(job["pdb"], output_prefix, job["name"], dcd=dcdFile, traj=traj, uniqueid=False)
        basin_list = ts_parse.load_basins(frame_ratio=frame_rate)
        n_basins = len(basin_list)

        minima_coords = {}
        basin_rms = {}
        basins = {}

        new_dmu = {}
        new_dsig = {}
        resid_rms_delta = {}

        stat.collect("num_basin", n_basins)
        downstream_list = []

        # FOR BOOTSTRAPPING USING RMSD
        ref_file = os.path.join(settings.workdir, self.data["pdb:ref:0"])
        logging.info("Loading RMSD reference frame from %s", self.data["pdb:ref:0"])
        refframe = md.load(ref_file)
        ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices("alpha"))
        rmsd = 10 * md.rmsd(traj_alpha, ref_alpha)

        # FOR RESIDUE RMSD
        res_rms_Kr = FEATURE_SET_RESID
        resrmsd = 10 * np.array([LA.norm(i - ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz])
        basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms))

        # LOAD CENROIDS -- todo move to immut
        centroid = pickle.loads(self.catalog.get("centroid:ds"))
        basin_label_list = {}

        # Process each basin
        for i, basin in enumerate(basin_list):
            logging.info("  Processing basin #%2d", i)
            bid = basin.id
            downstream_list.append(bid)

            # Slice out minima coord  & save to disk (for now)
            #  TODO:  Store in memory in cache
            minima_coords[bid] = traj.slice(basin.mindex)
            jc_filename = os.path.join(settings.datadir, "basin_%s.pdb" % bid)
            minima_coords[bid].save_pdb(jc_filename)

            # METRIC CALCULATION
            a, b = basin.start, basin.end
            new_dmu[bid] = np.mean(dist_space[a:b], axis=0)
            new_dsig[bid] = np.std(dist_space[a:b], axis=0)

            # Collect Basin metadata
            basin_hash = basin.kv()
            basin_hash["pdbfile"] = jc_filename
            basin_hash["rmsd"] = np.median(rmsd[a:b])
            basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

            # FOR EXP #16 -- Residue RMSD

            # Set relative time (in ns)
            # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000
            basins[bid] = basin_hash

            # LABEL ATEMPORAL DATA
            # Take every 4th frame
            stride = [dist_space[i] for i in range(a, b, 4)]
            rms = [LA.norm(centroid - i, axis=1) for i in stride]
            label_seq = [np.argmin(i) for i in rms]
            basin_label_list[bid] = label_seq
            logging.info("  Basin Processed: #%s, %d - %d", basin_hash["traj"], basin_hash["start"], basin_hash["end"])

        # RMSD DELTA (RESIDUE)
        basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T
        basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)}
        for k in basins.keys():
            basins[k]["resrms_delta"] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr])

        # TODO:  Use Min Index as snapshot, median (or mean) DistSpace vals for each basin?????

        bench.mark("analysis")
        #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
        # try:
        self.wait_catalog()
        # except OverlayNotAvailable as e:
        #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")

        # FOR EXPERIMENT METRICS
        src_basin = self.catalog.hgetall("basin:" + job["src_basin"])
        with open("/home-1/[email protected]/ddc/results/{0}_prov.log".format(settings.name), "a") as metric_out:
            for i, basin in enumerate(basin_list):
                bid = basin.id
                label_seq = basin_label_list[bid]
                basin_metric_label = LABEL10(label_seq)
                metric_out.write(
                    "BASIN,%s,%s,%s,%s\n"
                    % (bid, src_basin["label:10"], basin_metric_label, "".join([str(i) for i in label_seq]))
                )

        # Update Index Synchronized data lists
        basin_list_sorted = sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1])))
        for bid in basin_list_sorted:
            with self.catalog.pipeline() as pipe:
                while True:
                    try:
                        logging.debug("Updating %s basin indeces and distance space", len(basins))
                        pipe.rpush("basin:list", bid)
                        pipe.rpush("dspace", pickle.dumps(new_dmu[bid]))
                        basin_index, _ = pipe.execute()
                        break
                    except redis.WatchError as e:
                        logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...")
                        continue
            basins[bid]["dsidx"] = basin_index - 1

        # Update Catalog with 1 Long Atomic Transaction
        with self.catalog.pipeline() as pipe:
            while True:
                try:
                    logging.debug("Update Filelist")
                    pipe.watch(wrapKey("jc", job["name"]))
                    file_idx = pipe.rpush("xid:filelist", job["dcd"]) - 1

                    # HD Points
                    logging.debug("Update HD Points")
                    start_index = pipe.llen("xid:reference")
                    pipe.multi()
                    pipe.rpush("xid:reference", *[(file_idx, x) for x in range(traj.n_frames)])
                    pipe.set("resrms:" + job["name"], resrmsd)

                    # Store all basin data
                    logging.debug("Updating %s basins", len(basins))
                    for bid in sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1]))):
                        pipe.hmset("basin:" + bid, basins[bid])
                        pipe.set("minima:%s" % bid, pickle.dumps(minima_coords[bid]))
                        for i in basin_label_list[bid]:
                            pipe.rpush("basin:labelseq:" + bid, i)

                    pipe.hset("anl_sequence", job["name"], mylogical_seqnum)

                    logging.debug("Executing")
                    pipe.execute()
                    break

                except redis.WatchError as e:
                    logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...")
                    continue

        self.data[key]["xid:start"] = start_index
        self.data[key]["xid:end"] = start_index + traj.n_frames
        bench.mark("catalog")

        # ---- POST PROCESSING
        # Check for total data in downstream queue  & Notify control manager to run
        # 1. Det. what's still running
        job_queue = slurm.currentqueue()
        n_simjobs = -1  # -1 to exclude self
        for j in job_queue:
            if j["name"].startswith("sw"):
                n_simjobs += 1

        logging.info("SIM WORKER has detected %d other simulations running/pending.", len(job_queue))
        remain_percent = n_simjobs / self.data["numresources"]

        # Fault Tolerance: ensure the pipeline persists
        if n_simjobs == 0:
            logging.info("I am the last simulation. Ensuring the controler executes.")
            self.catalog.set("ctl:force", 1)

        # Send downstream notification when less than 10% of jobs are still running
        if not self.skip_notify and remain_percent < 0.1:
            # Notify the control manager 'CM'
            self.notify("ctl")

        if USE_SHM:
            shutil.rmtree(ramdisk)
            shm_contents = os.listdir("/dev/shm")
            logging.debug("Ramdisk contents (should be empty of DDC) : %s", str(shm_contents))

        # For benchmarching:
        bench.show()
        stat.show()

        # Return # of observations (frames) processed
        return downstream_list
예제 #7
0
파일: overlayService.py 프로젝트: DaMSL/ddc
    def __init__(self, name, port, **kwargs):

        settings = systemsettings()
        settings.applyConfig(name + '.json')

        # For Stat Collection
        self.stat = StatCollector('Overlay')
        self.stat.collect('name', name)

        # DEBUG
        self._host = socket.gethostname()
        self._port = port

        svc_name = kwargs.get('svc_name', type(self).__name__)
        self._name_svc = svc_name
        self._name_app = name
        self.lockfile = '%s_%s.lock' % (self._name_app, self._name_svc)
        logging.info('MY Lock file is: %s', self.lockfile)

        self.numslaves = kwargs.get('numslaves', 0)
        self.launchcmd = None
        self.shutdowncmd = None

        self._pid = None

        self._state = 'INIT'
        self._role = kwargs.get('role', 'NONE')

        self.aggressive = kwargs.get('isaggressive', False)

        # Check if this is a Slurm job and get the settings
        self.slurm_id = os.getenv('SLURM_JOBID')
        if self.slurm_id is None:
            # Not running as a Slurm Job (set default TTL)
            self.ttl = dt.now().timestamp() + DEFAULT_TTL
            self.jobinfo = None
            self.stat.collect("slurmid", "None")
        else:
            logging.info("SLURM JOB:  %s", str(self.slurm_id))
            self.jobinfo = slurm.jobinfo(int(self.slurm_id))
            endtime = dtparser.parse(self.jobinfo['EndTime'])
            self.ttl = endtime.timestamp()
            self.stat.collect("slurmid", self.slurm_id)
        self.stat.collect("ttl", self.ttl)

        # TODO: All services' nodes assume to run on same port (different hosts)
        #  This the master hold only the host name
        self.master = kwargs.get('master', None)
        self.slavelist = {}

        self.terminationFlag = Event()
        self.handoverFlag = Event()
        self.shuttingdown = Event()
        self.state_running = Event()

        # TODO: Should we require basic setting via systemsetting object???
        #   and thus, enforce this object on the abstract class?
        self.SERVICE_STARTUP_DELAY = 60
        self.SERVICE_HEARTBEAT_DELAY = 60

        # Register tty signaling to gracefully shutdown when halted externall7
        #   (e.g. from scancel)
        signal.signal(signal.SIGTERM, self.halt)
        signal.signal(signal.SIGINT, self.halt)
        signal.signal(signal.SIGQUIT, self.halt)
        signal.signal(signal.SIGHUP, self.halt)
예제 #8
0
파일: overlayService.py 프로젝트: DaMSL/ddc
class OverlayService(object):
    """Overlay services provide an abstracted layer of control to 
  allow an underlying implemented service to operate within
  a timedelayed HPC environment
  (TODO:  abstract to other environments)
  """
    __metaclass__ = abc.ABCMeta

    # Delay (in sec) to wait
    # SERVICE_HEARTBEAT_DELAY = 5
    # SERVICE_STARTUP_DELAY = 5

    def __init__(self, name, port, **kwargs):

        settings = systemsettings()
        settings.applyConfig(name + '.json')

        # For Stat Collection
        self.stat = StatCollector('Overlay')
        self.stat.collect('name', name)

        # DEBUG
        self._host = socket.gethostname()
        self._port = port

        svc_name = kwargs.get('svc_name', type(self).__name__)
        self._name_svc = svc_name
        self._name_app = name
        self.lockfile = '%s_%s.lock' % (self._name_app, self._name_svc)
        logging.info('MY Lock file is: %s', self.lockfile)

        self.numslaves = kwargs.get('numslaves', 0)
        self.launchcmd = None
        self.shutdowncmd = None

        self._pid = None

        self._state = 'INIT'
        self._role = kwargs.get('role', 'NONE')

        self.aggressive = kwargs.get('isaggressive', False)

        # Check if this is a Slurm job and get the settings
        self.slurm_id = os.getenv('SLURM_JOBID')
        if self.slurm_id is None:
            # Not running as a Slurm Job (set default TTL)
            self.ttl = dt.now().timestamp() + DEFAULT_TTL
            self.jobinfo = None
            self.stat.collect("slurmid", "None")
        else:
            logging.info("SLURM JOB:  %s", str(self.slurm_id))
            self.jobinfo = slurm.jobinfo(int(self.slurm_id))
            endtime = dtparser.parse(self.jobinfo['EndTime'])
            self.ttl = endtime.timestamp()
            self.stat.collect("slurmid", self.slurm_id)
        self.stat.collect("ttl", self.ttl)

        # TODO: All services' nodes assume to run on same port (different hosts)
        #  This the master hold only the host name
        self.master = kwargs.get('master', None)
        self.slavelist = {}

        self.terminationFlag = Event()
        self.handoverFlag = Event()
        self.shuttingdown = Event()
        self.state_running = Event()

        # TODO: Should we require basic setting via systemsetting object???
        #   and thus, enforce this object on the abstract class?
        self.SERVICE_STARTUP_DELAY = 60
        self.SERVICE_HEARTBEAT_DELAY = 60

        # Register tty signaling to gracefully shutdown when halted externall7
        #   (e.g. from scancel)
        signal.signal(signal.SIGTERM, self.halt)
        signal.signal(signal.SIGINT, self.halt)
        signal.signal(signal.SIGQUIT, self.halt)
        signal.signal(signal.SIGHUP, self.halt)

    @abc.abstractmethod
    def ping(self, host='localhost'):
        """Method to check if service is running on the given host
    """
        pass

    @abc.abstractmethod
    def handover_to(self):
        """Invoked to handover services from this instance to a new master
    """
        pass

    @abc.abstractmethod
    def handover_from(self):
        """Invoked as a callback when another master service is handing over
    service control (i.e. master duties) to this instance
    """
        pass

    def prepare_service(self):
        """Pre-execution processing (e.g. config file / env creation)
    """
        pass

    def tear_down(self):
        """Post-execution processing (e.g. remove temp files)
    """
        pass

    def idle(self):
        """To define an idle detection method and gracefully shutdown if the service
    is idle. If undefined, idle returns False and the system will always assume
    that is it never idle.
    """
        return False

    def launch_slave(self):
        """For multi-node services, the master node is responsible for starting/stopping
    all slave nodes. Launch slave is the method called when the master monitor
    needs to start up a new slave.
    """

    def stop_slave(self):
        """For multi-node services, the master node will invoke this to stop a 
    slave. 
    TODO: policy development for rotating slave nodes and ensuring they 
    are either not all stopping concurrently or do exactly that.
    """

    def start(self):
        """
    """
        logging.info('Overlay Service is Starting: %s', self._name_svc)
        logging.info('  SERVICE_STARTUP_DELAY  = %d',
                     self.SERVICE_STARTUP_DELAY)
        logging.info('  SERVICE_HEARTBEAT_DELAY  = %d\n',
                     self.SERVICE_HEARTBEAT_DELAY)

        # Validate the initialization State. Services first start in a Slave State
        #  but change over to Master if no Master is detected. If the master is
        #  online, the service becomes a replica (future enhancements will allow
        #  disitrbuted and multiple slaves/replicas)
        if self._role == 'SLAVE':
            logging.info('[%s] Starting as a slave. ', self._name_svc)
            slave_wait_ttl = self.SERVICE_STARTUP_DELAY * 4
            while slave_wait_ttl > 0:
                host, port, state = self.getconnection()
                logging.debug('SLAVE Coming online: Current Conn  = %s, %s',
                              str(host), str(state))
                if host is None:
                    # No Master alive -- become the master
                    logging.info('No Master Detected. Becoming the Master.')
                    self._role = 'MASTER'
                    break
                elif (state == 'RUN' or state == 'EXPIRED') and self.ping(
                        host, port):
                    # FLAG MASTER
                    logging.info('[%s] Master is Alive. Starting as a slave. ',
                                 self._name_svc)
                    with open(self.lockfile, 'a') as conn:
                        ts = dt.now().timestamp()
                        conn.write(
                            '%s,%d,%f,%f,%s\n' %
                            (self._host, self._port, ts, self.ttl, 'SLAVE'))
                    # Master is ALIVE and RUNNING
                    host = None
                    self._role = 'REPLICA'
                    break
                elif state == 'EXPIRED':
                    # Master is ALIVE, but detected as EXPIRED
                    logging.info(
                        '[%s] Master at %s has expired. Becoming the Master.',
                        self._name_svc, str(host))
                    host = None
                    self._role = 'MASTER'
                    break
                else:
                    # Master is either registered as START or it's unavailable. Wait
                    logging.warning(
                        '[%s] My Master is Not alive. Host is %s.  Waiting to see if it will come online...',
                        self._name_svc, str(host))
                    time.sleep(self.SERVICE_STARTUP_DELAY)
                    slave_wait_ttl -= self.SERVICE_STARTUP_DELAY
                    if slave_wait_ttl < 0:
                        logging.warning(
                            '[%s] Could not find a master. Taking over as Master.',
                            self._name_svc, self._host)
                        if os.path.exists(self.lockfile):
                            os.remove(self.lockfile)
                        host = None
                        self._role = 'MASTER'
                        break
            #TODO:
            # self.masterport = port

        # Check to ensure lock is not already acquired
        while self.master is None:
            logging.info('Checking on the master.... (MY ROLE = %s)',
                         self._role)
            if self._role == 'MASTER':
                try:
                    lock = os.open(self.lockfile,
                                   os.O_CREAT | os.O_EXCL | os.O_WRONLY)
                    ts = dt.now().timestamp()
                    os.write(
                        lock,
                        bytes(
                            '%s,%d,%f,%f,%s\n' %
                            (self._host, self._port, ts, self.ttl, 'START'),
                            'UTF-8'))
                    os.close(lock)
                    logging.debug(
                        "[%s] Lock File not found (I am creating it from %s)",
                        self._name_svc, self._host)
                    self._state = 'START'
                    self.master = self._host
                    break
                except FileExistsError as ex:
                    logging.debug(
                        "[%s] Lock File exists (someone else has acquired it)",
                        self._name_svc)

            host, port, state = self.getconnection()
            logging.debug('Overlay Service State = %s', str(state))
            if state is None or state == 'EXPIRED':
                logging.error(
                    "[%s] BAD state from existing %s. Removing it retrying to lock it.",
                    self._name_svc, self.lockfile)
                if os.path.exists(self.lockfile):
                    os.remove(self.lockfile)
                self._role = 'MASTER'
                continue

            if state == 'START':
                logging.debug(
                    "[%s] SOMEONE ELSE is starting my service on %s.",
                    self._name_svc, host)
                timeout = self.SERVICE_STARTUP_DELAY
                while timeout > 0 and state == 'START':
                    time.sleep(1)
                    host, port, state = self.getconnection()
                    timeout -= 1
                if state is not None and state == 'RUN':
                    logging.info('Found a NEW master on %s.', host)
                else:
                    logging.debug(
                        "[%s] Detected zombie master. Killing the lockfile and retrying to start locally.",
                        self._name_svc)
                    self._role = 'MASTER'
                    if os.path.exists(self.lockfile):
                        os.remove(self.lockfile)
                    continue

            if state == 'RUN':
                logging.debug(
                    "[%s] Checking if service is available on %s, %s",
                    self._name_svc, host, port)
                if self.ping(host, port):
                    logging.debug("[%s] Service is ALIVE on %s",
                                  self._name_svc, host)
                    if self.aggressive:
                        logging.debug(
                            "[%s] Starting AGGRESSIVE -- will initiate master takeover control protocol",
                            self._name_svc)
                        # If starting up as an aggressive replica -- assume immediate replication and control
                        #  of service operation. This will initiate this node to start as a
                        #  replica and initate replication/handover protocol. Future improvements
                        #  should allow for the system to detect the # of replica (for FT) and
                        #  flag handover accordingly
                        self._role = 'REPLICA'
                        self.master = host
                        self.handoverFlag.set()
                        break
                    else:
                        logging.debug(
                            '[%s] Service already running on %s. I am TIMID -- to take over, start me in aggressive mode',
                            self._name_svc, host)
                        return None
                else:
                    logging.warning(
                        '[%s] Service is NOT running. Will attempt to recover and start locally.',
                        self._name_svc)
                    self._role = 'MASTER'
                    if os.path.exists(self.lockfile):
                        os.remove(self.lockfile)

        self.prepare_service()

        if self.launchcmd is None:
            logging.error(
                "[%s] Launch Command not set. It needs to be defined.",
                self._name_svc)
            return None

        if self.shutdowncmd is None:
            logging.error(
                "[%s] Shutdown Command not set. It needs to be defined.",
                self._name_svc)
            return None

        # TODO: Check subproc call here -- should this also be threaded in a python wrap call?
        logging.info("[%s] Launching following command %s", self._name_svc,
                     self.launchcmd)
        args = shlex.split(self.launchcmd)
        service = proc.Popen(args)

        if service.returncode is not None:
            logging.error("[%s] ERROR starting local service on %s",
                          self._name_svc, self.host)
            # Exit or Return ???
            return None

        # Ensure service has started locally
        svc_up = None
        timeout = time.time() + self.SERVICE_STARTUP_DELAY
        while not svc_up:
            if time.time() > timeout:
                logging.error("[%s] Timed Out waiting on the server",
                              self._name_svc)
                break
            time.sleep(1)
            svc_up = self.ping()

        if not svc_up:
            logging.error("[%s] Service never started. You may need to retry.",
                          self._name_svc)
            return None

        self._state = 'RUN'
        self.state_running.set()
        logging.info(
            "[%s] My Service started local on %s (ROLE=%s). Starting the local monitor.",
            self._name_svc, self._role, self._host)
        ts = dt.now().timestamp()

        if self._role == 'MASTER':
            if os.path.exists(self.lockfile):
                logging.info("[%s] Removing and creating the Lockfile.",
                             self._name_svc)
                os.remove(self.lockfile)
            else:
                logging.info("[%s] Creating a new Lockfile.", self._name_svc)
            lock = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
            os.write(
                lock,
                bytes(
                    '%s,%d,%f,%f,%s\n' %
                    (self._host, self._port, ts, self.ttl, 'RUN'), 'UTF-8'))
            os.close(lock)

        self.service = service

        t = Thread(target=self.monitor)
        t.start()

        return t

    def monitor(self):

        MISS_HB_TO = 5  # 5 min timeot

        logging.info("[%s] Monitor Daemon Started.", self._name_svc)
        # TODO: Move to settings
        missed_hb = 0
        allowed_misshb = 3

        # Redundant check to ensure service has started locally (it should be started already)
        alive = False
        timeout = time.time() + self.SERVICE_STARTUP_DELAY
        while not alive:
            if time.time() > timeout:
                logging.error("[%s-Mon] Timed Out waiting on the server",
                              self._name_svc)
                break
            time.sleep(1)
            alive = self.ping()
        if not alive:
            logging.error(
                "[%s-Mon] Service never started. You may need to retry.",
                self._name_svc)
            return

        # TODO:  For multi-node services, the Master monitor process will start up all slaves
        #  as individual jobs  -- FUTURE
        # if self.numslaves > 0:
        #   logging.info("[%s-Mon] Service is up locally on %s. Preparing to launch %d slaves.", self._name_svc, self.numslaves)

        #   # TODO: Thread each slave with a separate monitor or launcher
        #   for i in len(range(self.numslaves)):
        #     self.launch_slave()

        #  Blocking loop. Will only exit if the term flag is set or by a
        #   in-loop check every heartbeat (via miss ping, lost pid, or idle timeout)
        logging.info(
            "[%s-Mon] Service is Operational and READY. Running the event handler loop.",
            self._name_svc)
        if self.jobinfo is not None:
            load_time = (
                dt.now() -
                dtparser.parse(self.jobinfo['StartTime'])).total_seconds()
            logging.info('Load Time:\nLOAD_TIME,%f', load_time)
            self.stat.collect('LOAD', load_time)

        while not self.terminationFlag.wait(self.SERVICE_HEARTBEAT_DELAY):
            logging.warning("[%s-Mon] Heartbeat. My ROLE = %s", self._name_svc,
                            self._role)
            if self.shuttingdown.is_set():
                logging.warning(
                    "[%s-Mon] Detectd Service was flagged to shutdown. Monitor is halting.",
                    self._name_svc)
                break

            # TODO:  try/catch service connection errors here
            if self.service.poll():
                logging.warning("[%s-Mon] Local Service has STOPPED on %s.",
                                self._name_svc, self._host)
                break

            # Heartbeat
            if not self.ping():
                missed_hb += 1
                logging.warning(
                    "[%s-Mon]  MISSED Heartbeat on %s. Lost communicate with service for %d seconds.",
                    self._name_svc, self._host,
                    missed_hb * self.SERVICE_HEARTBEAT_DELAY)
                if missed_hb * self.SERVICE_HEARTBEAT_DELAY > 60 * MISS_HB_TO:
                    logging.warning(
                        "[%s-Mon]  Lost commo with Server for over %d minutes. Shutting Down",
                        self._name_svc, MISS_HB_TO)
                    self.terminationFlag.set()
                continue

            missed_hb = 0

            # CHECK IDLE TIME
            if self.idle():
                logging.info(
                    "[%s-Mon] Service is idle. Initiate graceful shutdown.",
                    self._name_svc)
                break

            if self._role == 'MASTER':
                with open(self.lockfile, 'r') as conn:
                    lines = conn.read().strip().split('\n')
                logging.info(
                    "[%s-Mon] Checking Lockfile shows %d hosts online",
                    self._name_svc, len(lines))
                if len(lines) > 1:
                    # TODO: Multiple slave
                    for i in lines[1:]:
                        logging.info('SLAVE detected: %s', i)
                        data = i.split(',')
                        self.slavelist[data[0]] = data[-1]
                        logging.info("[%s-Mon]  Detected a new slave on %s",
                                     self._name_svc, data[0])
                    self.handoverFlag.set()

            if self.handoverFlag.is_set():
                handover_begin = dt.now()
                # This service has been flagged to handover control to a new master or from an old one
                if self._role == 'MASTER':
                    # Handover Control
                    next_master = self.handover_to()
                    if next_master is None:
                        self.handoverFlag.clear()
                        continue
                    self._role = 'REPLICA'
                    logging.info(
                        "[%s-Mon]  Handover complete. I am now terminating. My role is now %s.",
                        self._name_svc, self._role)
                    # Terminate
                    handover_time = (dt.now() - handover_begin).total_seconds()
                    logging.info('handover Time:\nHANDOVER_TIME_TO,%f',
                                 handover_time)
                    self.stat.collect('HANDOVER_TO', handover_time)

                    self.terminationFlag.set()

                elif self._role == 'REPLICA':
                    # Initiate replica protocol and assume role as master
                    self.handover_from()
                    logging.info(
                        "[%s-Mon]  Handover complete. I am now the master.",
                        self._name_svc)

                    # Re-set lock file (TODO: other slaves)
                    self._role = 'MASTER'
                    ts = dt.now().timestamp()
                    ttl = ts + self.ttl
                    os.remove(self.lockfile)
                    lock = os.open(self.lockfile,
                                   os.O_CREAT | os.O_EXCL | os.O_WRONLY)
                    os.write(
                        lock,
                        bytes(
                            '%s,%d,%f,%f,%s\n' %
                            (self._host, self._port, ts, ttl, 'RUN'), 'UTF-8'))
                    os.close(lock)

                    handover_time = (dt.now() - handover_begin).total_seconds()
                    logging.info('handover Time:\nHANDOVER_TIME_FROM,%f',
                                 handover_time)
                    self.stat.collect('HANDOVER_FROM', handover_time)

                    self.handoverFlag.clear()

                    logging.debug("[%s-Mon] Handover status is %s.",
                                  self._name_svc, (self.handoverFlag.is_set()))

                else:
                    logging.error(
                        "[%s-Mon]  Flagged for handover, but cannot (I am neither a master or a replica -- what am I?",
                        self._name_svc)
                    self.terminationFlag.set()

        self.stop()

        # Post-termination logic
        logging.info("[%s] Service is shutdown on %s. Monitor is terminating",
                     self._name_svc, self._host)

    def stop(self):
        """
    """
        if not self.shuttingdown.is_set():
            self.shuttingdown.set()
            logging.debug('MY ROLE = %s', self._role)
            if self._role == 'MASTER' and len(self.slavelist) == 0:
                logging.info(
                    '[%s] Last master shutdown. Veryfying I am the last one',
                    self._name_svc)
                host, port, state = self.getconnection()
                if host == self._host:
                    os.remove(self.lockfile)
                else:
                    logging.info('[%s] New service detected on %s.',
                                 self._name_svc, host)
            else:
                logging.info(
                    '[%s] This master is shutting down, but detected someone else.',
                    self._name_svc)

            logging.info('[%s] Service shutting down on %s.', self._name_svc,
                         self._host)
            args = shlex.split(self.shutdowncmd)
            result = proc.call(args)
            logging.info(
                "[%s] shutdown with return code: %s.  Tearing Down remaining work.",
                self._name_svc, str(result))

            self.tear_down()
            self.stat.show()

    def halt(self, signum, frame):

        logging.warning('[%s] SIGNAL Received: %s', self._name_svc,
                        siglist[signum])
        logging.warning(
            '[%s] Setting Termination Flag and Gracefully stopping the service',
            self._name_svc)
        self.terminationFlag.set()
        self.stop()

    def getconnection(self):
        try:
            with open(self.lockfile, 'r') as conn:
                nodes = conn.read().strip().split('\n')
                if len(nodes) == 0:
                    return None, None, None
                conn_string = nodes[0].split(',')
                if len(conn_string) < 5:
                    return None, None, None
                host = conn_string[0]
                port = conn_string[1]
                ttl = float(conn_string[3])
                state = conn_string[4].strip()
                ts = dt.now()
                if ts.timestamp() > ttl:
                    logging.warning(
                        "Overlay Service Master has expired. Check lockfile: %s",
                        self.lockfile)
                    state = 'EXPIRED'
            return host, port, state
        except FileNotFoundError:
            logging.warning("WARNING!  Lock file is missing.")
            return None, None, None
예제 #9
0
파일: simanl.py 프로젝트: DaMSL/ddc
  def execute(self, job):

  # PRE-PREOCESS ----------------------------------------------------------
    settings = systemsettings()
    bench = microbench('sim_%s' % settings.name, self.seqNumFromID())
    bench.start()
    stat  = StatCollector('sim_%s' % settings.name, self.seqNumFromID())
    mylogical_seqnum = str(self.seqNumFromID())

    # Prepare working directory, input/output files
    conFile = os.path.join(job['workdir'], job['name'] + '.conf')
    logFile = conFile.replace('conf', 'log')      # log in same place as config file
    dcdFile = conFile.replace('conf', 'dcd')      # dcd in same place as config file
    USE_SHM = True

    ADAPTIVE_CENTROID = False

    SIMULATE_RATIO = settings.SIMULATE_RATIO
    if SIMULATE_RATIO > 1:
      logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
    frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000)
    logging.info('Frame Size is %f  Using Sim Ratio of 1:%d', \
      frame_size, SIMULATE_RATIO)

    EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
    logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER)

    # TODO: FOR LINEAGE
    # srcA, srcB = eval(job['src_bin'])
    # stat.collect('src_bin', [str(srcA), str(srcB)])

    traj = None

  # EXECUTE SIMULATION ---------------------------------------------------------
    if self.skip_simulation:

      logging.info('1. SKIPPING SIMULATION.....')
      USE_SHM = False

      job['dcd'] = dcdFile
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

    else:
      logging.info('1. Run Simulation')

      # Prepare & source to config file
      with open(self.data['sim_conf_template'], 'r') as template:
        source = template.read()

      # >>>>Storing DCD into shared memory on this node

      if USE_SHM:
        # ramdisk = '/dev/shm/out/'
        ramdisk = '/tmp/ddc/'
        if not os.path.exists(ramdisk):
          os.mkdir(ramdisk)
        job['outputloc'] = ramdisk
        dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd')
      else:
        job['outputloc'] = ''

      with open(conFile, 'w') as sysconfig:
        sysconfig.write(source % job)
        logging.info("Config written to: " + conFile)

      # # Run simulation in parallel
      # if 'parallel' in job:
      #   numnodes = job['parallel']
      #   total_tasks = numnodes * 24
      #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

      # # Run simulation single threaded
      # else:
      #   cmd = 'namd2 %s > %s' % (conFile, logFile)

      # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
      check = executecmd('module list')
      logging.debug('%s', check)

      cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile)

      #  MICROBENCH #1 (file to Lustre)
      # logging.debug("Executing Simulation:\n   %s\n", cmd)
      # bench = microbench()
      # bench.start()
      # stdout = executecmd(cmd)
      # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
      # bench.mark('SimExec:%s' % job['name'])
      # shm_contents = os.listdir('/dev/shm/out')
      # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # logging.info("Copy Complete to Lustre.")
      # bench.mark('CopyLustre:%s' % job['name'])
      # shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
      # bench.show()

      max_expected_obs = int(job['runtime']) // int(job['dcdfreq'])
      # Retry upto 3 attempts if the sim fails
      MAX_TRY = 3
      for i in range(MAX_TRY, 0, -1):
        min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY)))
        logging.debug("Executing Simulation:\n   %s\n", cmd)
        logging.debug('# Obs Expected to see: %d', max_expected_obs)
        stdout = executecmd(cmd)
        logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
        # Check file for expected data
        if USE_SHM:
          traj = md.load(dcd_ramfile, top=job['pdb'])
        else:
          traj = md.load(dcdFile, top=job['pdb'])
        logging.info("Obs Threshold  = %4d", min_required_obs)
        logging.info("#Obs This Traj = %4d", traj.n_frames)
        if traj.n_frames >= min_required_obs:
          logging.info('Full (enough) Sim Completed')
          break
        logging.info('Detected a failed Simulation. Retrying the same sim.')
      
      bench.mark('SimExec:%s' % job['name'])

      # Internal stats
      sim_length = self.data['sim_step_size'] * int(job['runtime'])
      sim_realtime = bench.delta_last()
      sim_run_ratio =  (sim_realtime/60) / (sim_length/1000000)
      logging.info('##SIM_RATIO %6.3f  min-per-ns-sim', sim_run_ratio)
      stat.collect('sim_ratio', sim_run_ratio)

      if USE_SHM:
        shm_contents = os.listdir(ramdisk)
        logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
          time.sleep(10)

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD STIILL FILE NOT FOUND!!!!")
        else:
          logging.info("DCD File was found")

      # #  MICROBENCH #2 (file to Alluxio)
      # allux = AlluxioClient()
      # # copy to Aluxio FS
      # allux.put(ramdisk + job['name'] + '.dcd', '/')
      # logging.info("Copy Complete to Alluxio.")
      # bench.mark('CopyAllux:%s' % job['name'])

      # And copy to Lustre
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # And copy to Lustre (usng zero-copy):
      if USE_SHM:
        src  = open(dcd_ramfile, 'rb')
        dest = open(dcdFile, 'w+b')
        offset = 0
        dcdfilesize = os.path.getsize(dcd_ramfile)
        while True:
          sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
          if sent == 0:
            break
          offset += sent
        logging.info("Copy Complete to Lustre.")
        bench.mark('CopyLustre:%s' % job['name'])
      
      # TODO: Update job's metadata
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

  # ANALYSIS   ------- ---------------------------------------------------------
    #  ANALYSIS ALGORITHM
  # 1. With combined Sim-analysis: file is loaded locally from shared mem
    logging.debug("2. Load DCD")

    # Load full higher dim trajectory
    # traj = datareduce.filter_heavy(dcd_ramfile, job['pdb'])
    if traj is None:
      if USE_SHM:
        traj = md.load(dcd_ramfile, top=job['pdb'])
      else:
        traj = md.load(dcdFile, top=job['pdb'])

    # Center Coordinates
    traj.center_coordinates()

    bench.mark('File_Load')
    logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj))


  #  DIMENSIONALITY REDUCTION --------------------------------------------------
  # 4-A. Subspace Calcuation: RMS using Alpha-Filter
    #------ A:  RMSD-ALPHA  ------------------
      #     S_A = rmslist
    logging.info('---- RMSD Calculation against pre-defined centroids ----')
      #  RMSD is calculated on the Ca ('alpha') atoms in distance space
      #   whereby all pairwise distances are calculated for each frame.
      #   Pairwise distances are plotted in euclidean space
      #   Distance to each of the 5 pre-calculated centroids is calculated

    # 1. Filter to Alpha atoms
    alpha = traj.atom_slice(deshaw.FILTER['alpha'])



    # 2. (IF USED) Convert to distance space: pairwise dist for all atom combinations
    # alpha_dist = dr.distance_space(alpha)

    # 3. Calc RMS for each conform to all centroids
    # Heuristic centroid weight (TODO: make this trained)\

    # 4. For adaptive Centriods
    #  Centroids Will be pulled & updated. 
    logging.info('CENTROID Retrieval & Updating')
    self.wait_catalog()

    #  If they were mutable....
    # logging.info('Acquiring a Lock on the Centroids')
    # centroids = self.catalog.loadNPArray('centroid')
    # thetas = self.catalog.loadNPArray('thetas')
    # lock = self.catalog.lock_acquire('centroid')
    # if lock is None:
    #   logging.info('Could not lock the Centroids. Will use current cached (possibly stale) data.')
    # bench.mark('ConcurrLockCentroid'%(A,B))

    #  Implemented as a Transactional Data Structure....
    if ADAPTIVE_CENTROID:
      centroids = []
      for state in range(numLabels):
        cent_raw  = self.catalog.lrange('centroid:xyz:%d'%state, 0, -1)
        cent_xyz  = [pickle.loads(i) for i in cent_raw]
        cent_npts = [int(i) for i in self.catalog.lrange('centroid:npts:%d'%state, 0, -1)]
        c_sum = np.zeros(shape=cent_xyz[0].shape)
        c_tot = 0
        for x, n in zip(cent_xyz, cent_npts):
          c = x * n
          c_sum += c
          c_tot += n
        centroids.append(c_sum / c_tot)
    else:
      centroids = self.catalog.loadNPArray('centroid')      

    # if EXPERIMENT_NUMBER < 10:
    # 5. Calculate the RMSD for each filtered point to 5 pre-determined centroids
    # cw = [.92, .94, .96, .99, .99]
    cw = [.94, .95, .97, .99, .99]

    numLabels = len(self.data['centroid'])
    numConf = len(traj.xyz)
    stat.collect('numpts',numConf)

    # 4. Account for noise : Simple spatial mean filter over a small window
    #    Where size of window captures extent of noise 
    #    (e.g. 10000fs window => motions under 10ps are considered "noisy")
    noise = self.data['obs_noise']
    stepsize = 500 if 'interval' not in job else int(job['interval'])
    nwidth = noise//(2*stepsize)
    noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0)
    rms_filtered = np.array([noisefilt(alpha.xyz, i) for i in range(numConf)])
    # Notes: Delta_S == rmslist
    rmslist_sv = calc_rmsd(rms_filtered, centroids, weights=cw)
      # rmslist = adaptive_rmsd(rms_filtered, centroids, theta)

    # else:
    rmslist = calc_rmsd(alpha, centroids)

    numConf = traj.n_frames
    numLabels = len(centroids)

    # rmslist = calc_rmsd(alpha.xyz, self.data['centroid'], weights=cw)
    logging.debug('  RMS:  %d points projected to %d centroid-distances', \
      numConf, numLabels)


    # 6. Apply Heuristics Labeling  -- Single Variate
    rmslabel = []
    binlist = [(a, b) for a in range(numLabels) for b in range(numLabels)]
    label_count = {ab: 0 for ab in binlist}
    groupbystate = [[] for i in range(numLabels)]
    groupbybin = {ab: [] for ab in binlist}
    for i, rms in enumerate(rmslist_sv):
      #  Sort RMSD by proximity & set state A as nearest state's centroid
      A, B = np.argsort(rms)[:2]

      #  Calc Absolute proximity between nearest 2 states' centroids
      # THETA Calc derived from static run. it is based from the average std dev of all rms's from a static run
      #   of BPTI without solvent. It could be dynamically calculated, but is hard coded here
      #  The theta is divided by four based on the analysis of DEShaw:
      #   est based on ~3% of DEShaw data in transition (hence )
      # avg_stddev = 0.34119404492089034
      # theta = settings.RMSD_THETA
      ## FOR ADAPTIVE Cantroids. Theta is now updated dyamically

      # NOTE: Original formulate was relative. Retained here for reference:  
      # Rel vs Abs: Calc relative proximity for top 2 nearest centroids   
      # relproximity = rms[A] / (rms[A] + rms[rs[1]])
      # B = rs[1] if relproximity > (.5 - theta) else A
      # proximity = abs(rms[prox[1]] - rms[A]) / (rms[prox[1]] + rms[A])  #relative
      #proximity = abs(rms[prox[1]] - rms[A])    #abs
      # Update for Adaptive Centroid.
      delta = np.abs(rms[B] - rms[A])

      #  (TODO:  Factor in more than top 2, better noise)
      #  Label secondary sub-state
      # sub_state = B prox[1] if proximity < theta else A
      # For ADAPTIVE Centroids
      if delta < 0.33:
        sub_state = B
      else:
        sub_state = A
      rmslabel.append((A, sub_state))

      # Add this index to the set of indices for this respective label
      #  TODO: Should we evict if binsize is too big???
      # logging.debug('Label for observation #%3d: %s', i, str((A, B)))
      label_count[(A, sub_state)] += 1

      # Group high-dim point by state
      # TODO: Consider grouping by stateonly or well/transitions (5 vs 10 bins)
      groupbystate[A].append(i)
      groupbybin[(A, sub_state)].append(i)

    # stat.collect('observe', label_count)
    bench.mark('RMS')
    logging.info('Labeled the following:')
    for A in range(numLabels):
      if len(groupbystate[A]) > 0:
        logging.info('label,state,%d,num,%d', A, len(groupbystate[A]))
    for ab in binlist:
      if len(groupbybin[ab]) > 0:
        A, B = ab
        logging.info('label,bin,%d,%d,num,%d', A, B, len(groupbybin[ab]))

    # FEATURE LANDSCAPE -- Multi-Variate

    # Calc Feature landscape for each frame's RMSD
    feal_list = [feal.atemporal(rms) for rms in rmslist]
    logging.info('Calculated Feature Landscape. Aggregate for this traj')
    # For logging purposes
    agg_feal = np.mean(feal_list, axis=0)
    logging.info('CountsMax [C]:  %s', str(agg_feal[:5]))
    logging.info('StateDist [S]:  %s', str(agg_feal[5:10]))
    logging.info('RelDist [A-B]:  %s', str(agg_feal[10:]))

    #  ADAPTIVE CENTROID & THETA CALCULATION
    # if lock is None:
    #   logging.info('Never acqiured a lock. Skipping adaptive update (TODO: Mark pts as stale)')
    # else:  
    #   logging.info('Updating Adaptive Centroid')
    
    if ADAPTIVE_CENTROID:
      pipe = self.catalog.pipeline()
      for state in range(numLabels):
        n_pts = len(groupbybin[(state, state)])
        if n_pts == 0:
          logging.info('Skipping State %d Centroid -- Well not visited on this trajectory')
          continue

        cent_xyz  = [alpha.xyz[i] for i in groupbybin[(state, state)]]
        cent_npts = len(groupbybin[(state, state)])
        c_sum = np.zeros(shape=alpha.xyz[0].shape)
        for pt in cent_xyz:
          c_sum += pt
        centroid_local = c_sum / n_pts
        centroid_delta = LA.norm(centroids[state] - cent) 
        pipe.rpush('centroid:xyz:%d' % state, pickle.dumps(centroid_local))
        pipe.rpush('centroid:npts:%d' % state, n_pts)
        pipe.rpush('centroid:delta:%d' % state, centroid_delta)
      pipe.execute()

  # 4-B. Subspace Calcuation: COVARIANCE Matrix, 200ns windows, Full Protein
  #------ B:  Covariance Matrix  -----------------
    if EXPERIMENT_NUMBER > 5:
      # 1. Project Pt to PC's for each conform (top 3 PC's)
      logging.info('---- Covariance Calculation on 200ns windows (Full Protein, cartesian Space) ----')

      # Calculate Covariance over 200 ps Windows sliding every 100ps
      #  These could be user influenced...
      WIN_SIZE_NS = .2
      SLIDE_AMT_NS = .1
      logging.debug("Calculating Covariance over trajectory. frame_size = %.1f, WINSIZE = %dps, Slide = %dps", 
        frame_size, WIN_SIZE_NS*1000, SLIDE_AMT_NS*1000)
      covar = dr.calc_covar(alpha.xyz, WIN_SIZE_NS, frame_size, slide=SLIDE_AMT_NS)
      bench.mark('CalcCovar')
      stat.collect('numcovar', len(covar))
      logging.debug("Calcualted %d covariance matrices. Storing variances", len(covar)) 


  #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
    # try:
    self.wait_catalog()
    # except OverlayNotAvailable as e:
    #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")


  # Update Catalog with 1 Long Atomic Transaction  
    global_index = []
    with self.catalog.pipeline() as pipe:
      while True:
        try:
          logging.debug('Update Filelist')
          pipe.watch(wrapKey('jc', job['name']))
          file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1
          # HD Points
          logging.debug('Update HD Points')
          for x in range(traj.n_frames):
            # Note: Pipelined insertions "should" return contiguous set of index points
            index = pipe.rpush('xid:reference', (file_idx, x)) - 1
            global_index.append(index - 1) 

          pipe.multi()
          logging.debug('Update RMS Subspace')
          for x in range(traj.n_frames):
            A, B = rmslabel[x]
            index = global_index[x]
            # Labeled Observation (from RMSD)
            pipe.rpush('label:rms', rmslabel[x])
            pipe.rpush('varbin:rms:%d_%d' % (A, B), index)
            # pipe.rpush('lineage:rms:%d_%d:%d_%d' % (srcA, srcB, A, B), index)
            # pipe.rpush('lineage:pca:%s:%d_%d' % (job['src_hcube'], A, B), index)
            pipe.rpush('subspace:rms', bytes(rmslist_sv[x]))
            pipe.rpush('subspace:feal', bytes(feal_list[x]))            

          logging.debug('Update OBS Counts')
          for b in binlist:
            pipe.rpush('observe:rms:%d_%d' % b, label_count[b])
          pipe.incr('observe:count')
          pipe.hset('anl_sequence', job['name'], mylogical_seqnum)

          if EXPERIMENT_NUMBER > 5:
            logging.debug('Update Covar Subspace')
            for i, si in enumerate(covar):
              logging.debug('Update COVAR Pt #%d', i)
              local_index = int(i * frame_size * SLIDE_AMT_NS)
              pipe.rpush('subspace:covar:pts', bytes(si))
              pipe.rpush('subspace:covar:xid', global_index[local_index])
              pipe.rpush('subspace:covar:fidx', (file_idx, local_index))

          logging.debug('Executing')
          pipe.execute()
          break
        except redis.WatchError as e:
          logging.debug('WATCH ERROR')
          continue

    self.data[key]['xid:start'] = global_index[0]
    self.data[key]['xid:end'] = global_index[-1]
    bench.mark('Indx_Update')

  # (Should we Checkpoint here?)

  # 4-C. Subspace Calcuation: PCA BY Strata (PER STATE) using Alpha Filter
  #------ C:  GLOBAL PCA by state  -----------------
  #  Update PCA Vectors for each state with new data
    if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10:
      logging.info('---- PCA per BIN over Alpha Filter in cartesian Space ----')
      # TODO:  This will eventually get moved into a User process macrothread 
      #   which will set in between analysis and controller. 
      # For now, we're recalculating using a lock

      # Check if vectors need to be recalculated
      # Connect to reservoir samples
      # TODO: Catalog or Cache?
      reservoir = ReservoirSample('rms', self.catalog)
      # STALENESS_FACTOR = .25   # Recent updates account for 25% of the sample (Just a guess)

      num_inserted = {ab: 0 for ab in binlist}
      num_params = np.prod(alpha.xyz.shape[1:])

      for A, B in binlist:
        num_observations = len(groupbybin[(A,B)])

        if num_observations == 0:
          logging.info('No data received for bin (%d,%d).  Not processing this bin here.', A, B)
          continue

        res_label = '%d_%d' % (A,B)
        updateVectors = False
        kpca_key = 'subspace:pca:kernel:%d_%d' % (A, B)
        kpca = PCAnalyzer.load(self.catalog, kpca_key)
        newkpca = False
        if kpca is None:
          # kpca = PCAKernel(None, 'sigmoid')
          kpca = PCAKernel(6, 'rbf')
          newkpca = True


        logging.info('PCA:  Checking if current vectors for state %d are out of date', A)
        rsize = reservoir.getsize(res_label)
        tsize = kpca.trainsize

        #  KPCA is out of date is the sample size is 20% larger than previously used  set
        #  Heuristics --- this could be a different "staleness" factor or we can check it some other way
        if newkpca or rsize > (tsize * 1.5):

          #  Should we only use a sample here??? (not now -- perhaps with larger rervoirs or if KPCA is slow
          traindata = reservoir.get(res_label)
          if newkpca:
            logging.info('New PCA Kernel. Trained on data set of size %d. Current \
              reservoir is %d pts.', tsize, rsize)
            logging.info('Projecting %d points on Kernel PCA for bin (%d,%d)',
              num_observations, A, B)
            traindata = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]),
              dtype=np.float32)
            for i, index in enumerate(groupbybin[(A,B)]):
              np.copyto(traindata[i], alpha.xyz[index])
          else:
            logging.info('PCA Kernel is old (Updating it). Trained on data set of \
              size %d. Current reservoir is %d pts.', tsize, rsize)


          if len(traindata) <= num_params:
            logging.info("Not enough data to calculate PC's (Need at least %d \
              observations). Skipping PCA for Bin (%d,%d)", num_params, A, B)
            hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32)
            for i, index in enumerate(groupbybin[(A,B)]):
              np.copyto(hd_pts[i], alpha.xyz[index])
            num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts)
            logging.debug('Updating reservoir Sample for Bin (%d, %d)')
            continue

          logging.info('   Performing Kernel PCA (Gaussian) for bin (%d,%d) using traindata of size %d', \
            A, B, len(traindata))

          kpca.solve(traindata)

          # NOTE: Pick PCA Algorithm HERE
          # pca = calc_kpca(np.array(traindata), kerneltype='sigmoid')
          # pca = calc_pca(np.array(traindata))
          bench.mark('CalcKPCA_%d_%d'%(A,B))

          # new_vect = pca.alphas_.T
          lock = self.catalog.lock_acquire(kpca_key)
          if lock is None:
            logging.info('Could not lock the PC Kernel for Bin (%d,%d). Not updating', A, B)
          else:
            kpca.store(self.catalog, kpca_key)
            lock = self.catalog.lock_release(kpca_key, lock)
          bench.mark('ConcurrPCAWrite_%d_%d'%(A,B))

          # Project Reservoir Sample to the Kernel and overwrite current set of points
          #  This should only happen up until the reservior is filled
          # If we are approx above to train, be sure to project all reservor points
          if not newkpca:
            logging.info('Clearing and Re-Projecting the entire reservoir of %d points for Bin (%d,%d).', \
              rsize, A, B)
            rsamp_lowdim = kpca.project(traindata)
            pipe = self.catalog.pipeline()
            pipe.delete('subspace:pca:%d_%d'%(A,B))
            for si in rsamp_lowdim:
              pipe.rpush('subspace:pca:%d_%d'%(A,B), bytes(si))
            pipe.execute()


        else:
          logging.info('PCA Kernel is good -- no need to change them')

        bench.mark('start_ProjPCA')
        logging.info('Projecting %d points on Kernel PCA for Bin (%d,%d)', num_observations, A, B)
        hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32)
        for i, index in enumerate(groupbybin[(A,B)]):
          np.copyto(hd_pts[i], alpha.xyz[index])
        pc_proj = kpca.project(hd_pts)
        bench.mark('ProjPCA_%d_%d'%(A,B))

        # 2. Append subspace in catalog
        pipe = self.catalog.pipeline()
        for si in pc_proj:
          pipe.rpush('subspace:pca:%d_%d' % (A,B), bytes(si))
        pipe.execute()

        logging.debug('Updating reservoir Sample')
        num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts)

      bench.mark('PCA')
      pipe = self.catalog.pipeline()
      for ab, num in num_inserted.items():
        if num > 0:
          pipe.rpush('subspace:pca:updates:%d_%d' % (A, B), num)
      pipe.execute()

  # ---- POST PROCESSING
    if USE_SHM:
      shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      shm_contents = os.listdir('/tmp')
      logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents))
    
    # For benchmarching:
    # print('##', job['name'], dcdfilesize/(1024*1024*1024), traj.n_frames)
    bench.show()
    stat.show()

    # Return # of observations (frames) processed
    return [numConf]
예제 #10
0
파일: simexpl.py 프로젝트: DaMSL/ddc
  def execute(self, job):

  # PRE-PREOCESS ----------------------------------------------------------
    settings = systemsettings()
    bench = microbench('sim_%s' % settings.name, self.seqNumFromID())
    bench.start()
    stat  = StatCollector('sim_%s' % settings.name, self.seqNumFromID())
    mylogical_seqnum = str(self.seqNumFromID())

    # Prepare working directory, input/output files
    conFile = os.path.join(job['workdir'], job['name'] + '.conf')
    logFile = conFile.replace('conf', 'log')      # log in same place as config file
    dcdFile = conFile.replace('conf', 'dcd')      # dcd in same place as config file
    USE_SHM = True

    SIMULATE_RATIO = settings.SIMULATE_RATIO
    if SIMULATE_RATIO > 1:
      logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
    frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000)
    logging.info('Frame Size is %f  Using Sim Ratio of 1:%d', \
      frame_size, SIMULATE_RATIO)

    EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
    logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER)

    # # Grab historical basin data's relative time to start (for lineage)
    traj = None

  # EXECUTE SIMULATION ---------------------------------------------------------
    if self.skip_simulation:

      logging.info('1. SKIPPING SIMULATION.....')
      USE_SHM = False

      job['dcd'] = dcdFile
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

    else:
      logging.info('1. Run Simulation')

      # Prepare & source to config file
      with open(self.data['sim_conf_template'], 'r') as template:
        source = template.read()

      # >>>>Storing DCD into shared memory on this node

      if USE_SHM:
        ramdisk = gettempdir()
        job['outputloc'] = ramdisk
        dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd')
      else:
        job['outputloc'] = ''

      with open(conFile, 'w') as sysconfig:
        sysconfig.write(source % job)
        logging.info("Config written to: " + conFile)

      # # Run simulation in parallel
      # if 'parallel' in job:
      #   numnodes = job['parallel']
      #   total_tasks = numnodes * 24
      #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

      # # Run simulation single threaded
      # else:
      #   cmd = 'namd2 %s > %s' % (conFile, logFile)

      # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
      check = executecmd('module list')
      logging.debug('%s', check)

      cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile)

      #  MICROBENCH #1 (file to Lustre)
      # logging.debug("Executing Simulation:\n   %s\n", cmd)
      # bench = microbench()
      # bench.start()
      # stdout = executecmd(cmd)
      # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
      # bench.mark('SimExec:%s' % job['name'])
      # shm_contents = os.listdir('/dev/shm/out')
      # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # logging.info("Copy Complete to Lustre.")
      # bench.mark('CopyLustre:%s' % job['name'])
      # shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
      # bench.show()

      max_expected_obs = int(job['runtime']) // int(job['dcdfreq'])
      # Retry upto 3 attempts if the sim fails
      MAX_TRY = 3
      for i in range(MAX_TRY, 0, -1):
        min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY)))
        logging.debug("Executing Simulation:\n   %s\n", cmd)
        logging.debug('# Obs Expected to see: %d', max_expected_obs)
        stdout = executecmd(cmd)
        logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
        # Check file for expected data
        if USE_SHM:
          traj = md.load(dcd_ramfile, top=job['pdb'])
        else:
          traj = md.load(dcdFile, top=job['pdb'])
        logging.info("Obs Threshold  = %4d", min_required_obs)
        logging.info("#Obs This Traj = %4d", traj.n_frames)
        if traj.n_frames >= min_required_obs:
          logging.info('Full (enough) Sim Completed')
          break
        logging.info('Detected a failed Simulation. Retrying the same sim.')
        break
      
      bench.mark('simulation')
      # bench.mark('SimExec:%s' % job['name'])

      # Internal stats
      sim_length = self.data['sim_step_size'] * int(job['runtime'])
      sim_realtime = bench.delta_last()
      sim_run_ratio =  (sim_realtime/60) / (sim_length/1000000)
      logging.info('##SIM_RATIO %6.3f  min-per-ns-sim', sim_run_ratio)
      stat.collect('sim_ratio', sim_run_ratio)

      if USE_SHM:
        shm_contents = os.listdir(ramdisk)
        logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
          time.sleep(10)

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD STIILL FILE NOT FOUND!!!!")
        else:
          logging.info("DCD File was found")

      # #  MICROBENCH #2 (file to Alluxio)
      # allux = AlluxioClient()
      # # copy to Aluxio FS
      # allux.put(ramdisk + job['name'] + '.dcd', '/')
      # logging.info("Copy Complete to Alluxio.")
      # bench.mark('CopyAllux:%s' % job['name'])

      # And copy to Lustre
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # And copy to Lustre (usng zero-copy):
      if USE_SHM:

        # ALT:  X-Mit to Cache Service and let cache write to disk lazily
        src  = open(dcd_ramfile, 'rb')
        dest = open(dcdFile, 'w+b')
        offset = 0
        dcdfilesize = os.path.getsize(dcd_ramfile)
        while True:
          sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
          if sent == 0:
            break
          offset += sent
        logging.info("Copy Complete to Lustre.")
        bench.mark('CopyLustre:%s' % job['name'])
      
      # TODO: Update job's metadata
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

  # ANALYSIS   ------- ---------------------------------------------------------
    #  ANALYSIS ALGORITHM
  # 1. With combined Sim-analysis: file is loaded locally from shared mem
    logging.debug("2. Load DCD")

    # Load topology and define filters
    topo = self.protein.pdb
    pdb = md.load(job['pdb'])
    hfilt = pdb.top.select_atom_indices('heavy')
    pfilt = pdb.top.select('protein')
    afilt = pdb.top.select_atom_indices('alpha')

    # Load full higher dim trajectory
    if traj is None:
      if USE_SHM:
        traj = md.load(dcd_ramfile, top=pdb)
      else:
        traj = md.load(dcdFile, top=pdb)

    logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj))
    traj_prot = traj.atom_slice(pfilt)
    traj_heavy = traj.atom_slice(hfilt)
    traj_alpha = traj.atom_slice(afilt)

    # Superpose Coordinates to Common Reference
    traj_prot.superpose(topo)

    # Calculate output Distance Space
    # Use of the second side chain atom is discussed in the ref paper on Timescape
    # The atom pairs and selected atoms are in the timescape module
    sc_pairs = side_chain_pairs(traj_prot)
    n_features = len(sc_pairs)

    # Use the CA atoms to calculate distance space
    # NOTE THE CONVERSION FROM NM TO ANG!!!!!!
    dist_space = 10 * DR.distance_space(traj_alpha)

    # Get the frame rate to for conversion between source and timescape
    #  NOTE: Timescae is run at a more coarse level
    logging.debug('Preprocessing output for TimeScapes: terrain')
    traj_frame_per_ps = SIMULATE_RATIO * int(job['interval']) / 1000.   # jc interval is in fs
    ts_frame_per_ps = int(self.data['timescape:rate'])  # this value is in ps
    frame_rate = int(ts_frame_per_ps / traj_frame_per_ps)
    logging.debug('%5.2f fr/ps (Traj)     %5.2f fr/ps (TS)    FrameRate= %4.1f', traj_frame_per_ps, ts_frame_per_ps, frame_rate)

    # Execute Timescapes agility program to detect spatial-temporal basins
    output_prefix = os.path.join(job['workdir'], job['name'])
    if not self.skip_timescape:
      # Prep file and save locally in shm
      tmploc = gettempdir()
      ts_out = tmploc + 'traj_ts'
      ts_dcd = ts_out + '.dcd'
      ts_pdb = ts_out + '.pdb'
      heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate))
      heavy.slice(0).save_pdb(ts_pdb)
      heavy.save_dcd(ts_dcd)

      # Gaussuan Full Width at Half-Max value affects sliding window size
      # ref:  http://timescapes.biomachina.org/guide.pdf
      gmd_cut1 = int(self.data['timescape:gmd:low'])
      gmd_cut2 = int(self.data['timescape:gmd:hi'])
      gauss_wght_delta = int(self.data['timescape:delta'])

      # Execute timescapes' terrain.py on the pre-processed trajectory
      cmd = 'terrain.py %s %s %d %d %d GMD %s' %\
        (ts_pdb, ts_dcd, gmd_cut1, gmd_cut2, gauss_wght_delta, output_prefix)
      logging.info('Running Timescapes:\n  %s', cmd)
      stdout = executecmd(cmd)
      logging.info('TimeScapes COMPLETE:\n%s', stdout)


    # Collect and parse Timescape output
    logging.debug('Parsing Timescapes output')
    ts_parse = TimeScapeParser(job['pdb'], output_prefix, job['name'], 
      dcd=dcdFile, traj=traj, uniqueid=False)
    basin_list = ts_parse.load_basins(frame_ratio=frame_rate)
    n_basins = len(basin_list)

    minima_coords = {}
    basin_rms = {}
    basins = {}

    new_dmu={}
    new_dsig={}
    resid_rms_delta = {}

    stat.collect('num_basin', n_basins)
    downstream_list = []

    # FOR BOOTSTRAPPING USING RMSD
    ref_file = os.path.join(settings.workdir, self.data['pdb:ref:0'])
    logging.info('Loading RMSD reference frame from %s', self.data['pdb:ref:0'])
    refframe = md.load(ref_file)
    ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices('alpha'))
    rmsd = 10*md.rmsd(traj_alpha, ref_alpha)

    # FOR RESIDUE RMSD
    res_rms_Kr = FEATURE_SET_RESID
    resrmsd = 10*np.array([LA.norm(i-ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz])
    basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms))

    # LOAD CENROIDS -- todo move to immut
    centroid = pickle.loads(self.catalog.get('centroid:ds'))
    basin_label_list = {}

    # Process each basin
    for i, basin in enumerate(basin_list):
      logging.info('  Processing basin #%2d', i)
      bid = basin.id
      downstream_list.append(bid)

      # Slice out minima coord  & save to disk (for now)
      #  TODO:  Store in memory in cache
      minima_coords[bid] = traj.slice(basin.mindex)
      jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid)
      minima_coords[bid].save_pdb(jc_filename)

      # METRIC CALCULATION
      a, b = basin.start, basin.end
      new_dmu[bid]    = np.mean(dist_space[a:b], axis=0)
      new_dsig[bid]   = np.std(dist_space[a:b], axis=0)

      # Collect Basin metadata
      basin_hash = basin.kv()
      basin_hash['pdbfile'] = jc_filename
      basin_hash['rmsd'] = np.median(rmsd[a:b])
      basin_res_rms[i] = np.median(resrmsd[a:b], axis=0)

      # FOR EXP #16 -- Residue RMSD

      # Set relative time (in ns)
      # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000
      basins[bid] = basin_hash

      # LABEL ATEMPORAL DATA
      # Take every 4th frame
      stride = [dist_space[i] for i in range(a,b,4)]
      rms = [LA.norm(centroid - i, axis=1) for i in stride]
      label_seq = [np.argmin(i) for i in rms]
      basin_label_list[bid] = label_seq
      logging.info('  Basin Processed: #%s, %d - %d', basin_hash['traj'], 
        basin_hash['start'], basin_hash['end'])

    # RMSD DELTA (RESIDUE)
    basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T
    basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)}
    for k in basins.keys():
      basins[k]['resrms_delta'] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr])


    # TODO:  Use Min Index as snapshot, median (or mean) DistSpace vals for each basin?????

    bench.mark('analysis')
  #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
    # try:
    self.wait_catalog()
    # except OverlayNotAvailable as e:
    #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")

  # FOR EXPERIMENT METRICS
    src_basin = self.catalog.hgetall("basin:" + job['src_basin'])
    with open('/home-1/[email protected]/ddc/results/{0}_prov.log'.format(settings.name), 'a') as metric_out:
      for i, basin in enumerate(basin_list):
        bid = basin.id
        label_seq = basin_label_list[bid]
        basin_metric_label = LABEL10(label_seq)
        metric_out.write('BASIN,%s,%s,%s,%s\n'% \
          (bid, src_basin['label:10'], basin_metric_label, ''.join([str(i) for i in label_seq])))

  # Update Index Synchronized data lists
    basin_list_sorted =sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1])))
    for bid in basin_list_sorted:
      with self.catalog.pipeline() as pipe:
        while True:
          try:
            logging.debug('Updating %s basin indeces and distance space', len(basins))
            pipe.rpush('basin:list', bid)
            pipe.rpush('dspace', pickle.dumps(new_dmu[bid]))
            basin_index,_ = pipe.execute()
            break
          except redis.WatchError as e:
            logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...')
            continue
      basins[bid]['dsidx'] = basin_index - 1

  # Update Catalog with 1 Long Atomic Transaction  
    with self.catalog.pipeline() as pipe:
      while True:
        try:
          logging.debug('Update Filelist')
          pipe.watch(wrapKey('jc', job['name']))
          file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1

          # HD Points
          logging.debug('Update HD Points')
          start_index = pipe.llen('xid:reference')
          pipe.multi()
          pipe.rpush('xid:reference', *[(file_idx, x) for x in range(traj.n_frames)])
          pipe.set('resrms:' + job['name'], resrmsd)

          # Store all basin data
          logging.debug('Updating %s basins', len(basins))
          for bid in sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1]))):
            pipe.hmset('basin:'+bid, basins[bid])
            pipe.set('minima:%s'%bid, pickle.dumps(minima_coords[bid]))
            for i in basin_label_list[bid]:
              pipe.rpush('basin:labelseq:'+bid, i)

          pipe.hset('anl_sequence', job['name'], mylogical_seqnum)

          logging.debug('Executing')
          pipe.execute()
          break

        except redis.WatchError as e:
          logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...')
          continue

    self.data[key]['xid:start'] = start_index
    self.data[key]['xid:end'] = start_index + traj.n_frames
    bench.mark('catalog')

  # ---- POST PROCESSING
    # Check for total data in downstream queue  & Notify control manager to run
    # 1. Det. what's still running
    job_queue = slurm.currentqueue()
    n_simjobs = -1    # -1 to exclude self
    for j in job_queue:
      if j['name'].startswith('sw'):
        n_simjobs += 1

    logging.info('SIM WORKER has detected %d other simulations running/pending.', len(job_queue))
    remain_percent = n_simjobs / self.data['numresources']

    # Fault Tolerance: ensure the pipeline persists
    if n_simjobs == 0:
      logging.info('I am the last simulation. Ensuring the controler executes.')
      self.catalog.set('ctl:force', 1)

    # Send downstream notification when less than 10% of jobs are still running
    if not self.skip_notify and remain_percent < .1:
      # Notify the control manager 'CM'
      self.notify('ctl')

    if USE_SHM:
      shutil.rmtree(ramdisk)
      shm_contents = os.listdir('/dev/shm')
      logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents))
    
    # For benchmarching:
    bench.show()
    stat.show()

    # Return # of observations (frames) processed
    return downstream_list