Beispiel #1
0
    def execute(self, new_basin_list):
        """Executing the Controler Algorithm. Load pre-analyzed lower dimensional
      subspaces, process user query and identify the sampling space with 
      corresponding distribution function for each user query. Calculate 
      convergence rates, run sampler, and then execute fairness policy to
      distribute resources among users' sampled values.
      """
        logging.debug('CTL MT')

        # PRE-PROCESSING ---------------------------------------------------------------------------------
        logging.debug(
            "============================  <PRE-PROCESS>  ============================="
        )
        np.set_printoptions(precision=4, linewidth=150)

        self.data['timestep'] += 1
        logging.info('TIMESTEP: %d', self.data['timestep'])

        settings = systemsettings()
        bench = microbench('ctl_%s' % settings.name, self.seqNumFromID())
        stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID())

        if self.force_decision:
            new_basin_list = []

        # Connect to the cache
        self.cacheclient = CacheClient(settings.APPL_LABEL)

        # create the "binlist":
        numresources = self.data['numresources']
        explore_factor = float(self.data['sampler:explore'])
        topo = self.protein.top

        # Load new RMS Labels -- load all for now
        bench.start()
        logging.debug('Loading RMS Labels')
        start_index = max(0, self.data['ctl_index_head'])
        logging.debug(" Start_index=%d,  batch_size=%d", start_index,
                      len(new_basin_list))

        # Calculate variable PDF estimations for each subspace via bootstrapping:
        logging.debug(
            "=======================  <SUBSPACE CONVERGENCE> (skip)  ==================="
        )

        # IMPLEMENT USER QUERY with REWEIGHTING:
        logging.debug(
            "=======================  <QUERY PROCESSING>  ========================="
        )
        stat.collect('new_basin', len(new_basin_list))

        ##### BARRIER
        self.wait_catalog()
        selected_index_list = []

        # QUERY PROCESSING & SAMPLING BELOW to select indices.
        EXPERIMENT_NUMBER = self.experiment_number
        logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER)

        # TODO: ABSTRACT FEATURE DIMENSIONALITY
        n_features = 1653

        # Basin List will be the list of basin representing the new Job Candidates
        selected_basin_list = []
        all_basins = self.data['basin:list']
        num_prev_basins = int(self.data['basin:processed'])

        # Load Previous Distance Space (historical data)
        prev_ds_raw = self.catalog.lrange('dspace', 0, num_prev_basins)
        local_basins = {}
        if len(prev_ds_raw) > 0:
            ds_prev = np.zeros(shape=(len(prev_ds_raw), n_features))
            logging.info("Unpickling distance space to array: %s",
                         ds_prev.shape)
            for i, elm in enumerate(prev_ds_raw):
                ds_prev[i] = pickle.loads(elm)
            logging.info('Prev DS loaded. Size = %d', len(ds_prev))
        else:
            logging.info('NO Prev DS')
            ds_prev = []

        # FOR Supervised Classification (included for all for post-partem processing)
        bin_labels_10 = [
            'T0', 'T1', 'T2', 'T3', 'T4', 'W0', 'W1', 'W2', 'W3', 'W4'
        ]
        bin_labels_25 = [(a, b) for a in range(5) for b in range(5)]
        bin_list_10 = {
            k: [int(i) for i in self.catalog.lrange('bin:10:%s' % k, 0, -1)]
            for k in bin_labels_10
        }
        bin_list_25 = {
            k:
            [int(i) for i in self.catalog.lrange('bin:25:%d_%d' % k, 0, -1)]
            for k in bin_labels_25
        }

        # Merge locally analyzed distance space
        delta_ds = np.zeros(shape=(len(new_basin_list), n_features))
        logging.info('Collecting new data from basins: %s', new_basin_list)
        for i, bid in enumerate(new_basin_list):
            basin = self.data['basin:' + bid]
            global_basin_index = int(basin['dsidx'])
            local_basins[global_basin_index] = basin

            # Update Distance Space
            dmu_ = self.catalog.lindex('dspace', global_basin_index)
            if dmu_ is None:
                print("ERROR!!!!")
                print(global_basin_index, i)
                for k, v in basin.items():
                    print(k, v)
                sys.exit(0)

            delta_ds[i] = pickle.loads(dmu_)

            # Update Supervise Classification
            label_seq = [
                int(i)
                for i in self.catalog.lrange('basin:labelseq:' + bid, 0, -1)
            ]
            local_basins[global_basin_index][
                'label:10'] = label10 = bin_label_10(label_seq)
            local_basins[global_basin_index][
                'label:25'] = label25 = bin_label_25(label_seq)
            bin_list_10[label10].append(global_basin_index)
            bin_list_25[label25].append(global_basin_index)
            self.data['basin:processed'] += 1

        if len(new_basin_list) > 0 and len(prev_ds_raw) > 0:
            dist_space = np.vstack((ds_prev, delta_ds))
        elif len(delta_ds) == 0:
            dist_space = np.array(ds_prev)
        elif len(ds_prev) == 0:
            logging.info('First Set of Distance Coord laoded')
            dist_space = delta_ds
        else:
            logging.error("ERROR! NO DISTANCE SPACE IN THE CATALOG")

        # UNIFORM SAMPLER (BASIC)
        if EXPERIMENT_NUMBER == 12:
            basin_idx_list = []
            candidate_list = [[] for i in range(5)]
            for k, v in bin_list_10.items():
                candidate_list[int(k[1])].extend(v)

            # UNIFORM SAMPLING
            for sel_num in range(numresources):
                # Select Random start state
                start_state = np.random.randint(5)
                # Select random index
                rand_index = np.random.choice(len(candidate_list[start_state]))
                basin_idx = candidate_list[start_state][rand_index]
                basin_idx_list.append(basin_idx)

            for i in basin_idx_list:
                if i < num_prev_basins:
                    logging.info("Select index: %s   (Retrieve from Catalog)",
                                 i)
                    bid = self.data['basin:list'][i]
                    basin = self.catalog.hgetall('basin:%s' % bid)
                else:
                    logging.info(
                        "Select index: %s   (New locally built basin in mem)",
                        i)
                    basin = local_basins[i]
                logging.debug('   BASIN:  %s', basin['id'])
                selected_basin_list.append(basin)

        # BIASED SAMPLER (UPDATED)
        if EXPERIMENT_NUMBER == 13:

            # USING 10-BIN LABELS
            distro = [len(bin_list_10[i]) for i in bin_labels_10]

            # Create and invoke the sampler
            logging.info('Running the biased (umbrella) samplers')
            sampler = BiasSampler(distro)
            samplecount = np.zeros(len(bin_labels_10), dtype=np.int16)

            # Find the first index for each bin:
            explore_direction = 1 if explore_factor < .5 else -1
            for i, b in enumerate(bin_list_10):
                if len(b) == 0:
                    idx = 0
                else:
                    idx = np.floor(explore_factor * (len(b) - 1))
                samplecount[i] = idx

            sel_bins = sampler.execute(numresources)

            logging.info(
                'Processing selected bins to find starting candidates')
            candidate_list = {}
            basin_idx_list = []
            for b in sel_bins:
                target_bin = bin_labels_10[b]
                if target_bin not in candidate_list:
                    candidate_list[target_bin] = bin_list_10[target_bin]
                #  TODO:  FULLY IMPLEMENT EXPLORE/EXPLOIT BUT INCL HISTORY/PROVONANCE
                # Lazy Update to centroid -- push to catalog immediately
                # vals = dist_space[bin_list_10[target_bin]]
                # logging.info('Updating Centroid for bin %s,  bindata: %s', target_bin, vals.shape)
                # centroid = np.mean(vals, axis=0)
                # self.catalog.set('bin:10:centroid:%s' % target_bin, pickle.dumps(centroid))
                # dist_center = [LA.norm(centroid - dist_space[i]) for i in bin_list_10[target_bin]]
                # candidate_list[target_bin] = sorted(zip(bin_list_10[target_bin], dist_center), key=lambda x: x[1])
                # basin_idx, basin_diff = candidate_list[target_bin][samplecount[b]]
                # samplecount[b] += explore_direction
                # # Wrap
                # if samplecount[b] == 0:
                #   samplecount = len(candidate_list[target_bin]) - 1
                # if samplecount[b] == len(candidate_list[target_bin]):
                #   samplecount = 0

                # FOR NOW PICK A RANDOM CANDIDATE
                rand_index = np.random.choice(len(candidate_list[target_bin]))
                basin_idx = candidate_list[target_bin][rand_index]
                logging.info('BIAS SAMPLER:\n   Bin: %s\n   basin: %d     Delta from Center: %6.3f  (note: dist not factored in)', \
                  target_bin, basin_idx, 0.)
                basin_idx_list.append(basin_idx)

            for i in basin_idx_list:
                if i < num_prev_basins:
                    logging.info("Select index: %s   (Retrieve from Catalog)",
                                 i)
                    bid = self.data['basin:list'][i]
                    basin = self.catalog.hgetall('basin:%s' % bid)
                else:
                    logging.info(
                        "Select index: %s   (New locally built basin in mem)",
                        i)
                    basin = local_basins[i]
                logging.debug('   BASIN:  %s', basin['id'])
                selected_basin_list.append(basin)

        # LATTICE SAMPLER (WITH HISTORICAL DATA)
        if EXPERIMENT_NUMBER == 14:
            # Merge Existing delta with DEShaw Pre-Processed data:
            logging.info('Merging DEShaw with existing generated data')

            # Set parameters for lattice
            Kr = [
                int(i) for i in self.catalog.lrange('lattice:features', 0, -1)
            ]
            support = int(self.data['lattice:support'])
            dspt = self.catalog.get('lattice:delta_support')
            delta_support = 5 if dspt is None else int(dspt)
            cutoff = float(self.data['lattice:cutoff'])
            logging.info('PARAMS  Kr:%s\n support:%d  dspt:%d  cutoff:%f', Kr,
                         support, delta_support, cutoff)

            # Load existing (base) lattice data
            logging.info(
                "Unpickling max/low FIS and derived lattice EMD values")
            max_fis = pickle.loads(self.catalog.get('lattice:max_fis'))
            low_fis = pickle.loads(self.catalog.get('lattice:low_fis'))
            dlat = pickle.loads(self.catalog.get('lattice:dlat'))

            # TODO: MOVE iset completely into catalog
            # Ik       = pickle.loads(self.catalog.get('lattice:iset'))

            # FOR NOW: Full DEShaw Index is saved on disk  (TODO: MOVE TO CATALOG)
            logging.info(
                "Loading full Itemset from disk (TODO: Det optimization on mem/time)"
            )
            Ik = pickle.load(open(settings.datadir + '/iset.p', 'rb'))
            # Item_set Keys (Ik) are only saved as a delta for space conservation
            if os.path.exists(settings.datadir + '/iset_delta.p'):
                Ik_delta = pickle.load(
                    open(settings.datadir + '/iset_delta.p', 'rb'))
            else:
                Ik_delta = {}
            # Merge previous item set delta with DEShaw index
            logging.info("Merging DEShaw Ik with Delta IK")
            for k, v in Ik_delta.items():
                Ik[k] = np.concatenate((Ik[k], v)) if k in Ik else v

            # Build Base Lattice Object
            base_lattice = lat.Lattice(ds_prev, Kr, cutoff, support)
            base_lattice.set_fis(max_fis, low_fis)
            base_lattice.set_dlat(dlat, Ik)

            if not self.force_decision and len(delta_ds) > 0:
                # Build Delta Lattice Object
                logging.info('Building Delta lattice. Num new items: %d',
                             len(delta_ds))
                delta_lattice = lat.Lattice(delta_ds,
                                            Kr,
                                            cutoff,
                                            delta_support,
                                            invert=invert_vals)
                delta_lattice.maxminer()
                delta_lattice.derive_lattice()

                # Update non-DEShaw delta itemset key index
                logging.info("Updating Itemsets and Distance Space Matrix")
                for k, v in delta_lattice.Ik.items():
                    Ik_delta[k] = np.concatenate(
                        (Ik_delta[k], v)) if k in Ik_delta else v

                # Save Ik delta to disk
                logging.info("Saving Delta Itemset (to disk)")
                pickle.dump(Ik_delta,
                            open(settings.datadir + '/iset_delta.p', 'wb'))

                #  Perform incremental maintenance
                logging.info('Merging Delta lattice with Base Lattice')
                base_lattice.merge(delta_lattice)

            # Create the Sampler object (also make clusters)
            logging.info('Invoking the Lattice Sampler')

            sampler = LatticeSampler(base_lattice)
            basin_id_list = sampler.execute(numresources)
            for index in basin_id_list:
                bid = all_basins[index]
                selected_basin_list.append(
                    self.catalog.hgetall('basin:%s' % bid))

        # LATTICE SAMPLER (DE NOVO)
        if EXPERIMENT_NUMBER == 15:

            # PREPROCESS
            N_features_src = topo.n_residues
            N_features_corr = (N_features_src**2 - N_features_src) // 2
            upt = np.triu_indices(N_features_src, 1)
            old_basin_ids = all_basind[:num_prev_basins]

            # DENOVO Exploratory Bootstrapping (RMSD)
            explore_factor = float(self.data['sampler:explore'])

            # TODO:  Better transtion plan from explore to exploit
            self.data['sampler:explore'] *= .75
            executed_basins = self.catalog.lrange('executed', 0, -1)

            if explore_factor > 0:
                logging.info("EXPLORING Most active basins....")
                basindata = [
                    self.catalog.hgetall(bid) for bid in old_basin_ids
                ]
                for bid in new_basin_list:
                    basindata.append(self.data['basin:' + bid])

                basins_with_rms = [b for b in basindata if 'resrms_delta' in b]

                basin_by_rmsd = sorted(basins_with_rms,
                                       key=lambda x: float(x['resrms_delta']),
                                       reverse=True)
                explore_samples = int(np.floor(numresources * explore_factor))
                logging.info('Num to explore: %d  out of %d', explore_samples,
                             len(basin_by_rmsd))
                idx, num_sampled = 0, 0

                while idx < len(
                        basin_by_rmsd) and num_sampled < explore_samples:
                    selb = basin_by_rmsd[idx]
                    if selb['id'] in executed_basins:
                        logging.info(
                            'Selected %s, but it has been executed. Selecting next option',
                            selb['id'])
                    else:
                        selected_basin_list.append(selb)
                        logging.info('  (%d) EXPLORE BASIN:  %s  %f',
                                     selb['id'], selb['id'],
                                     float(selb['resrms_delta']))
                        numresources -= 1
                        num_sampled += 1
                    idx += 1

            # TODO:  Reduced Feature Sets
            #  Using Reduced Feature Set Alg #2 HERE
            support = int(.01 * len(dist_space))
            cutoff = 8

            # RE-Calc the whole lattice:
            logging.info("Building the new lattice")
            BUILD_NEW = not self.catalog.exists('lattice:bootstrapped')
            # TODO: Decision to go from build new to incr maint
            if BUILD_NEW:
                tval = .05
                Kr = lat.reduced_feature_set2(dist_space,
                                              cutoff,
                                              theta=tval,
                                              maxk=25)
                retry = 5
                while len(Kr) < 12 and retry > 0:
                    tval /= 2
                    retry -= 1
                    Kr = lat.reduced_feature_set2(dist_space,
                                                  cutoff,
                                                  theta=tval,
                                                  maxk=25)

                base_lattice = lat.Lattice(dist_space, Kr, cutoff, support)
                base_lattice.maxminer()
                base_lattice.derive_lattice()
                with self.catalog.pipeline() as pipe:
                    pipe.delete('lattice:kr')
                    for i in sorted(Kr):
                        pipe.rpush('lattice:kr', i)
                    pipe.execute()

            else:
                # Load existing (base) lattice data
                max_fis = pickle.loads(self.catalog.get('lattice:max_fis'))
                low_fis = pickle.loads(self.catalog.get('lattice:low_fis'))
                dlat = pickle.loads(self.catalog.get('lattice:dlat'))
                Ik = pickle.loads(self.catalog.get('lattice:iset'))
                num_k = self.catalog.get('lattice:num_k')
                Kr = [int(i) for i in self.catalog.lrange('lattice:kr', 0, -1)]
                if num_k is None:
                    num_k = max(8, min(15, numresources * 2))

                # Build Lattice Object
                logging.info('Building Existing lattice object')
                base_lattice = lat.Lattice(ds_prev, Kr, cutoff, support)
                base_lattice.set_fis(max_fis, low_fis)
                base_lattice.set_dlat(dlat, Ik)

                # Build Delta Lattice Object
                logging.info('Building Delta lattice. Num new items: %d',
                             len(delta_ds))
                delta_lattice = lat.Lattice(delta_ds, Kr, cutoff, 1)
                delta_lattice.maxminer()
                delta_lattice.derive_lattice()

                #  Perform incremental maintenance
                logging.info('Merging Delta lattice with Base Lattice')
                base_lattice.merge(delta_lattice)

            if numresources > 0:
                logging.info('Invoking the Lattice Sampler')
                sampler = LatticeSampler(base_lattice)
                basin_id_list = sampler.execute(numresources)

                # For now retrieve immediately from catalog
                self.wait_catalog()
                for index in basin_id_list:
                    bid = all_basins[index]
                    key = 'basin:%s' % bid
                    # Check to ensure its not a new basin and that it exists in the DB
                    if self.catalog.exists(key):
                        logging.debug('KEY EXISTS: %s', key)
                        selbasin = self.catalog.hgetall(key)
                    else:
                        logging.debug('NO KEY: %s\n%s', key, self.data[key])
                        selbasin = self.data[key]
                    selected_basin_list.append(selbasin)

        # CORRELATION SAMPLER (KMEANS)
        if EXPERIMENT_NUMBER == 18:
            sampler = CorrelationSampler(cm_all, mu=dist_space)
            basin_id_list = sampler.execute(numresources)
            for bid in basin_id_list:
                selected_basin_list.append(
                    self.catalog.hgetall('basin:%s' % bid))

        bench.mark('GlobalAnalysis')

        # Generate new starting positions
        jcqueue = OrderedDict()
        src_traj_list = []
        for basin in selected_basin_list:
            src_traj_list.append(basin['traj'])
            if basin['traj'].startswith('desh'):
                global_params = getSimParameters(self.data, 'deshaw')
                # fileno = int(basin['traj'][-4:])
                # frame = int(basin['mindex'])
                # jcID, config = generateDEShawJC(fileno, frame)
            else:
                global_params = getSimParameters(self.data, 'gen')
                src_psf = self.catalog.hget('jc_' + basin['traj'], 'psf')
                global_params.update({'psf': src_psf})
                # jcID, config = generateExplJC(basin, jcid=None)

            jcID, config = generateFromBasin(basin)
            config.update(global_params)
            config['name'] = jcID
            logging.info("New Simulation Job Created: %s", jcID)
            for k, v in config.items():
                logging.debug("   %s:  %s", k, str(v))

            #  Add to the output queue & save config info
            jcqueue[jcID] = config
            logging.info("New Job Candidate Completed:  %s   #%d on the Queue",
                         jcID, len(jcqueue))

        stat.collect('src_traj_list', src_traj_list)
        bench.mark('GenInputParams')

        #  POST-PROCESSING  -------------------------------------
        logging.debug(
            "============================  <POST-PROCESSING & OUTPUT>  ============================="
        )
        self.wait_catalog()

        with self.catalog.pipeline() as pipe:
            for basin in selected_basin_list:
                pipe.rpush('executed', basin['id'])
            pipe.execute()

        # Append new distance values
        if EXPERIMENT_NUMBER == 14:
            # Save Ik delta to disk
            logging.info("Saving Delta Itemset (to disk)")
            pickle.dump(Ik_delta, open(settings.datadir + '/iset_delta.p',
                                       'wb'))

            with self.catalog.pipeline() as pipe:
                pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
                pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
                pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
                pipe.execute()

        if EXPERIMENT_NUMBER == 15:
            with self.catalog.pipeline() as pipe:
                pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
                pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
                pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
                pipe.execute()
            self.catalog.set('lattice:iset', pickle.dumps(base_lattice.Ik))

        for basin_idx, basin in local_basins.items():
            key = 'basin:' + basin['id']
            with self.catalog.pipeline() as pipe:
                pipe.hset(key, 'label:10', basin['label:10'])
                pipe.hset(key, 'label:25', basin['label:25'])
                pipe.rpush('bin:10:%s' % basin['label:10'], basin_idx)
                pipe.rpush('bin:25:%d_%d' % basin['label:25'], basin_idx)
                pipe.execute()
        self.catalog.set('basin:processed',
                         num_prev_basins + len(local_basins))

        # Clear current queue, mark previously queues jobs for GC, push new queue
        qlen = self.catalog.llen('jcqueue')
        logging.debug('Current queue len;   %s', str(qlen))
        if qlen > 0:
            curqueue = self.catalog.lrange('jcqueue', 0, -1)
            logging.info("Marking %d obsolete jobs for garbage collection",
                         len(curqueue))
            for jc_key in curqueue:
                key = wrapKey('jc', jc_key)
                config = self.catalog.hgetall(key)
                config['gc'] = 0
                # Add gc jobs it to the state to write back to catalog (flags it for gc)
                self.addMut(key, config)
            self.catalog.delete('jcqueue')

        # Update cache hit/miss
        # hit = self.cache_hit
        # miss = self.cache_miss
        # logging.info('##CACHE_HIT_MISS %d %d  %.3f', hit, miss, (hit)/(hit+miss))
        # self.catalog.rpush('cache:hit', self.cache_hit)
        # self.catalog.rpush('cache:miss', self.cache_miss)

        self.data['jcqueue'] = list(jcqueue.keys())

        logging.debug("   JCQUEUE:  %s", str(self.data['jcqueue']))
        # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y)
        logging.debug("Updated Job Queue length:  %d",
                      len(self.data['jcqueue']))
        for jcid, config in jcqueue.items():
            # config['converge'] = self.data['converge']
            self.addMut(wrapKey('jc', jcid), config)

        self.notify('sim')

        bench.mark('PostProcessing')
        print('## TS=%d' % self.data['timestep'])
        bench.show()
        stat.show()

        return list(jcqueue.keys())
Beispiel #2
0
def load_historical_DEShaw(catalog, skip_dspace=False):
  """ Load all DEShaw data into basins for processing """
  settings = systemsettings()
  home = os.getenv('HOME')

  # Read in and parse TimeScape output
  file_pref = home+'/work/timescape/desh_' #'/root/heavy/out/expl'
  basin_list = []
  logging.info('Loading all D.E.Shaw Time Scape data and processing basins')
  # To re-label basin
  atemp_labels = np.load(home+'/work/results/DE_label_full.npy')

  bnum = 0
  bin_labels_10 = ['T0', 'T1', 'T2', 'T3', 'T4', 'W0', 'W1', 'W2', 'W3', 'W4']
  bin_labels_25 = [(a,b) for a in range(5) for b in range(5)]
  bin_list_10 = defaultdict(list)
  bin_list_25 = defaultdict(list)
  for i in range(42):  
    nframes = 100000 if i < 41 else 25000
    minima_list = TimeScape.read_log(file_pref + '%02d_minima.log'%i)
    window_list = TimeScape.windows(file_pref + '%02d_transitions.log'%i)
    basin_index = 0
    last = None
    offset = 100000*i
    pipe = catalog.pipeline()
    while basin_index < len(minima_list):
      ### MINIMA IS LOCAL TO FULL 2.5us FILE
      ### WINDOW IS GLOBAL INDEX OVER ALL 4.125Mil Frames
      a, b = window_list[basin_index]
      minima = minima_list[basin_index]
      basin_id = '%07d' % (offset + a)
      local_file_num = offset // 1000 + minima // 1000
      local_file_id  = 'desh_%04d' %  (local_file_num)
      local_minima   = (minima + offset) - local_file_num * 1000
      basin = Basin(local_file_id, (a+offset, b+offset), local_minima, uid='%07d' % (offset + a))
      if last is not None:
        basin.prev = last.id
        basin_list[-1].next = basin.id
      basin_list.append(basin)
      last = basin
      basin_index += 1
      basin_hash = basin.kv()
      basin_hash['dsidx'] = bnum
      label_seq = atemp_labels[a+offset:b+offset]
      basin_hash['label:10'] = label10 = bin_label_10(label_seq)
      basin_hash['label:25'] = label25 = bin_label_25(label_seq)
      bin_list_10[label10].append(bnum)
      bin_list_25[label25].append(bnum)
      pipe.rpush('bin:10:%s' % label10, bnum)
      pipe.rpush('bin:25:%d_%d' % label25, bnum)
      pipe.rpush('basin:list', basin_id)
      pipe.hmset('basin:%s'%basin_id, basin_hash)
      bnum += 1
    pipe.execute()
  catalog.set('basin:processed', bnum)

  # logging.info('Loading Pre-Calculated Correlation Matrix and mean/stddev vals')
  # corr_matrix = np.load('data/de_corr_matrix.npy')
  # dmu = np.load('data/de_ds_mu.npy')
  # dsig = np.load('data/de_ds_mu.npy')

  # logging.info("Loading Historical data into catalog:  corr_matrix: %s", corr_matrix.shape)
  # catalog.storeNPArray(corr_matrix, 'desh:coor_vector')
  # catalog.storeNPArray(dmu, 'desh:ds_mu')
  # catalog.storeNPArray(dsigma, 'desh:ds_sigma')

  if settings.EXPERIMENT_NUMBER == 14:

    if not os.path.exists(settings.datadir + '/iset.p'):
      os.symlink(settings.workdir + '/iset.p', settings.datadir + '/iset.p')
    if not os.path.exists(settings.datadir + '/iset.p'):
      os.symlink(settings.workdir + '/de_ds_mu.npy', settings.datadir + '/de_ds_mu.npy')
    if not os.path.exists(settings.datadir + '/de_ds_mu.npy'):
      os.symlink(settings.workdir + '/data/de_ds_mu.npy', settings.datadir + '/de_ds_mu.npy')

    dlat    = open(os.path.join(settings.workdir, 'dlat.p'), 'rb').read()
    max_fis = open(os.path.join(settings.workdir, 'mfis.p'), 'rb').read()
    low_fis = open(os.path.join(settings.workdir, 'lowfis.p'), 'rb').read()

    logging.info('Loading max, low FIS and derived lattice')
    with catalog.pipeline() as pipe:
      pipe.set('lattice:max_fis', max_fis)
      pipe.set('lattice:low_fis', low_fis)
      pipe.set('lattice:dlat', dlat)
      pipe.execute()

    logging.info('Loading raw distance from file')
    de_ds = 10*np.load(settings.datadir + '/de_ds_mu.npy')
    logging.info('Loading raw distance space into catalog')
    with catalog.pipeline() as pipe:
      for elm in de_ds:
        pipe.rpush('dspace', pickle.dumps(elm))
      pipe.execute()

  if settings.EXPERIMENT_NUMBER == 16:

    if not os.path.exists(settings.datadir + '/iset.p'):
      os.symlink(settings.workdir + '/tran_iset.p', settings.datadir + '/iset.p')
    if not os.path.exists(settings.datadir + '/resrms.npy'):
      os.symlink(settings.workdir + '/data/resrms.npy', settings.datadir + '/resrms.npy')

    dlat    = open(os.path.join(settings.workdir, 'tran_dlat.p'), 'rb').read()
    max_fis = open(os.path.join(settings.workdir, 'tran_mfis.p'), 'rb').read()
    low_fis = open(os.path.join(settings.workdir, 'tran_lowfis.p'), 'rb').read()

    logging.info('Loading max, low FIS and derived lattice')
    with catalog.pipeline() as pipe:
      pipe.set('lattice:max_fis', max_fis)
      pipe.set('lattice:low_fis', low_fis)
      pipe.set('lattice:dlat', dlat)
      pipe.execute()

    logging.info('Loading raw distance from file')
    de_ds = np.load(settings.datadir + '/resrms.npy')
    logging.info('Loading raw distance space into catalog')
    with catalog.pipeline() as pipe:
      for elm in de_ds:
        pipe.rpush('dspace', pickle.dumps(elm))   # NOTE DS is RESID_RMSD
      pipe.execute()

  # Calculate Centroids -- for explore/exploit within cluster
  logging.info('Retrieving raw distance space from file')
  de_ds = 10*np.load('data/de_ds_mu.npy')
  logging.info('Calculating bin centroids (for 10-Bin labels)')
  for i, b in enumerate(bin_labels_10):
    centroid = np.mean(de_ds[bin_list_10[b]], axis=0)
    catalog.set('bin:10:centroid:%s' % b, pickle.dumps(centroid))
  logging.info('Calculating bin centroids (for 25-Bin labels)')
  for i, b in enumerate(bin_labels_25):
    centroid = np.mean(de_ds[bin_list_25[b]], axis=0)
    catalog.set('bin:25:centroid:%d_%d' % b, pickle.dumps(centroid))
  logging.info('Loading raw distance space into catalog')
  if not skip_dspace:
    with catalog.pipeline() as pipe:
      for elm in de_ds:
        pipe.rpush('dspace', pickle.dumps(elm))
      pipe.execute()

  logging.info('DEShaw data loaded. ALL Done!')
Beispiel #3
0
    def execute(self, new_basin_list):
      """Executing the Controler Algorithm. Load pre-analyzed lower dimensional
      subspaces, process user query and identify the sampling space with 
      corresponding distribution function for each user query. Calculate 
      convergence rates, run sampler, and then execute fairness policy to
      distribute resources among users' sampled values.
      """
      logging.debug('CTL MT')

    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")
      np.set_printoptions(precision=4, linewidth=150)

      self.data['timestep'] += 1
      logging.info('TIMESTEP: %d', self.data['timestep'])

      settings = systemsettings()
      bench = microbench('ctl_%s' % settings.name, self.seqNumFromID())
      stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID())

      if self.force_decision:
        new_basin_list = []

      # Connect to the cache
      self.cacheclient = CacheClient(settings.APPL_LABEL)

      # create the "binlist":
      numresources = self.data['numresources']
      explore_factor = float(self.data['sampler:explore'])
      topo = self.protein.top

      # Load new RMS Labels -- load all for now
      bench.start()
      logging.debug('Loading RMS Labels')
      start_index = max(0, self.data['ctl_index_head'])
      logging.debug(" Start_index=%d,  batch_size=%d", start_index, len(new_basin_list))

    # Calculate variable PDF estimations for each subspace via bootstrapping:
      logging.debug("=======================  <SUBSPACE CONVERGENCE> (skip)  ===================")

    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")
      stat.collect('new_basin', len(new_basin_list))

      ##### BARRIER
      self.wait_catalog()
      selected_index_list = []

      # QUERY PROCESSING & SAMPLING BELOW to select indices. 
      EXPERIMENT_NUMBER = self.experiment_number
      logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER)

      # TODO: ABSTRACT FEATURE DIMENSIONALITY
      n_features = 1653

      # Basin List will be the list of basin representing the new Job Candidates
      selected_basin_list = []
      all_basins = self.data['basin:list']
      num_prev_basins = int(self.data['basin:processed'])

      # Load Previous Distance Space (historical data)
      prev_ds_raw = self.catalog.lrange('dspace', 0, num_prev_basins)
      local_basins = {}
      if  len(prev_ds_raw) > 0:
        ds_prev = np.zeros(shape=(len(prev_ds_raw), n_features))
        logging.info("Unpickling distance space to array: %s", ds_prev.shape)
        for i, elm in enumerate(prev_ds_raw):
          ds_prev[i] = pickle.loads(elm)
        logging.info('Prev DS loaded. Size = %d', len(ds_prev))
      else:
        logging.info('NO Prev DS')
        ds_prev = []

      # FOR Supervised Classification (included for all for post-partem processing)
      bin_labels_10 = ['T0', 'T1', 'T2', 'T3', 'T4', 'W0', 'W1', 'W2', 'W3', 'W4']
      bin_labels_25 = [(a,b) for a in range(5) for b in range(5)]
      bin_list_10 = {k: [int(i) for i in self.catalog.lrange('bin:10:%s' % k, 0, -1)] for k in bin_labels_10}
      bin_list_25 = {k: [int(i) for i in self.catalog.lrange('bin:25:%d_%d' % k, 0, -1)] for k in bin_labels_25}

      # Merge locally analyzed distance space
      delta_ds = np.zeros(shape=(len(new_basin_list), n_features))
      logging.info('Collecting new data from basins: %s', new_basin_list)
      for i, bid in enumerate(new_basin_list):
        basin = self.data['basin:'+bid]
        global_basin_index = int(basin['dsidx'])
        local_basins[global_basin_index] = basin

        # Update Distance Space
        dmu_  = self.catalog.lindex('dspace', global_basin_index)
        if dmu_ is None:
          print("ERROR!!!!")
          print(global_basin_index, i)
          for k,v in basin.items():
            print(k,v)
          sys.exit(0)

        delta_ds[i] = pickle.loads(dmu_)

        # Update Supervise Classification
        label_seq = [int(i) for i in self.catalog.lrange('basin:labelseq:'+bid, 0, -1)]
        local_basins[global_basin_index]['label:10'] = label10 = bin_label_10(label_seq)
        local_basins[global_basin_index]['label:25'] = label25 = bin_label_25(label_seq)
        bin_list_10[label10].append(global_basin_index)
        bin_list_25[label25].append(global_basin_index)
        self.data['basin:processed'] += 1


      if len(new_basin_list) > 0 and len(prev_ds_raw) > 0:
        dist_space = np.vstack((ds_prev, delta_ds))
      elif len(delta_ds) == 0:
        dist_space = np.array(ds_prev)
      elif len(ds_prev) == 0:
        logging.info('First Set of Distance Coord laoded')
        dist_space = delta_ds
      else:
        logging.error("ERROR! NO DISTANCE SPACE IN THE CATALOG")

      # UNIFORM SAMPLER (BASIC)
      if EXPERIMENT_NUMBER == 12:
        basin_idx_list = []
        candidate_list = [[] for i in range(5)]
        for k,v in bin_list_10.items():
          candidate_list[int(k[1])].extend(v)

        # UNIFORM SAMPLING
        for sel_num in range(numresources):
          # Select Random start state
          start_state = np.random.randint(5)
          # Select random index
          rand_index = np.random.choice(len(candidate_list[start_state]))
          basin_idx = candidate_list[start_state][rand_index]
          basin_idx_list.append(basin_idx)

        for i in basin_idx_list:
          if i < num_prev_basins:
            logging.info("Select index: %s   (Retrieve from Catalog)", i)
            bid = self.data['basin:list'][i]
            basin = self.catalog.hgetall('basin:%s'%bid)
          else:
            logging.info("Select index: %s   (New locally built basin in mem)", i)  
            basin = local_basins[i]
          logging.debug('   BASIN:  %s', basin['id'])
          selected_basin_list.append(basin)

      # BIASED SAMPLER (UPDATED)
      if EXPERIMENT_NUMBER == 13:

        # USING 10-BIN LABELS
        distro = [len(bin_list_10[i]) for i in bin_labels_10]

        # Create and invoke the sampler
        logging.info('Running the biased (umbrella) samplers')
        sampler = BiasSampler(distro)
        samplecount = np.zeros(len(bin_labels_10), dtype=np.int16)

        # Find the first index for each bin:
        explore_direction = 1 if explore_factor < .5 else -1
        for i, b in enumerate(bin_list_10):
          if len(b) == 0:
            idx = 0
          else: 
            idx = np.floor(explore_factor * (len(b) - 1))
          samplecount[i] = idx

        sel_bins = sampler.execute(numresources)

        logging.info('Processing selected bins to find starting candidates')
        candidate_list = {}
        basin_idx_list = []
        for b in sel_bins:
          target_bin = bin_labels_10[b]
          if target_bin not in candidate_list:
            candidate_list[target_bin] = bin_list_10[target_bin]
          #  TODO:  FULLY IMPLEMENT EXPLORE/EXPLOIT BUT INCL HISTORY/PROVONANCE
            # Lazy Update to centroid -- push to catalog immediately
            # vals = dist_space[bin_list_10[target_bin]]
            # logging.info('Updating Centroid for bin %s,  bindata: %s', target_bin, vals.shape)
            # centroid = np.mean(vals, axis=0)
            # self.catalog.set('bin:10:centroid:%s' % target_bin, pickle.dumps(centroid))
            # dist_center = [LA.norm(centroid - dist_space[i]) for i in bin_list_10[target_bin]]
            # candidate_list[target_bin] = sorted(zip(bin_list_10[target_bin], dist_center), key=lambda x: x[1])
          # basin_idx, basin_diff = candidate_list[target_bin][samplecount[b]]
          # samplecount[b] += explore_direction
          # # Wrap
          # if samplecount[b] == 0:
          #   samplecount = len(candidate_list[target_bin]) - 1
          # if samplecount[b] == len(candidate_list[target_bin]):
          #   samplecount = 0

          # FOR NOW PICK A RANDOM CANDIDATE 
          rand_index = np.random.choice(len(candidate_list[target_bin]))
          basin_idx = candidate_list[target_bin][rand_index]
          logging.info('BIAS SAMPLER:\n   Bin: %s\n   basin: %d     Delta from Center: %6.3f  (note: dist not factored in)', \
            target_bin, basin_idx, 0.)
          basin_idx_list.append(basin_idx)

        for i in basin_idx_list:
          if i < num_prev_basins:
            logging.info("Select index: %s   (Retrieve from Catalog)", i)
            bid = self.data['basin:list'][i]
            basin = self.catalog.hgetall('basin:%s'%bid)
          else:
            logging.info("Select index: %s   (New locally built basin in mem)", i)  
            basin = local_basins[i]
          logging.debug('   BASIN:  %s', basin['id'])
          selected_basin_list.append(basin)

      # LATTICE SAMPLER (WITH HISTORICAL DATA)
      if EXPERIMENT_NUMBER == 14:
        # Merge Existing delta with DEShaw Pre-Processed data:
        logging.info('Merging DEShaw with existing generated data')

        # Set parameters for lattice
        Kr = [int(i) for i in self.catalog.lrange('lattice:features', 0, -1)]
        support = int(self.data['lattice:support'])
        dspt = self.catalog.get('lattice:delta_support')
        delta_support = 5 if dspt is None else int(dspt)
        cutoff  = float(self.data['lattice:cutoff'])
        logging.info('PARAMS  Kr:%s\n support:%d  dspt:%d  cutoff:%f', Kr, support, delta_support, cutoff)

        # Load existing (base) lattice data
        logging.info("Unpickling max/low FIS and derived lattice EMD values")
        max_fis    = pickle.loads(self.catalog.get('lattice:max_fis'))
        low_fis    = pickle.loads(self.catalog.get('lattice:low_fis'))
        dlat       = pickle.loads(self.catalog.get('lattice:dlat'))

        # TODO: MOVE iset completely into catalog        
        # Ik       = pickle.loads(self.catalog.get('lattice:iset'))

        # FOR NOW: Full DEShaw Index is saved on disk  (TODO: MOVE TO CATALOG)
        logging.info("Loading full Itemset from disk (TODO: Det optimization on mem/time)")
        Ik        = pickle.load(open(settings.datadir + '/iset.p', 'rb'))
        # Item_set Keys (Ik) are only saved as a delta for space conservation
        if os.path.exists(settings.datadir + '/iset_delta.p'):
          Ik_delta  = pickle.load(open(settings.datadir + '/iset_delta.p', 'rb'))
        else:
          Ik_delta = {}
        # Merge previous item set delta with DEShaw index
        logging.info("Merging DEShaw Ik with Delta IK")
        for k,v in Ik_delta.items():
          Ik[k] = np.concatenate((Ik[k], v)) if k in Ik else v

        # Build Base Lattice Object
        base_lattice=lat.Lattice(ds_prev, Kr, cutoff, support)
        base_lattice.set_fis(max_fis, low_fis)
        base_lattice.set_dlat(dlat, Ik)

        if not self.force_decision and len(delta_ds) > 0:
          # Build Delta Lattice Object
          logging.info('Building Delta lattice. Num new items: %d', len(delta_ds))
          delta_lattice = lat.Lattice(delta_ds, Kr, cutoff, delta_support, invert=invert_vals)
          delta_lattice.maxminer()
          delta_lattice.derive_lattice()

          # Update non-DEShaw delta itemset key index
          logging.info("Updating Itemsets and Distance Space Matrix")
          for k,v in delta_lattice.Ik.items():
            Ik_delta[k] = np.concatenate((Ik_delta[k], v)) if k in Ik_delta else v

          # Save Ik delta to disk
          logging.info("Saving Delta Itemset (to disk)")
          pickle.dump(Ik_delta, open(settings.datadir + '/iset_delta.p', 'wb'))

          #  Perform incremental maintenance
          logging.info('Merging Delta lattice with Base Lattice')
          base_lattice.merge(delta_lattice)

        # Create the Sampler object (also make clusters)
        logging.info('Invoking the Lattice Sampler')

        sampler = LatticeSampler(base_lattice)
        basin_id_list = sampler.execute(numresources)
        for index in basin_id_list:
          bid = all_basins[index]
          selected_basin_list.append(self.catalog.hgetall('basin:%s'%bid))

      # LATTICE SAMPLER (DE NOVO)
      if EXPERIMENT_NUMBER == 15:

        # PREPROCESS
        N_features_src = topo.n_residues
        N_features_corr = (N_features_src**2 - N_features_src) // 2 
        upt = np.triu_indices(N_features_src, 1)
        old_basin_ids = all_basind[:num_prev_basins]
    
        # DENOVO Exploratory Bootstrapping (RMSD)
        explore_factor = float(self.data['sampler:explore'])

        # TODO:  Better transtion plan from explore to exploit
        self.data['sampler:explore'] *= .75   
        executed_basins = self.catalog.lrange('executed', 0, -1)

        if explore_factor > 0:
          logging.info("EXPLORING Most active basins....")
          basindata = [self.catalog.hgetall(bid) for bid in old_basin_ids]
          for bid in new_basin_list:
            basindata.append(self.data['basin:'+bid])          

          basins_with_rms = [b for b in basindata if 'resrms_delta' in b]

          basin_by_rmsd = sorted(basins_with_rms, key=lambda x: float(x['resrms_delta']), reverse=True)
          explore_samples = int(np.floor(numresources * explore_factor))
          logging.info('Num to explore: %d  out of %d', explore_samples, len(basin_by_rmsd))
          idx, num_sampled = 0, 0

          while idx < len(basin_by_rmsd) and num_sampled < explore_samples:
            selb = basin_by_rmsd[idx]
            if selb['id'] in executed_basins:
              logging.info('Selected %s, but it has been executed. Selecting next option', selb['id'])
            else:
              selected_basin_list.append(selb)
              logging.info('  (%d) EXPLORE BASIN:  %s  %f', selb['id'], selb['id'], float(selb['resrms_delta']))
              numresources -= 1
              num_sampled += 1
            idx += 1

        # TODO:  Reduced Feature Sets
        #  Using Reduced Feature Set Alg #2 HERE
        support = int(.01 * len(dist_space))
        cutoff  = 8

        # RE-Calc the whole lattice:
        logging.info("Building the new lattice")
        BUILD_NEW = not self.catalog.exists('lattice:bootstrapped')
        # TODO: Decision to go from build new to incr maint
        if BUILD_NEW:
          tval = .05
          Kr = lat.reduced_feature_set2(dist_space, cutoff, theta=tval, maxk=25)
          retry = 5
          while len(Kr) < 12 and retry > 0:
            tval /= 2
            retry -= 1
            Kr = lat.reduced_feature_set2(dist_space, cutoff, theta=tval, maxk=25)

          base_lattice = lat.Lattice(dist_space, Kr, cutoff, support)
          base_lattice.maxminer()
          base_lattice.derive_lattice()
          with self.catalog.pipeline() as pipe:
            pipe.delete('lattice:kr')
            for i in sorted(Kr):
              pipe.rpush('lattice:kr', i)
            pipe.execute()

        else:
          # Load existing (base) lattice data
          max_fis    = pickle.loads(self.catalog.get('lattice:max_fis'))
          low_fis    = pickle.loads(self.catalog.get('lattice:low_fis'))
          dlat       = pickle.loads(self.catalog.get('lattice:dlat'))
          Ik         = pickle.loads(self.catalog.get('lattice:iset'))
          num_k      = self.catalog.get('lattice:num_k')
          Kr         = [int(i) for i in self.catalog.lrange('lattice:kr', 0, -1)]
          if num_k is None:
            num_k = max(8, min(15, numresources*2))

          # Build Lattice Object
          logging.info('Building Existing lattice object')
          base_lattice=lat.Lattice(ds_prev, Kr, cutoff, support)
          base_lattice.set_fis(max_fis, low_fis)
          base_lattice.set_dlat(dlat, Ik)

          # Build Delta Lattice Object
          logging.info('Building Delta lattice. Num new items: %d', len(delta_ds))
          delta_lattice = lat.Lattice(delta_ds, Kr, cutoff, 1)
          delta_lattice.maxminer()
          delta_lattice.derive_lattice()

          #  Perform incremental maintenance
          logging.info('Merging Delta lattice with Base Lattice')
          base_lattice.merge(delta_lattice)

        if numresources > 0:
          logging.info('Invoking the Lattice Sampler')
          sampler = LatticeSampler(base_lattice)
          basin_id_list = sampler.execute(numresources)

          # For now retrieve immediately from catalog
          self.wait_catalog()
          for index in basin_id_list:
            bid = all_basins[index]
            key = 'basin:%s'%bid
            # Check to ensure its not a new basin and that it exists in the DB
            if self.catalog.exists(key):
              logging.debug('KEY EXISTS: %s', key)
              selbasin = self.catalog.hgetall(key)
            else:
              logging.debug('NO KEY: %s\n%s', key, self.data[key])
              selbasin = self.data[key]
            selected_basin_list.append(selbasin)

      # CORRELATION SAMPLER (KMEANS)
      if EXPERIMENT_NUMBER == 18:
        sampler = CorrelationSampler(cm_all, mu=dist_space)
        basin_id_list = sampler.execute(numresources)
        for bid in basin_id_list:
          selected_basin_list.append(self.catalog.hgetall('basin:%s'%bid))

      bench.mark('GlobalAnalysis')

    # Generate new starting positions
      jcqueue = OrderedDict()
      src_traj_list = []
      for basin in selected_basin_list:
        src_traj_list.append(basin['traj'])
        if basin['traj'].startswith('desh'):
          global_params = getSimParameters(self.data, 'deshaw')
          # fileno = int(basin['traj'][-4:])
          # frame = int(basin['mindex'])
          # jcID, config = generateDEShawJC(fileno, frame)
        else:
          global_params = getSimParameters(self.data, 'gen')
          src_psf = self.catalog.hget('jc_' + basin['traj'], 'psf')
          global_params.update({'psf': src_psf})
          # jcID, config = generateExplJC(basin, jcid=None)

        jcID, config = generateFromBasin(basin)
        config.update(global_params)
        config['name'] = jcID
        logging.info("New Simulation Job Created: %s", jcID)
        for k, v in config.items():
          logging.debug("   %s:  %s", k, str(v))

        #  Add to the output queue & save config info
        jcqueue[jcID] = config
        logging.info("New Job Candidate Completed:  %s   #%d on the Queue", jcID, len(jcqueue))

      stat.collect('src_traj_list', src_traj_list)
      bench.mark('GenInputParams')

    #  POST-PROCESSING  -------------------------------------
      logging.debug("============================  <POST-PROCESSING & OUTPUT>  =============================")
      self.wait_catalog()

      with self.catalog.pipeline() as pipe:
        for basin in selected_basin_list:
          pipe.rpush('executed', basin['id'])
        pipe.execute()

      # Append new distance values
      if EXPERIMENT_NUMBER == 14:
        # Save Ik delta to disk
        logging.info("Saving Delta Itemset (to disk)")
        pickle.dump(Ik_delta, open(settings.datadir + '/iset_delta.p', 'wb'))

        with self.catalog.pipeline() as pipe:
          pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
          pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
          pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
          pipe.execute()

      if EXPERIMENT_NUMBER == 15:
        with self.catalog.pipeline() as pipe:
          pipe.set('lattice:max_fis', pickle.dumps(base_lattice.max_fis))
          pipe.set('lattice:low_fis', pickle.dumps(base_lattice.low_fis))
          pipe.set('lattice:dlat', pickle.dumps(base_lattice.dlat))
          pipe.execute()
        self.catalog.set('lattice:iset', pickle.dumps(base_lattice.Ik))

      for basin_idx, basin in local_basins.items():
        key = 'basin:' + basin['id']
        with self.catalog.pipeline() as pipe:
          pipe.hset(key, 'label:10', basin['label:10'])
          pipe.hset(key, 'label:25', basin['label:25'])
          pipe.rpush('bin:10:%s' % basin['label:10'], basin_idx)
          pipe.rpush('bin:25:%d_%d' % basin['label:25'], basin_idx)
          pipe.execute()
      self.catalog.set('basin:processed', num_prev_basins + len(local_basins))

      # Clear current queue, mark previously queues jobs for GC, push new queue
      qlen = self.catalog.llen('jcqueue')
      logging.debug('Current queue len;   %s', str(qlen))
      if qlen > 0:
        curqueue = self.catalog.lrange('jcqueue', 0, -1)
        logging.info("Marking %d obsolete jobs for garbage collection", len(curqueue))
        for jc_key in curqueue:
          key = wrapKey('jc', jc_key)
          config = self.catalog.hgetall(key)
          config['gc'] = 0
          # Add gc jobs it to the state to write back to catalog (flags it for gc)
          self.addMut(key, config)
        self.catalog.delete('jcqueue')


      # Update cache hit/miss
      # hit = self.cache_hit
      # miss = self.cache_miss
      # logging.info('##CACHE_HIT_MISS %d %d  %.3f', hit, miss, (hit)/(hit+miss))
      # self.catalog.rpush('cache:hit', self.cache_hit)
      # self.catalog.rpush('cache:miss', self.cache_miss)

      self.data['jcqueue'] = list(jcqueue.keys())

      logging.debug("   JCQUEUE:  %s", str(self.data['jcqueue']))
      # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y)
      logging.debug("Updated Job Queue length:  %d", len(self.data['jcqueue']))
      for jcid, config in jcqueue.items():
        # config['converge'] = self.data['converge']
        self.addMut(wrapKey('jc', jcid), config)
 
      self.notify('sim')

      bench.mark('PostProcessing')
      print ('## TS=%d' % self.data['timestep'])
      bench.show()
      stat.show()

      return list(jcqueue.keys())
Beispiel #4
0
def load_historical_DEShaw(catalog, skip_dspace=False):
    """ Load all DEShaw data into basins for processing """
    settings = systemsettings()
    home = os.getenv('HOME')

    # Read in and parse TimeScape output
    file_pref = home + '/work/timescape/desh_'  #'/root/heavy/out/expl'
    basin_list = []
    logging.info('Loading all D.E.Shaw Time Scape data and processing basins')
    # To re-label basin
    atemp_labels = np.load(home + '/work/results/DE_label_full.npy')

    bnum = 0
    bin_labels_10 = [
        'T0', 'T1', 'T2', 'T3', 'T4', 'W0', 'W1', 'W2', 'W3', 'W4'
    ]
    bin_labels_25 = [(a, b) for a in range(5) for b in range(5)]
    bin_list_10 = defaultdict(list)
    bin_list_25 = defaultdict(list)
    for i in range(42):
        nframes = 100000 if i < 41 else 25000
        minima_list = TimeScape.read_log(file_pref + '%02d_minima.log' % i)
        window_list = TimeScape.windows(file_pref + '%02d_transitions.log' % i)
        basin_index = 0
        last = None
        offset = 100000 * i
        pipe = catalog.pipeline()
        while basin_index < len(minima_list):
            ### MINIMA IS LOCAL TO FULL 2.5us FILE
            ### WINDOW IS GLOBAL INDEX OVER ALL 4.125Mil Frames
            a, b = window_list[basin_index]
            minima = minima_list[basin_index]
            basin_id = '%07d' % (offset + a)
            local_file_num = offset // 1000 + minima // 1000
            local_file_id = 'desh_%04d' % (local_file_num)
            local_minima = (minima + offset) - local_file_num * 1000
            basin = Basin(local_file_id, (a + offset, b + offset),
                          local_minima,
                          uid='%07d' % (offset + a))
            if last is not None:
                basin.prev = last.id
                basin_list[-1].next = basin.id
            basin_list.append(basin)
            last = basin
            basin_index += 1
            basin_hash = basin.kv()
            basin_hash['dsidx'] = bnum
            label_seq = atemp_labels[a + offset:b + offset]
            basin_hash['label:10'] = label10 = bin_label_10(label_seq)
            basin_hash['label:25'] = label25 = bin_label_25(label_seq)
            bin_list_10[label10].append(bnum)
            bin_list_25[label25].append(bnum)
            pipe.rpush('bin:10:%s' % label10, bnum)
            pipe.rpush('bin:25:%d_%d' % label25, bnum)
            pipe.rpush('basin:list', basin_id)
            pipe.hmset('basin:%s' % basin_id, basin_hash)
            bnum += 1
        pipe.execute()
    catalog.set('basin:processed', bnum)

    # logging.info('Loading Pre-Calculated Correlation Matrix and mean/stddev vals')
    # corr_matrix = np.load('data/de_corr_matrix.npy')
    # dmu = np.load('data/de_ds_mu.npy')
    # dsig = np.load('data/de_ds_mu.npy')

    # logging.info("Loading Historical data into catalog:  corr_matrix: %s", corr_matrix.shape)
    # catalog.storeNPArray(corr_matrix, 'desh:coor_vector')
    # catalog.storeNPArray(dmu, 'desh:ds_mu')
    # catalog.storeNPArray(dsigma, 'desh:ds_sigma')

    if settings.EXPERIMENT_NUMBER == 14:

        if not os.path.exists(settings.datadir + '/iset.p'):
            os.symlink(settings.workdir + '/iset.p',
                       settings.datadir + '/iset.p')
        if not os.path.exists(settings.datadir + '/iset.p'):
            os.symlink(settings.workdir + '/de_ds_mu.npy',
                       settings.datadir + '/de_ds_mu.npy')
        if not os.path.exists(settings.datadir + '/de_ds_mu.npy'):
            os.symlink(settings.workdir + '/data/de_ds_mu.npy',
                       settings.datadir + '/de_ds_mu.npy')

        dlat = open(os.path.join(settings.workdir, 'dlat.p'), 'rb').read()
        max_fis = open(os.path.join(settings.workdir, 'mfis.p'), 'rb').read()
        low_fis = open(os.path.join(settings.workdir, 'lowfis.p'), 'rb').read()

        logging.info('Loading max, low FIS and derived lattice')
        with catalog.pipeline() as pipe:
            pipe.set('lattice:max_fis', max_fis)
            pipe.set('lattice:low_fis', low_fis)
            pipe.set('lattice:dlat', dlat)
            pipe.execute()

        logging.info('Loading raw distance from file')
        de_ds = 10 * np.load(settings.datadir + '/de_ds_mu.npy')
        logging.info('Loading raw distance space into catalog')
        with catalog.pipeline() as pipe:
            for elm in de_ds:
                pipe.rpush('dspace', pickle.dumps(elm))
            pipe.execute()

    if settings.EXPERIMENT_NUMBER == 16:

        if not os.path.exists(settings.datadir + '/iset.p'):
            os.symlink(settings.workdir + '/tran_iset.p',
                       settings.datadir + '/iset.p')
        if not os.path.exists(settings.datadir + '/resrms.npy'):
            os.symlink(settings.workdir + '/data/resrms.npy',
                       settings.datadir + '/resrms.npy')

        dlat = open(os.path.join(settings.workdir, 'tran_dlat.p'), 'rb').read()
        max_fis = open(os.path.join(settings.workdir, 'tran_mfis.p'),
                       'rb').read()
        low_fis = open(os.path.join(settings.workdir, 'tran_lowfis.p'),
                       'rb').read()

        logging.info('Loading max, low FIS and derived lattice')
        with catalog.pipeline() as pipe:
            pipe.set('lattice:max_fis', max_fis)
            pipe.set('lattice:low_fis', low_fis)
            pipe.set('lattice:dlat', dlat)
            pipe.execute()

        logging.info('Loading raw distance from file')
        de_ds = np.load(settings.datadir + '/resrms.npy')
        logging.info('Loading raw distance space into catalog')
        with catalog.pipeline() as pipe:
            for elm in de_ds:
                pipe.rpush('dspace',
                           pickle.dumps(elm))  # NOTE DS is RESID_RMSD
            pipe.execute()

    # Calculate Centroids -- for explore/exploit within cluster
    logging.info('Retrieving raw distance space from file')
    de_ds = 10 * np.load('data/de_ds_mu.npy')
    logging.info('Calculating bin centroids (for 10-Bin labels)')
    for i, b in enumerate(bin_labels_10):
        centroid = np.mean(de_ds[bin_list_10[b]], axis=0)
        catalog.set('bin:10:centroid:%s' % b, pickle.dumps(centroid))
    logging.info('Calculating bin centroids (for 25-Bin labels)')
    for i, b in enumerate(bin_labels_25):
        centroid = np.mean(de_ds[bin_list_25[b]], axis=0)
        catalog.set('bin:25:centroid:%d_%d' % b, pickle.dumps(centroid))
    logging.info('Loading raw distance space into catalog')
    if not skip_dspace:
        with catalog.pipeline() as pipe:
            for elm in de_ds:
                pipe.rpush('dspace', pickle.dumps(elm))
            pipe.execute()

    logging.info('DEShaw data loaded. ALL Done!')