def bootstrap(self, size): feal = self.all_feal() i = 0 boot = [] while i+size < len(feal): print(i) boot.append(op.bootstrap_block(feal[:i+size], size)) i += size return boot
def elas_feal(name, feal_list, max_obs, step=2000): eid = db.get_expid(name) plotlist = [] sw_list=db.runquery('select start,time,numobs from sw where expid=%d order by start'%eid) end_ts = lambda x: du.parse(x[0]).timestamp() + x[1] ts_0 = du.parse(sw_list[0][0]).timestamp() sw = sorted([dict(start=x[0], time=x[1], numobs=x[2], end=end_ts(x)-ts_0) for x in sw_list], key=lambda i: i['end']) n = 0 snum = 0 nextcalc = step while n < max_obs and snum < len(sw): n += sw[snum]['numobs'] if n > nextcalc: t = sw[snum]['end'] / 3600. # c = bootstrap_sampler(obs[:min(n,N)], samplesize=.25) c = op.bootstrap_block(feal_list[:n], step) plotlist.append((t, min(np.max(c[1]), 1.))) nextcalc += step snum += 1 return plotlist
def execute(self, thru_index): """Executing the Controler Algorithm. Load pre-analyzed lower dimensional subspaces, process user query and identify the sampling space with corresponding distribution function for each user query. Calculate convergence rates, run sampler, and then execute fairness policy to distribute resources among users' sampled values. """ logging.debug('CTL MT') # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug("============================ <PRE-PROCESS> =============================") np.set_printoptions(precision=4, linewidth=150) self.data['timestep'] += 1 logging.info('TIMESTEP: %d', self.data['timestep']) settings = systemsettings() bench = microbench('ctl_%s' % settings.name, self.seqNumFromID()) stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID()) # Connect to the cache self.cacheclient = CacheClient(settings.APPL_LABEL) # create the "binlist": numLabels = self.data['numLabels'] numresources = self.data['numresources'] # LOAD all new subspaces (?) and values ##### BARRIER self.wait_catalog() # Load new RMS Labels -- load all for now bench.start() logging.debug('Loading RMS Labels') start_index = max(0, self.data['ctlIndexHead']) # labeled_pts_rms = self.catalog.lrange('label:rms', self.data['ctlIndexHead'], thru_index) logging.debug(" Start_index=%d, thru_index=%d, ctlIndexHead=%d", start_index, thru_index, self.data['ctlIndexHead']) feallist = [np.fromstring(i) for i in self.catalog.lrange('subspace:feal', 0, -1)] num_pts = len(feallist) self.data['ctlIndexHead'] = thru_index thru_count = self.data['observe:count'] logging.debug('##NUM_RMS_THIS_ROUND: %d', num_pts) stat.collect('numpts', len(feallist)) # Calculate variable PDF estimations for each subspace via bootstrapping: logging.debug("======================= <SUBSPACE CONVERGENCE> =========================") # Bootstrap current sample for RMS logging.info("Feature Landscapes for %d points loaded. Calculating PDF.....", len(feallist)) # Static for now blocksize = 5000 mv_convergence = op.bootstrap_block(feallist, blocksize) global_landscape = np.mean(feallist, axis=0) stat.collect('convergence', mv_convergence) stat.collect('globalfeal', global_landscape) # logging.info('MV Convergence values:\nCONV,%s', ','.join(['%5.3f'%i for i in mv_convergence])) # logging.info('Global Feature Landscape:\nFEAL,%s', ','.join(['%5.3f'%i for i in global_landscape])) logging.info('MV Convergence values:\nCONV,%s', str(mv_convergence[-1])) logging.info('Global Feature Landscape:\n%s', feal.tostring(global_landscape)) # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug("======================= <QUERY PROCESSING> =========================") ##### BARRIER self.wait_catalog() selected_index_list = [] # QUERY PROCESSING & SAMPLING BELOW to select indices. EXPERIMENT_NUMBER = self.experiment_number logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER) ###### EXPERIMENT #5: BIASED (Umbrella) SAMPLER if EXPERIMENT_NUMBER == 5: if self.catalog.exists('label:deshaw'): logging.info("Loading DEShaw historical points.... From Catalog") rmslabel = [eval(x) for x in self.catalog.lrange('label:deshaw', 0, -1)] else: logging.info("Loading DEShaw historical points.... From File (and recalculating)") rmslabel = deshaw.labelDEShaw_rmsd() deshaw_samples = {b:[] for b in binlist} for i, b in enumerate(rmslabel): deshaw_samples[b].append(i) coord_origin = [] conv_vals = np.array([v for k, v in sorted(convergence_rms.items())]) norm_pdf_conv = conv_vals / sum(conv_vals) logging.info("Umbrella Samping PDF (Bootstrapping):") sampled_distro_perbin = {b: 0 for b in binlist} while numresources > 0: # First sampling is BIASED selected_bin = np.random.choice(len(binlist), p=norm_pdf_conv) A, B = binlist[selected_bin] sampled_distro_perbin[binlist[selected_bin]] += 1 if bincounts[selected_bin] is not None and bincounts[selected_bin] > 0: # Secondary Sampling is Uniform sample_num = np.random.randint(bincounts[selected_bin]) logging.debug('SAMPLER: selecting sample #%d from bin %s', sample_num, str(binlist[selected_bin])) index = self.catalog.lindex('varbin:rms:%d_%d' % binlist[selected_bin], sample_num) selected_index_list.append(index) coord_origin.append(('sim', index, binlist[selected_bin], '%d-D'%A)) numresources -= 1 elif len(deshaw_samples[binlist[selected_bin]]) > 0: index = np.random.choice(deshaw_samples[binlist[selected_bin]]) logging.debug('SAMPLER: selecting DEShaw frame #%d from bin %s', index, str(binlist[selected_bin])) # Negation indicates an historical index number selected_index_list.append(-index) coord_origin.append(('deshaw', index, binlist[selected_bin], '%d-D'%A)) numresources -= 1 else: logging.info("NO Candidates for bin: %s", binlist[selected_bin]) ###### EXPERIMENT #10: MutiVariate Nearest Neighbor (MVNN) SAMPLER if EXPERIMENT_NUMBER == 10: # Create the KD Tree from all feature landscapes (ignore first 5 features) kd = KDTree(100, 15, np.array(feallist), 'median') # Collect hypercubes hc = kd.getleaves() logging.info('KD Tree Stats') logging.info(' # HCubes : %5d', len(hc)) logging.info(' Largest HC: %5d', max([v['count'] for k,v in hc.items()])) logging.info(' Smallest HC: %5d', min([v['count'] for k,v in hc.items()])) for key, hcube in hc.items(): hc_feal = [feallist[i] for i in hcube['elm']] hc[key]['feal'] = np.mean(hc_feal, axis=0) # Det scale and/or sep scales for each feature set desired = 10 - global_landscape logging.info('Desired Landscape:\n%s', feal.tostring(desired)) # Calc euclidean dist to each mean HC's feal nn = {k: LA.norm(desired[5:] - v['feal'][5:]) for k,v in hc.items()} # Grab top N Neighbors (10 for now) neighbors = sorted(nn.items(), key=lambda x: x[1])[:10] logging.info('BestFit Landscape:\n%s', feal.tostring(hc[neighbors[0][0]]['feal'])) ## DATA SAMPLER nn_keys = [i for i,w in neighbors] nn_wgts = np.array([w for i,w in neighbors]) nn_wgts /= np.sum(nn_wgts) # normalize coord_origin = [] while numresources > 0: # First sampling is BIASED selected_hc = np.random.choice(nn_keys, p=nn_wgts) # Second is UNIFORM (within the HCube) index = np.random.choice(hc[selected_hc]['elm']) selected_index_list.append(index) src_state = np.argmax(feallist[index][:5]) coord_origin.append(('sim', index, src_state, selected_hc)) logging.info('Sampled Landscape [hc=%s]:\n%s', selected_hc, feal.tostring(feallist[index])) numresources -= 1 elif EXPERIMENT_NUMBER == 11: # Use only right most 10 features (non-normalized ones) inventory = np.array([f[10:] for f in feallist]) desired = 10 - global_landscape logging.info('Desired Landscape (NOTE Only Including A-B values:\n%s', feal.tostring(desired)) selected_index_list = mvkp.knapsack(desired[10:], inventory, numresources, 2000000) coord_origin = [('sim', index, np.argmax(feallist[index][:5]), 'D') for index in selected_index_list] logging.info("KNAPSACK Completed:") logging.info('Target Distribution:\n%s', str(desired[10:])) logging.info('Target Distribution:\n%s', '\n'.join(['%s'%feallist[i] for i in selected_index_list])) # Back Project to get new starting Coords for each sample logging.debug("======================= <INPUT PARAM GENERATION> =================") logging.info('All Indices sampled. Back projecting to high dim coords') sampled_set = [] for i in selected_index_list: traj = self.backProjection([i]) sampled_set.append(traj) bench.mark('Sampler') # Generate new starting positions runtime = self.data['runtime'] jcqueue = OrderedDict() for i, start_traj in enumerate(sampled_set): jcID, params = generateNewJC(start_traj) # TODO: Update/check adaptive runtime, starting state jcConfig = dict(params, name = jcID, runtime = runtime, # In timesteps dcdfreq = self.data['dcdfreq'], # Frame save rate interval = self.data['dcdfreq'] * self.data['sim_step_size'], temp = 310, timestep = self.data['timestep'], gc = 1, origin = coord_origin[i][0], src_index = coord_origin[i][1], src_bin = coord_origin[i][2], src_hcube = coord_origin[i][3], application = settings.APPL_LABEL) logging.info("New Simulation Job Created: %s", jcID) for k, v in jcConfig.items(): logging.debug(" %s: %s", k, str(v)) # Add to the output queue & save config info jcqueue[jcID] = jcConfig logging.info("New Job Candidate Completed: %s #%d on the Queue", jcID, len(jcqueue)) bench.mark('GenInputParams') # POST-PROCESSING ------------------------------------- logging.debug("============================ <POST-PROCESSING & OUTPUT> =============================") self.wait_catalog() # Clear current queue, mark previously queues jobs for GC, push new queue qlen = self.catalog.llen('jcqueue') logging.debug('Current queue len; %s', str(qlen)) if qlen > 0: curqueue = self.catalog.lrange('jcqueue', 0, -1) logging.info("Marking %d obsolete jobs for garbage collection", len(curqueue)) for jc_key in curqueue: key = wrapKey('jc', jc_key) config = self.catalog.hgetall(key) config['gc'] = 0 # Add gc jobs it to the state to write back to catalog (flags it for gc) self.addMut(key, config) self.catalog.delete('jcqueue') # CATALOG UPDATES self.catalog.rpush('datacount', len(feallist)) # EXPR 7 Update: if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10: # self.catalog.storeNPArray(np.array(centroid), 'subspace:covar:centroid:%d' % cov_iteration) self.catalog.rpush('subspace:covar:thruindex', len(covar_pts)) # Update cache hit/miss hit = self.cache_hit miss = self.cache_miss logging.info('##CACHE_HIT_MISS %d %d %.3f', hit, miss, (hit)/(hit+miss)) self.catalog.rpush('cache:hit', self.cache_hit) self.catalog.rpush('cache:miss', self.cache_miss) self.data['jcqueue'] = list(jcqueue.keys()) logging.debug(" JCQUEUE: %s", str(self.data['jcqueue'])) # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y) logging.debug("Updated Job Queue length: %d", len(self.data['jcqueue'])) for jcid, config in jcqueue.items(): # config['converge'] = self.data['converge'] self.addMut(wrapKey('jc', jcid), config) bench.mark('PostProcessing') print ('## TS=%d' % self.data['timestep']) bench.show() stat.show() return list(jcqueue.keys())