def high_low_check(r, tbin='(0, 4)'): print('Pulling data...') obslist=rb.lrange('label:rms', 0, -1) ob04 = [i for i, o in enumerate(obslist) if o == tbin] traj = backProjection(r, ob04) alpha = datareduce.filter_alpha(traj) print('Kpca') kpca1 = PCAKernel(6, 'rbf') kpca1.solve(alpha.xyz) X = kpca1.project(alpha.xyz) print('KDTree1') kdtree1 = KDTree(50, maxdepth=4, data=X, method='median') hc1 = kdtree1.getleaves() print('KDTree2') Y = alpha.xyz.reshape(alpha.n_frames, 174) kdtree2 = KDTree(50, maxdepth=4, data=Y, method='median') hc2 = kdtree2.getleaves() hc1k = sorted(hc1.keys()) hc2k = sorted(hc2.keys()) s1 = [set(hc1[k]['elm']) for k in hc1k] s2 = [set(hc2[k]['elm']) for k in hc2k] dd = np.zeros(shape=(len(s1), len(s2))) print(' ', ' '.join(hc1k)) for i, a in enumerate(s1): print(' ' +hc1k[i], end=' ') for j, b in enumerate(s2): n = len(a & b) print('%4d'%n, end=' ') dd[i][j] = n print('\n', end=' ') return dd
def make_kdtree(feal_list): kdtree1 = KDTree(50, maxdepth=4, data=feal_list, method='median') hc1 = kdtree1.getleaves() for k, v in hc1.items(): src_pts = [] for i in v['elm']: a, b = tidx[i] src_pts.append(rms_val[a][b]) print(k, np.mean(src_pts, axis=0))
def doTree(self, pc=4, ktype='rbf', leafsize=100, maxdepth=6, split='middle'): diag = np.array([np.diag(i) for i in self.covar]) kpca = PCAKernel(pc, ktype) kpca.solve(diag) gdata = kpca.project(diag) # Create KDTree over reduced-dim subspace (4-PCs) gtree = KDTree(leafsize, maxdepth, gdata, split) # Or Max Gap self.hc = gtree.getleaves()
def kpca_check(red_db, tbin='(0, 4)'): if isinstance(red_db, list): rlist = red_db else: rlist = [red_db] trajlist = [] for r in rlist: print('Pulling data...') obslist = r.lrange('label:rms', 0, -1) idxlist = [i for i, o in enumerate(obslist) if o == tbin] traj = dh.backProjection(r, idxlist) alpha = datareduce.filter_alpha(traj) trajlist.append(alpha) deidx = lambda i: deidx_cutlist(i, [t.n_frames for t in trajlist]) print('Kpca') kpca1 = PCAKernel(6, 'rbf') kpca1.solve(alpha.xyz) X = kpca1.project(alpha.xyz) print('KDTree1') kdtree1 = KDTree(50, maxdepth=4, data=X, method='median') hc1 = kdtree1.getleaves() srcidx = [[i[0] \ for i in db.runquery("select idx from jc where bin='0_4' and expid=%d"%e)] \ for e in range(32, 36)] src_traj = [dh.backProjection(r, i) for r, i in zip(rlist, srcidx)] src_xyz = [datareduce.filter_alpha(t).xyz for t in src_traj] probe_res = [[kdtree1.project(i.reshape(174,)) for i in xyz] for xyz in src_xyz] grp_src = [] for p, s in zip(probe_res, srcidx): grp = {} for h, i in zip(p, s): if h not in grp: grp[h] = [] grp[h].append(i) grp_src.append(grp) idx_se_map = [{i: (s, e) for i, s, e in db.runquery("select idx, start, end from jc where bin='0_4' and expid=%d"%eid)} for eid in range(32, 36)]
def execute(self, thru_index): """Executing the Controler Algorithm. Load pre-analyzed lower dimensional subspaces, process user query and identify the sampling space with corresponding distribution function for each user query. Calculate convergence rates, run sampler, and then execute fairness policy to distribute resources among users' sampled values. """ logging.debug('CTL MT') # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug("============================ <PRE-PROCESS> =============================") np.set_printoptions(precision=4, linewidth=150) self.data['timestep'] += 1 logging.info('TIMESTEP: %d', self.data['timestep']) settings = systemsettings() bench = microbench('ctl_%s' % settings.name, self.seqNumFromID()) stat = StatCollector('ctl_%s' % settings.name, self.seqNumFromID()) # Connect to the cache self.cacheclient = CacheClient(settings.APPL_LABEL) # create the "binlist": numLabels = self.data['numLabels'] numresources = self.data['numresources'] # LOAD all new subspaces (?) and values ##### BARRIER self.wait_catalog() # Load new RMS Labels -- load all for now bench.start() logging.debug('Loading RMS Labels') start_index = max(0, self.data['ctlIndexHead']) # labeled_pts_rms = self.catalog.lrange('label:rms', self.data['ctlIndexHead'], thru_index) logging.debug(" Start_index=%d, thru_index=%d, ctlIndexHead=%d", start_index, thru_index, self.data['ctlIndexHead']) feallist = [np.fromstring(i) for i in self.catalog.lrange('subspace:feal', 0, -1)] num_pts = len(feallist) self.data['ctlIndexHead'] = thru_index thru_count = self.data['observe:count'] logging.debug('##NUM_RMS_THIS_ROUND: %d', num_pts) stat.collect('numpts', len(feallist)) # Calculate variable PDF estimations for each subspace via bootstrapping: logging.debug("======================= <SUBSPACE CONVERGENCE> =========================") # Bootstrap current sample for RMS logging.info("Feature Landscapes for %d points loaded. Calculating PDF.....", len(feallist)) # Static for now blocksize = 5000 mv_convergence = op.bootstrap_block(feallist, blocksize) global_landscape = np.mean(feallist, axis=0) stat.collect('convergence', mv_convergence) stat.collect('globalfeal', global_landscape) # logging.info('MV Convergence values:\nCONV,%s', ','.join(['%5.3f'%i for i in mv_convergence])) # logging.info('Global Feature Landscape:\nFEAL,%s', ','.join(['%5.3f'%i for i in global_landscape])) logging.info('MV Convergence values:\nCONV,%s', str(mv_convergence[-1])) logging.info('Global Feature Landscape:\n%s', feal.tostring(global_landscape)) # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug("======================= <QUERY PROCESSING> =========================") ##### BARRIER self.wait_catalog() selected_index_list = [] # QUERY PROCESSING & SAMPLING BELOW to select indices. EXPERIMENT_NUMBER = self.experiment_number logging.info("RUNNING EXPER CONFIGURATION #%d", EXPERIMENT_NUMBER) ###### EXPERIMENT #5: BIASED (Umbrella) SAMPLER if EXPERIMENT_NUMBER == 5: if self.catalog.exists('label:deshaw'): logging.info("Loading DEShaw historical points.... From Catalog") rmslabel = [eval(x) for x in self.catalog.lrange('label:deshaw', 0, -1)] else: logging.info("Loading DEShaw historical points.... From File (and recalculating)") rmslabel = deshaw.labelDEShaw_rmsd() deshaw_samples = {b:[] for b in binlist} for i, b in enumerate(rmslabel): deshaw_samples[b].append(i) coord_origin = [] conv_vals = np.array([v for k, v in sorted(convergence_rms.items())]) norm_pdf_conv = conv_vals / sum(conv_vals) logging.info("Umbrella Samping PDF (Bootstrapping):") sampled_distro_perbin = {b: 0 for b in binlist} while numresources > 0: # First sampling is BIASED selected_bin = np.random.choice(len(binlist), p=norm_pdf_conv) A, B = binlist[selected_bin] sampled_distro_perbin[binlist[selected_bin]] += 1 if bincounts[selected_bin] is not None and bincounts[selected_bin] > 0: # Secondary Sampling is Uniform sample_num = np.random.randint(bincounts[selected_bin]) logging.debug('SAMPLER: selecting sample #%d from bin %s', sample_num, str(binlist[selected_bin])) index = self.catalog.lindex('varbin:rms:%d_%d' % binlist[selected_bin], sample_num) selected_index_list.append(index) coord_origin.append(('sim', index, binlist[selected_bin], '%d-D'%A)) numresources -= 1 elif len(deshaw_samples[binlist[selected_bin]]) > 0: index = np.random.choice(deshaw_samples[binlist[selected_bin]]) logging.debug('SAMPLER: selecting DEShaw frame #%d from bin %s', index, str(binlist[selected_bin])) # Negation indicates an historical index number selected_index_list.append(-index) coord_origin.append(('deshaw', index, binlist[selected_bin], '%d-D'%A)) numresources -= 1 else: logging.info("NO Candidates for bin: %s", binlist[selected_bin]) ###### EXPERIMENT #10: MutiVariate Nearest Neighbor (MVNN) SAMPLER if EXPERIMENT_NUMBER == 10: # Create the KD Tree from all feature landscapes (ignore first 5 features) kd = KDTree(100, 15, np.array(feallist), 'median') # Collect hypercubes hc = kd.getleaves() logging.info('KD Tree Stats') logging.info(' # HCubes : %5d', len(hc)) logging.info(' Largest HC: %5d', max([v['count'] for k,v in hc.items()])) logging.info(' Smallest HC: %5d', min([v['count'] for k,v in hc.items()])) for key, hcube in hc.items(): hc_feal = [feallist[i] for i in hcube['elm']] hc[key]['feal'] = np.mean(hc_feal, axis=0) # Det scale and/or sep scales for each feature set desired = 10 - global_landscape logging.info('Desired Landscape:\n%s', feal.tostring(desired)) # Calc euclidean dist to each mean HC's feal nn = {k: LA.norm(desired[5:] - v['feal'][5:]) for k,v in hc.items()} # Grab top N Neighbors (10 for now) neighbors = sorted(nn.items(), key=lambda x: x[1])[:10] logging.info('BestFit Landscape:\n%s', feal.tostring(hc[neighbors[0][0]]['feal'])) ## DATA SAMPLER nn_keys = [i for i,w in neighbors] nn_wgts = np.array([w for i,w in neighbors]) nn_wgts /= np.sum(nn_wgts) # normalize coord_origin = [] while numresources > 0: # First sampling is BIASED selected_hc = np.random.choice(nn_keys, p=nn_wgts) # Second is UNIFORM (within the HCube) index = np.random.choice(hc[selected_hc]['elm']) selected_index_list.append(index) src_state = np.argmax(feallist[index][:5]) coord_origin.append(('sim', index, src_state, selected_hc)) logging.info('Sampled Landscape [hc=%s]:\n%s', selected_hc, feal.tostring(feallist[index])) numresources -= 1 elif EXPERIMENT_NUMBER == 11: # Use only right most 10 features (non-normalized ones) inventory = np.array([f[10:] for f in feallist]) desired = 10 - global_landscape logging.info('Desired Landscape (NOTE Only Including A-B values:\n%s', feal.tostring(desired)) selected_index_list = mvkp.knapsack(desired[10:], inventory, numresources, 2000000) coord_origin = [('sim', index, np.argmax(feallist[index][:5]), 'D') for index in selected_index_list] logging.info("KNAPSACK Completed:") logging.info('Target Distribution:\n%s', str(desired[10:])) logging.info('Target Distribution:\n%s', '\n'.join(['%s'%feallist[i] for i in selected_index_list])) # Back Project to get new starting Coords for each sample logging.debug("======================= <INPUT PARAM GENERATION> =================") logging.info('All Indices sampled. Back projecting to high dim coords') sampled_set = [] for i in selected_index_list: traj = self.backProjection([i]) sampled_set.append(traj) bench.mark('Sampler') # Generate new starting positions runtime = self.data['runtime'] jcqueue = OrderedDict() for i, start_traj in enumerate(sampled_set): jcID, params = generateNewJC(start_traj) # TODO: Update/check adaptive runtime, starting state jcConfig = dict(params, name = jcID, runtime = runtime, # In timesteps dcdfreq = self.data['dcdfreq'], # Frame save rate interval = self.data['dcdfreq'] * self.data['sim_step_size'], temp = 310, timestep = self.data['timestep'], gc = 1, origin = coord_origin[i][0], src_index = coord_origin[i][1], src_bin = coord_origin[i][2], src_hcube = coord_origin[i][3], application = settings.APPL_LABEL) logging.info("New Simulation Job Created: %s", jcID) for k, v in jcConfig.items(): logging.debug(" %s: %s", k, str(v)) # Add to the output queue & save config info jcqueue[jcID] = jcConfig logging.info("New Job Candidate Completed: %s #%d on the Queue", jcID, len(jcqueue)) bench.mark('GenInputParams') # POST-PROCESSING ------------------------------------- logging.debug("============================ <POST-PROCESSING & OUTPUT> =============================") self.wait_catalog() # Clear current queue, mark previously queues jobs for GC, push new queue qlen = self.catalog.llen('jcqueue') logging.debug('Current queue len; %s', str(qlen)) if qlen > 0: curqueue = self.catalog.lrange('jcqueue', 0, -1) logging.info("Marking %d obsolete jobs for garbage collection", len(curqueue)) for jc_key in curqueue: key = wrapKey('jc', jc_key) config = self.catalog.hgetall(key) config['gc'] = 0 # Add gc jobs it to the state to write back to catalog (flags it for gc) self.addMut(key, config) self.catalog.delete('jcqueue') # CATALOG UPDATES self.catalog.rpush('datacount', len(feallist)) # EXPR 7 Update: if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10: # self.catalog.storeNPArray(np.array(centroid), 'subspace:covar:centroid:%d' % cov_iteration) self.catalog.rpush('subspace:covar:thruindex', len(covar_pts)) # Update cache hit/miss hit = self.cache_hit miss = self.cache_miss logging.info('##CACHE_HIT_MISS %d %d %.3f', hit, miss, (hit)/(hit+miss)) self.catalog.rpush('cache:hit', self.cache_hit) self.catalog.rpush('cache:miss', self.cache_miss) self.data['jcqueue'] = list(jcqueue.keys()) logging.debug(" JCQUEUE: %s", str(self.data['jcqueue'])) # Update Each new job with latest convergence score and save to catalog(TODO: save may not be nec'y) logging.debug("Updated Job Queue length: %d", len(self.data['jcqueue'])) for jcid, config in jcqueue.items(): # config['converge'] = self.data['converge'] self.addMut(wrapKey('jc', jcid), config) bench.mark('PostProcessing') print ('## TS=%d' % self.data['timestep']) bench.show() stat.show() return list(jcqueue.keys())
class ExprAnl: def __init__(self, host='localhost', port=6379, adaptive_cent=False): self.r = redis.StrictRedis(port=port, host=host, decode_responses=True) # self.rms = [np.fromstring(i) for i in self.r.lrange('subspace:rms', 0, -1)] self.seq = ['jc_'+x[0] for x in sorted(self.r.hgetall('anl_sequence').items(), key=lambda x: int(x[1]))] self.conf=[self.r.hgetall(i) for i in self.seq] self.trlist = {} self.wells = [[] for i in range(5)] self.rmsd_c = {} self.rmsd_cw = {} self.rmsd_ds = {} self.rmsd_dsw = {} self.feal_list = None self.checked_rmsd = [] self.cent_ds = np.load('../bpti-alpha-dist-centroid.npy') # self.cent_c = np.load('../data/gen-alpha-cartesian-centroid.npy') self.cent_c = np.load('../data/init-centroids.npy') self.cw = [.92, .92, .96, .99, .99] if adaptive_cent: pass def loadtraj(self, tr, first=None): if isinstance(tr, list): trlist = tr else: trlist = [tr] for t in trlist: traj = md.load(self.conf[t]['dcd'], top=self.conf[t]['pdb']) # traj.center_coordinates() if first is not None: traj = traj.slice(np.arange(first)) alpha = datareduce.filter_alpha(traj) # alpha.superpose(deshaw.topo_alpha) self.trlist[t] = alpha def ld_wells(self): for x, i in enumerate(self.conf): if i['origin'] == 'deshaw': A, B = eval(i['src_bin']) if A == B: traj = md.load(self.conf[A]['dcd'], top=self.conf[A]['pdb']) traj.center_coordinates() alpha = dr.filter_alpha(traj) maxf = min(1000, alpha.n_frames) for i in alpha.xyz[:maxf]: self.wells[A].append(i) def load(self, num, first=None): for i in range(num): _ = self.rms(i, first) def rms(self, trnum, noise=False, force=False, first=None): if trnum not in self.trlist.keys(): self.loadtraj(trnum, first) if trnum not in self.rmsd_c.keys() or force: if noise: # With Noise Filter noise = int(self.r.get('obs_noise')) dcdfreq = int(self.r.get('dcdfreq')) stepsize = int(self.r.get('sim_step_size')) nwidth = noise//(2*stepsize) noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0) source_pts = np.array([noisefilt(self.trlist[trnum].xyz, i) for i in range(self.trlist[trnum].n_frames)]) else: source_pts = self.trlist[trnum].xyz # self.rmsd_c[trnum] = [[LA.norm(self.cent_c[i]-pt) for i in range(5)] for pt in source_pts.xyz] self.rmsd_cw[trnum] = [[self.cw[i]*LA.norm(self.cent_c[i]-pt) for i in range(5)] for pt in source_pts] # ds = dr.distance_space(source_pts) # self.rmsd_ds[trnum] = [[LA.norm(self.cent_ds[i]-pt) for i in range(5)] for pt in ds] # self.rmsd_dsw[trnum] = [[self.cw[i]*LA.norm(self.cent_ds[i]-pt) for i in range(5)] for pt in ds] return self.rmsd_cw[trnum] def feal(self, trnum, winsize=None, var=False): feal_list = [] var_list = [] N = len(self.rmsd[trnum]) wsize = N if winsize is None else winsize if trnum not in self.rmsd.keys(): _ = self.rms(trnum) for idx in range(0, N, wsize): window = self.rmsd[trnum][idx:min(N,idx+wsize)] if var: f, v = self.feature_temporal(window, var=True) var_list.append(v) else: f = self.feature_temporal(window) feal_list.append(f) if var: return np.array(feal_list), np.array(var_list) return np.array(feal_list) @classmethod def feal_atemp(cls, rms, scaleto=10): """Atemporal (individual frame) featue landscape """ log_reld = op.makeLogisticFunc(scaleto, -3, 0) # Counts (feature 0..4) fealand = [0 for i in range(5)] fealand[np.argmin(rms)] = scaleto tup = [] # Proximity (feature 5..9) # Normalized and adjusted to smooth implicit water A, B = np.argsort(rms)[:2] prox = op.makeLogisticFunc(scaleto, scaleto, (rms[B]+rms[A])/2) for d in rms: tup.append(prox(d)) # Relative Distance (akin to LLE) (feature 10..19) for a in range(4): for b in range(a+1, 5): rel_dist = rms[a]-rms[b] tup.append(log_reld(rel_dist)) fealand.extend(tup) # Additional Feature Spaces Would go here return np.array(fealand) # Tuple or NDArray? def feature_temporal(self, window, var=False, scaleto=10): """ FEATURE LANDSCAPE Calculation for traj f data pts """ landscape = [self.feal_atemp(rms) for rms in window] meanfeal = np.mean(landscape, axis=0) if var: variance = [0 for i in range(5)] variance.extend(np.std(tup[5:] for tup in landscape), axis=0) return np.array(meanfeal), np.array(variance) else: return np.array(meanfeal) def all_feal(self, force=False, method='cw'): # if method == 'c': # rmsd = self.rmsd_c # elif method == 'cw': # rmsd = self.rmsd_cw # elif method == 'ds': # rmsd = self.rmsd_ds # else: # rmsd = self.rmsd_dsw rmsd = self.rmsd_cw if self.feal_list is None or force: self.feal_list = op.flatten([[self.feal_atemp(i) for i in rmsd[tr]] for tr in rmsd.keys()]) return self.feal_list def feal_global(self): flist = self.all_feal() return np.mean(flist, axis=0) def bootstrap(self, size): feal = self.all_feal() i = 0 boot = [] while i+size < len(feal): print(i) boot.append(op.bootstrap_block(feal[:i+size], size)) i += size return boot def draw_feal(self, trnum=None, norm=10): if trnum is None: flist = self.all_feal() agg = np.mean(flist, axis=0) P.feadist(agg, 'feal_global_%s' % self.r.get('name'), norm=norm) else: flist = [self.feal_atemp(i, scaleto=norm) for i in self.rmsd[trnum]] agg = np.mean(flist, axis=0) P.feadist(agg, 'feal_global_%s_%d' % (self.r.get('name'), trnum), norm=norm) def kdtree(self, leafsize, depth, method): self.index = [] allpts = [] # Recalc for indexing flist = [[self.feal_atemp(i) for i in self.rmsd[tr]] for tr in self.rmsd.keys()] for trnum, f in enumerate(flist): for i, tup in enumerate(f): allpts.append(tup[5:]) self.index.append((trnum, i)) self.kd = KDTree(leafsize, depth, np.array(allpts), method) self.hc = self.kd.getleaves() def hcmean(self, hckey=None): if hckey is None: flist = [[self.feal_atemp(i) for i in self.rmsd[tr]] for tr in self.rmsd.keys()] result = {} for k, v in self.hc.items(): hc_feal = [] for idx in v['elm']: trnum, frame = self.index[int(idx)] hc_feal.append(flist[trnum][frame]) result[k] = np.mean(hc_feal, axis=0) return result else: flist = [] for idx in v['elm']: trnum, frame = self.index[idx] flist.append(self.feal_list[trnum][frame]) return np.mean(flist, axis=0) #### FEATURE LANDSCAPE Temporal Data Analysis/Study def make_covar(self, win, slide): covar = [] WIN_SIZE_NS = win SLIDE_AMT_NS = slide for i, tr in self.trlist.items(): if i % 100==0: print(i) cov = datareduce.calc_covar(tr.xyz, WIN_SIZE_NS, 1, slide=SLIDE_AMT_NS) covar.extend(cov) return covar
for o in obs: pd[o] += 1 for s, v in pd.items(): hc_pdf[hc][s].append(v) mn04 = {hc: {s: np.mean(i) for s, i in x.items()} for hc, x in hc_pdf.items()} er04 = {hc: {s: np.std(i) for s, i in x.items()} for hc, x in hc_pdf.items()} mn04_n = {e: {k: v/np.sum(list(l.values())) for k,v in l.items()} for e, l in mn04.items()} er04_n = {e: {k: v/np.sum(list(l.values())) for k,v in l.items()} for e, l in er04.items()} print('KDTree2') Y = alpha.xyz.reshape(alpha.n_frames, 174) kdtree2 = KDTree(50, maxdepth=4, data=Y, method='median') hc2 = kdtree2.getleaves() hc1k = sorted(hc1.keys()) hc2k = sorted(hc2.keys()) s1 = [set(hc1[k]['elm']) for k in hc1k] s2 = [set(hc2[k]['elm']) for k in hc2k] dd = np.zeros(shape=(len(s1), len(s2))) print(' ', ' '.join(hc1k)) for i, a in enumerate(s1): print(' ' +hc1k[i], end=' ') for j, b in enumerate(s2): n = len(a & b) print('%4d'%n, end=' ') dd[i][j] = n print('\n', end=' ') return dd
def execute(self): """Special execute function for the reweight operator -- check/validate. """ # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug( "============================ <PRE-PROCESS> =============================" ) self.cacheclient = CacheClient(self.name) numLabels = 5 binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)] labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1) num_pts = len(labeled_pts_rms) logging.debug('##NUM_OBS: %d', num_pts) # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)] TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4), (0, 2), (0, 1)] MAX_SAMPLE_SIZE = 100 # Max # of cov traj to back project per HCube MAX_PT_PER_MATRIX = 100 # Num points to sample from each cov traj COVAR_SIZE = 200 # Ea Cov "pt" is 200 HD pts. -- should be static based on user query MAX_HCUBE = 6 # Max Num HCubes to process # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug( "======================= <QUERY PROCESSING> =========================" ) # 1. RUN KPCA on <<?????>> (sample set) and project all pts # 2. Calculate K-D Tree on above # 3. Score each point with distance to centroid # 4. B = Select the smallest half of clusters # 5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA) # 6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????) # ALT-> use HCUbe size as its weight # 7. A = HCubes for states 3 (and 4) # 8. Reweight A into both state 3 and state 4 (B) HCubes # 9. ID Overlap # 10. Apply Gamme Function logging.info("===== Covariance Matrix PCA-KMeans Calculation (B)") logging.info("Retrieving All Covariance Vectors") home = os.getenv('HOME') cfile = home + '/work/DEBUG_COVAR_PTS' DO_COVAR = self.calc_covar # For recalculating covariance matrices (if not pre-calc/stored) if DO_COVAR: if os.path.exists(cfile + '.npy'): covar_pts = np.load(cfile + '.npy') logging.debug('Loaded From File') else: covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1) covar_pts = np.array([np.fromstring(x) for x in covar_raw]) np.save(cfile, covar_pts) logging.debug('Loaded From Catalog & Saved') covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1) logging.debug('Indiced Loaded. Retrieving File Indices') covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1) if DO_COVAR: logging.info(" Pulled %d Covariance Vectors", len(covar_pts)) logging.info( "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)" ) # FOR incrementatl PCA: NUM_PC = 6 ipca_key = 'subspace:covar:ipca' ipca = PCAnalyzer.load(self.catalog, ipca_key) if ipca is None: logging.info('Creating a NEW IPCA') ipca = PCAIncremental(NUM_PC) lastindex = 0 else: lastindex = ipca.trainsize logging.info( 'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', ipca.trainsize, len(covar_pts) - ipca.trainsize) # For incremental, partial solve using only newer pts (from the last "trainsize") if len(covar_pts) - lastindex > 0: ipca.solve(covar_pts[lastindex:]) logging.info("Incrementatl PCA Updated. Storing Now...") #### BARRIER self.wait_catalog() ipca.store(self.catalog, ipca_key) logging.info("IPCA Saved. Projecting Covariance to PC") cfile = home + '/work/DEBUG_SUBCOVAR_PTS' if os.path.exists(cfile + '.npy'): subspace_covar_pts = np.load(cfile + '.npy') else: subspace_covar_pts = ipca.project(covar_pts) np.save(cfile, subspace_covar_pts) # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points logging.info( 'Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts)) global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle') if MAX_HCUBE <= 0: hcube_global = global_kdtree.getleaves() else: # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES hcube_global_ALL = global_kdtree.getleaves() hcube_global = {} num = 0 for k, v in hcube_global_ALL.items(): hcube_global[k] = v num += 1 if num == MAX_HCUBE: break # hcube_global = global_kdtree.getleaves() logging.info( 'Global HCubes: Key Count Volume Density (NOTE DEBUGGING ONLY 3 USED)' ) for k in sorted(hcube_global.keys()): v = hcube_global[k] logging.info('%-10s %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density']) if self.filelog: keys = hcube_global.keys() self.filelog.info('global,keys,%s', ','.join(keys)) self.filelog.info( 'global,count,%s', ','.join([str(hcube_global[k]['count']) for k in keys])) self.filelog.info( 'global,volume,%s', ','.join([str(hcube_global[k]['volume']) for k in keys])) self.filelog.info( 'global,density,%s', ','.join([str(hcube_global[k]['density']) for k in keys])) logging.info( "===== SELECT Sampling of points from each Global HCube (B)") s = sorted(hcube_global.items(), key=lambda x: x[1]['count']) hcube_global = {x[0]: x[1] for x in s} counter = 0 for key in hcube_global.keys(): counter += 1 if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE: cov_index = hcube_global[key]['elm'] hcube_global[key]['samplefactor'] = 1 else: cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE) hcube_global[key]['samplefactor'] = len( hcube_global[key]['elm']) / MAX_SAMPLE_SIZE hcube_global[key]['idxlist'] = [] for cov in cov_index: selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist() hcube_global[key]['idxlist'].extend( [int(covar_index[cov]) + i for i in selected_hd_idx]) logging.info('Back Projecting Global HCube `%s` (%d out of %d)', key, counter, len(hcube_global.keys())) source_cov = self.backProjection(hcube_global[key]['idxlist']) hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov) logging.debug('Back Projected %d points to HD space: %s', len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha'])) # logging.info('Calculating all HD Distances') # dist_hd = {} # dist_ld = {} # for key in hcube_global.keys(): # T = hcube_global[key]['alpha'].xyz # N = len(T) # dist_hd[key] = np.zeros(shape=(N, N)) # dist_ld[key] = {} # for A in range(0, N): # dist_hd[key][A][A] = 0 # for B in range(A+1, N): # dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B]) # KD Tree for states from Reservoir Sample of RMSD labeled HighDim reservoir = ReservoirSample('rms', self.catalog) logging.info( "===== BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) " ) hcube_list = {} logging.info( "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)" ) hcube_local = {} logging.info("=======================================================") logging.info(" PROJECT Global HCubes into Per-Bin HCube KD Tree(s)") logging.info( "=======================================================\n") overlap_hcube = {k: {} for k in hcube_global.keys()} projection_map = {} pt_projection_list = [] for key in sorted(hcube_global.keys()): for i in range(len(hcube_global[key]['alpha'].xyz)): pt_projection_list.append([]) for bin_idx, tbin in enumerate(TEST_TBIN): logging.info("Project Global HCubes into local subspace for %s", str(tbin)) # Load Vectors logging.info('Loading subspace and kernel for bin %s', str(tbin)) # LOAD KPCA Kernel matrix kpca_key = 'subspace:pca:kernel:%d_%d' % tbin kpca = PCAnalyzer.load(self.catalog, kpca_key) data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1) data = np.array([np.fromstring(x) for x in data_raw]) if len(data) == 0: logging.error( 'No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue logging.info( 'Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape)) kdtree = KDTree(200, maxdepth=8, data=data, method='middle') hcube_local[tbin] = kdtree.getleaves() logging.info('LOCAL KD-Tree Completed for %s:', str(tbin)) for k in sorted(hcube_local[tbin].keys()): logging.info(' `%-9s` #pts:%6d density:%9.1f', k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density']) if self.filelog: keys = hcube_local[tbin].keys() A, B = tbin self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys)) self.filelog.info( 'local,%d_%d,count,%s', A, B, ','.join( [str(hcube_local[tbin][k]['count']) for k in keys])) self.filelog.info( 'local,%d_%d,volume,%s', A, B, ','.join( [str(hcube_local[tbin][k]['volume']) for k in keys])) self.filelog.info( 'local,%d_%d,density,%s', A, B, ','.join( [str(hcube_local[tbin][k]['density']) for k in keys])) n_total = 0 logging.debug('Global Hcubes to Project (%d): %s', len(hcube_global.keys()), str(hcube_global.keys())) projection_map[bin_idx] = { k: set() for k in hcube_local[tbin].keys() } pnum = 0 for key in sorted(hcube_global.keys()): overlap_hcube[key][tbin] = {} cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz) logging.debug( 'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s ', key, len(cov_proj_pca), str(tbin)) for i, pt in enumerate(cov_proj_pca): hcube = kdtree.probe(pt, probedepth=9) # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying) if hcube not in overlap_hcube[key][tbin]: overlap_hcube[key][tbin][hcube] = { 'idxlist': hcube_local[tbin][hcube]['elm'], 'wgt': hcube_local[tbin][hcube]['density'], 'num_projected': 0 } overlap_hcube[key][tbin][hcube]['num_projected'] += 1 # Index this point in corresponding local HCube projection view projection_map[bin_idx][hcube].add(pnum) pt_projection_list[pnum].append(hcube) pnum += 1 for k, v in sorted(overlap_hcube[key][tbin].items()): logging.debug( ' Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected']) # logging.info('Calculating Lower Dimensional Distances') # N = len(cov_proj_pca) # dist_ld[key][tbin] = np.zeros(shape=(N, N)) # for A in range(0, N): # for B in range(A+1, N): # dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B]) # Re-Index projected points -- could make this a list too next_index = 0 view_list = [] for bin_idx, hcube_map in projection_map.items(): hcube_list = [] for hcube_key, pt_list in hcube_map.items(): hcube_list.append((set((hcube_key, )), set(pt_list))) view_list.append((set((bin_idx, )), hcube_list)) print("CALLING: Collapse Join") joined_subspaces = collapse_join(projection_map.keys(), view_list) for subspace_list, correlated_hcubes in joined_subspaces: tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list] for hcube_list, pt_list in correlated_hcubes: print(tbin_list, hcube_list, pt_list) # TODO: Corrlate Back to Global print('Visualize HERE') # for idx, tbin in enumerate(TEST_TBIN): # # Only process substates with data # if tbin not in hcube_local: # logging.warning('Local KD Tree not created for %s', str(tbin)) # continue # projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # sets = {} # proj_bin_list = [] # for tbin in TEST_TBIN: # if tbin not in hcube_local: # continue # proj_bin_list.append(tbin) # sets[tbin] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # set_list = {} # for tbin, view in sets.items(): # set_list[(tbin,)] = [] # for hcube, idxlist in view.items(): # print(tbin, hcube, idxlist) # set_list[(tbin,)].append((set((hcube,)), idxlist)) # def collapse(C): # a = 0 # b = 0 # N = [] # while a < len(C) and b < len(C): # A = sorted(C[a]) # B = sorted(C[b]) # if A == B: # b += 1 # elif A[0] == B[0]: # N.append(set(A)|set(B)) # b += 1 # else: # a += 1 # if len(N) <= 1: # return [] # else: # return N + collapse(N) # q=collapse(t1) # for i in q: print(sorted(i)) # print('Checking all 2-Way Joins') # join2 = {} # for a in range(0, len(proj_bin_list)-1): # tA = proj_bin_list[a] # for b in range(a+1, len(proj_bin_list)): # tB = proj_bin_list[b] # join_ss = tuple(set((tA, tB))) # set_list = [] # for kA, vA in sets[tA].items(): # for kB, vB in sets[tB].items(): # join_hc = set((kA, kB)) # inter = vA & vB # if len(inter) > 0: # set_list.append((join_hc, inter)) # if len(set_list) > 0: # join2[join_ss] = set_list # print('2-Way Join Results:') # for ss, set_list in join2.items(): # for hc, idxlist in set_list: # print(ss, hc, idxlist) # print('Checking all 3-Way Joins') # join3 = [] # checked = [] # for a in range(0, len(join2)-1): # sA, hA, vA = join2[a] # for b in range(a+1, len(join2)): # sB, hB, vB = join2[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join3.append((ss, hc, inter)) # print('Checking all 4-Way Joins') # join4 = [] # checked = [] # for a in range(0, len(join3)-1): # sA, hA, vA = join3[a] # for b in range(a+1, len(join3)): # sB, hB, vB = join3[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join4.append((ss, hc, inter)) # if self.filelog: # for i in join2: # self.filelog.info('%s', str(i)) # for i in join3: # self.filelog.info('%s', str(i)) # for i in join4: # self.filelog.info('%s', str(i)) DO_MIN_CHECK = False if DO_MIN_CHECK: def maxcount(x): y = {} for i in x: y[i] = 1 if i not in y else y[i] + 1 return max(y.values()) print( '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces' ) argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf) for i in x]) for key in hcube_global.keys(): # logging.info('Showing MIN / MAX for points from HCube %s:', key) minA = {} maxA = {} for n in range(len(dist_hd[key])): minA[n] = [] maxA[n] = [] for tbin in TEST_TBIN: if tbin not in dist_ld[key].keys(): continue minA[n].append(0) maxA[n].append(0) else: minA[n].append( argmin_nonzero(dist_ld[key][tbin][n])) maxA[n].append(np.argmax(dist_ld[key][tbin][n])) numsame = np.zeros(len(dist_ld[key].keys()) + 1) for n in range(len(dist_hd[key][n])): minH = argmin_nonzero(dist_hd[key][n]) maxH = np.argmax(dist_hd[key][n]) minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])] numsamepair = maxcount(minA[n]) numsame[numsamepair] += 1 # print('%3d'%n, '%2d/%-2d '%(minH, maxH), '%s' % ' '.join(minmax), ' [%d]'%numsamepair) print(' '.join([ '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame))) ])) print('Stopping HERE!') sys.exit(0) # GAMMA FUNCTION EXPR # 8 gamma1 = lambda a, b: (a * b) gamma2 = lambda a, b: (a + b) / 2 # TODO: Factor in RMS weight for tbin in TEST_TBIN: # for tbin in sorted(bin_list): logging.info('') logging.info('BIPARTITE GRAPH for %s', str(tbin)) bipart = {} edgelist = [] for hcB in hcube_global.keys(): num_B = hcube_global[hcB]['count'] wgt1_B = hcube_global[hcB]['density'] if tbin not in overlap_hcube[hcB]: continue for hcA, hcA_data in overlap_hcube[hcB][tbin].items(): edge = {} if hcA not in bipart: bipart[hcA] = [] num_proj = hcA_data['num_projected'] wgt_A = hcA_data['wgt'] wgt2_B = wgt1_B * num_proj edge['combW1'] = gamma1(wgt_A, wgt1_B) edge['combW2'] = gamma1(wgt_A, wgt2_B) edge['combW3'] = gamma2(wgt_A, wgt1_B) edge['combW4'] = gamma2(wgt_A, wgt2_B) edge['num_A'] = len(hcA_data['idxlist']) edge['num_B'] = num_B edge['num_proj'] = num_proj edge['wgt_A'] = wgt_A edge['wgt1_B'] = wgt1_B edge['wgt2_B'] = wgt2_B edge['hcA'] = hcA edge['hcB'] = hcB bipart[hcA].append(edge) edgelist.append((hcA, hcB, num_proj)) if len(bipart) == 0: logging.info("NO DATA FOR %s", str(tbin)) continue logging.info('') logging.info( 'A (# Pts) H-Cube <--- B H-Cube (# proj/total Pts) wgt_A wB1:density wB2:Mass A*B1 A*B2 AVG(A,B1) AVG(A,B2)' ) for k, v in bipart.items(): for edge in v: logging.info( 'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s` (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge) if self.filelog: A, B = tbin self.filelog.info('edge,%d_%d,%s,%s,%d', A, B, edge['hcA'], edge['hcB'], edge['num_proj']) # Prepare nodes for graph nA = set() nB = set() elist = [] for e in edgelist: a, b, z = e if z <= 5: continue nA.add(a) nB.add(b) elist.append((a, b, z)) nAKeys = sorted(nA)[::-1] nBKeys = sorted(nB)[::-1] sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys] sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys] idxA = {key: i for i, key in enumerate(nAKeys)} idxB = {key: i for i, key in enumerate(nBKeys)} edges = [(idxA[a], idxB[b], z) for a, b, z in elist] G.bipartite(sizesA, sizesB, edges, sizesA, sizesB, 'bipartite_%d_%d' % tbin) logging.info('STOPPING HERE!!!!') sys.exit(0) return []
def execute(self): """Special execute function for the reweight operator -- check/validate. """ # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug("============================ <PRE-PROCESS> =============================") self.cacheclient = CacheClient(self.name) numLabels = 5 binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)] labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1) num_pts = len(labeled_pts_rms) logging.debug('##NUM_OBS: %d', num_pts) # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)] TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)] MAX_SAMPLE_SIZE = 100 # Max # of cov traj to back project per HCube MAX_PT_PER_MATRIX = 100 # Num points to sample from each cov traj COVAR_SIZE = 200 # Ea Cov "pt" is 200 HD pts. -- should be static based on user query MAX_HCUBE = 6 # Max Num HCubes to process # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug("======================= <QUERY PROCESSING> =========================") # 1. RUN KPCA on <<?????>> (sample set) and project all pts # 2. Calculate K-D Tree on above # 3. Score each point with distance to centroid # 4. B = Select the smallest half of clusters # 5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA) # 6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????) # ALT-> use HCUbe size as its weight # 7. A = HCubes for states 3 (and 4) # 8. Reweight A into both state 3 and state 4 (B) HCubes # 9. ID Overlap # 10. Apply Gamme Function logging.info("===== Covariance Matrix PCA-KMeans Calculation (B)") logging.info("Retrieving All Covariance Vectors") home = os.getenv('HOME') cfile = home + '/work/DEBUG_COVAR_PTS' DO_COVAR = self.calc_covar # For recalculating covariance matrices (if not pre-calc/stored) if DO_COVAR: if os.path.exists(cfile + '.npy'): covar_pts = np.load(cfile + '.npy') logging.debug('Loaded From File') else: covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1) covar_pts = np.array([np.fromstring(x) for x in covar_raw]) np.save(cfile, covar_pts) logging.debug('Loaded From Catalog & Saved') covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1) logging.debug('Indiced Loaded. Retrieving File Indices') covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1) if DO_COVAR: logging.info(" Pulled %d Covariance Vectors", len(covar_pts)) logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)") # FOR incrementatl PCA: NUM_PC = 6 ipca_key = 'subspace:covar:ipca' ipca = PCAnalyzer.load(self.catalog, ipca_key) if ipca is None: logging.info('Creating a NEW IPCA') ipca = PCAIncremental(NUM_PC) lastindex = 0 else: lastindex = ipca.trainsize logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', ipca.trainsize, len(covar_pts)-ipca.trainsize) # For incremental, partial solve using only newer pts (from the last "trainsize") if len(covar_pts)-lastindex > 0: ipca.solve(covar_pts[lastindex:]) logging.info("Incrementatl PCA Updated. Storing Now...") #### BARRIER self.wait_catalog() ipca.store(self.catalog, ipca_key) logging.info("IPCA Saved. Projecting Covariance to PC") cfile = home + '/work/DEBUG_SUBCOVAR_PTS' if os.path.exists(cfile + '.npy'): subspace_covar_pts = np.load(cfile + '.npy') else: subspace_covar_pts = ipca.project(covar_pts) np.save(cfile, subspace_covar_pts) # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts)) global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle') if MAX_HCUBE <= 0: hcube_global = global_kdtree.getleaves() else: # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES hcube_global_ALL = global_kdtree.getleaves() hcube_global = {} num = 0 for k, v in hcube_global_ALL.items(): hcube_global[k] = v num += 1 if num == MAX_HCUBE: break # hcube_global = global_kdtree.getleaves() logging.info('Global HCubes: Key Count Volume Density (NOTE DEBUGGING ONLY 3 USED)') for k in sorted(hcube_global.keys()): v = hcube_global[k] logging.info('%-10s %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density']) if self.filelog: keys = hcube_global.keys() self.filelog.info('global,keys,%s',','.join(keys)) self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys])) self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys])) self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys])) logging.info("===== SELECT Sampling of points from each Global HCube (B)") s = sorted(hcube_global.items(), key=lambda x: x[1]['count']) hcube_global = {x[0]: x[1] for x in s} counter = 0 for key in hcube_global.keys(): counter += 1 if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE: cov_index = hcube_global[key]['elm'] hcube_global[key]['samplefactor'] = 1 else: cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE) hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE hcube_global[key]['idxlist'] = [] for cov in cov_index: selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist() hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx]) logging.info('Back Projecting Global HCube `%s` (%d out of %d)', key, counter, len(hcube_global.keys())) source_cov = self.backProjection(hcube_global[key]['idxlist']) hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov) logging.debug('Back Projected %d points to HD space: %s', len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha'])) # logging.info('Calculating all HD Distances') # dist_hd = {} # dist_ld = {} # for key in hcube_global.keys(): # T = hcube_global[key]['alpha'].xyz # N = len(T) # dist_hd[key] = np.zeros(shape=(N, N)) # dist_ld[key] = {} # for A in range(0, N): # dist_hd[key][A][A] = 0 # for B in range(A+1, N): # dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B]) # KD Tree for states from Reservoir Sample of RMSD labeled HighDim reservoir = ReservoirSample('rms', self.catalog) logging.info("===== BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ") hcube_list = {} logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)") hcube_local = {} logging.info("=======================================================") logging.info(" PROJECT Global HCubes into Per-Bin HCube KD Tree(s)") logging.info("=======================================================\n") overlap_hcube = {k: {} for k in hcube_global.keys()} projection_map = {} pt_projection_list = [] for key in sorted(hcube_global.keys()): for i in range(len(hcube_global[key]['alpha'].xyz)): pt_projection_list.append([]) for bin_idx, tbin in enumerate(TEST_TBIN): logging.info("Project Global HCubes into local subspace for %s", str(tbin)) # Load Vectors logging.info('Loading subspace and kernel for bin %s', str(tbin)) # LOAD KPCA Kernel matrix kpca_key = 'subspace:pca:kernel:%d_%d' % tbin kpca = PCAnalyzer.load(self.catalog, kpca_key) data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1) data = np.array([np.fromstring(x) for x in data_raw]) if len(data) == 0: logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape)) kdtree = KDTree(200, maxdepth=8, data=data, method='middle') hcube_local[tbin] = kdtree.getleaves() logging.info('LOCAL KD-Tree Completed for %s:', str(tbin)) for k in sorted(hcube_local[tbin].keys()): logging.info(' `%-9s` #pts:%6d density:%9.1f', k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density']) if self.filelog: keys = hcube_local[tbin].keys() A,B = tbin self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys)) self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys])) self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys])) self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys])) n_total = 0 logging.debug('Global Hcubes to Project (%d): %s', len(hcube_global.keys()), str(hcube_global.keys())) projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()} pnum = 0 for key in sorted(hcube_global.keys()): overlap_hcube[key][tbin] = {} cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz) logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s ', key, len(cov_proj_pca), str(tbin)) for i, pt in enumerate(cov_proj_pca): hcube = kdtree.probe(pt, probedepth=9) # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying) if hcube not in overlap_hcube[key][tbin]: overlap_hcube[key][tbin][hcube] = { 'idxlist': hcube_local[tbin][hcube]['elm'], 'wgt': hcube_local[tbin][hcube]['density'], 'num_projected': 0} overlap_hcube[key][tbin][hcube]['num_projected'] += 1 # Index this point in corresponding local HCube projection view projection_map[bin_idx][hcube].add(pnum) pt_projection_list[pnum].append(hcube) pnum += 1 for k, v in sorted(overlap_hcube[key][tbin].items()): logging.debug(' Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected']) # logging.info('Calculating Lower Dimensional Distances') # N = len(cov_proj_pca) # dist_ld[key][tbin] = np.zeros(shape=(N, N)) # for A in range(0, N): # for B in range(A+1, N): # dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B]) # Re-Index projected points -- could make this a list too next_index = 0 view_list = [] for bin_idx, hcube_map in projection_map.items(): hcube_list = [] for hcube_key, pt_list in hcube_map.items(): hcube_list.append((set((hcube_key,)), set(pt_list))) view_list.append((set((bin_idx,)), hcube_list)) print("CALLING: Collapse Join") joined_subspaces = collapse_join(projection_map.keys(), view_list) for subspace_list, correlated_hcubes in joined_subspaces: tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list] for hcube_list, pt_list in correlated_hcubes: print(tbin_list, hcube_list, pt_list) # TODO: Corrlate Back to Global print('Visualize HERE') # for idx, tbin in enumerate(TEST_TBIN): # # Only process substates with data # if tbin not in hcube_local: # logging.warning('Local KD Tree not created for %s', str(tbin)) # continue # projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # sets = {} # proj_bin_list = [] # for tbin in TEST_TBIN: # if tbin not in hcube_local: # continue # proj_bin_list.append(tbin) # sets[tbin] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # set_list = {} # for tbin, view in sets.items(): # set_list[(tbin,)] = [] # for hcube, idxlist in view.items(): # print(tbin, hcube, idxlist) # set_list[(tbin,)].append((set((hcube,)), idxlist)) # def collapse(C): # a = 0 # b = 0 # N = [] # while a < len(C) and b < len(C): # A = sorted(C[a]) # B = sorted(C[b]) # if A == B: # b += 1 # elif A[0] == B[0]: # N.append(set(A)|set(B)) # b += 1 # else: # a += 1 # if len(N) <= 1: # return [] # else: # return N + collapse(N) # q=collapse(t1) # for i in q: print(sorted(i)) # print('Checking all 2-Way Joins') # join2 = {} # for a in range(0, len(proj_bin_list)-1): # tA = proj_bin_list[a] # for b in range(a+1, len(proj_bin_list)): # tB = proj_bin_list[b] # join_ss = tuple(set((tA, tB))) # set_list = [] # for kA, vA in sets[tA].items(): # for kB, vB in sets[tB].items(): # join_hc = set((kA, kB)) # inter = vA & vB # if len(inter) > 0: # set_list.append((join_hc, inter)) # if len(set_list) > 0: # join2[join_ss] = set_list # print('2-Way Join Results:') # for ss, set_list in join2.items(): # for hc, idxlist in set_list: # print(ss, hc, idxlist) # print('Checking all 3-Way Joins') # join3 = [] # checked = [] # for a in range(0, len(join2)-1): # sA, hA, vA = join2[a] # for b in range(a+1, len(join2)): # sB, hB, vB = join2[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join3.append((ss, hc, inter)) # print('Checking all 4-Way Joins') # join4 = [] # checked = [] # for a in range(0, len(join3)-1): # sA, hA, vA = join3[a] # for b in range(a+1, len(join3)): # sB, hB, vB = join3[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join4.append((ss, hc, inter)) # if self.filelog: # for i in join2: # self.filelog.info('%s', str(i)) # for i in join3: # self.filelog.info('%s', str(i)) # for i in join4: # self.filelog.info('%s', str(i)) DO_MIN_CHECK = False if DO_MIN_CHECK: def maxcount(x): y={} for i in x: y[i] = 1 if i not in y else y[i]+1 return max(y.values()) print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces') argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x]) for key in hcube_global.keys(): # logging.info('Showing MIN / MAX for points from HCube %s:', key) minA = {}; maxA={} for n in range(len(dist_hd[key])) : minA[n]=[] ; maxA[n]=[] for tbin in TEST_TBIN: if tbin not in dist_ld[key].keys(): continue minA[n].append(0) maxA[n].append(0) else: minA[n].append(argmin_nonzero(dist_ld[key][tbin][n])) maxA[n].append(np.argmax(dist_ld[key][tbin][n])) numsame = np.zeros(len(dist_ld[key].keys())+1) for n in range(len(dist_hd[key][n])): minH = argmin_nonzero(dist_hd[key][n]) maxH = np.argmax(dist_hd[key][n]) minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])] numsamepair = maxcount(minA[n]) numsame[numsamepair] += 1 # print('%3d'%n, '%2d/%-2d '%(minH, maxH), '%s' % ' '.join(minmax), ' [%d]'%numsamepair) print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))])) print('Stopping HERE!') sys.exit(0) # GAMMA FUNCTION EXPR # 8 gamma1 = lambda a, b : (a * b) gamma2 = lambda a, b : (a + b) / 2 # TODO: Factor in RMS weight for tbin in TEST_TBIN: # for tbin in sorted(bin_list): logging.info('') logging.info('BIPARTITE GRAPH for %s', str(tbin)) bipart = {} edgelist = [] for hcB in hcube_global.keys(): num_B = hcube_global[hcB]['count'] wgt1_B = hcube_global[hcB]['density'] if tbin not in overlap_hcube[hcB]: continue for hcA, hcA_data in overlap_hcube[hcB][tbin].items(): edge = {} if hcA not in bipart: bipart[hcA] = [] num_proj = hcA_data['num_projected'] wgt_A = hcA_data['wgt'] wgt2_B = wgt1_B*num_proj edge['combW1'] = gamma1(wgt_A, wgt1_B) edge['combW2'] = gamma1(wgt_A, wgt2_B) edge['combW3'] = gamma2(wgt_A, wgt1_B) edge['combW4'] = gamma2(wgt_A, wgt2_B) edge['num_A'] = len(hcA_data['idxlist']) edge['num_B'] = num_B edge['num_proj'] = num_proj edge['wgt_A'] = wgt_A edge['wgt1_B'] = wgt1_B edge['wgt2_B'] = wgt2_B edge['hcA'] = hcA edge['hcB'] = hcB bipart[hcA].append(edge) edgelist.append((hcA, hcB, num_proj)) if len(bipart) == 0: logging.info("NO DATA FOR %s", str(tbin)) continue logging.info('') logging.info('A (# Pts) H-Cube <--- B H-Cube (# proj/total Pts) wgt_A wB1:density wB2:Mass A*B1 A*B2 AVG(A,B1) AVG(A,B2)') for k, v in bipart.items(): for edge in v: logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s` (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge) if self.filelog: A,B = tbin self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj']) # Prepare nodes for graph nA = set() nB = set() elist = [] for e in edgelist: a, b, z = e if z <= 5: continue nA.add(a) nB.add(b) elist.append((a,b,z)) nAKeys = sorted(nA)[::-1] nBKeys = sorted(nB)[::-1] sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys] sizesB = [hcube_global[n]['count']*3 for n in nBKeys] idxA = {key: i for i, key in enumerate(nAKeys)} idxB = {key: i for i, key in enumerate(nBKeys)} edges = [(idxA[a], idxB[b], z) for a, b, z in elist] G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin) logging.info('STOPPING HERE!!!!') sys.exit(0) return []