def reproj_distro(): local = get_local() data = get_edges() r2 = redis.StrictRedis(port=6385, decode_responses=True) for tbin in [(2, 4), (2, 2), (4, 4), (4, 2), (4, 1), (3, 1)]: print('Processing:', tbin) tkey = '%d_%d' % tbin # Get Kernel kpca_key = 'subspace:pca:kernel:' + tkey kpca = PCAnalyzer.load(r2, kpca_key) # Get Training Data data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1) pca_pts = np.array([np.fromstring(x) for x in data_raw]) kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle') proj_pts = kpca.project(alpha.xyz) biased_hcubes = [] for i, pt in enumerate(proj_pts): biased_hcubes.append(kdtree.probe(pt, probedepth=9)) if len(data) == 0: print('No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue counts = {} for i in biased_hcubes: if i not in counts: counts[i] = 0 counts[i] += 1 for i in local[tkey]['keys']: if i not in counts: counts[i] = 0 print('check') cvect = [counts[i] for i in local[tkey]['keys']] d = np.array(cvect) / sum(cvect) c = np.array(data[tkey]) lcnt = np.sum(c, axis=0) gcnt = np.sum(c, axis=1) norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis]) # Add biased data as a col kpca_cnt = np.array([int(i) for i in local[tkey]['count']]) kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt) arr = np.vstack((norm, kpca_cnt_norm, d)).T rowlist = tuple(gcnt) + ( 'localKPCA', 'biased', ) P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
def reproj_distro(): local = get_local() data = get_edges() r2 = redis.StrictRedis(port=6385, decode_responses=True) for tbin in [(2,4), (2,2), (4,4), (4,2), (4,1), (3,1)]: print('Processing:', tbin) tkey = '%d_%d' % tbin # Get Kernel kpca_key = 'subspace:pca:kernel:' + tkey kpca = PCAnalyzer.load(r2, kpca_key) # Get Training Data data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1) pca_pts = np.array([np.fromstring(x) for x in data_raw]) kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle') proj_pts = kpca.project(alpha.xyz) biased_hcubes = [] for i, pt in enumerate(proj_pts): biased_hcubes.append(kdtree.probe(pt, probedepth=9)) if len(data) == 0: print('No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue counts = {} for i in biased_hcubes: if i not in counts: counts[i] = 0 counts[i] += 1 for i in local[tkey]['keys']: if i not in counts: counts[i] = 0 print('check') cvect = [counts[i] for i in local[tkey]['keys']] d = np.array(cvect)/sum(cvect) c = np.array(data[tkey]) lcnt = np.sum(c, axis=0) gcnt = np.sum(c, axis=1) norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis]) # Add biased data as a col kpca_cnt = np.array([int(i) for i in local[tkey]['count']]) kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt) arr = np.vstack((norm, kpca_cnt_norm, d)).T rowlist = tuple(gcnt) + ('localKPCA', 'biased',) P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
def execute(self, job): # PRE-PREOCESS ---------------------------------------------------------- settings = systemsettings() bench = microbench('sim_%s' % settings.name, self.seqNumFromID()) bench.start() stat = StatCollector('sim_%s' % settings.name, self.seqNumFromID()) mylogical_seqnum = str(self.seqNumFromID()) # Prepare working directory, input/output files conFile = os.path.join(job['workdir'], job['name'] + '.conf') logFile = conFile.replace('conf', 'log') # log in same place as config file dcdFile = conFile.replace('conf', 'dcd') # dcd in same place as config file USE_SHM = True ADAPTIVE_CENTROID = False SIMULATE_RATIO = settings.SIMULATE_RATIO if SIMULATE_RATIO > 1: logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO) frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000) logging.info('Frame Size is %f Using Sim Ratio of 1:%d', \ frame_size, SIMULATE_RATIO) EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER) # TODO: FOR LINEAGE # srcA, srcB = eval(job['src_bin']) # stat.collect('src_bin', [str(srcA), str(srcB)]) traj = None # EXECUTE SIMULATION --------------------------------------------------------- if self.skip_simulation: logging.info('1. SKIPPING SIMULATION.....') USE_SHM = False job['dcd'] = dcdFile key = wrapKey('jc', job['name']) self.data[key]['dcd'] = dcdFile else: logging.info('1. Run Simulation') # Prepare & source to config file with open(self.data['sim_conf_template'], 'r') as template: source = template.read() # >>>>Storing DCD into shared memory on this node if USE_SHM: # ramdisk = '/dev/shm/out/' ramdisk = '/tmp/ddc/' if not os.path.exists(ramdisk): os.mkdir(ramdisk) job['outputloc'] = ramdisk dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd') else: job['outputloc'] = '' with open(conFile, 'w') as sysconfig: sysconfig.write(source % job) logging.info("Config written to: " + conFile) # # Run simulation in parallel # if 'parallel' in job: # numnodes = job['parallel'] # total_tasks = numnodes * 24 # cmd = 'mpiexec -n %d namd2 %s > %s' % (total_tasks, conFile, logFile) # # Run simulation single threaded # else: # cmd = 'namd2 %s > %s' % (conFile, logFile) # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile) check = executecmd('module list') logging.debug('%s', check) cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile) # MICROBENCH #1 (file to Lustre) # logging.debug("Executing Simulation:\n %s\n", cmd) # bench = microbench() # bench.start() # stdout = executecmd(cmd) # logging.info("SIMULATION Complete! STDOUT/ERR Follows:") # bench.mark('SimExec:%s' % job['name']) # shm_contents = os.listdir('/dev/shm/out') # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents)) # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir']) # logging.info("Copy Complete to Lustre.") # bench.mark('CopyLustre:%s' % job['name']) # shutil.rmtree(ramdisk) # shm_contents = os.listdir('/dev/shm') # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents)) # bench.show() max_expected_obs = int(job['runtime']) // int(job['dcdfreq']) # Retry upto 3 attempts if the sim fails MAX_TRY = 3 for i in range(MAX_TRY, 0, -1): min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY))) logging.debug("Executing Simulation:\n %s\n", cmd) logging.debug('# Obs Expected to see: %d', max_expected_obs) stdout = executecmd(cmd) logging.info("SIMULATION Complete! STDOUT/ERR Follows:") # Check file for expected data if USE_SHM: traj = md.load(dcd_ramfile, top=job['pdb']) else: traj = md.load(dcdFile, top=job['pdb']) logging.info("Obs Threshold = %4d", min_required_obs) logging.info("#Obs This Traj = %4d", traj.n_frames) if traj.n_frames >= min_required_obs: logging.info('Full (enough) Sim Completed') break logging.info('Detected a failed Simulation. Retrying the same sim.') bench.mark('SimExec:%s' % job['name']) # Internal stats sim_length = self.data['sim_step_size'] * int(job['runtime']) sim_realtime = bench.delta_last() sim_run_ratio = (sim_realtime/60) / (sim_length/1000000) logging.info('##SIM_RATIO %6.3f min-per-ns-sim', sim_run_ratio) stat.collect('sim_ratio', sim_run_ratio) if USE_SHM: shm_contents = os.listdir(ramdisk) logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents)) if not os.path.exists(dcd_ramfile): logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)") time.sleep(10) if not os.path.exists(dcd_ramfile): logging.warning("DCD STIILL FILE NOT FOUND!!!!") else: logging.info("DCD File was found") # # MICROBENCH #2 (file to Alluxio) # allux = AlluxioClient() # # copy to Aluxio FS # allux.put(ramdisk + job['name'] + '.dcd', '/') # logging.info("Copy Complete to Alluxio.") # bench.mark('CopyAllux:%s' % job['name']) # And copy to Lustre # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir']) # And copy to Lustre (usng zero-copy): if USE_SHM: src = open(dcd_ramfile, 'rb') dest = open(dcdFile, 'w+b') offset = 0 dcdfilesize = os.path.getsize(dcd_ramfile) while True: sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize) if sent == 0: break offset += sent logging.info("Copy Complete to Lustre.") bench.mark('CopyLustre:%s' % job['name']) # TODO: Update job's metadata key = wrapKey('jc', job['name']) self.data[key]['dcd'] = dcdFile # ANALYSIS ------- --------------------------------------------------------- # ANALYSIS ALGORITHM # 1. With combined Sim-analysis: file is loaded locally from shared mem logging.debug("2. Load DCD") # Load full higher dim trajectory # traj = datareduce.filter_heavy(dcd_ramfile, job['pdb']) if traj is None: if USE_SHM: traj = md.load(dcd_ramfile, top=job['pdb']) else: traj = md.load(dcdFile, top=job['pdb']) # Center Coordinates traj.center_coordinates() bench.mark('File_Load') logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj)) # DIMENSIONALITY REDUCTION -------------------------------------------------- # 4-A. Subspace Calcuation: RMS using Alpha-Filter #------ A: RMSD-ALPHA ------------------ # S_A = rmslist logging.info('---- RMSD Calculation against pre-defined centroids ----') # RMSD is calculated on the Ca ('alpha') atoms in distance space # whereby all pairwise distances are calculated for each frame. # Pairwise distances are plotted in euclidean space # Distance to each of the 5 pre-calculated centroids is calculated # 1. Filter to Alpha atoms alpha = traj.atom_slice(deshaw.FILTER['alpha']) # 2. (IF USED) Convert to distance space: pairwise dist for all atom combinations # alpha_dist = dr.distance_space(alpha) # 3. Calc RMS for each conform to all centroids # Heuristic centroid weight (TODO: make this trained)\ # 4. For adaptive Centriods # Centroids Will be pulled & updated. logging.info('CENTROID Retrieval & Updating') self.wait_catalog() # If they were mutable.... # logging.info('Acquiring a Lock on the Centroids') # centroids = self.catalog.loadNPArray('centroid') # thetas = self.catalog.loadNPArray('thetas') # lock = self.catalog.lock_acquire('centroid') # if lock is None: # logging.info('Could not lock the Centroids. Will use current cached (possibly stale) data.') # bench.mark('ConcurrLockCentroid'%(A,B)) # Implemented as a Transactional Data Structure.... if ADAPTIVE_CENTROID: centroids = [] for state in range(numLabels): cent_raw = self.catalog.lrange('centroid:xyz:%d'%state, 0, -1) cent_xyz = [pickle.loads(i) for i in cent_raw] cent_npts = [int(i) for i in self.catalog.lrange('centroid:npts:%d'%state, 0, -1)] c_sum = np.zeros(shape=cent_xyz[0].shape) c_tot = 0 for x, n in zip(cent_xyz, cent_npts): c = x * n c_sum += c c_tot += n centroids.append(c_sum / c_tot) else: centroids = self.catalog.loadNPArray('centroid') # if EXPERIMENT_NUMBER < 10: # 5. Calculate the RMSD for each filtered point to 5 pre-determined centroids # cw = [.92, .94, .96, .99, .99] cw = [.94, .95, .97, .99, .99] numLabels = len(self.data['centroid']) numConf = len(traj.xyz) stat.collect('numpts',numConf) # 4. Account for noise : Simple spatial mean filter over a small window # Where size of window captures extent of noise # (e.g. 10000fs window => motions under 10ps are considered "noisy") noise = self.data['obs_noise'] stepsize = 500 if 'interval' not in job else int(job['interval']) nwidth = noise//(2*stepsize) noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0) rms_filtered = np.array([noisefilt(alpha.xyz, i) for i in range(numConf)]) # Notes: Delta_S == rmslist rmslist_sv = calc_rmsd(rms_filtered, centroids, weights=cw) # rmslist = adaptive_rmsd(rms_filtered, centroids, theta) # else: rmslist = calc_rmsd(alpha, centroids) numConf = traj.n_frames numLabels = len(centroids) # rmslist = calc_rmsd(alpha.xyz, self.data['centroid'], weights=cw) logging.debug(' RMS: %d points projected to %d centroid-distances', \ numConf, numLabels) # 6. Apply Heuristics Labeling -- Single Variate rmslabel = [] binlist = [(a, b) for a in range(numLabels) for b in range(numLabels)] label_count = {ab: 0 for ab in binlist} groupbystate = [[] for i in range(numLabels)] groupbybin = {ab: [] for ab in binlist} for i, rms in enumerate(rmslist_sv): # Sort RMSD by proximity & set state A as nearest state's centroid A, B = np.argsort(rms)[:2] # Calc Absolute proximity between nearest 2 states' centroids # THETA Calc derived from static run. it is based from the average std dev of all rms's from a static run # of BPTI without solvent. It could be dynamically calculated, but is hard coded here # The theta is divided by four based on the analysis of DEShaw: # est based on ~3% of DEShaw data in transition (hence ) # avg_stddev = 0.34119404492089034 # theta = settings.RMSD_THETA ## FOR ADAPTIVE Cantroids. Theta is now updated dyamically # NOTE: Original formulate was relative. Retained here for reference: # Rel vs Abs: Calc relative proximity for top 2 nearest centroids # relproximity = rms[A] / (rms[A] + rms[rs[1]]) # B = rs[1] if relproximity > (.5 - theta) else A # proximity = abs(rms[prox[1]] - rms[A]) / (rms[prox[1]] + rms[A]) #relative #proximity = abs(rms[prox[1]] - rms[A]) #abs # Update for Adaptive Centroid. delta = np.abs(rms[B] - rms[A]) # (TODO: Factor in more than top 2, better noise) # Label secondary sub-state # sub_state = B prox[1] if proximity < theta else A # For ADAPTIVE Centroids if delta < 0.33: sub_state = B else: sub_state = A rmslabel.append((A, sub_state)) # Add this index to the set of indices for this respective label # TODO: Should we evict if binsize is too big??? # logging.debug('Label for observation #%3d: %s', i, str((A, B))) label_count[(A, sub_state)] += 1 # Group high-dim point by state # TODO: Consider grouping by stateonly or well/transitions (5 vs 10 bins) groupbystate[A].append(i) groupbybin[(A, sub_state)].append(i) # stat.collect('observe', label_count) bench.mark('RMS') logging.info('Labeled the following:') for A in range(numLabels): if len(groupbystate[A]) > 0: logging.info('label,state,%d,num,%d', A, len(groupbystate[A])) for ab in binlist: if len(groupbybin[ab]) > 0: A, B = ab logging.info('label,bin,%d,%d,num,%d', A, B, len(groupbybin[ab])) # FEATURE LANDSCAPE -- Multi-Variate # Calc Feature landscape for each frame's RMSD feal_list = [feal.atemporal(rms) for rms in rmslist] logging.info('Calculated Feature Landscape. Aggregate for this traj') # For logging purposes agg_feal = np.mean(feal_list, axis=0) logging.info('CountsMax [C]: %s', str(agg_feal[:5])) logging.info('StateDist [S]: %s', str(agg_feal[5:10])) logging.info('RelDist [A-B]: %s', str(agg_feal[10:])) # ADAPTIVE CENTROID & THETA CALCULATION # if lock is None: # logging.info('Never acqiured a lock. Skipping adaptive update (TODO: Mark pts as stale)') # else: # logging.info('Updating Adaptive Centroid') if ADAPTIVE_CENTROID: pipe = self.catalog.pipeline() for state in range(numLabels): n_pts = len(groupbybin[(state, state)]) if n_pts == 0: logging.info('Skipping State %d Centroid -- Well not visited on this trajectory') continue cent_xyz = [alpha.xyz[i] for i in groupbybin[(state, state)]] cent_npts = len(groupbybin[(state, state)]) c_sum = np.zeros(shape=alpha.xyz[0].shape) for pt in cent_xyz: c_sum += pt centroid_local = c_sum / n_pts centroid_delta = LA.norm(centroids[state] - cent) pipe.rpush('centroid:xyz:%d' % state, pickle.dumps(centroid_local)) pipe.rpush('centroid:npts:%d' % state, n_pts) pipe.rpush('centroid:delta:%d' % state, centroid_delta) pipe.execute() # 4-B. Subspace Calcuation: COVARIANCE Matrix, 200ns windows, Full Protein #------ B: Covariance Matrix ----------------- if EXPERIMENT_NUMBER > 5: # 1. Project Pt to PC's for each conform (top 3 PC's) logging.info('---- Covariance Calculation on 200ns windows (Full Protein, cartesian Space) ----') # Calculate Covariance over 200 ps Windows sliding every 100ps # These could be user influenced... WIN_SIZE_NS = .2 SLIDE_AMT_NS = .1 logging.debug("Calculating Covariance over trajectory. frame_size = %.1f, WINSIZE = %dps, Slide = %dps", frame_size, WIN_SIZE_NS*1000, SLIDE_AMT_NS*1000) covar = dr.calc_covar(alpha.xyz, WIN_SIZE_NS, frame_size, slide=SLIDE_AMT_NS) bench.mark('CalcCovar') stat.collect('numcovar', len(covar)) logging.debug("Calcualted %d covariance matrices. Storing variances", len(covar)) # BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available # try: self.wait_catalog() # except OverlayNotAvailable as e: # logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis") # Update Catalog with 1 Long Atomic Transaction global_index = [] with self.catalog.pipeline() as pipe: while True: try: logging.debug('Update Filelist') pipe.watch(wrapKey('jc', job['name'])) file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1 # HD Points logging.debug('Update HD Points') for x in range(traj.n_frames): # Note: Pipelined insertions "should" return contiguous set of index points index = pipe.rpush('xid:reference', (file_idx, x)) - 1 global_index.append(index - 1) pipe.multi() logging.debug('Update RMS Subspace') for x in range(traj.n_frames): A, B = rmslabel[x] index = global_index[x] # Labeled Observation (from RMSD) pipe.rpush('label:rms', rmslabel[x]) pipe.rpush('varbin:rms:%d_%d' % (A, B), index) # pipe.rpush('lineage:rms:%d_%d:%d_%d' % (srcA, srcB, A, B), index) # pipe.rpush('lineage:pca:%s:%d_%d' % (job['src_hcube'], A, B), index) pipe.rpush('subspace:rms', bytes(rmslist_sv[x])) pipe.rpush('subspace:feal', bytes(feal_list[x])) logging.debug('Update OBS Counts') for b in binlist: pipe.rpush('observe:rms:%d_%d' % b, label_count[b]) pipe.incr('observe:count') pipe.hset('anl_sequence', job['name'], mylogical_seqnum) if EXPERIMENT_NUMBER > 5: logging.debug('Update Covar Subspace') for i, si in enumerate(covar): logging.debug('Update COVAR Pt #%d', i) local_index = int(i * frame_size * SLIDE_AMT_NS) pipe.rpush('subspace:covar:pts', bytes(si)) pipe.rpush('subspace:covar:xid', global_index[local_index]) pipe.rpush('subspace:covar:fidx', (file_idx, local_index)) logging.debug('Executing') pipe.execute() break except redis.WatchError as e: logging.debug('WATCH ERROR') continue self.data[key]['xid:start'] = global_index[0] self.data[key]['xid:end'] = global_index[-1] bench.mark('Indx_Update') # (Should we Checkpoint here?) # 4-C. Subspace Calcuation: PCA BY Strata (PER STATE) using Alpha Filter #------ C: GLOBAL PCA by state ----------------- # Update PCA Vectors for each state with new data if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10: logging.info('---- PCA per BIN over Alpha Filter in cartesian Space ----') # TODO: This will eventually get moved into a User process macrothread # which will set in between analysis and controller. # For now, we're recalculating using a lock # Check if vectors need to be recalculated # Connect to reservoir samples # TODO: Catalog or Cache? reservoir = ReservoirSample('rms', self.catalog) # STALENESS_FACTOR = .25 # Recent updates account for 25% of the sample (Just a guess) num_inserted = {ab: 0 for ab in binlist} num_params = np.prod(alpha.xyz.shape[1:]) for A, B in binlist: num_observations = len(groupbybin[(A,B)]) if num_observations == 0: logging.info('No data received for bin (%d,%d). Not processing this bin here.', A, B) continue res_label = '%d_%d' % (A,B) updateVectors = False kpca_key = 'subspace:pca:kernel:%d_%d' % (A, B) kpca = PCAnalyzer.load(self.catalog, kpca_key) newkpca = False if kpca is None: # kpca = PCAKernel(None, 'sigmoid') kpca = PCAKernel(6, 'rbf') newkpca = True logging.info('PCA: Checking if current vectors for state %d are out of date', A) rsize = reservoir.getsize(res_label) tsize = kpca.trainsize # KPCA is out of date is the sample size is 20% larger than previously used set # Heuristics --- this could be a different "staleness" factor or we can check it some other way if newkpca or rsize > (tsize * 1.5): # Should we only use a sample here??? (not now -- perhaps with larger rervoirs or if KPCA is slow traindata = reservoir.get(res_label) if newkpca: logging.info('New PCA Kernel. Trained on data set of size %d. Current \ reservoir is %d pts.', tsize, rsize) logging.info('Projecting %d points on Kernel PCA for bin (%d,%d)', num_observations, A, B) traindata = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32) for i, index in enumerate(groupbybin[(A,B)]): np.copyto(traindata[i], alpha.xyz[index]) else: logging.info('PCA Kernel is old (Updating it). Trained on data set of \ size %d. Current reservoir is %d pts.', tsize, rsize) if len(traindata) <= num_params: logging.info("Not enough data to calculate PC's (Need at least %d \ observations). Skipping PCA for Bin (%d,%d)", num_params, A, B) hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32) for i, index in enumerate(groupbybin[(A,B)]): np.copyto(hd_pts[i], alpha.xyz[index]) num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts) logging.debug('Updating reservoir Sample for Bin (%d, %d)') continue logging.info(' Performing Kernel PCA (Gaussian) for bin (%d,%d) using traindata of size %d', \ A, B, len(traindata)) kpca.solve(traindata) # NOTE: Pick PCA Algorithm HERE # pca = calc_kpca(np.array(traindata), kerneltype='sigmoid') # pca = calc_pca(np.array(traindata)) bench.mark('CalcKPCA_%d_%d'%(A,B)) # new_vect = pca.alphas_.T lock = self.catalog.lock_acquire(kpca_key) if lock is None: logging.info('Could not lock the PC Kernel for Bin (%d,%d). Not updating', A, B) else: kpca.store(self.catalog, kpca_key) lock = self.catalog.lock_release(kpca_key, lock) bench.mark('ConcurrPCAWrite_%d_%d'%(A,B)) # Project Reservoir Sample to the Kernel and overwrite current set of points # This should only happen up until the reservior is filled # If we are approx above to train, be sure to project all reservor points if not newkpca: logging.info('Clearing and Re-Projecting the entire reservoir of %d points for Bin (%d,%d).', \ rsize, A, B) rsamp_lowdim = kpca.project(traindata) pipe = self.catalog.pipeline() pipe.delete('subspace:pca:%d_%d'%(A,B)) for si in rsamp_lowdim: pipe.rpush('subspace:pca:%d_%d'%(A,B), bytes(si)) pipe.execute() else: logging.info('PCA Kernel is good -- no need to change them') bench.mark('start_ProjPCA') logging.info('Projecting %d points on Kernel PCA for Bin (%d,%d)', num_observations, A, B) hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32) for i, index in enumerate(groupbybin[(A,B)]): np.copyto(hd_pts[i], alpha.xyz[index]) pc_proj = kpca.project(hd_pts) bench.mark('ProjPCA_%d_%d'%(A,B)) # 2. Append subspace in catalog pipe = self.catalog.pipeline() for si in pc_proj: pipe.rpush('subspace:pca:%d_%d' % (A,B), bytes(si)) pipe.execute() logging.debug('Updating reservoir Sample') num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts) bench.mark('PCA') pipe = self.catalog.pipeline() for ab, num in num_inserted.items(): if num > 0: pipe.rpush('subspace:pca:updates:%d_%d' % (A, B), num) pipe.execute() # ---- POST PROCESSING if USE_SHM: shutil.rmtree(ramdisk) # shm_contents = os.listdir('/dev/shm') shm_contents = os.listdir('/tmp') logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents)) # For benchmarching: # print('##', job['name'], dcdfilesize/(1024*1024*1024), traj.n_frames) bench.show() stat.show() # Return # of observations (frames) processed return [numConf]
def execute(self): """Special execute function for the reweight operator -- check/validate. """ # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug( "============================ <PRE-PROCESS> =============================" ) self.cacheclient = CacheClient(self.name) numLabels = 5 binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)] labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1) num_pts = len(labeled_pts_rms) logging.debug('##NUM_OBS: %d', num_pts) # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)] TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4), (0, 2), (0, 1)] MAX_SAMPLE_SIZE = 100 # Max # of cov traj to back project per HCube MAX_PT_PER_MATRIX = 100 # Num points to sample from each cov traj COVAR_SIZE = 200 # Ea Cov "pt" is 200 HD pts. -- should be static based on user query MAX_HCUBE = 6 # Max Num HCubes to process # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug( "======================= <QUERY PROCESSING> =========================" ) # 1. RUN KPCA on <<?????>> (sample set) and project all pts # 2. Calculate K-D Tree on above # 3. Score each point with distance to centroid # 4. B = Select the smallest half of clusters # 5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA) # 6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????) # ALT-> use HCUbe size as its weight # 7. A = HCubes for states 3 (and 4) # 8. Reweight A into both state 3 and state 4 (B) HCubes # 9. ID Overlap # 10. Apply Gamme Function logging.info("===== Covariance Matrix PCA-KMeans Calculation (B)") logging.info("Retrieving All Covariance Vectors") home = os.getenv('HOME') cfile = home + '/work/DEBUG_COVAR_PTS' DO_COVAR = self.calc_covar # For recalculating covariance matrices (if not pre-calc/stored) if DO_COVAR: if os.path.exists(cfile + '.npy'): covar_pts = np.load(cfile + '.npy') logging.debug('Loaded From File') else: covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1) covar_pts = np.array([np.fromstring(x) for x in covar_raw]) np.save(cfile, covar_pts) logging.debug('Loaded From Catalog & Saved') covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1) logging.debug('Indiced Loaded. Retrieving File Indices') covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1) if DO_COVAR: logging.info(" Pulled %d Covariance Vectors", len(covar_pts)) logging.info( "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)" ) # FOR incrementatl PCA: NUM_PC = 6 ipca_key = 'subspace:covar:ipca' ipca = PCAnalyzer.load(self.catalog, ipca_key) if ipca is None: logging.info('Creating a NEW IPCA') ipca = PCAIncremental(NUM_PC) lastindex = 0 else: lastindex = ipca.trainsize logging.info( 'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', ipca.trainsize, len(covar_pts) - ipca.trainsize) # For incremental, partial solve using only newer pts (from the last "trainsize") if len(covar_pts) - lastindex > 0: ipca.solve(covar_pts[lastindex:]) logging.info("Incrementatl PCA Updated. Storing Now...") #### BARRIER self.wait_catalog() ipca.store(self.catalog, ipca_key) logging.info("IPCA Saved. Projecting Covariance to PC") cfile = home + '/work/DEBUG_SUBCOVAR_PTS' if os.path.exists(cfile + '.npy'): subspace_covar_pts = np.load(cfile + '.npy') else: subspace_covar_pts = ipca.project(covar_pts) np.save(cfile, subspace_covar_pts) # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points logging.info( 'Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts)) global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle') if MAX_HCUBE <= 0: hcube_global = global_kdtree.getleaves() else: # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES hcube_global_ALL = global_kdtree.getleaves() hcube_global = {} num = 0 for k, v in hcube_global_ALL.items(): hcube_global[k] = v num += 1 if num == MAX_HCUBE: break # hcube_global = global_kdtree.getleaves() logging.info( 'Global HCubes: Key Count Volume Density (NOTE DEBUGGING ONLY 3 USED)' ) for k in sorted(hcube_global.keys()): v = hcube_global[k] logging.info('%-10s %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density']) if self.filelog: keys = hcube_global.keys() self.filelog.info('global,keys,%s', ','.join(keys)) self.filelog.info( 'global,count,%s', ','.join([str(hcube_global[k]['count']) for k in keys])) self.filelog.info( 'global,volume,%s', ','.join([str(hcube_global[k]['volume']) for k in keys])) self.filelog.info( 'global,density,%s', ','.join([str(hcube_global[k]['density']) for k in keys])) logging.info( "===== SELECT Sampling of points from each Global HCube (B)") s = sorted(hcube_global.items(), key=lambda x: x[1]['count']) hcube_global = {x[0]: x[1] for x in s} counter = 0 for key in hcube_global.keys(): counter += 1 if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE: cov_index = hcube_global[key]['elm'] hcube_global[key]['samplefactor'] = 1 else: cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE) hcube_global[key]['samplefactor'] = len( hcube_global[key]['elm']) / MAX_SAMPLE_SIZE hcube_global[key]['idxlist'] = [] for cov in cov_index: selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist() hcube_global[key]['idxlist'].extend( [int(covar_index[cov]) + i for i in selected_hd_idx]) logging.info('Back Projecting Global HCube `%s` (%d out of %d)', key, counter, len(hcube_global.keys())) source_cov = self.backProjection(hcube_global[key]['idxlist']) hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov) logging.debug('Back Projected %d points to HD space: %s', len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha'])) # logging.info('Calculating all HD Distances') # dist_hd = {} # dist_ld = {} # for key in hcube_global.keys(): # T = hcube_global[key]['alpha'].xyz # N = len(T) # dist_hd[key] = np.zeros(shape=(N, N)) # dist_ld[key] = {} # for A in range(0, N): # dist_hd[key][A][A] = 0 # for B in range(A+1, N): # dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B]) # KD Tree for states from Reservoir Sample of RMSD labeled HighDim reservoir = ReservoirSample('rms', self.catalog) logging.info( "===== BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) " ) hcube_list = {} logging.info( "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)" ) hcube_local = {} logging.info("=======================================================") logging.info(" PROJECT Global HCubes into Per-Bin HCube KD Tree(s)") logging.info( "=======================================================\n") overlap_hcube = {k: {} for k in hcube_global.keys()} projection_map = {} pt_projection_list = [] for key in sorted(hcube_global.keys()): for i in range(len(hcube_global[key]['alpha'].xyz)): pt_projection_list.append([]) for bin_idx, tbin in enumerate(TEST_TBIN): logging.info("Project Global HCubes into local subspace for %s", str(tbin)) # Load Vectors logging.info('Loading subspace and kernel for bin %s', str(tbin)) # LOAD KPCA Kernel matrix kpca_key = 'subspace:pca:kernel:%d_%d' % tbin kpca = PCAnalyzer.load(self.catalog, kpca_key) data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1) data = np.array([np.fromstring(x) for x in data_raw]) if len(data) == 0: logging.error( 'No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue logging.info( 'Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape)) kdtree = KDTree(200, maxdepth=8, data=data, method='middle') hcube_local[tbin] = kdtree.getleaves() logging.info('LOCAL KD-Tree Completed for %s:', str(tbin)) for k in sorted(hcube_local[tbin].keys()): logging.info(' `%-9s` #pts:%6d density:%9.1f', k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density']) if self.filelog: keys = hcube_local[tbin].keys() A, B = tbin self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys)) self.filelog.info( 'local,%d_%d,count,%s', A, B, ','.join( [str(hcube_local[tbin][k]['count']) for k in keys])) self.filelog.info( 'local,%d_%d,volume,%s', A, B, ','.join( [str(hcube_local[tbin][k]['volume']) for k in keys])) self.filelog.info( 'local,%d_%d,density,%s', A, B, ','.join( [str(hcube_local[tbin][k]['density']) for k in keys])) n_total = 0 logging.debug('Global Hcubes to Project (%d): %s', len(hcube_global.keys()), str(hcube_global.keys())) projection_map[bin_idx] = { k: set() for k in hcube_local[tbin].keys() } pnum = 0 for key in sorted(hcube_global.keys()): overlap_hcube[key][tbin] = {} cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz) logging.debug( 'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s ', key, len(cov_proj_pca), str(tbin)) for i, pt in enumerate(cov_proj_pca): hcube = kdtree.probe(pt, probedepth=9) # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying) if hcube not in overlap_hcube[key][tbin]: overlap_hcube[key][tbin][hcube] = { 'idxlist': hcube_local[tbin][hcube]['elm'], 'wgt': hcube_local[tbin][hcube]['density'], 'num_projected': 0 } overlap_hcube[key][tbin][hcube]['num_projected'] += 1 # Index this point in corresponding local HCube projection view projection_map[bin_idx][hcube].add(pnum) pt_projection_list[pnum].append(hcube) pnum += 1 for k, v in sorted(overlap_hcube[key][tbin].items()): logging.debug( ' Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected']) # logging.info('Calculating Lower Dimensional Distances') # N = len(cov_proj_pca) # dist_ld[key][tbin] = np.zeros(shape=(N, N)) # for A in range(0, N): # for B in range(A+1, N): # dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B]) # Re-Index projected points -- could make this a list too next_index = 0 view_list = [] for bin_idx, hcube_map in projection_map.items(): hcube_list = [] for hcube_key, pt_list in hcube_map.items(): hcube_list.append((set((hcube_key, )), set(pt_list))) view_list.append((set((bin_idx, )), hcube_list)) print("CALLING: Collapse Join") joined_subspaces = collapse_join(projection_map.keys(), view_list) for subspace_list, correlated_hcubes in joined_subspaces: tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list] for hcube_list, pt_list in correlated_hcubes: print(tbin_list, hcube_list, pt_list) # TODO: Corrlate Back to Global print('Visualize HERE') # for idx, tbin in enumerate(TEST_TBIN): # # Only process substates with data # if tbin not in hcube_local: # logging.warning('Local KD Tree not created for %s', str(tbin)) # continue # projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # sets = {} # proj_bin_list = [] # for tbin in TEST_TBIN: # if tbin not in hcube_local: # continue # proj_bin_list.append(tbin) # sets[tbin] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # set_list = {} # for tbin, view in sets.items(): # set_list[(tbin,)] = [] # for hcube, idxlist in view.items(): # print(tbin, hcube, idxlist) # set_list[(tbin,)].append((set((hcube,)), idxlist)) # def collapse(C): # a = 0 # b = 0 # N = [] # while a < len(C) and b < len(C): # A = sorted(C[a]) # B = sorted(C[b]) # if A == B: # b += 1 # elif A[0] == B[0]: # N.append(set(A)|set(B)) # b += 1 # else: # a += 1 # if len(N) <= 1: # return [] # else: # return N + collapse(N) # q=collapse(t1) # for i in q: print(sorted(i)) # print('Checking all 2-Way Joins') # join2 = {} # for a in range(0, len(proj_bin_list)-1): # tA = proj_bin_list[a] # for b in range(a+1, len(proj_bin_list)): # tB = proj_bin_list[b] # join_ss = tuple(set((tA, tB))) # set_list = [] # for kA, vA in sets[tA].items(): # for kB, vB in sets[tB].items(): # join_hc = set((kA, kB)) # inter = vA & vB # if len(inter) > 0: # set_list.append((join_hc, inter)) # if len(set_list) > 0: # join2[join_ss] = set_list # print('2-Way Join Results:') # for ss, set_list in join2.items(): # for hc, idxlist in set_list: # print(ss, hc, idxlist) # print('Checking all 3-Way Joins') # join3 = [] # checked = [] # for a in range(0, len(join2)-1): # sA, hA, vA = join2[a] # for b in range(a+1, len(join2)): # sB, hB, vB = join2[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join3.append((ss, hc, inter)) # print('Checking all 4-Way Joins') # join4 = [] # checked = [] # for a in range(0, len(join3)-1): # sA, hA, vA = join3[a] # for b in range(a+1, len(join3)): # sB, hB, vB = join3[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join4.append((ss, hc, inter)) # if self.filelog: # for i in join2: # self.filelog.info('%s', str(i)) # for i in join3: # self.filelog.info('%s', str(i)) # for i in join4: # self.filelog.info('%s', str(i)) DO_MIN_CHECK = False if DO_MIN_CHECK: def maxcount(x): y = {} for i in x: y[i] = 1 if i not in y else y[i] + 1 return max(y.values()) print( '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces' ) argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf) for i in x]) for key in hcube_global.keys(): # logging.info('Showing MIN / MAX for points from HCube %s:', key) minA = {} maxA = {} for n in range(len(dist_hd[key])): minA[n] = [] maxA[n] = [] for tbin in TEST_TBIN: if tbin not in dist_ld[key].keys(): continue minA[n].append(0) maxA[n].append(0) else: minA[n].append( argmin_nonzero(dist_ld[key][tbin][n])) maxA[n].append(np.argmax(dist_ld[key][tbin][n])) numsame = np.zeros(len(dist_ld[key].keys()) + 1) for n in range(len(dist_hd[key][n])): minH = argmin_nonzero(dist_hd[key][n]) maxH = np.argmax(dist_hd[key][n]) minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])] numsamepair = maxcount(minA[n]) numsame[numsamepair] += 1 # print('%3d'%n, '%2d/%-2d '%(minH, maxH), '%s' % ' '.join(minmax), ' [%d]'%numsamepair) print(' '.join([ '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame))) ])) print('Stopping HERE!') sys.exit(0) # GAMMA FUNCTION EXPR # 8 gamma1 = lambda a, b: (a * b) gamma2 = lambda a, b: (a + b) / 2 # TODO: Factor in RMS weight for tbin in TEST_TBIN: # for tbin in sorted(bin_list): logging.info('') logging.info('BIPARTITE GRAPH for %s', str(tbin)) bipart = {} edgelist = [] for hcB in hcube_global.keys(): num_B = hcube_global[hcB]['count'] wgt1_B = hcube_global[hcB]['density'] if tbin not in overlap_hcube[hcB]: continue for hcA, hcA_data in overlap_hcube[hcB][tbin].items(): edge = {} if hcA not in bipart: bipart[hcA] = [] num_proj = hcA_data['num_projected'] wgt_A = hcA_data['wgt'] wgt2_B = wgt1_B * num_proj edge['combW1'] = gamma1(wgt_A, wgt1_B) edge['combW2'] = gamma1(wgt_A, wgt2_B) edge['combW3'] = gamma2(wgt_A, wgt1_B) edge['combW4'] = gamma2(wgt_A, wgt2_B) edge['num_A'] = len(hcA_data['idxlist']) edge['num_B'] = num_B edge['num_proj'] = num_proj edge['wgt_A'] = wgt_A edge['wgt1_B'] = wgt1_B edge['wgt2_B'] = wgt2_B edge['hcA'] = hcA edge['hcB'] = hcB bipart[hcA].append(edge) edgelist.append((hcA, hcB, num_proj)) if len(bipart) == 0: logging.info("NO DATA FOR %s", str(tbin)) continue logging.info('') logging.info( 'A (# Pts) H-Cube <--- B H-Cube (# proj/total Pts) wgt_A wB1:density wB2:Mass A*B1 A*B2 AVG(A,B1) AVG(A,B2)' ) for k, v in bipart.items(): for edge in v: logging.info( 'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s` (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge) if self.filelog: A, B = tbin self.filelog.info('edge,%d_%d,%s,%s,%d', A, B, edge['hcA'], edge['hcB'], edge['num_proj']) # Prepare nodes for graph nA = set() nB = set() elist = [] for e in edgelist: a, b, z = e if z <= 5: continue nA.add(a) nB.add(b) elist.append((a, b, z)) nAKeys = sorted(nA)[::-1] nBKeys = sorted(nB)[::-1] sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys] sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys] idxA = {key: i for i, key in enumerate(nAKeys)} idxB = {key: i for i, key in enumerate(nBKeys)} edges = [(idxA[a], idxB[b], z) for a, b, z in elist] G.bipartite(sizesA, sizesB, edges, sizesA, sizesB, 'bipartite_%d_%d' % tbin) logging.info('STOPPING HERE!!!!') sys.exit(0) return []
def execute(self): """Special execute function for the reweight operator -- check/validate. """ # PRE-PROCESSING --------------------------------------------------------------------------------- logging.debug("============================ <PRE-PROCESS> =============================") self.cacheclient = CacheClient(self.name) numLabels = 5 binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)] labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1) num_pts = len(labeled_pts_rms) logging.debug('##NUM_OBS: %d', num_pts) # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)] TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)] MAX_SAMPLE_SIZE = 100 # Max # of cov traj to back project per HCube MAX_PT_PER_MATRIX = 100 # Num points to sample from each cov traj COVAR_SIZE = 200 # Ea Cov "pt" is 200 HD pts. -- should be static based on user query MAX_HCUBE = 6 # Max Num HCubes to process # IMPLEMENT USER QUERY with REWEIGHTING: logging.debug("======================= <QUERY PROCESSING> =========================") # 1. RUN KPCA on <<?????>> (sample set) and project all pts # 2. Calculate K-D Tree on above # 3. Score each point with distance to centroid # 4. B = Select the smallest half of clusters # 5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA) # 6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????) # ALT-> use HCUbe size as its weight # 7. A = HCubes for states 3 (and 4) # 8. Reweight A into both state 3 and state 4 (B) HCubes # 9. ID Overlap # 10. Apply Gamme Function logging.info("===== Covariance Matrix PCA-KMeans Calculation (B)") logging.info("Retrieving All Covariance Vectors") home = os.getenv('HOME') cfile = home + '/work/DEBUG_COVAR_PTS' DO_COVAR = self.calc_covar # For recalculating covariance matrices (if not pre-calc/stored) if DO_COVAR: if os.path.exists(cfile + '.npy'): covar_pts = np.load(cfile + '.npy') logging.debug('Loaded From File') else: covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1) covar_pts = np.array([np.fromstring(x) for x in covar_raw]) np.save(cfile, covar_pts) logging.debug('Loaded From Catalog & Saved') covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1) logging.debug('Indiced Loaded. Retrieving File Indices') covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1) if DO_COVAR: logging.info(" Pulled %d Covariance Vectors", len(covar_pts)) logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)") # FOR incrementatl PCA: NUM_PC = 6 ipca_key = 'subspace:covar:ipca' ipca = PCAnalyzer.load(self.catalog, ipca_key) if ipca is None: logging.info('Creating a NEW IPCA') ipca = PCAIncremental(NUM_PC) lastindex = 0 else: lastindex = ipca.trainsize logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', ipca.trainsize, len(covar_pts)-ipca.trainsize) # For incremental, partial solve using only newer pts (from the last "trainsize") if len(covar_pts)-lastindex > 0: ipca.solve(covar_pts[lastindex:]) logging.info("Incrementatl PCA Updated. Storing Now...") #### BARRIER self.wait_catalog() ipca.store(self.catalog, ipca_key) logging.info("IPCA Saved. Projecting Covariance to PC") cfile = home + '/work/DEBUG_SUBCOVAR_PTS' if os.path.exists(cfile + '.npy'): subspace_covar_pts = np.load(cfile + '.npy') else: subspace_covar_pts = ipca.project(covar_pts) np.save(cfile, subspace_covar_pts) # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts)) global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle') if MAX_HCUBE <= 0: hcube_global = global_kdtree.getleaves() else: # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES hcube_global_ALL = global_kdtree.getleaves() hcube_global = {} num = 0 for k, v in hcube_global_ALL.items(): hcube_global[k] = v num += 1 if num == MAX_HCUBE: break # hcube_global = global_kdtree.getleaves() logging.info('Global HCubes: Key Count Volume Density (NOTE DEBUGGING ONLY 3 USED)') for k in sorted(hcube_global.keys()): v = hcube_global[k] logging.info('%-10s %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density']) if self.filelog: keys = hcube_global.keys() self.filelog.info('global,keys,%s',','.join(keys)) self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys])) self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys])) self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys])) logging.info("===== SELECT Sampling of points from each Global HCube (B)") s = sorted(hcube_global.items(), key=lambda x: x[1]['count']) hcube_global = {x[0]: x[1] for x in s} counter = 0 for key in hcube_global.keys(): counter += 1 if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE: cov_index = hcube_global[key]['elm'] hcube_global[key]['samplefactor'] = 1 else: cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE) hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE hcube_global[key]['idxlist'] = [] for cov in cov_index: selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist() hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx]) logging.info('Back Projecting Global HCube `%s` (%d out of %d)', key, counter, len(hcube_global.keys())) source_cov = self.backProjection(hcube_global[key]['idxlist']) hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov) logging.debug('Back Projected %d points to HD space: %s', len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha'])) # logging.info('Calculating all HD Distances') # dist_hd = {} # dist_ld = {} # for key in hcube_global.keys(): # T = hcube_global[key]['alpha'].xyz # N = len(T) # dist_hd[key] = np.zeros(shape=(N, N)) # dist_ld[key] = {} # for A in range(0, N): # dist_hd[key][A][A] = 0 # for B in range(A+1, N): # dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B]) # KD Tree for states from Reservoir Sample of RMSD labeled HighDim reservoir = ReservoirSample('rms', self.catalog) logging.info("===== BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ") hcube_list = {} logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)") hcube_local = {} logging.info("=======================================================") logging.info(" PROJECT Global HCubes into Per-Bin HCube KD Tree(s)") logging.info("=======================================================\n") overlap_hcube = {k: {} for k in hcube_global.keys()} projection_map = {} pt_projection_list = [] for key in sorted(hcube_global.keys()): for i in range(len(hcube_global[key]['alpha'].xyz)): pt_projection_list.append([]) for bin_idx, tbin in enumerate(TEST_TBIN): logging.info("Project Global HCubes into local subspace for %s", str(tbin)) # Load Vectors logging.info('Loading subspace and kernel for bin %s', str(tbin)) # LOAD KPCA Kernel matrix kpca_key = 'subspace:pca:kernel:%d_%d' % tbin kpca = PCAnalyzer.load(self.catalog, kpca_key) data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1) data = np.array([np.fromstring(x) for x in data_raw]) if len(data) == 0: logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin)) continue logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape)) kdtree = KDTree(200, maxdepth=8, data=data, method='middle') hcube_local[tbin] = kdtree.getleaves() logging.info('LOCAL KD-Tree Completed for %s:', str(tbin)) for k in sorted(hcube_local[tbin].keys()): logging.info(' `%-9s` #pts:%6d density:%9.1f', k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density']) if self.filelog: keys = hcube_local[tbin].keys() A,B = tbin self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys)) self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys])) self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys])) self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys])) n_total = 0 logging.debug('Global Hcubes to Project (%d): %s', len(hcube_global.keys()), str(hcube_global.keys())) projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()} pnum = 0 for key in sorted(hcube_global.keys()): overlap_hcube[key][tbin] = {} cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz) logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s ', key, len(cov_proj_pca), str(tbin)) for i, pt in enumerate(cov_proj_pca): hcube = kdtree.probe(pt, probedepth=9) # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying) if hcube not in overlap_hcube[key][tbin]: overlap_hcube[key][tbin][hcube] = { 'idxlist': hcube_local[tbin][hcube]['elm'], 'wgt': hcube_local[tbin][hcube]['density'], 'num_projected': 0} overlap_hcube[key][tbin][hcube]['num_projected'] += 1 # Index this point in corresponding local HCube projection view projection_map[bin_idx][hcube].add(pnum) pt_projection_list[pnum].append(hcube) pnum += 1 for k, v in sorted(overlap_hcube[key][tbin].items()): logging.debug(' Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected']) # logging.info('Calculating Lower Dimensional Distances') # N = len(cov_proj_pca) # dist_ld[key][tbin] = np.zeros(shape=(N, N)) # for A in range(0, N): # for B in range(A+1, N): # dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B]) # Re-Index projected points -- could make this a list too next_index = 0 view_list = [] for bin_idx, hcube_map in projection_map.items(): hcube_list = [] for hcube_key, pt_list in hcube_map.items(): hcube_list.append((set((hcube_key,)), set(pt_list))) view_list.append((set((bin_idx,)), hcube_list)) print("CALLING: Collapse Join") joined_subspaces = collapse_join(projection_map.keys(), view_list) for subspace_list, correlated_hcubes in joined_subspaces: tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list] for hcube_list, pt_list in correlated_hcubes: print(tbin_list, hcube_list, pt_list) # TODO: Corrlate Back to Global print('Visualize HERE') # for idx, tbin in enumerate(TEST_TBIN): # # Only process substates with data # if tbin not in hcube_local: # logging.warning('Local KD Tree not created for %s', str(tbin)) # continue # projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # sets = {} # proj_bin_list = [] # for tbin in TEST_TBIN: # if tbin not in hcube_local: # continue # proj_bin_list.append(tbin) # sets[tbin] = {k: set() for k in hcube_local[tbin].keys()} # for n, proj in enumerate(pt_projection_list): # for i, tbin in enumerate(proj_bin_list): # sets[tbin][proj[i]].add(n) # if self.filelog: # self.filelog.info('%d,%s', n, ','.join(proj)) # logging.info('%d,%s', n, ','.join(proj)) # set_list = {} # for tbin, view in sets.items(): # set_list[(tbin,)] = [] # for hcube, idxlist in view.items(): # print(tbin, hcube, idxlist) # set_list[(tbin,)].append((set((hcube,)), idxlist)) # def collapse(C): # a = 0 # b = 0 # N = [] # while a < len(C) and b < len(C): # A = sorted(C[a]) # B = sorted(C[b]) # if A == B: # b += 1 # elif A[0] == B[0]: # N.append(set(A)|set(B)) # b += 1 # else: # a += 1 # if len(N) <= 1: # return [] # else: # return N + collapse(N) # q=collapse(t1) # for i in q: print(sorted(i)) # print('Checking all 2-Way Joins') # join2 = {} # for a in range(0, len(proj_bin_list)-1): # tA = proj_bin_list[a] # for b in range(a+1, len(proj_bin_list)): # tB = proj_bin_list[b] # join_ss = tuple(set((tA, tB))) # set_list = [] # for kA, vA in sets[tA].items(): # for kB, vB in sets[tB].items(): # join_hc = set((kA, kB)) # inter = vA & vB # if len(inter) > 0: # set_list.append((join_hc, inter)) # if len(set_list) > 0: # join2[join_ss] = set_list # print('2-Way Join Results:') # for ss, set_list in join2.items(): # for hc, idxlist in set_list: # print(ss, hc, idxlist) # print('Checking all 3-Way Joins') # join3 = [] # checked = [] # for a in range(0, len(join2)-1): # sA, hA, vA = join2[a] # for b in range(a+1, len(join2)): # sB, hB, vB = join2[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join3.append((ss, hc, inter)) # print('Checking all 4-Way Joins') # join4 = [] # checked = [] # for a in range(0, len(join3)-1): # sA, hA, vA = join3[a] # for b in range(a+1, len(join3)): # sB, hB, vB = join3[b] # if sA == sB: # continue # ss, hc = sA | sB, hA | hB # if (ss, hc) in checked[-10:]: # continue # checked.append((ss, hc)) # inter = vA & vB # if len(inter) > 0: # join4.append((ss, hc, inter)) # if self.filelog: # for i in join2: # self.filelog.info('%s', str(i)) # for i in join3: # self.filelog.info('%s', str(i)) # for i in join4: # self.filelog.info('%s', str(i)) DO_MIN_CHECK = False if DO_MIN_CHECK: def maxcount(x): y={} for i in x: y[i] = 1 if i not in y else y[i]+1 return max(y.values()) print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces') argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x]) for key in hcube_global.keys(): # logging.info('Showing MIN / MAX for points from HCube %s:', key) minA = {}; maxA={} for n in range(len(dist_hd[key])) : minA[n]=[] ; maxA[n]=[] for tbin in TEST_TBIN: if tbin not in dist_ld[key].keys(): continue minA[n].append(0) maxA[n].append(0) else: minA[n].append(argmin_nonzero(dist_ld[key][tbin][n])) maxA[n].append(np.argmax(dist_ld[key][tbin][n])) numsame = np.zeros(len(dist_ld[key].keys())+1) for n in range(len(dist_hd[key][n])): minH = argmin_nonzero(dist_hd[key][n]) maxH = np.argmax(dist_hd[key][n]) minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])] numsamepair = maxcount(minA[n]) numsame[numsamepair] += 1 # print('%3d'%n, '%2d/%-2d '%(minH, maxH), '%s' % ' '.join(minmax), ' [%d]'%numsamepair) print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))])) print('Stopping HERE!') sys.exit(0) # GAMMA FUNCTION EXPR # 8 gamma1 = lambda a, b : (a * b) gamma2 = lambda a, b : (a + b) / 2 # TODO: Factor in RMS weight for tbin in TEST_TBIN: # for tbin in sorted(bin_list): logging.info('') logging.info('BIPARTITE GRAPH for %s', str(tbin)) bipart = {} edgelist = [] for hcB in hcube_global.keys(): num_B = hcube_global[hcB]['count'] wgt1_B = hcube_global[hcB]['density'] if tbin not in overlap_hcube[hcB]: continue for hcA, hcA_data in overlap_hcube[hcB][tbin].items(): edge = {} if hcA not in bipart: bipart[hcA] = [] num_proj = hcA_data['num_projected'] wgt_A = hcA_data['wgt'] wgt2_B = wgt1_B*num_proj edge['combW1'] = gamma1(wgt_A, wgt1_B) edge['combW2'] = gamma1(wgt_A, wgt2_B) edge['combW3'] = gamma2(wgt_A, wgt1_B) edge['combW4'] = gamma2(wgt_A, wgt2_B) edge['num_A'] = len(hcA_data['idxlist']) edge['num_B'] = num_B edge['num_proj'] = num_proj edge['wgt_A'] = wgt_A edge['wgt1_B'] = wgt1_B edge['wgt2_B'] = wgt2_B edge['hcA'] = hcA edge['hcB'] = hcB bipart[hcA].append(edge) edgelist.append((hcA, hcB, num_proj)) if len(bipart) == 0: logging.info("NO DATA FOR %s", str(tbin)) continue logging.info('') logging.info('A (# Pts) H-Cube <--- B H-Cube (# proj/total Pts) wgt_A wB1:density wB2:Mass A*B1 A*B2 AVG(A,B1) AVG(A,B2)') for k, v in bipart.items(): for edge in v: logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s` (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge) if self.filelog: A,B = tbin self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj']) # Prepare nodes for graph nA = set() nB = set() elist = [] for e in edgelist: a, b, z = e if z <= 5: continue nA.add(a) nB.add(b) elist.append((a,b,z)) nAKeys = sorted(nA)[::-1] nBKeys = sorted(nB)[::-1] sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys] sizesB = [hcube_global[n]['count']*3 for n in nBKeys] idxA = {key: i for i, key in enumerate(nAKeys)} idxB = {key: i for i, key in enumerate(nBKeys)} edges = [(idxA[a], idxB[b], z) for a, b, z in elist] G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin) logging.info('STOPPING HERE!!!!') sys.exit(0) return []