Example #1
0
def reproject_oldata():
  r1 = redis.StrictRedis(port=6390, decode_responses=True)
  cache = redis.StrictRedis(host='bigmem0006', port=6380, decode_responses=True)
  execlist = r1.hgetall('anl_sequence')
  keyorder = ['jc_'+i[0] for i in sorted(execlist.items(), key=lambda x:x[1])]
  # skip first 100 (non-sampled)
  pts = []
  bad_ref = 0
  miss = 0
  for key in keyorder:
    conf = r1.hgetall(key)
    src = int(conf['src_index'])
    ref = r1.lindex('xid:reference', src)
    if ref is not None:
      fileno, frame = eval(ref)
      ckey = 'sim:%s' % conf['name']
      xyz = cache.lindex(ckey, frame)
      if xyz is not None:
        pts.append(pickle.loads(xyz))
      else:
        tr = md.load_frame(conf['dcd'], frame, top=conf['pdb'])
        if len(tr.xyz) == 0:
          miss += 1
        else:
          pts.append(tr.xyz[0])
    else:
      bad_ref += 1
  traj = md.Trajectory(pts, deshaw.topo_prot.top)
  alpha = datareduce.filter_alpha(traj)
  return alpha
Example #2
0
def high_low_check(r, tbin='(0, 4)'):
  print('Pulling data...')
  obslist=rb.lrange('label:rms', 0, -1)
  ob04 = [i for i, o in enumerate(obslist) if o == tbin]
  traj = backProjection(r, ob04)
  alpha = datareduce.filter_alpha(traj)
  print('Kpca')
  kpca1 = PCAKernel(6, 'rbf')
  kpca1.solve(alpha.xyz)
  X = kpca1.project(alpha.xyz)
  print('KDTree1')
  kdtree1 = KDTree(50, maxdepth=4, data=X, method='median')
  hc1 = kdtree1.getleaves()
  print('KDTree2')
  Y = alpha.xyz.reshape(alpha.n_frames, 174)
  kdtree2 = KDTree(50, maxdepth=4, data=Y, method='median')
  hc2 = kdtree2.getleaves()
  hc1k = sorted(hc1.keys())
  hc2k = sorted(hc2.keys())
  s1 = [set(hc1[k]['elm']) for k in hc1k]
  s2 = [set(hc2[k]['elm']) for k in hc2k]
  dd = np.zeros(shape=(len(s1), len(s2)))
  print('     ', ' '.join(hc1k))
  for i, a in enumerate(s1):
    print('  ' +hc1k[i], end=' ')
    for j, b in enumerate(s2):
      n = len(a & b)
      print('%4d'%n, end=' ')
      dd[i][j] = n
    print('\n', end=' ')
  return dd
Example #3
0
def reproject_oldata():
    r1 = redis.StrictRedis(port=6390, decode_responses=True)
    cache = redis.StrictRedis(host='bigmem0006',
                              port=6380,
                              decode_responses=True)
    execlist = r1.hgetall('anl_sequence')
    keyorder = [
        'jc_' + i[0] for i in sorted(execlist.items(), key=lambda x: x[1])
    ]
    # skip first 100 (non-sampled)
    pts = []
    bad_ref = 0
    miss = 0
    for key in keyorder:
        conf = r1.hgetall(key)
        src = int(conf['src_index'])
        ref = r1.lindex('xid:reference', src)
        if ref is not None:
            fileno, frame = eval(ref)
            ckey = 'sim:%s' % conf['name']
            xyz = cache.lindex(ckey, frame)
            if xyz is not None:
                pts.append(pickle.loads(xyz))
            else:
                tr = md.load_frame(conf['dcd'], frame, top=conf['pdb'])
                if len(tr.xyz) == 0:
                    miss += 1
                else:
                    pts.append(tr.xyz[0])
        else:
            bad_ref += 1
    traj = md.Trajectory(pts, deshaw.topo_prot.top)
    alpha = datareduce.filter_alpha(traj)
    return alpha
Example #4
0
def kpca_check(red_db, tbin='(0, 4)'):
  if isinstance(red_db, list):
    rlist = red_db
  else:
    rlist = [red_db]

  trajlist = []
  for r in rlist:
    print('Pulling data...')
    obslist = r.lrange('label:rms', 0, -1)
    idxlist = [i for i, o in enumerate(obslist) if o == tbin]
    traj = dh.backProjection(r, idxlist)
    alpha = datareduce.filter_alpha(traj)
    trajlist.append(alpha)

  deidx = lambda i: deidx_cutlist(i, [t.n_frames for t in trajlist])

  print('Kpca')
  kpca1 = PCAKernel(6, 'rbf')
  kpca1.solve(alpha.xyz)
  X = kpca1.project(alpha.xyz)
  print('KDTree1')
  kdtree1 = KDTree(50, maxdepth=4, data=X, method='median')
  hc1 = kdtree1.getleaves()

  srcidx = [[i[0] \
    for i in db.runquery("select idx from jc where bin='0_4' and expid=%d"%e)] \
    for e in range(32, 36)]

  src_traj = [dh.backProjection(r, i) for r, i in zip(rlist, srcidx)]
  src_xyz = [datareduce.filter_alpha(t).xyz for t in src_traj]
  probe_res = [[kdtree1.project(i.reshape(174,)) for i in xyz] for xyz in src_xyz]

  grp_src = []
  for p, s in zip(probe_res, srcidx):
      grp = {}
      for h, i in zip(p, s):
        if h not in grp:
          grp[h] = []
        grp[h].append(i)
      grp_src.append(grp)

  idx_se_map = [{i: (s, e) for i, s, e in db.runquery("select idx, start, end from jc where bin='0_4' and expid=%d"%eid)} for eid in range(32, 36)]
Example #5
0
 def ld_wells(self):
   for x, i in enumerate(self.conf):
     if i['origin'] == 'deshaw':
       A, B = eval(i['src_bin'])
       if A == B:
         traj = md.load(self.conf[A]['dcd'], top=self.conf[A]['pdb'])
         traj.center_coordinates()
         alpha = dr.filter_alpha(traj)
         maxf = min(1000, alpha.n_frames)
         for i in alpha.xyz[:maxf]:
           self.wells[A].append(i)
Example #6
0
 def loadtraj(self, tr, first=None):
   if isinstance(tr, list):
     trlist = tr
   else:
     trlist = [tr]
   for t in trlist:
     traj = md.load(self.conf[t]['dcd'], top=self.conf[t]['pdb'])
     # traj.center_coordinates()
     if first is not None:
       traj = traj.slice(np.arange(first))
     alpha = datareduce.filter_alpha(traj)
     # alpha.superpose(deshaw.topo_alpha)
     self.trlist[t] = alpha
Example #7
0
def backProjection(db, index_list):
    """Perform OFFLINE back projection function for a list of indices using
      given DB. Return a list of high dimensional points (one per index). 
      Assumes NO CACHE or DESHAW.
      """
    logging.debug('--------  BACK PROJECTION:  %d POINTS ---', len(index_list))
    # Derefernce indices to file, frame tuple:
    pipe = db.pipeline()
    for idx in index_list:
        pipe.lindex('xid:reference', int(idx))
    generated_framelist = pipe.execute()
    # Group all Generated indidces by file index
    groupbyFileIdx = {}
    for i, idx in enumerate(generated_framelist):
        try:
            file_index, frame = eval(idx)
        except TypeError as e:
            print('Bad Index:', str(idx))
            continue
        if file_index not in groupbyFileIdx:
            groupbyFileIdx[file_index] = []
        groupbyFileIdx[file_index].append(frame)
    # Dereference File index to filenames
    generated_frameMask = {}
    generated_filemap = {}
    for file_index in groupbyFileIdx.keys():
        filename = db.lindex('xid:filelist', file_index)
        if filename is None:
            logging.warning('Error file not found in catalog: %s', filename)
        if not os.path.exists(filename):
            logging.warning('DCD File not found: %s', filename)
        else:
            key = os.path.splitext(os.path.basename(filename))[0]
            generated_frameMask[key] = groupbyFileIdx[file_index]
            generated_filemap[key] = filename
    # Add high-dim points to list of source points in a trajectory
    # Optimized for parallel file loading
    logging.debug('Sequentially Loading all trajectories')
    source_points = []
    for key, framelist in generated_frameMask.items():
        traj = datareduce.load_trajectory(generated_filemap[key])
        traj = datareduce.filter_alpha(traj)
        selected_frames = traj.slice(framelist)
        source_points.extend(selected_frames.xyz)
    return np.array(source_points)
Example #8
0
def backProjection(db, index_list):
      """Perform OFFLINE back projection function for a list of indices using
      given DB. Return a list of high dimensional points (one per index). 
      Assumes NO CACHE or DESHAW.
      """
      logging.debug('--------  BACK PROJECTION:  %d POINTS ---', len(index_list))
      # Derefernce indices to file, frame tuple:
      pipe = db.pipeline()
      for idx in index_list:
        pipe.lindex('xid:reference', int(idx))
      generated_framelist = pipe.execute()
      # Group all Generated indidces by file index 
      groupbyFileIdx = {}
      for i, idx in enumerate(generated_framelist):
        try:
          file_index, frame = eval(idx)
        except TypeError as e:
          print('Bad Index:', str(idx))
          continue
        if file_index not in groupbyFileIdx:
          groupbyFileIdx[file_index] = []
        groupbyFileIdx[file_index].append(frame)
      # Dereference File index to filenames
      generated_frameMask = {}
      generated_filemap = {}
      for file_index in groupbyFileIdx.keys():
        filename = db.lindex('xid:filelist', file_index)
        if filename is None:
          logging.warning('Error file not found in catalog: %s', filename)
        if not os.path.exists(filename):
          logging.warning('DCD File not found: %s', filename)
        else:
          key = os.path.splitext(os.path.basename(filename))[0]
          generated_frameMask[key] = groupbyFileIdx[file_index]
          generated_filemap[key] = filename
      # Add high-dim points to list of source points in a trajectory
      # Optimized for parallel file loading
      logging.debug('Sequentially Loading all trajectories')
      source_points = []
      for key, framelist in generated_frameMask.items():
        traj = datareduce.load_trajectory(generated_filemap[key])
        traj = datareduce.filter_alpha(traj)
        selected_frames = traj.slice(framelist)
        source_points.extend(selected_frames.xyz)
      return np.array(source_points)     
Example #9
0
def centroid_bootstrap(catalog):
  centfile = settings.RMSD_CENTROID_FILE
  centroid = np.load(centfile)
  cent_npts = [1, 1, 1, 1, 1]  # TBD
  numLabels = len(centroid)
  binlist = [(a, b) for a in range(numLabels) for b in range(numLabels)]
  logging.info("Loaded Starting Centroids from %s", centfile)

  name = catalog.get('name')
  if name is None:
    logging.info('Name not configured in this catalog. Set it and try again')
    return

  # Load/Set initial (current) Configs from Catalog
  if catalog.exists('thetas'):
    thetas = catalog.loadNPArray('thetas')
  else:
    thetas = np.zeros(shape=(numLabels, numLabels))
    thetas[:] = 0.25

  if catalog.exists('transition_sensitivity'):
    trans_factor = catalog.loadNPArray('transition_sensitivity')
  else:
    trans_factor = 0.2
    
  use_gradient = True
  obs_count = {ab: 0 for ab in binlist}
  C_delta = []
  T_delta = []

  # Configure Noise Filter
  noise = int(catalog.get('obs_noise'))
  dcdfreq = int(catalog.get('dcdfreq'))
  stepsize = int(catalog.get('sim_step_size'))
  nwidth = noise//(2*stepsize)
  noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0)


  # Get previously Labeled data (or label data IAW current settings)
  eid = db.get_expid(name)
  obslist = [i[0] for i in db.runquery('SELECT obs FROM obs WHERE expid=%d' % eid)]
  jobs = [i[0] for i in sorted(catalog.hgetall('anl_sequence').items(), key=lambda x: x[1])]
  shape = None

  # Initialize lists for pair-wise distances (top 2 nearest centroids)
  diffList  = {}
  transList = {}
  scatPlot  = {}
  for A in range(0, numLabels-1):
    for B in range(A+1, numLabels):
      diffList[(A, B)]  = []
      transList[(A, B)] = []
      scatPlot[(A, B)]  = []
  allScat = []
  # Load trajectories & filter
  obs_global = []

  # Process learning in batches (static batch size to start)
  batch_size = 25
  max_obs = 150
  batch = 0
  while batch <= max_obs:
    logging.info("Procssing Jobs %d - %d", batch, batch+batch_size)
    exec_sim = []
    obs_list = []
    for job in jobs[batch:batch+25]:
      conf = catalog.hgetall('jc_' + job)
      traj = md.load(conf['dcd'], top=conf['pdb'])
      alpha = datareduce.filter_alpha(traj)
      conf['alpha'] = alpha.xyz
      exec_sim.append(conf)
      if shape is None:
        shape = conf['alpha'].shape[1:]

      # xyz_filtered = np.array([noisefilt(alpha.xyz, i) for i in range(alpha.n_frames)])
      rmslist = calc_rmsd(alpha, centroid)
      labels = []
      for rms in rmslist:
        # [cw[i]*LA.norm(pt - centroid[i]) for i in range(5)]
        A, B = np.argsort(rms)[:2]
        delta = np.abs(rms[B] - rms[A])
        if delta < thetas[A][B]:
          sub_state = B
        else:
          sub_state = A
        classify = (A, sub_state)
        labels.append(classify)
        obs_count[classify] += 1

        # For globally updating Thetas
        obs_global.append(classify)
        if A < B:
          diffList[(A, B)].append(rms[A] - rms[B])
        else:
          diffList[(B, A)].append(rms[B] - rms[A])

        for a in range(0, numLabels-1):
          for b in range(a+1, numLabels):
            transList[(a, b)].append(rms[a] - rms[b])
            if (a, a) == classify or (b, b) == classify:
              c = 'b'
            elif (a, b) == classify or (b, a) == classify:
              c = 'g'
            elif a == A or b == A:
              c = 'r'
            else:
              c = 'black'
            scatPlot[(a, b)].append((rms[a] - rms[b], c))
      obs_list.append(labels)

    logging.info('Bin Distribution:')
    grpby = {}
    for llist in obs_list:
      for l in llist:
        if l not in grpby:
          grpby[l] = 0
        grpby[l] += 1
    for k in sorted(grpby.keys()):
      logging.info('%s:  %5d', k, grpby[k])
    for A in range(0, numLabels-1):
      for B in range(A+1, numLabels):
        d = diffList[(A, B)]
        logging.info('Diff list for %d,%d:  %d, %5.2f, %5.2f', A, B, len(d), min(d), max(d))


    # # 6. Apply Heuristics Labeling
    # # logging.debug('Applying Labeling Heuristic. Origin:   %d, %d', srcA, srcB)
    # rmslabel = []
    # 
    # label_count = {ab: 0 for ab in binlist}
    # groupbystate = [[] for i in range(numLabels)]
    # groupbybin = {ab: [] for ab in binlist}


    # For each frame in each traj: ID labeled well pts & build avg op
    logging.info('Selecting observed Well States')
    coor_sum = {i: np.zeros(shape=shape) for i in range(numLabels)}
    coor_tot = {i: 0 for i in range(numLabels)}
    for job, obslist in zip(exec_sim, obs_list):
      # offset = int(job['xid:start'])
      # for i, frame in enumerate(job['alpha']):
      for frame, label in zip(job['alpha'], obslist):
        # A, B = eval(obslist[offset+i])
        A, B = label
        if A != B:
          continue
        coor_sum[A] += frame
        coor_tot[A] += 1

    logging.info('Calculating Avg from following stats:')
    logging.info('   Total Frames: %d', sum([len(sim['alpha']) for sim in exec_sim]))

    # Calculate New Centroids (w/deltas)
    delta = []
    for S in range(numLabels):
      if coor_tot[S] == 0:
        logging.info("   State: %d --- NO OBSERVATIONS IN THIS WELL STATE", S)
        continue
      cent_local = coor_sum[S] / coor_tot[S]
      diff_local = LA.norm(centroid[S] - cent_local)
      update = ((centroid[S] * cent_npts[S]) + (cent_local * coor_tot[S])) / (cent_npts[S] + coor_tot[S])
      delta.append(LA.norm(update - centroid[S]))
      logging.info('   State %d:  NewPts=%5d   Delta=%5.2f   LocalDiff=%5.2f', 
        S, coor_tot[S], delta[-1], diff_local)
      centroid[S] = update
      cent_npts[S] += coor_tot[S]
    centroid_change = np.mean(delta)
    if len(C_delta) > 1:
      rel_change = np.abs((centroid_change - C_delta[-1]) / C_delta[-1])
      logging.info('Centroid Change:  %5.2f   (%5.2f%%)', centroid_change, 100*rel_change)
    C_delta.append(centroid_change)
    batch += batch_size


    # Update Thetas (usig global data ?????)
    delta = []
    for A in range(0, numLabels-1):
      for B in range(A+1, numLabels):
        X = sorted(diffList[(A, B)])
        if len(X) < 100:
          logging.info('Lacking data on %d, %d', A, B)
          continue
        # logging.info('  Total # Obs: %d', len(X))
        crossover = 0
        for i, x in enumerate(X):
          if x > 0:
            crossover = i
            break
        # logging.info('  Crossover at Index: %d', crossover)
        if crossover < 50 or (len(X)-crossover) < 50:
          logging.info('  Lacking local data skipping.')
          continue

        # Find local max gradient  (among 50% of points)
        
        if use_gradient:
          thetas_updated = np.copy(thetas)
          zoneA = int((1-trans_factor) * crossover)
          zoneB = crossover + int(trans_factor * (len(X) - crossover))
          gradA = zoneA + np.argmax(np.gradient(X[zoneA:crossover]))
          gradB = crossover + np.argmax(np.gradient(X[crossover:zoneB]))
          thetaA = X[gradA]
          thetaB = X[gradB]
          thetas_updated[A][B] = np.abs(thetaA)
          thetas_updated[B][A] = np.abs(thetaB)
          tdeltA = np.abs(thetas_updated[A][B] - thetas[A][B])
          tdeltB = np.abs(thetas_updated[B][A] - thetas[B][A])
          delta.append(tdeltA)
          delta.append(tdeltB)
          logging.info('  Theta Change (%d,%d):  %4.2f  (%4.1f)', A, B, tdeltA, (100*tdeltA/thetas[A][B]))
          logging.info('  Theta Change (%d,%d):  %4.2f  (%4.1f)', B, A, tdeltB, (100*tdeltB/thetas[B][A]))
          thetas[A][B] = thetas_updated[A][B]
          thetas[B][A] = thetas_updated[B][A]
        else:
          # Classify Fixed Percent of observations as Transitional
          thetas_updated = np.copy(thetas)
          transitionPtA = int((1-trans_factor) * crossover)
          transitionPtB = crossover + int(trans_factor * (len(X) - crossover))
          thetaA = X[transitionPtA]
          thetaB = X[transitionPtB]
          thetas_updated[A][B] = np.abs(thetaA)
          thetas_updated[B][A] = np.abs(thetaB)
          tdeltA = np.abs(thetas_updated[A][B] - thetas[A][B])
          tdeltB = np.abs(thetas_updated[B][A] - thetas[B][A])
          delta.append(tdeltA)
          delta.append(tdeltB)
          logging.info('  Theta Change (%d,%d):  %4.2f  (%4.1f)', A, B, tdeltA, (100*tdeltA/thetas[A][B]))
          logging.info('  Theta Change (%d,%d):  %4.2f  (%4.1f)', B, A, tdeltB, (100*tdeltB/thetas[B][A]))
          thetas[A][B] = thetas_updated[A][B]
          thetas[B][A] = thetas_updated[B][A]

    T_delta.append(np.mean(delta))
  P.line(np.array(C_delta), 'Avg_CHANGE_Centroid_Pos_%s' % name)
  P.line(np.array(T_delta), 'Avg_CHANGE_Theta_Val_%s' % name)
  P.bargraph_simple(obs_count, 'Final_Histogram_%s' % name)
  # for k, X in diffList.items():
  #   A, B = k
  #   P.transition_line(sorted(X), A, B, title='-X', trans_factor=.5)
  # for k, X in transList.items():
  #   A, B = k
  #   P.transition_line(sorted(X), A, B, title='-ALL', trans_factor=.5)
  for k, X in scatPlot.items():
    collab = {'b': 'Well', 'g': 'Trans', 'r': 'Primary', 'brown': 'Secondary', 'black': 'None'}
    ptmap = {k: [] for k in collab.keys()}
    ordpts = sorted(X, key = lambda x : x[0])
    for i, tup in enumerate(ordpts):
      y, c = tup
      ptmap[c].append((i, y))
      # if c == 'b' or c == 'g':
      #   ptmap[c].append((i, y))
      # else:
      #   ptmap[c].append((i, 0))
    A, B = k
    P.scat_Transtions(ptmap, title='-%d_%d'%(A,B), size=1, labels=collab)
Example #10
0
    def execute(self):
        """Special execute function for the reweight operator -- check/validate.
      """
        # PRE-PROCESSING ---------------------------------------------------------------------------------
        logging.debug(
            "============================  <PRE-PROCESS>  ============================="
        )
        self.cacheclient = CacheClient(self.name)
        numLabels = 5
        binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
        labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
        num_pts = len(labeled_pts_rms)
        logging.debug('##NUM_OBS: %d', num_pts)

        # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
        TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4),
                     (0, 2), (0, 1)]
        MAX_SAMPLE_SIZE = 100  # Max # of cov traj to back project per HCube
        MAX_PT_PER_MATRIX = 100  # Num points to sample from each cov traj
        COVAR_SIZE = 200  # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
        MAX_HCUBE = 6  # Max Num HCubes to process

        # IMPLEMENT USER QUERY with REWEIGHTING:
        logging.debug(
            "=======================  <QUERY PROCESSING>  ========================="
        )

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

        logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
        logging.info("Retrieving All Covariance Vectors")
        home = os.getenv('HOME')
        cfile = home + '/work/DEBUG_COVAR_PTS'
        DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
        if DO_COVAR:
            if os.path.exists(cfile + '.npy'):
                covar_pts = np.load(cfile + '.npy')
                logging.debug('Loaded From File')
            else:
                covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
                covar_pts = np.array([np.fromstring(x) for x in covar_raw])
                np.save(cfile, covar_pts)
                logging.debug('Loaded From Catalog & Saved')
        covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
        logging.debug('Indiced Loaded. Retrieving File Indices')
        covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

        if DO_COVAR:
            logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
            logging.info(
                "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)"
            )

            # FOR incrementatl PCA:
            NUM_PC = 6
            ipca_key = 'subspace:covar:ipca'
            ipca = PCAnalyzer.load(self.catalog, ipca_key)
            if ipca is None:
                logging.info('Creating a NEW IPCA')
                ipca = PCAIncremental(NUM_PC)
                lastindex = 0
            else:
                lastindex = ipca.trainsize
                logging.info(
                    'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts',
                    ipca.trainsize,
                    len(covar_pts) - ipca.trainsize)

            # For incremental, partial solve using only newer pts (from the last "trainsize")
            if len(covar_pts) - lastindex > 0:
                ipca.solve(covar_pts[lastindex:])
                logging.info("Incrementatl PCA Updated. Storing Now...")

                ####  BARRIER
                self.wait_catalog()
                ipca.store(self.catalog, ipca_key)

            logging.info("IPCA Saved. Projecting Covariance to PC")

        cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
        if os.path.exists(cfile + '.npy'):
            subspace_covar_pts = np.load(cfile + '.npy')
        else:
            subspace_covar_pts = ipca.project(covar_pts)
            np.save(cfile, subspace_covar_pts)

        # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
        logging.info(
            'Building Global KD Tree over Covar Subspace with %d data pts',
            len(subspace_covar_pts))
        global_kdtree = KDTree(250,
                               maxdepth=8,
                               data=subspace_covar_pts,
                               method='middle')

        if MAX_HCUBE <= 0:
            hcube_global = global_kdtree.getleaves()
        else:
            # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
            hcube_global_ALL = global_kdtree.getleaves()
            hcube_global = {}
            num = 0
            for k, v in hcube_global_ALL.items():
                hcube_global[k] = v
                num += 1
                if num == MAX_HCUBE:
                    break

        # hcube_global = global_kdtree.getleaves()
        logging.info(
            'Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)'
        )
        for k in sorted(hcube_global.keys()):
            v = hcube_global[k]
            logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'],
                         v['volume'], v['density'])

        if self.filelog:
            keys = hcube_global.keys()
            self.filelog.info('global,keys,%s', ','.join(keys))
            self.filelog.info(
                'global,count,%s',
                ','.join([str(hcube_global[k]['count']) for k in keys]))
            self.filelog.info(
                'global,volume,%s',
                ','.join([str(hcube_global[k]['volume']) for k in keys]))
            self.filelog.info(
                'global,density,%s',
                ','.join([str(hcube_global[k]['density']) for k in keys]))

        logging.info(
            "=====  SELECT Sampling of points from each Global HCube  (B)")
        s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
        hcube_global = {x[0]: x[1] for x in s}

        counter = 0
        for key in hcube_global.keys():
            counter += 1
            if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE:
                cov_index = hcube_global[key]['elm']
                hcube_global[key]['samplefactor'] = 1
            else:
                cov_index = np.random.choice(hcube_global[key]['elm'],
                                             MAX_SAMPLE_SIZE)
                hcube_global[key]['samplefactor'] = len(
                    hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
            hcube_global[key]['idxlist'] = []
            for cov in cov_index:
                selected_hd_idx = np.random.choice(COVAR_SIZE,
                                                   MAX_PT_PER_MATRIX).tolist()
                hcube_global[key]['idxlist'].extend(
                    [int(covar_index[cov]) + i for i in selected_hd_idx])
            logging.info('Back Projecting Global HCube `%s`  (%d out of %d)',
                         key, counter, len(hcube_global.keys()))
            source_cov = self.backProjection(hcube_global[key]['idxlist'])
            hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
            logging.debug('Back Projected %d points to HD space: %s',
                          len(hcube_global[key]['idxlist']),
                          str(hcube_global[key]['alpha']))

        # logging.info('Calculating all HD Distances')
        # dist_hd = {}
        # dist_ld = {}
        # for key in hcube_global.keys():
        #   T = hcube_global[key]['alpha'].xyz
        #   N = len(T)
        #   dist_hd[key] = np.zeros(shape=(N, N))
        #   dist_ld[key] = {}
        #   for A in range(0, N):
        #     dist_hd[key][A][A] = 0
        #     for B in range(A+1, N):
        #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
        reservoir = ReservoirSample('rms', self.catalog)

        logging.info(
            "=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) "
        )
        hcube_list = {}

        logging.info(
            "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)"
        )
        hcube_local = {}

        logging.info("=======================================================")
        logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
        logging.info(
            "=======================================================\n")

        overlap_hcube = {k: {} for k in hcube_global.keys()}

        projection_map = {}

        pt_projection_list = []
        for key in sorted(hcube_global.keys()):
            for i in range(len(hcube_global[key]['alpha'].xyz)):
                pt_projection_list.append([])
        for bin_idx, tbin in enumerate(TEST_TBIN):
            logging.info("Project Global HCubes into local subspace for %s",
                         str(tbin))
            # Load Vectors
            logging.info('Loading subspace and kernel for bin %s', str(tbin))

            # LOAD KPCA Kernel matrix
            kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
            kpca = PCAnalyzer.load(self.catalog, kpca_key)

            data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
            data = np.array([np.fromstring(x) for x in data_raw])
            if len(data) == 0:
                logging.error(
                    'No Raw PCA data points for bin %s.... Going to next bin',
                    str(tbin))
                continue

            logging.info(
                'Building KDtree over local %s bin from observations matrix of size: %s',
                str(tbin), str(data.shape))
            kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
            hcube_local[tbin] = kdtree.getleaves()
            logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
            for k in sorted(hcube_local[tbin].keys()):
                logging.info('    `%-9s`   #pts:%6d   density:%9.1f', k,
                             len(hcube_local[tbin][k]['elm']),
                             hcube_local[tbin][k]['density'])

            if self.filelog:
                keys = hcube_local[tbin].keys()
                A, B = tbin
                self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys))
                self.filelog.info(
                    'local,%d_%d,count,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['count']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,volume,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['volume']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,density,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['density']) for k in keys]))

            n_total = 0
            logging.debug('Global Hcubes to Project (%d):  %s',
                          len(hcube_global.keys()), str(hcube_global.keys()))
            projection_map[bin_idx] = {
                k: set()
                for k in hcube_local[tbin].keys()
            }

            pnum = 0
            for key in sorted(hcube_global.keys()):
                overlap_hcube[key][tbin] = {}
                cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

                logging.debug(
                    'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ',
                    key, len(cov_proj_pca), str(tbin))
                for i, pt in enumerate(cov_proj_pca):
                    hcube = kdtree.probe(pt, probedepth=9)
                    # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
                    if hcube not in overlap_hcube[key][tbin]:
                        overlap_hcube[key][tbin][hcube] = {
                            'idxlist': hcube_local[tbin][hcube]['elm'],
                            'wgt': hcube_local[tbin][hcube]['density'],
                            'num_projected': 0
                        }
                    overlap_hcube[key][tbin][hcube]['num_projected'] += 1

                    # Index this point in corresponding local HCube projection view
                    projection_map[bin_idx][hcube].add(pnum)

                    pt_projection_list[pnum].append(hcube)
                    pnum += 1

                for k, v in sorted(overlap_hcube[key][tbin].items()):
                    logging.debug(
                        '   Project ==> Local HCube `%-9s`: %5d points', k,
                        v['num_projected'])
                # logging.info('Calculating Lower Dimensional Distances')
                # N = len(cov_proj_pca)
                # dist_ld[key][tbin] = np.zeros(shape=(N, N))
                # for A in range(0, N):
                #   for B in range(A+1, N):
                #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])

    # Re-Index projected points -- could make this a list too

        next_index = 0
        view_list = []
        for bin_idx, hcube_map in projection_map.items():
            hcube_list = []
            for hcube_key, pt_list in hcube_map.items():
                hcube_list.append((set((hcube_key, )), set(pt_list)))
            view_list.append((set((bin_idx, )), hcube_list))

        print("CALLING: Collapse Join")
        joined_subspaces = collapse_join(projection_map.keys(), view_list)
        for subspace_list, correlated_hcubes in joined_subspaces:
            tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
            for hcube_list, pt_list in correlated_hcubes:
                print(tbin_list, hcube_list, pt_list)
                # TODO: Corrlate Back to Global
        print('Visualize HERE')

        # for idx, tbin in enumerate(TEST_TBIN):
        #   # Only process substates with data
        #   if tbin not in hcube_local:
        #     logging.warning('Local KD Tree not created for %s', str(tbin))
        #     continue
        #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # sets = {}
        # proj_bin_list = []
        # for tbin in TEST_TBIN:
        #   if tbin not in hcube_local:
        #     continue
        #   proj_bin_list.append(tbin)
        #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # set_list = {}
        # for tbin, view in sets.items():
        #   set_list[(tbin,)] = []
        #   for hcube, idxlist in view.items():
        #     print(tbin, hcube, idxlist)
        #     set_list[(tbin,)].append((set((hcube,)), idxlist))

        # def collapse(C):
        #   a = 0
        #   b = 0
        #   N = []
        #   while a < len(C) and b < len(C):
        #     A = sorted(C[a])
        #     B = sorted(C[b])
        #     if A == B:
        #       b += 1
        #     elif A[0] == B[0]:
        #       N.append(set(A)|set(B))
        #       b += 1
        #     else:
        #       a += 1
        #   if len(N) <= 1:
        #     return []
        #   else:
        #     return N + collapse(N)

        # q=collapse(t1)
        # for i in q: print(sorted(i))

        # print('Checking all 2-Way Joins')
        # join2 = {}
        # for a in range(0, len(proj_bin_list)-1):
        #   tA = proj_bin_list[a]
        #   for b in range(a+1, len(proj_bin_list)):
        #     tB = proj_bin_list[b]
        #     join_ss = tuple(set((tA, tB)))
        #     set_list = []
        #     for kA, vA in sets[tA].items():
        #       for kB, vB in sets[tB].items():
        #         join_hc = set((kA, kB))
        #         inter = vA & vB
        #         if len(inter) > 0:
        #           set_list.append((join_hc, inter))
        #     if len(set_list) > 0:
        #       join2[join_ss] = set_list
        # print('2-Way Join Results:')
        # for ss, set_list in join2.items():
        #   for hc, idxlist in set_list:
        #     print(ss, hc, idxlist)

        # print('Checking all 3-Way Joins')
        # join3 = []
        # checked = []
        # for a in range(0, len(join2)-1):
        #   sA, hA, vA = join2[a]
        #   for b in range(a+1, len(join2)):
        #     sB, hB, vB = join2[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join3.append((ss, hc, inter))

        # print('Checking all 4-Way Joins')
        # join4 = []
        # checked = []
        # for a in range(0, len(join3)-1):
        #   sA, hA, vA = join3[a]
        #   for b in range(a+1, len(join3)):
        #     sB, hB, vB = join3[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join4.append((ss, hc, inter))

        # if self.filelog:
        #   for i in join2:
        #     self.filelog.info('%s', str(i))
        #   for i in join3:
        #     self.filelog.info('%s', str(i))
        #   for i in join4:
        #     self.filelog.info('%s', str(i))

        DO_MIN_CHECK = False
        if DO_MIN_CHECK:

            def maxcount(x):
                y = {}
                for i in x:
                    y[i] = 1 if i not in y else y[i] + 1
                return max(y.values())

            print(
                '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces'
            )
            argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf)
                                                  for i in x])
            for key in hcube_global.keys():
                # logging.info('Showing MIN / MAX for points from HCube %s:', key)
                minA = {}
                maxA = {}
                for n in range(len(dist_hd[key])):
                    minA[n] = []
                    maxA[n] = []
                    for tbin in TEST_TBIN:
                        if tbin not in dist_ld[key].keys():
                            continue
                            minA[n].append(0)
                            maxA[n].append(0)
                        else:
                            minA[n].append(
                                argmin_nonzero(dist_ld[key][tbin][n]))
                            maxA[n].append(np.argmax(dist_ld[key][tbin][n]))
                numsame = np.zeros(len(dist_ld[key].keys()) + 1)
                for n in range(len(dist_hd[key][n])):
                    minH = argmin_nonzero(dist_hd[key][n])
                    maxH = np.argmax(dist_hd[key][n])
                    minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])]
                    numsamepair = maxcount(minA[n])
                    numsame[numsamepair] += 1
                    # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
                print(' '.join([
                    '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame)))
                ]))

        print('Stopping HERE!')
        sys.exit(0)
        #  GAMMA FUNCTION EXPR # 8
        gamma1 = lambda a, b: (a * b)
        gamma2 = lambda a, b: (a + b) / 2

        # TODO: Factor in RMS weight
        for tbin in TEST_TBIN:
            # for tbin in sorted(bin_list):
            logging.info('')
            logging.info('BIPARTITE GRAPH for %s', str(tbin))
            bipart = {}
            edgelist = []
            for hcB in hcube_global.keys():
                num_B = hcube_global[hcB]['count']
                wgt1_B = hcube_global[hcB]['density']
                if tbin not in overlap_hcube[hcB]:
                    continue
                for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
                    edge = {}
                    if hcA not in bipart:
                        bipart[hcA] = []
                    num_proj = hcA_data['num_projected']
                    wgt_A = hcA_data['wgt']
                    wgt2_B = wgt1_B * num_proj
                    edge['combW1'] = gamma1(wgt_A, wgt1_B)
                    edge['combW2'] = gamma1(wgt_A, wgt2_B)
                    edge['combW3'] = gamma2(wgt_A, wgt1_B)
                    edge['combW4'] = gamma2(wgt_A, wgt2_B)
                    edge['num_A'] = len(hcA_data['idxlist'])
                    edge['num_B'] = num_B
                    edge['num_proj'] = num_proj
                    edge['wgt_A'] = wgt_A
                    edge['wgt1_B'] = wgt1_B
                    edge['wgt2_B'] = wgt2_B
                    edge['hcA'] = hcA
                    edge['hcB'] = hcB
                    bipart[hcA].append(edge)
                    edgelist.append((hcA, hcB, num_proj))
            if len(bipart) == 0:
                logging.info("NO DATA FOR %s", str(tbin))
                continue
            logging.info('')
            logging.info(
                'A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)'
            )
            for k, v in bipart.items():
                for edge in v:
                    logging.info(
                        'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f'
                        % edge)
                    if self.filelog:
                        A, B = tbin
                        self.filelog.info('edge,%d_%d,%s,%s,%d', A, B,
                                          edge['hcA'], edge['hcB'],
                                          edge['num_proj'])

            # Prepare nodes for graph
            nA = set()
            nB = set()
            elist = []
            for e in edgelist:
                a, b, z = e
                if z <= 5:
                    continue
                nA.add(a)
                nB.add(b)
                elist.append((a, b, z))
            nAKeys = sorted(nA)[::-1]
            nBKeys = sorted(nB)[::-1]
            sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
            sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys]
            idxA = {key: i for i, key in enumerate(nAKeys)}
            idxB = {key: i for i, key in enumerate(nBKeys)}
            edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
            G.bipartite(sizesA, sizesB, edges, sizesA, sizesB,
                        'bipartite_%d_%d' % tbin)

        logging.info('STOPPING HERE!!!!')
        sys.exit(0)
        return []
Example #11
0
    def execute(self):
      """Special execute function for the reweight operator -- check/validate.
      """
    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")
      self.cacheclient = CacheClient(self.name)
      numLabels = 5
      binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
      labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
      num_pts = len(labeled_pts_rms)
      logging.debug('##NUM_OBS: %d', num_pts)

      # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
      TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)]
      MAX_SAMPLE_SIZE   =  100   # Max # of cov traj to back project per HCube
      MAX_PT_PER_MATRIX =  100   # Num points to sample from each cov traj
      COVAR_SIZE        = 200   # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
      MAX_HCUBE         = 6      # Max Num HCubes to process


    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

      logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
      logging.info("Retrieving All Covariance Vectors")
      home = os.getenv('HOME')
      cfile = home + '/work/DEBUG_COVAR_PTS'
      DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
      if DO_COVAR: 
        if os.path.exists(cfile + '.npy'):
          covar_pts = np.load(cfile + '.npy')
          logging.debug('Loaded From File')
        else: 
          covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
          covar_pts = np.array([np.fromstring(x) for x in covar_raw])
          np.save(cfile, covar_pts)
          logging.debug('Loaded From Catalog & Saved')
      covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
      logging.debug('Indiced Loaded. Retrieving File Indices')
      covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

      if DO_COVAR: 
        logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
        logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)")

        # FOR incrementatl PCA:
        NUM_PC = 6
        ipca_key = 'subspace:covar:ipca'
        ipca = PCAnalyzer.load(self.catalog, ipca_key)
        if ipca is None:
          logging.info('Creating a NEW IPCA')
          ipca = PCAIncremental(NUM_PC)
          lastindex = 0
        else:
          lastindex = ipca.trainsize
          logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', 
            ipca.trainsize, len(covar_pts)-ipca.trainsize)

        # For incremental, partial solve using only newer pts (from the last "trainsize")
        if len(covar_pts)-lastindex > 0:
          ipca.solve(covar_pts[lastindex:])
          logging.info("Incrementatl PCA Updated. Storing Now...")

          ####  BARRIER 
          self.wait_catalog()
          ipca.store(self.catalog, ipca_key)

        logging.info("IPCA Saved. Projecting Covariance to PC")

      cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
      if os.path.exists(cfile + '.npy'):
        subspace_covar_pts = np.load(cfile + '.npy')
      else: 
        subspace_covar_pts = ipca.project(covar_pts)
        np.save(cfile, subspace_covar_pts)

      # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
      logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts))
      global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle')
  

      if MAX_HCUBE <= 0:
        hcube_global = global_kdtree.getleaves()
      else:
      # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
        hcube_global_ALL = global_kdtree.getleaves()
        hcube_global = {}
        num = 0
        for k, v in hcube_global_ALL.items():
          hcube_global[k] = v
          num += 1
          if num == MAX_HCUBE:
            break

      # hcube_global = global_kdtree.getleaves()
      logging.info('Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)')
      for k in sorted(hcube_global.keys()):
        v = hcube_global[k]
        logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density'])

      if self.filelog:
        keys = hcube_global.keys()
        self.filelog.info('global,keys,%s',','.join(keys))
        self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys]))
        self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys]))
        self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys]))

      logging.info("=====  SELECT Sampling of points from each Global HCube  (B)")
      s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
      hcube_global = {x[0]: x[1] for x in s}


      counter = 0
      for key in hcube_global.keys():
        counter += 1
        if hcube_global[key]['count']  <= MAX_SAMPLE_SIZE:
          cov_index = hcube_global[key]['elm']
          hcube_global[key]['samplefactor'] = 1
        else:
          cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE)
          hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
        hcube_global[key]['idxlist'] = []
        for cov in cov_index:
          selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist()
          hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx])
        logging.info('Back Projecting Global HCube `%s`  (%d out of %d)', key, counter, len(hcube_global.keys()))
        source_cov = self.backProjection(hcube_global[key]['idxlist'])
        hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
        logging.debug('Back Projected %d points to HD space: %s', 
          len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha']))

      # logging.info('Calculating all HD Distances')
      # dist_hd = {}
      # dist_ld = {}
      # for key in hcube_global.keys():
      #   T = hcube_global[key]['alpha'].xyz
      #   N = len(T)
      #   dist_hd[key] = np.zeros(shape=(N, N))
      #   dist_ld[key] = {}
      #   for A in range(0, N):
      #     dist_hd[key][A][A] = 0
      #     for B in range(A+1, N):
      #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])
        

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
      reservoir = ReservoirSample('rms', self.catalog)

      logging.info("=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ")
      hcube_list = {}

      logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)")
      hcube_local = {}

      logging.info("=======================================================")
      logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
      logging.info("=======================================================\n")

      overlap_hcube = {k: {} for k in hcube_global.keys()}

      projection_map = {}


      pt_projection_list = []
      for key in sorted(hcube_global.keys()):
        for i in range(len(hcube_global[key]['alpha'].xyz)):
          pt_projection_list.append([])
      for bin_idx, tbin in enumerate(TEST_TBIN):
        logging.info("Project Global HCubes into local subspace for %s", str(tbin))
        # Load Vectors
        logging.info('Loading subspace and kernel for bin %s', str(tbin))

        # LOAD KPCA Kernel matrix
        kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
        kpca = PCAnalyzer.load(self.catalog, kpca_key)

        data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
        data = np.array([np.fromstring(x) for x in data_raw])
        if len(data) == 0:
          logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
          continue


        logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape))
        kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
        hcube_local[tbin] = kdtree.getleaves()
        logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
        for k in sorted(hcube_local[tbin].keys()):
          logging.info('    `%-9s`   #pts:%6d   density:%9.1f', 
            k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density'])

        if self.filelog:
          keys = hcube_local[tbin].keys()
          A,B = tbin
          self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys))
          self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys]))
          self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys]))
          self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys]))          

        n_total = 0
        logging.debug('Global Hcubes to Project (%d):  %s', len(hcube_global.keys()), str(hcube_global.keys()))
        projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()}
        
        pnum = 0
        for key in sorted(hcube_global.keys()):
          overlap_hcube[key][tbin] = {}
          cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

          logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ', 
            key, len(cov_proj_pca), str(tbin))
          for i, pt in enumerate(cov_proj_pca):
            hcube = kdtree.probe(pt, probedepth=9)
            # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
            if hcube not in overlap_hcube[key][tbin]:
              overlap_hcube[key][tbin][hcube] = {
                  'idxlist': hcube_local[tbin][hcube]['elm'],
                  'wgt': hcube_local[tbin][hcube]['density'], 
                  'num_projected': 0}
            overlap_hcube[key][tbin][hcube]['num_projected'] += 1

            # Index this point in corresponding local HCube projection view
            projection_map[bin_idx][hcube].add(pnum)

            pt_projection_list[pnum].append(hcube)
            pnum += 1

          for k, v in sorted(overlap_hcube[key][tbin].items()):
            logging.debug('   Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected'])
          # logging.info('Calculating Lower Dimensional Distances')
          # N = len(cov_proj_pca)
          # dist_ld[key][tbin] = np.zeros(shape=(N, N))
          # for A in range(0, N):
          #   for B in range(A+1, N):
          #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])


    # Re-Index projected points -- could make this a list too

      next_index = 0
      view_list = []
      for bin_idx, hcube_map in projection_map.items():
        hcube_list = []
        for hcube_key, pt_list in hcube_map.items():
          hcube_list.append((set((hcube_key,)), set(pt_list)))
        view_list.append((set((bin_idx,)), hcube_list))

      print("CALLING: Collapse Join")
      joined_subspaces = collapse_join(projection_map.keys(), view_list)
      for subspace_list, correlated_hcubes in joined_subspaces:
        tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
        for hcube_list, pt_list in correlated_hcubes:
          print(tbin_list, hcube_list, pt_list)
          # TODO: Corrlate Back to Global 
      print('Visualize HERE')



      # for idx, tbin in enumerate(TEST_TBIN):
      #   # Only process substates with data
      #   if tbin not in hcube_local:
      #     logging.warning('Local KD Tree not created for %s', str(tbin))
      #     continue
      #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))


      # sets = {}
      # proj_bin_list = []
      # for tbin in TEST_TBIN:
      #   if tbin not in hcube_local:
      #     continue
      #   proj_bin_list.append(tbin)
      #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))

      # set_list = {}
      # for tbin, view in sets.items():
      #   set_list[(tbin,)] = []
      #   for hcube, idxlist in view.items():
      #     print(tbin, hcube, idxlist)
      #     set_list[(tbin,)].append((set((hcube,)), idxlist))


      # def collapse(C):
      #   a = 0
      #   b = 0
      #   N = []
      #   while a < len(C) and b < len(C):
      #     A = sorted(C[a])
      #     B = sorted(C[b])
      #     if A == B:
      #       b += 1
      #     elif A[0] == B[0]:
      #       N.append(set(A)|set(B))
      #       b += 1
      #     else:
      #       a += 1
      #   if len(N) <= 1:
      #     return []
      #   else:
      #     return N + collapse(N)

      # q=collapse(t1)
      # for i in q: print(sorted(i))


      # print('Checking all 2-Way Joins')
      # join2 = {}
      # for a in range(0, len(proj_bin_list)-1):
      #   tA = proj_bin_list[a]
      #   for b in range(a+1, len(proj_bin_list)):
      #     tB = proj_bin_list[b]
      #     join_ss = tuple(set((tA, tB)))
      #     set_list = []
      #     for kA, vA in sets[tA].items():
      #       for kB, vB in sets[tB].items():
      #         join_hc = set((kA, kB))
      #         inter = vA & vB
      #         if len(inter) > 0:
      #           set_list.append((join_hc, inter))
      #     if len(set_list) > 0:
      #       join2[join_ss] = set_list
      # print('2-Way Join Results:')
      # for ss, set_list in join2.items():
      #   for hc, idxlist in set_list:
      #     print(ss, hc, idxlist)


      # print('Checking all 3-Way Joins')
      # join3 = []
      # checked = []
      # for a in range(0, len(join2)-1):
      #   sA, hA, vA = join2[a]
      #   for b in range(a+1, len(join2)):
      #     sB, hB, vB = join2[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join3.append((ss, hc, inter))


      # print('Checking all 4-Way Joins')
      # join4 = []
      # checked = []
      # for a in range(0, len(join3)-1):
      #   sA, hA, vA = join3[a]
      #   for b in range(a+1, len(join3)):
      #     sB, hB, vB = join3[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join4.append((ss, hc, inter))

      # if self.filelog:
      #   for i in join2:
      #     self.filelog.info('%s', str(i))
      #   for i in join3:
      #     self.filelog.info('%s', str(i))
      #   for i in join4:
      #     self.filelog.info('%s', str(i))

      DO_MIN_CHECK = False
      if DO_MIN_CHECK:
        def maxcount(x):
          y={}
          for i in x:
            y[i] = 1 if i not in y else y[i]+1
          return max(y.values())

        print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces')
        argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x])
        for key in hcube_global.keys():
          # logging.info('Showing MIN / MAX for points from HCube %s:', key)
          minA = {}; maxA={}
          for n in range(len(dist_hd[key])) :
            minA[n]=[] ; maxA[n]=[]
            for tbin in TEST_TBIN:
              if tbin not in dist_ld[key].keys():
                continue
                minA[n].append(0)
                maxA[n].append(0)          
              else:
                minA[n].append(argmin_nonzero(dist_ld[key][tbin][n]))
                maxA[n].append(np.argmax(dist_ld[key][tbin][n]))          
          numsame = np.zeros(len(dist_ld[key].keys())+1)
          for n in range(len(dist_hd[key][n])):
            minH = argmin_nonzero(dist_hd[key][n])
            maxH = np.argmax(dist_hd[key][n])
            minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])]
            numsamepair = maxcount(minA[n])
            numsame[numsamepair] += 1
            # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
          print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))]))

      print('Stopping HERE!')
      sys.exit(0)
      #  GAMMA FUNCTION EXPR # 8
      gamma1 = lambda a, b : (a * b)
      gamma2 = lambda a, b : (a + b) / 2

      # TODO: Factor in RMS weight
      for tbin in TEST_TBIN:
      # for tbin in sorted(bin_list):
        logging.info('')
        logging.info('BIPARTITE GRAPH for %s', str(tbin))
        bipart = {}
        edgelist = []
        for hcB in hcube_global.keys():
          num_B  = hcube_global[hcB]['count']
          wgt1_B = hcube_global[hcB]['density']
          if tbin not in overlap_hcube[hcB]:
            continue
          for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
            edge = {}
            if hcA not in bipart:
              bipart[hcA] = []  
            num_proj  = hcA_data['num_projected']
            wgt_A  = hcA_data['wgt']
            wgt2_B = wgt1_B*num_proj
            edge['combW1'] = gamma1(wgt_A, wgt1_B)
            edge['combW2'] = gamma1(wgt_A, wgt2_B)
            edge['combW3'] = gamma2(wgt_A, wgt1_B)
            edge['combW4'] = gamma2(wgt_A, wgt2_B)
            edge['num_A']  = len(hcA_data['idxlist'])
            edge['num_B']  = num_B
            edge['num_proj']  = num_proj
            edge['wgt_A']  = wgt_A
            edge['wgt1_B'] = wgt1_B
            edge['wgt2_B'] = wgt2_B
            edge['hcA'] = hcA
            edge['hcB'] = hcB
            bipart[hcA].append(edge)
            edgelist.append((hcA, hcB, num_proj))
        if len(bipart) == 0:
          logging.info("NO DATA FOR %s", str(tbin))
          continue
        logging.info('')
        logging.info('A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)')
        for k, v in bipart.items():
          for edge in v:
            logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge)
            if self.filelog:
              A,B = tbin
              self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj'])

        # Prepare nodes for graph
        nA = set()
        nB = set()
        elist = []
        for e in edgelist:
          a, b, z = e
          if z <= 5:
            continue
          nA.add(a)
          nB.add(b)
          elist.append((a,b,z))
        nAKeys = sorted(nA)[::-1]
        nBKeys = sorted(nB)[::-1]
        sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
        sizesB = [hcube_global[n]['count']*3 for n in nBKeys]
        idxA = {key: i for i, key in enumerate(nAKeys)}
        idxB = {key: i for i, key in enumerate(nBKeys)}
        edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
        G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin)

      logging.info('STOPPING HERE!!!!')
      sys.exit(0)
      return []