Example #1
0
def reproj_distro():
    local = get_local()
    data = get_edges()
    r2 = redis.StrictRedis(port=6385, decode_responses=True)
    for tbin in [(2, 4), (2, 2), (4, 4), (4, 2), (4, 1), (3, 1)]:
        print('Processing:', tbin)
        tkey = '%d_%d' % tbin
        # Get Kernel
        kpca_key = 'subspace:pca:kernel:' + tkey
        kpca = PCAnalyzer.load(r2, kpca_key)
        # Get Training Data
        data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
        pca_pts = np.array([np.fromstring(x) for x in data_raw])
        kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
        proj_pts = kpca.project(alpha.xyz)
        biased_hcubes = []
        for i, pt in enumerate(proj_pts):
            biased_hcubes.append(kdtree.probe(pt, probedepth=9))
        if len(data) == 0:
            print('No Raw PCA data points for bin %s.... Going to next bin',
                  str(tbin))
            continue
        counts = {}
        for i in biased_hcubes:
            if i not in counts:
                counts[i] = 0
            counts[i] += 1
        for i in local[tkey]['keys']:
            if i not in counts:
                counts[i] = 0
        print('check')
        cvect = [counts[i] for i in local[tkey]['keys']]
        d = np.array(cvect) / sum(cvect)
        c = np.array(data[tkey])
        lcnt = np.sum(c, axis=0)
        gcnt = np.sum(c, axis=1)
        norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
        # Add biased data as a col
        kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
        kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
        arr = np.vstack((norm, kpca_cnt_norm, d)).T
        rowlist = tuple(gcnt) + (
            'localKPCA',
            'biased',
        )
        P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
Example #2
0
def reproj_distro():
  local = get_local()
  data  = get_edges()
  r2 = redis.StrictRedis(port=6385, decode_responses=True)
  for tbin in [(2,4), (2,2), (4,4), (4,2), (4,1), (3,1)]:
    print('Processing:', tbin)
    tkey = '%d_%d' % tbin
    # Get Kernel
    kpca_key = 'subspace:pca:kernel:' + tkey
    kpca = PCAnalyzer.load(r2, kpca_key)
    # Get Training Data
    data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
    pca_pts = np.array([np.fromstring(x) for x in data_raw])
    kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
    proj_pts = kpca.project(alpha.xyz)
    biased_hcubes = []
    for i, pt in enumerate(proj_pts):
      biased_hcubes.append(kdtree.probe(pt, probedepth=9))
    if len(data) == 0:
      print('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
      continue
    counts = {}
    for i in biased_hcubes:
      if i not in counts:
        counts[i] = 0
      counts[i] += 1
    for i in local[tkey]['keys']:
      if i not in counts:
        counts[i] = 0
    print('check')
    cvect = [counts[i] for i in local[tkey]['keys']]
    d = np.array(cvect)/sum(cvect)
    c = np.array(data[tkey])
    lcnt = np.sum(c, axis=0)
    gcnt = np.sum(c, axis=1)
    norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
    # Add biased data as a col
    kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
    kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
    arr = np.vstack((norm, kpca_cnt_norm, d)).T
    rowlist = tuple(gcnt) + ('localKPCA', 'biased',)
    P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])
Example #3
0
File: simanl.py Project: DaMSL/ddc
  def execute(self, job):

  # PRE-PREOCESS ----------------------------------------------------------
    settings = systemsettings()
    bench = microbench('sim_%s' % settings.name, self.seqNumFromID())
    bench.start()
    stat  = StatCollector('sim_%s' % settings.name, self.seqNumFromID())
    mylogical_seqnum = str(self.seqNumFromID())

    # Prepare working directory, input/output files
    conFile = os.path.join(job['workdir'], job['name'] + '.conf')
    logFile = conFile.replace('conf', 'log')      # log in same place as config file
    dcdFile = conFile.replace('conf', 'dcd')      # dcd in same place as config file
    USE_SHM = True

    ADAPTIVE_CENTROID = False

    SIMULATE_RATIO = settings.SIMULATE_RATIO
    if SIMULATE_RATIO > 1:
      logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
    frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000)
    logging.info('Frame Size is %f  Using Sim Ratio of 1:%d', \
      frame_size, SIMULATE_RATIO)

    EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
    logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER)

    # TODO: FOR LINEAGE
    # srcA, srcB = eval(job['src_bin'])
    # stat.collect('src_bin', [str(srcA), str(srcB)])

    traj = None

  # EXECUTE SIMULATION ---------------------------------------------------------
    if self.skip_simulation:

      logging.info('1. SKIPPING SIMULATION.....')
      USE_SHM = False

      job['dcd'] = dcdFile
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

    else:
      logging.info('1. Run Simulation')

      # Prepare & source to config file
      with open(self.data['sim_conf_template'], 'r') as template:
        source = template.read()

      # >>>>Storing DCD into shared memory on this node

      if USE_SHM:
        # ramdisk = '/dev/shm/out/'
        ramdisk = '/tmp/ddc/'
        if not os.path.exists(ramdisk):
          os.mkdir(ramdisk)
        job['outputloc'] = ramdisk
        dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd')
      else:
        job['outputloc'] = ''

      with open(conFile, 'w') as sysconfig:
        sysconfig.write(source % job)
        logging.info("Config written to: " + conFile)

      # # Run simulation in parallel
      # if 'parallel' in job:
      #   numnodes = job['parallel']
      #   total_tasks = numnodes * 24
      #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

      # # Run simulation single threaded
      # else:
      #   cmd = 'namd2 %s > %s' % (conFile, logFile)

      # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
      check = executecmd('module list')
      logging.debug('%s', check)

      cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile)

      #  MICROBENCH #1 (file to Lustre)
      # logging.debug("Executing Simulation:\n   %s\n", cmd)
      # bench = microbench()
      # bench.start()
      # stdout = executecmd(cmd)
      # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
      # bench.mark('SimExec:%s' % job['name'])
      # shm_contents = os.listdir('/dev/shm/out')
      # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # logging.info("Copy Complete to Lustre.")
      # bench.mark('CopyLustre:%s' % job['name'])
      # shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
      # bench.show()

      max_expected_obs = int(job['runtime']) // int(job['dcdfreq'])
      # Retry upto 3 attempts if the sim fails
      MAX_TRY = 3
      for i in range(MAX_TRY, 0, -1):
        min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY)))
        logging.debug("Executing Simulation:\n   %s\n", cmd)
        logging.debug('# Obs Expected to see: %d', max_expected_obs)
        stdout = executecmd(cmd)
        logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
        # Check file for expected data
        if USE_SHM:
          traj = md.load(dcd_ramfile, top=job['pdb'])
        else:
          traj = md.load(dcdFile, top=job['pdb'])
        logging.info("Obs Threshold  = %4d", min_required_obs)
        logging.info("#Obs This Traj = %4d", traj.n_frames)
        if traj.n_frames >= min_required_obs:
          logging.info('Full (enough) Sim Completed')
          break
        logging.info('Detected a failed Simulation. Retrying the same sim.')
      
      bench.mark('SimExec:%s' % job['name'])

      # Internal stats
      sim_length = self.data['sim_step_size'] * int(job['runtime'])
      sim_realtime = bench.delta_last()
      sim_run_ratio =  (sim_realtime/60) / (sim_length/1000000)
      logging.info('##SIM_RATIO %6.3f  min-per-ns-sim', sim_run_ratio)
      stat.collect('sim_ratio', sim_run_ratio)

      if USE_SHM:
        shm_contents = os.listdir(ramdisk)
        logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
          time.sleep(10)

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD STIILL FILE NOT FOUND!!!!")
        else:
          logging.info("DCD File was found")

      # #  MICROBENCH #2 (file to Alluxio)
      # allux = AlluxioClient()
      # # copy to Aluxio FS
      # allux.put(ramdisk + job['name'] + '.dcd', '/')
      # logging.info("Copy Complete to Alluxio.")
      # bench.mark('CopyAllux:%s' % job['name'])

      # And copy to Lustre
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # And copy to Lustre (usng zero-copy):
      if USE_SHM:
        src  = open(dcd_ramfile, 'rb')
        dest = open(dcdFile, 'w+b')
        offset = 0
        dcdfilesize = os.path.getsize(dcd_ramfile)
        while True:
          sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
          if sent == 0:
            break
          offset += sent
        logging.info("Copy Complete to Lustre.")
        bench.mark('CopyLustre:%s' % job['name'])
      
      # TODO: Update job's metadata
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

  # ANALYSIS   ------- ---------------------------------------------------------
    #  ANALYSIS ALGORITHM
  # 1. With combined Sim-analysis: file is loaded locally from shared mem
    logging.debug("2. Load DCD")

    # Load full higher dim trajectory
    # traj = datareduce.filter_heavy(dcd_ramfile, job['pdb'])
    if traj is None:
      if USE_SHM:
        traj = md.load(dcd_ramfile, top=job['pdb'])
      else:
        traj = md.load(dcdFile, top=job['pdb'])

    # Center Coordinates
    traj.center_coordinates()

    bench.mark('File_Load')
    logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj))


  #  DIMENSIONALITY REDUCTION --------------------------------------------------
  # 4-A. Subspace Calcuation: RMS using Alpha-Filter
    #------ A:  RMSD-ALPHA  ------------------
      #     S_A = rmslist
    logging.info('---- RMSD Calculation against pre-defined centroids ----')
      #  RMSD is calculated on the Ca ('alpha') atoms in distance space
      #   whereby all pairwise distances are calculated for each frame.
      #   Pairwise distances are plotted in euclidean space
      #   Distance to each of the 5 pre-calculated centroids is calculated

    # 1. Filter to Alpha atoms
    alpha = traj.atom_slice(deshaw.FILTER['alpha'])



    # 2. (IF USED) Convert to distance space: pairwise dist for all atom combinations
    # alpha_dist = dr.distance_space(alpha)

    # 3. Calc RMS for each conform to all centroids
    # Heuristic centroid weight (TODO: make this trained)\

    # 4. For adaptive Centriods
    #  Centroids Will be pulled & updated. 
    logging.info('CENTROID Retrieval & Updating')
    self.wait_catalog()

    #  If they were mutable....
    # logging.info('Acquiring a Lock on the Centroids')
    # centroids = self.catalog.loadNPArray('centroid')
    # thetas = self.catalog.loadNPArray('thetas')
    # lock = self.catalog.lock_acquire('centroid')
    # if lock is None:
    #   logging.info('Could not lock the Centroids. Will use current cached (possibly stale) data.')
    # bench.mark('ConcurrLockCentroid'%(A,B))

    #  Implemented as a Transactional Data Structure....
    if ADAPTIVE_CENTROID:
      centroids = []
      for state in range(numLabels):
        cent_raw  = self.catalog.lrange('centroid:xyz:%d'%state, 0, -1)
        cent_xyz  = [pickle.loads(i) for i in cent_raw]
        cent_npts = [int(i) for i in self.catalog.lrange('centroid:npts:%d'%state, 0, -1)]
        c_sum = np.zeros(shape=cent_xyz[0].shape)
        c_tot = 0
        for x, n in zip(cent_xyz, cent_npts):
          c = x * n
          c_sum += c
          c_tot += n
        centroids.append(c_sum / c_tot)
    else:
      centroids = self.catalog.loadNPArray('centroid')      

    # if EXPERIMENT_NUMBER < 10:
    # 5. Calculate the RMSD for each filtered point to 5 pre-determined centroids
    # cw = [.92, .94, .96, .99, .99]
    cw = [.94, .95, .97, .99, .99]

    numLabels = len(self.data['centroid'])
    numConf = len(traj.xyz)
    stat.collect('numpts',numConf)

    # 4. Account for noise : Simple spatial mean filter over a small window
    #    Where size of window captures extent of noise 
    #    (e.g. 10000fs window => motions under 10ps are considered "noisy")
    noise = self.data['obs_noise']
    stepsize = 500 if 'interval' not in job else int(job['interval'])
    nwidth = noise//(2*stepsize)
    noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0)
    rms_filtered = np.array([noisefilt(alpha.xyz, i) for i in range(numConf)])
    # Notes: Delta_S == rmslist
    rmslist_sv = calc_rmsd(rms_filtered, centroids, weights=cw)
      # rmslist = adaptive_rmsd(rms_filtered, centroids, theta)

    # else:
    rmslist = calc_rmsd(alpha, centroids)

    numConf = traj.n_frames
    numLabels = len(centroids)

    # rmslist = calc_rmsd(alpha.xyz, self.data['centroid'], weights=cw)
    logging.debug('  RMS:  %d points projected to %d centroid-distances', \
      numConf, numLabels)


    # 6. Apply Heuristics Labeling  -- Single Variate
    rmslabel = []
    binlist = [(a, b) for a in range(numLabels) for b in range(numLabels)]
    label_count = {ab: 0 for ab in binlist}
    groupbystate = [[] for i in range(numLabels)]
    groupbybin = {ab: [] for ab in binlist}
    for i, rms in enumerate(rmslist_sv):
      #  Sort RMSD by proximity & set state A as nearest state's centroid
      A, B = np.argsort(rms)[:2]

      #  Calc Absolute proximity between nearest 2 states' centroids
      # THETA Calc derived from static run. it is based from the average std dev of all rms's from a static run
      #   of BPTI without solvent. It could be dynamically calculated, but is hard coded here
      #  The theta is divided by four based on the analysis of DEShaw:
      #   est based on ~3% of DEShaw data in transition (hence )
      # avg_stddev = 0.34119404492089034
      # theta = settings.RMSD_THETA
      ## FOR ADAPTIVE Cantroids. Theta is now updated dyamically

      # NOTE: Original formulate was relative. Retained here for reference:  
      # Rel vs Abs: Calc relative proximity for top 2 nearest centroids   
      # relproximity = rms[A] / (rms[A] + rms[rs[1]])
      # B = rs[1] if relproximity > (.5 - theta) else A
      # proximity = abs(rms[prox[1]] - rms[A]) / (rms[prox[1]] + rms[A])  #relative
      #proximity = abs(rms[prox[1]] - rms[A])    #abs
      # Update for Adaptive Centroid.
      delta = np.abs(rms[B] - rms[A])

      #  (TODO:  Factor in more than top 2, better noise)
      #  Label secondary sub-state
      # sub_state = B prox[1] if proximity < theta else A
      # For ADAPTIVE Centroids
      if delta < 0.33:
        sub_state = B
      else:
        sub_state = A
      rmslabel.append((A, sub_state))

      # Add this index to the set of indices for this respective label
      #  TODO: Should we evict if binsize is too big???
      # logging.debug('Label for observation #%3d: %s', i, str((A, B)))
      label_count[(A, sub_state)] += 1

      # Group high-dim point by state
      # TODO: Consider grouping by stateonly or well/transitions (5 vs 10 bins)
      groupbystate[A].append(i)
      groupbybin[(A, sub_state)].append(i)

    # stat.collect('observe', label_count)
    bench.mark('RMS')
    logging.info('Labeled the following:')
    for A in range(numLabels):
      if len(groupbystate[A]) > 0:
        logging.info('label,state,%d,num,%d', A, len(groupbystate[A]))
    for ab in binlist:
      if len(groupbybin[ab]) > 0:
        A, B = ab
        logging.info('label,bin,%d,%d,num,%d', A, B, len(groupbybin[ab]))

    # FEATURE LANDSCAPE -- Multi-Variate

    # Calc Feature landscape for each frame's RMSD
    feal_list = [feal.atemporal(rms) for rms in rmslist]
    logging.info('Calculated Feature Landscape. Aggregate for this traj')
    # For logging purposes
    agg_feal = np.mean(feal_list, axis=0)
    logging.info('CountsMax [C]:  %s', str(agg_feal[:5]))
    logging.info('StateDist [S]:  %s', str(agg_feal[5:10]))
    logging.info('RelDist [A-B]:  %s', str(agg_feal[10:]))

    #  ADAPTIVE CENTROID & THETA CALCULATION
    # if lock is None:
    #   logging.info('Never acqiured a lock. Skipping adaptive update (TODO: Mark pts as stale)')
    # else:  
    #   logging.info('Updating Adaptive Centroid')
    
    if ADAPTIVE_CENTROID:
      pipe = self.catalog.pipeline()
      for state in range(numLabels):
        n_pts = len(groupbybin[(state, state)])
        if n_pts == 0:
          logging.info('Skipping State %d Centroid -- Well not visited on this trajectory')
          continue

        cent_xyz  = [alpha.xyz[i] for i in groupbybin[(state, state)]]
        cent_npts = len(groupbybin[(state, state)])
        c_sum = np.zeros(shape=alpha.xyz[0].shape)
        for pt in cent_xyz:
          c_sum += pt
        centroid_local = c_sum / n_pts
        centroid_delta = LA.norm(centroids[state] - cent) 
        pipe.rpush('centroid:xyz:%d' % state, pickle.dumps(centroid_local))
        pipe.rpush('centroid:npts:%d' % state, n_pts)
        pipe.rpush('centroid:delta:%d' % state, centroid_delta)
      pipe.execute()

  # 4-B. Subspace Calcuation: COVARIANCE Matrix, 200ns windows, Full Protein
  #------ B:  Covariance Matrix  -----------------
    if EXPERIMENT_NUMBER > 5:
      # 1. Project Pt to PC's for each conform (top 3 PC's)
      logging.info('---- Covariance Calculation on 200ns windows (Full Protein, cartesian Space) ----')

      # Calculate Covariance over 200 ps Windows sliding every 100ps
      #  These could be user influenced...
      WIN_SIZE_NS = .2
      SLIDE_AMT_NS = .1
      logging.debug("Calculating Covariance over trajectory. frame_size = %.1f, WINSIZE = %dps, Slide = %dps", 
        frame_size, WIN_SIZE_NS*1000, SLIDE_AMT_NS*1000)
      covar = dr.calc_covar(alpha.xyz, WIN_SIZE_NS, frame_size, slide=SLIDE_AMT_NS)
      bench.mark('CalcCovar')
      stat.collect('numcovar', len(covar))
      logging.debug("Calcualted %d covariance matrices. Storing variances", len(covar)) 


  #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
    # try:
    self.wait_catalog()
    # except OverlayNotAvailable as e:
    #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")


  # Update Catalog with 1 Long Atomic Transaction  
    global_index = []
    with self.catalog.pipeline() as pipe:
      while True:
        try:
          logging.debug('Update Filelist')
          pipe.watch(wrapKey('jc', job['name']))
          file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1
          # HD Points
          logging.debug('Update HD Points')
          for x in range(traj.n_frames):
            # Note: Pipelined insertions "should" return contiguous set of index points
            index = pipe.rpush('xid:reference', (file_idx, x)) - 1
            global_index.append(index - 1) 

          pipe.multi()
          logging.debug('Update RMS Subspace')
          for x in range(traj.n_frames):
            A, B = rmslabel[x]
            index = global_index[x]
            # Labeled Observation (from RMSD)
            pipe.rpush('label:rms', rmslabel[x])
            pipe.rpush('varbin:rms:%d_%d' % (A, B), index)
            # pipe.rpush('lineage:rms:%d_%d:%d_%d' % (srcA, srcB, A, B), index)
            # pipe.rpush('lineage:pca:%s:%d_%d' % (job['src_hcube'], A, B), index)
            pipe.rpush('subspace:rms', bytes(rmslist_sv[x]))
            pipe.rpush('subspace:feal', bytes(feal_list[x]))            

          logging.debug('Update OBS Counts')
          for b in binlist:
            pipe.rpush('observe:rms:%d_%d' % b, label_count[b])
          pipe.incr('observe:count')
          pipe.hset('anl_sequence', job['name'], mylogical_seqnum)

          if EXPERIMENT_NUMBER > 5:
            logging.debug('Update Covar Subspace')
            for i, si in enumerate(covar):
              logging.debug('Update COVAR Pt #%d', i)
              local_index = int(i * frame_size * SLIDE_AMT_NS)
              pipe.rpush('subspace:covar:pts', bytes(si))
              pipe.rpush('subspace:covar:xid', global_index[local_index])
              pipe.rpush('subspace:covar:fidx', (file_idx, local_index))

          logging.debug('Executing')
          pipe.execute()
          break
        except redis.WatchError as e:
          logging.debug('WATCH ERROR')
          continue

    self.data[key]['xid:start'] = global_index[0]
    self.data[key]['xid:end'] = global_index[-1]
    bench.mark('Indx_Update')

  # (Should we Checkpoint here?)

  # 4-C. Subspace Calcuation: PCA BY Strata (PER STATE) using Alpha Filter
  #------ C:  GLOBAL PCA by state  -----------------
  #  Update PCA Vectors for each state with new data
    if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10:
      logging.info('---- PCA per BIN over Alpha Filter in cartesian Space ----')
      # TODO:  This will eventually get moved into a User process macrothread 
      #   which will set in between analysis and controller. 
      # For now, we're recalculating using a lock

      # Check if vectors need to be recalculated
      # Connect to reservoir samples
      # TODO: Catalog or Cache?
      reservoir = ReservoirSample('rms', self.catalog)
      # STALENESS_FACTOR = .25   # Recent updates account for 25% of the sample (Just a guess)

      num_inserted = {ab: 0 for ab in binlist}
      num_params = np.prod(alpha.xyz.shape[1:])

      for A, B in binlist:
        num_observations = len(groupbybin[(A,B)])

        if num_observations == 0:
          logging.info('No data received for bin (%d,%d).  Not processing this bin here.', A, B)
          continue

        res_label = '%d_%d' % (A,B)
        updateVectors = False
        kpca_key = 'subspace:pca:kernel:%d_%d' % (A, B)
        kpca = PCAnalyzer.load(self.catalog, kpca_key)
        newkpca = False
        if kpca is None:
          # kpca = PCAKernel(None, 'sigmoid')
          kpca = PCAKernel(6, 'rbf')
          newkpca = True


        logging.info('PCA:  Checking if current vectors for state %d are out of date', A)
        rsize = reservoir.getsize(res_label)
        tsize = kpca.trainsize

        #  KPCA is out of date is the sample size is 20% larger than previously used  set
        #  Heuristics --- this could be a different "staleness" factor or we can check it some other way
        if newkpca or rsize > (tsize * 1.5):

          #  Should we only use a sample here??? (not now -- perhaps with larger rervoirs or if KPCA is slow
          traindata = reservoir.get(res_label)
          if newkpca:
            logging.info('New PCA Kernel. Trained on data set of size %d. Current \
              reservoir is %d pts.', tsize, rsize)
            logging.info('Projecting %d points on Kernel PCA for bin (%d,%d)',
              num_observations, A, B)
            traindata = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]),
              dtype=np.float32)
            for i, index in enumerate(groupbybin[(A,B)]):
              np.copyto(traindata[i], alpha.xyz[index])
          else:
            logging.info('PCA Kernel is old (Updating it). Trained on data set of \
              size %d. Current reservoir is %d pts.', tsize, rsize)


          if len(traindata) <= num_params:
            logging.info("Not enough data to calculate PC's (Need at least %d \
              observations). Skipping PCA for Bin (%d,%d)", num_params, A, B)
            hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32)
            for i, index in enumerate(groupbybin[(A,B)]):
              np.copyto(hd_pts[i], alpha.xyz[index])
            num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts)
            logging.debug('Updating reservoir Sample for Bin (%d, %d)')
            continue

          logging.info('   Performing Kernel PCA (Gaussian) for bin (%d,%d) using traindata of size %d', \
            A, B, len(traindata))

          kpca.solve(traindata)

          # NOTE: Pick PCA Algorithm HERE
          # pca = calc_kpca(np.array(traindata), kerneltype='sigmoid')
          # pca = calc_pca(np.array(traindata))
          bench.mark('CalcKPCA_%d_%d'%(A,B))

          # new_vect = pca.alphas_.T
          lock = self.catalog.lock_acquire(kpca_key)
          if lock is None:
            logging.info('Could not lock the PC Kernel for Bin (%d,%d). Not updating', A, B)
          else:
            kpca.store(self.catalog, kpca_key)
            lock = self.catalog.lock_release(kpca_key, lock)
          bench.mark('ConcurrPCAWrite_%d_%d'%(A,B))

          # Project Reservoir Sample to the Kernel and overwrite current set of points
          #  This should only happen up until the reservior is filled
          # If we are approx above to train, be sure to project all reservor points
          if not newkpca:
            logging.info('Clearing and Re-Projecting the entire reservoir of %d points for Bin (%d,%d).', \
              rsize, A, B)
            rsamp_lowdim = kpca.project(traindata)
            pipe = self.catalog.pipeline()
            pipe.delete('subspace:pca:%d_%d'%(A,B))
            for si in rsamp_lowdim:
              pipe.rpush('subspace:pca:%d_%d'%(A,B), bytes(si))
            pipe.execute()


        else:
          logging.info('PCA Kernel is good -- no need to change them')

        bench.mark('start_ProjPCA')
        logging.info('Projecting %d points on Kernel PCA for Bin (%d,%d)', num_observations, A, B)
        hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32)
        for i, index in enumerate(groupbybin[(A,B)]):
          np.copyto(hd_pts[i], alpha.xyz[index])
        pc_proj = kpca.project(hd_pts)
        bench.mark('ProjPCA_%d_%d'%(A,B))

        # 2. Append subspace in catalog
        pipe = self.catalog.pipeline()
        for si in pc_proj:
          pipe.rpush('subspace:pca:%d_%d' % (A,B), bytes(si))
        pipe.execute()

        logging.debug('Updating reservoir Sample')
        num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts)

      bench.mark('PCA')
      pipe = self.catalog.pipeline()
      for ab, num in num_inserted.items():
        if num > 0:
          pipe.rpush('subspace:pca:updates:%d_%d' % (A, B), num)
      pipe.execute()

  # ---- POST PROCESSING
    if USE_SHM:
      shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      shm_contents = os.listdir('/tmp')
      logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents))
    
    # For benchmarching:
    # print('##', job['name'], dcdfilesize/(1024*1024*1024), traj.n_frames)
    bench.show()
    stat.show()

    # Return # of observations (frames) processed
    return [numConf]
Example #4
0
    def execute(self):
        """Special execute function for the reweight operator -- check/validate.
      """
        # PRE-PROCESSING ---------------------------------------------------------------------------------
        logging.debug(
            "============================  <PRE-PROCESS>  ============================="
        )
        self.cacheclient = CacheClient(self.name)
        numLabels = 5
        binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
        labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
        num_pts = len(labeled_pts_rms)
        logging.debug('##NUM_OBS: %d', num_pts)

        # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
        TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4),
                     (0, 2), (0, 1)]
        MAX_SAMPLE_SIZE = 100  # Max # of cov traj to back project per HCube
        MAX_PT_PER_MATRIX = 100  # Num points to sample from each cov traj
        COVAR_SIZE = 200  # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
        MAX_HCUBE = 6  # Max Num HCubes to process

        # IMPLEMENT USER QUERY with REWEIGHTING:
        logging.debug(
            "=======================  <QUERY PROCESSING>  ========================="
        )

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

        logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
        logging.info("Retrieving All Covariance Vectors")
        home = os.getenv('HOME')
        cfile = home + '/work/DEBUG_COVAR_PTS'
        DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
        if DO_COVAR:
            if os.path.exists(cfile + '.npy'):
                covar_pts = np.load(cfile + '.npy')
                logging.debug('Loaded From File')
            else:
                covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
                covar_pts = np.array([np.fromstring(x) for x in covar_raw])
                np.save(cfile, covar_pts)
                logging.debug('Loaded From Catalog & Saved')
        covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
        logging.debug('Indiced Loaded. Retrieving File Indices')
        covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

        if DO_COVAR:
            logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
            logging.info(
                "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)"
            )

            # FOR incrementatl PCA:
            NUM_PC = 6
            ipca_key = 'subspace:covar:ipca'
            ipca = PCAnalyzer.load(self.catalog, ipca_key)
            if ipca is None:
                logging.info('Creating a NEW IPCA')
                ipca = PCAIncremental(NUM_PC)
                lastindex = 0
            else:
                lastindex = ipca.trainsize
                logging.info(
                    'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts',
                    ipca.trainsize,
                    len(covar_pts) - ipca.trainsize)

            # For incremental, partial solve using only newer pts (from the last "trainsize")
            if len(covar_pts) - lastindex > 0:
                ipca.solve(covar_pts[lastindex:])
                logging.info("Incrementatl PCA Updated. Storing Now...")

                ####  BARRIER
                self.wait_catalog()
                ipca.store(self.catalog, ipca_key)

            logging.info("IPCA Saved. Projecting Covariance to PC")

        cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
        if os.path.exists(cfile + '.npy'):
            subspace_covar_pts = np.load(cfile + '.npy')
        else:
            subspace_covar_pts = ipca.project(covar_pts)
            np.save(cfile, subspace_covar_pts)

        # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
        logging.info(
            'Building Global KD Tree over Covar Subspace with %d data pts',
            len(subspace_covar_pts))
        global_kdtree = KDTree(250,
                               maxdepth=8,
                               data=subspace_covar_pts,
                               method='middle')

        if MAX_HCUBE <= 0:
            hcube_global = global_kdtree.getleaves()
        else:
            # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
            hcube_global_ALL = global_kdtree.getleaves()
            hcube_global = {}
            num = 0
            for k, v in hcube_global_ALL.items():
                hcube_global[k] = v
                num += 1
                if num == MAX_HCUBE:
                    break

        # hcube_global = global_kdtree.getleaves()
        logging.info(
            'Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)'
        )
        for k in sorted(hcube_global.keys()):
            v = hcube_global[k]
            logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'],
                         v['volume'], v['density'])

        if self.filelog:
            keys = hcube_global.keys()
            self.filelog.info('global,keys,%s', ','.join(keys))
            self.filelog.info(
                'global,count,%s',
                ','.join([str(hcube_global[k]['count']) for k in keys]))
            self.filelog.info(
                'global,volume,%s',
                ','.join([str(hcube_global[k]['volume']) for k in keys]))
            self.filelog.info(
                'global,density,%s',
                ','.join([str(hcube_global[k]['density']) for k in keys]))

        logging.info(
            "=====  SELECT Sampling of points from each Global HCube  (B)")
        s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
        hcube_global = {x[0]: x[1] for x in s}

        counter = 0
        for key in hcube_global.keys():
            counter += 1
            if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE:
                cov_index = hcube_global[key]['elm']
                hcube_global[key]['samplefactor'] = 1
            else:
                cov_index = np.random.choice(hcube_global[key]['elm'],
                                             MAX_SAMPLE_SIZE)
                hcube_global[key]['samplefactor'] = len(
                    hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
            hcube_global[key]['idxlist'] = []
            for cov in cov_index:
                selected_hd_idx = np.random.choice(COVAR_SIZE,
                                                   MAX_PT_PER_MATRIX).tolist()
                hcube_global[key]['idxlist'].extend(
                    [int(covar_index[cov]) + i for i in selected_hd_idx])
            logging.info('Back Projecting Global HCube `%s`  (%d out of %d)',
                         key, counter, len(hcube_global.keys()))
            source_cov = self.backProjection(hcube_global[key]['idxlist'])
            hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
            logging.debug('Back Projected %d points to HD space: %s',
                          len(hcube_global[key]['idxlist']),
                          str(hcube_global[key]['alpha']))

        # logging.info('Calculating all HD Distances')
        # dist_hd = {}
        # dist_ld = {}
        # for key in hcube_global.keys():
        #   T = hcube_global[key]['alpha'].xyz
        #   N = len(T)
        #   dist_hd[key] = np.zeros(shape=(N, N))
        #   dist_ld[key] = {}
        #   for A in range(0, N):
        #     dist_hd[key][A][A] = 0
        #     for B in range(A+1, N):
        #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
        reservoir = ReservoirSample('rms', self.catalog)

        logging.info(
            "=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) "
        )
        hcube_list = {}

        logging.info(
            "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)"
        )
        hcube_local = {}

        logging.info("=======================================================")
        logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
        logging.info(
            "=======================================================\n")

        overlap_hcube = {k: {} for k in hcube_global.keys()}

        projection_map = {}

        pt_projection_list = []
        for key in sorted(hcube_global.keys()):
            for i in range(len(hcube_global[key]['alpha'].xyz)):
                pt_projection_list.append([])
        for bin_idx, tbin in enumerate(TEST_TBIN):
            logging.info("Project Global HCubes into local subspace for %s",
                         str(tbin))
            # Load Vectors
            logging.info('Loading subspace and kernel for bin %s', str(tbin))

            # LOAD KPCA Kernel matrix
            kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
            kpca = PCAnalyzer.load(self.catalog, kpca_key)

            data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
            data = np.array([np.fromstring(x) for x in data_raw])
            if len(data) == 0:
                logging.error(
                    'No Raw PCA data points for bin %s.... Going to next bin',
                    str(tbin))
                continue

            logging.info(
                'Building KDtree over local %s bin from observations matrix of size: %s',
                str(tbin), str(data.shape))
            kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
            hcube_local[tbin] = kdtree.getleaves()
            logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
            for k in sorted(hcube_local[tbin].keys()):
                logging.info('    `%-9s`   #pts:%6d   density:%9.1f', k,
                             len(hcube_local[tbin][k]['elm']),
                             hcube_local[tbin][k]['density'])

            if self.filelog:
                keys = hcube_local[tbin].keys()
                A, B = tbin
                self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys))
                self.filelog.info(
                    'local,%d_%d,count,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['count']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,volume,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['volume']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,density,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['density']) for k in keys]))

            n_total = 0
            logging.debug('Global Hcubes to Project (%d):  %s',
                          len(hcube_global.keys()), str(hcube_global.keys()))
            projection_map[bin_idx] = {
                k: set()
                for k in hcube_local[tbin].keys()
            }

            pnum = 0
            for key in sorted(hcube_global.keys()):
                overlap_hcube[key][tbin] = {}
                cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

                logging.debug(
                    'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ',
                    key, len(cov_proj_pca), str(tbin))
                for i, pt in enumerate(cov_proj_pca):
                    hcube = kdtree.probe(pt, probedepth=9)
                    # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
                    if hcube not in overlap_hcube[key][tbin]:
                        overlap_hcube[key][tbin][hcube] = {
                            'idxlist': hcube_local[tbin][hcube]['elm'],
                            'wgt': hcube_local[tbin][hcube]['density'],
                            'num_projected': 0
                        }
                    overlap_hcube[key][tbin][hcube]['num_projected'] += 1

                    # Index this point in corresponding local HCube projection view
                    projection_map[bin_idx][hcube].add(pnum)

                    pt_projection_list[pnum].append(hcube)
                    pnum += 1

                for k, v in sorted(overlap_hcube[key][tbin].items()):
                    logging.debug(
                        '   Project ==> Local HCube `%-9s`: %5d points', k,
                        v['num_projected'])
                # logging.info('Calculating Lower Dimensional Distances')
                # N = len(cov_proj_pca)
                # dist_ld[key][tbin] = np.zeros(shape=(N, N))
                # for A in range(0, N):
                #   for B in range(A+1, N):
                #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])

    # Re-Index projected points -- could make this a list too

        next_index = 0
        view_list = []
        for bin_idx, hcube_map in projection_map.items():
            hcube_list = []
            for hcube_key, pt_list in hcube_map.items():
                hcube_list.append((set((hcube_key, )), set(pt_list)))
            view_list.append((set((bin_idx, )), hcube_list))

        print("CALLING: Collapse Join")
        joined_subspaces = collapse_join(projection_map.keys(), view_list)
        for subspace_list, correlated_hcubes in joined_subspaces:
            tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
            for hcube_list, pt_list in correlated_hcubes:
                print(tbin_list, hcube_list, pt_list)
                # TODO: Corrlate Back to Global
        print('Visualize HERE')

        # for idx, tbin in enumerate(TEST_TBIN):
        #   # Only process substates with data
        #   if tbin not in hcube_local:
        #     logging.warning('Local KD Tree not created for %s', str(tbin))
        #     continue
        #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # sets = {}
        # proj_bin_list = []
        # for tbin in TEST_TBIN:
        #   if tbin not in hcube_local:
        #     continue
        #   proj_bin_list.append(tbin)
        #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # set_list = {}
        # for tbin, view in sets.items():
        #   set_list[(tbin,)] = []
        #   for hcube, idxlist in view.items():
        #     print(tbin, hcube, idxlist)
        #     set_list[(tbin,)].append((set((hcube,)), idxlist))

        # def collapse(C):
        #   a = 0
        #   b = 0
        #   N = []
        #   while a < len(C) and b < len(C):
        #     A = sorted(C[a])
        #     B = sorted(C[b])
        #     if A == B:
        #       b += 1
        #     elif A[0] == B[0]:
        #       N.append(set(A)|set(B))
        #       b += 1
        #     else:
        #       a += 1
        #   if len(N) <= 1:
        #     return []
        #   else:
        #     return N + collapse(N)

        # q=collapse(t1)
        # for i in q: print(sorted(i))

        # print('Checking all 2-Way Joins')
        # join2 = {}
        # for a in range(0, len(proj_bin_list)-1):
        #   tA = proj_bin_list[a]
        #   for b in range(a+1, len(proj_bin_list)):
        #     tB = proj_bin_list[b]
        #     join_ss = tuple(set((tA, tB)))
        #     set_list = []
        #     for kA, vA in sets[tA].items():
        #       for kB, vB in sets[tB].items():
        #         join_hc = set((kA, kB))
        #         inter = vA & vB
        #         if len(inter) > 0:
        #           set_list.append((join_hc, inter))
        #     if len(set_list) > 0:
        #       join2[join_ss] = set_list
        # print('2-Way Join Results:')
        # for ss, set_list in join2.items():
        #   for hc, idxlist in set_list:
        #     print(ss, hc, idxlist)

        # print('Checking all 3-Way Joins')
        # join3 = []
        # checked = []
        # for a in range(0, len(join2)-1):
        #   sA, hA, vA = join2[a]
        #   for b in range(a+1, len(join2)):
        #     sB, hB, vB = join2[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join3.append((ss, hc, inter))

        # print('Checking all 4-Way Joins')
        # join4 = []
        # checked = []
        # for a in range(0, len(join3)-1):
        #   sA, hA, vA = join3[a]
        #   for b in range(a+1, len(join3)):
        #     sB, hB, vB = join3[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join4.append((ss, hc, inter))

        # if self.filelog:
        #   for i in join2:
        #     self.filelog.info('%s', str(i))
        #   for i in join3:
        #     self.filelog.info('%s', str(i))
        #   for i in join4:
        #     self.filelog.info('%s', str(i))

        DO_MIN_CHECK = False
        if DO_MIN_CHECK:

            def maxcount(x):
                y = {}
                for i in x:
                    y[i] = 1 if i not in y else y[i] + 1
                return max(y.values())

            print(
                '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces'
            )
            argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf)
                                                  for i in x])
            for key in hcube_global.keys():
                # logging.info('Showing MIN / MAX for points from HCube %s:', key)
                minA = {}
                maxA = {}
                for n in range(len(dist_hd[key])):
                    minA[n] = []
                    maxA[n] = []
                    for tbin in TEST_TBIN:
                        if tbin not in dist_ld[key].keys():
                            continue
                            minA[n].append(0)
                            maxA[n].append(0)
                        else:
                            minA[n].append(
                                argmin_nonzero(dist_ld[key][tbin][n]))
                            maxA[n].append(np.argmax(dist_ld[key][tbin][n]))
                numsame = np.zeros(len(dist_ld[key].keys()) + 1)
                for n in range(len(dist_hd[key][n])):
                    minH = argmin_nonzero(dist_hd[key][n])
                    maxH = np.argmax(dist_hd[key][n])
                    minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])]
                    numsamepair = maxcount(minA[n])
                    numsame[numsamepair] += 1
                    # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
                print(' '.join([
                    '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame)))
                ]))

        print('Stopping HERE!')
        sys.exit(0)
        #  GAMMA FUNCTION EXPR # 8
        gamma1 = lambda a, b: (a * b)
        gamma2 = lambda a, b: (a + b) / 2

        # TODO: Factor in RMS weight
        for tbin in TEST_TBIN:
            # for tbin in sorted(bin_list):
            logging.info('')
            logging.info('BIPARTITE GRAPH for %s', str(tbin))
            bipart = {}
            edgelist = []
            for hcB in hcube_global.keys():
                num_B = hcube_global[hcB]['count']
                wgt1_B = hcube_global[hcB]['density']
                if tbin not in overlap_hcube[hcB]:
                    continue
                for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
                    edge = {}
                    if hcA not in bipart:
                        bipart[hcA] = []
                    num_proj = hcA_data['num_projected']
                    wgt_A = hcA_data['wgt']
                    wgt2_B = wgt1_B * num_proj
                    edge['combW1'] = gamma1(wgt_A, wgt1_B)
                    edge['combW2'] = gamma1(wgt_A, wgt2_B)
                    edge['combW3'] = gamma2(wgt_A, wgt1_B)
                    edge['combW4'] = gamma2(wgt_A, wgt2_B)
                    edge['num_A'] = len(hcA_data['idxlist'])
                    edge['num_B'] = num_B
                    edge['num_proj'] = num_proj
                    edge['wgt_A'] = wgt_A
                    edge['wgt1_B'] = wgt1_B
                    edge['wgt2_B'] = wgt2_B
                    edge['hcA'] = hcA
                    edge['hcB'] = hcB
                    bipart[hcA].append(edge)
                    edgelist.append((hcA, hcB, num_proj))
            if len(bipart) == 0:
                logging.info("NO DATA FOR %s", str(tbin))
                continue
            logging.info('')
            logging.info(
                'A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)'
            )
            for k, v in bipart.items():
                for edge in v:
                    logging.info(
                        'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f'
                        % edge)
                    if self.filelog:
                        A, B = tbin
                        self.filelog.info('edge,%d_%d,%s,%s,%d', A, B,
                                          edge['hcA'], edge['hcB'],
                                          edge['num_proj'])

            # Prepare nodes for graph
            nA = set()
            nB = set()
            elist = []
            for e in edgelist:
                a, b, z = e
                if z <= 5:
                    continue
                nA.add(a)
                nB.add(b)
                elist.append((a, b, z))
            nAKeys = sorted(nA)[::-1]
            nBKeys = sorted(nB)[::-1]
            sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
            sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys]
            idxA = {key: i for i, key in enumerate(nAKeys)}
            idxB = {key: i for i, key in enumerate(nBKeys)}
            edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
            G.bipartite(sizesA, sizesB, edges, sizesA, sizesB,
                        'bipartite_%d_%d' % tbin)

        logging.info('STOPPING HERE!!!!')
        sys.exit(0)
        return []
Example #5
0
    def execute(self):
      """Special execute function for the reweight operator -- check/validate.
      """
    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")
      self.cacheclient = CacheClient(self.name)
      numLabels = 5
      binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
      labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
      num_pts = len(labeled_pts_rms)
      logging.debug('##NUM_OBS: %d', num_pts)

      # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
      TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)]
      MAX_SAMPLE_SIZE   =  100   # Max # of cov traj to back project per HCube
      MAX_PT_PER_MATRIX =  100   # Num points to sample from each cov traj
      COVAR_SIZE        = 200   # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
      MAX_HCUBE         = 6      # Max Num HCubes to process


    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

      logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
      logging.info("Retrieving All Covariance Vectors")
      home = os.getenv('HOME')
      cfile = home + '/work/DEBUG_COVAR_PTS'
      DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
      if DO_COVAR: 
        if os.path.exists(cfile + '.npy'):
          covar_pts = np.load(cfile + '.npy')
          logging.debug('Loaded From File')
        else: 
          covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
          covar_pts = np.array([np.fromstring(x) for x in covar_raw])
          np.save(cfile, covar_pts)
          logging.debug('Loaded From Catalog & Saved')
      covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
      logging.debug('Indiced Loaded. Retrieving File Indices')
      covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

      if DO_COVAR: 
        logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
        logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)")

        # FOR incrementatl PCA:
        NUM_PC = 6
        ipca_key = 'subspace:covar:ipca'
        ipca = PCAnalyzer.load(self.catalog, ipca_key)
        if ipca is None:
          logging.info('Creating a NEW IPCA')
          ipca = PCAIncremental(NUM_PC)
          lastindex = 0
        else:
          lastindex = ipca.trainsize
          logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', 
            ipca.trainsize, len(covar_pts)-ipca.trainsize)

        # For incremental, partial solve using only newer pts (from the last "trainsize")
        if len(covar_pts)-lastindex > 0:
          ipca.solve(covar_pts[lastindex:])
          logging.info("Incrementatl PCA Updated. Storing Now...")

          ####  BARRIER 
          self.wait_catalog()
          ipca.store(self.catalog, ipca_key)

        logging.info("IPCA Saved. Projecting Covariance to PC")

      cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
      if os.path.exists(cfile + '.npy'):
        subspace_covar_pts = np.load(cfile + '.npy')
      else: 
        subspace_covar_pts = ipca.project(covar_pts)
        np.save(cfile, subspace_covar_pts)

      # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
      logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts))
      global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle')
  

      if MAX_HCUBE <= 0:
        hcube_global = global_kdtree.getleaves()
      else:
      # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
        hcube_global_ALL = global_kdtree.getleaves()
        hcube_global = {}
        num = 0
        for k, v in hcube_global_ALL.items():
          hcube_global[k] = v
          num += 1
          if num == MAX_HCUBE:
            break

      # hcube_global = global_kdtree.getleaves()
      logging.info('Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)')
      for k in sorted(hcube_global.keys()):
        v = hcube_global[k]
        logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density'])

      if self.filelog:
        keys = hcube_global.keys()
        self.filelog.info('global,keys,%s',','.join(keys))
        self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys]))
        self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys]))
        self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys]))

      logging.info("=====  SELECT Sampling of points from each Global HCube  (B)")
      s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
      hcube_global = {x[0]: x[1] for x in s}


      counter = 0
      for key in hcube_global.keys():
        counter += 1
        if hcube_global[key]['count']  <= MAX_SAMPLE_SIZE:
          cov_index = hcube_global[key]['elm']
          hcube_global[key]['samplefactor'] = 1
        else:
          cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE)
          hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
        hcube_global[key]['idxlist'] = []
        for cov in cov_index:
          selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist()
          hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx])
        logging.info('Back Projecting Global HCube `%s`  (%d out of %d)', key, counter, len(hcube_global.keys()))
        source_cov = self.backProjection(hcube_global[key]['idxlist'])
        hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
        logging.debug('Back Projected %d points to HD space: %s', 
          len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha']))

      # logging.info('Calculating all HD Distances')
      # dist_hd = {}
      # dist_ld = {}
      # for key in hcube_global.keys():
      #   T = hcube_global[key]['alpha'].xyz
      #   N = len(T)
      #   dist_hd[key] = np.zeros(shape=(N, N))
      #   dist_ld[key] = {}
      #   for A in range(0, N):
      #     dist_hd[key][A][A] = 0
      #     for B in range(A+1, N):
      #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])
        

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
      reservoir = ReservoirSample('rms', self.catalog)

      logging.info("=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ")
      hcube_list = {}

      logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)")
      hcube_local = {}

      logging.info("=======================================================")
      logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
      logging.info("=======================================================\n")

      overlap_hcube = {k: {} for k in hcube_global.keys()}

      projection_map = {}


      pt_projection_list = []
      for key in sorted(hcube_global.keys()):
        for i in range(len(hcube_global[key]['alpha'].xyz)):
          pt_projection_list.append([])
      for bin_idx, tbin in enumerate(TEST_TBIN):
        logging.info("Project Global HCubes into local subspace for %s", str(tbin))
        # Load Vectors
        logging.info('Loading subspace and kernel for bin %s', str(tbin))

        # LOAD KPCA Kernel matrix
        kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
        kpca = PCAnalyzer.load(self.catalog, kpca_key)

        data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
        data = np.array([np.fromstring(x) for x in data_raw])
        if len(data) == 0:
          logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
          continue


        logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape))
        kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
        hcube_local[tbin] = kdtree.getleaves()
        logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
        for k in sorted(hcube_local[tbin].keys()):
          logging.info('    `%-9s`   #pts:%6d   density:%9.1f', 
            k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density'])

        if self.filelog:
          keys = hcube_local[tbin].keys()
          A,B = tbin
          self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys))
          self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys]))
          self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys]))
          self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys]))          

        n_total = 0
        logging.debug('Global Hcubes to Project (%d):  %s', len(hcube_global.keys()), str(hcube_global.keys()))
        projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()}
        
        pnum = 0
        for key in sorted(hcube_global.keys()):
          overlap_hcube[key][tbin] = {}
          cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

          logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ', 
            key, len(cov_proj_pca), str(tbin))
          for i, pt in enumerate(cov_proj_pca):
            hcube = kdtree.probe(pt, probedepth=9)
            # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
            if hcube not in overlap_hcube[key][tbin]:
              overlap_hcube[key][tbin][hcube] = {
                  'idxlist': hcube_local[tbin][hcube]['elm'],
                  'wgt': hcube_local[tbin][hcube]['density'], 
                  'num_projected': 0}
            overlap_hcube[key][tbin][hcube]['num_projected'] += 1

            # Index this point in corresponding local HCube projection view
            projection_map[bin_idx][hcube].add(pnum)

            pt_projection_list[pnum].append(hcube)
            pnum += 1

          for k, v in sorted(overlap_hcube[key][tbin].items()):
            logging.debug('   Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected'])
          # logging.info('Calculating Lower Dimensional Distances')
          # N = len(cov_proj_pca)
          # dist_ld[key][tbin] = np.zeros(shape=(N, N))
          # for A in range(0, N):
          #   for B in range(A+1, N):
          #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])


    # Re-Index projected points -- could make this a list too

      next_index = 0
      view_list = []
      for bin_idx, hcube_map in projection_map.items():
        hcube_list = []
        for hcube_key, pt_list in hcube_map.items():
          hcube_list.append((set((hcube_key,)), set(pt_list)))
        view_list.append((set((bin_idx,)), hcube_list))

      print("CALLING: Collapse Join")
      joined_subspaces = collapse_join(projection_map.keys(), view_list)
      for subspace_list, correlated_hcubes in joined_subspaces:
        tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
        for hcube_list, pt_list in correlated_hcubes:
          print(tbin_list, hcube_list, pt_list)
          # TODO: Corrlate Back to Global 
      print('Visualize HERE')



      # for idx, tbin in enumerate(TEST_TBIN):
      #   # Only process substates with data
      #   if tbin not in hcube_local:
      #     logging.warning('Local KD Tree not created for %s', str(tbin))
      #     continue
      #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))


      # sets = {}
      # proj_bin_list = []
      # for tbin in TEST_TBIN:
      #   if tbin not in hcube_local:
      #     continue
      #   proj_bin_list.append(tbin)
      #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))

      # set_list = {}
      # for tbin, view in sets.items():
      #   set_list[(tbin,)] = []
      #   for hcube, idxlist in view.items():
      #     print(tbin, hcube, idxlist)
      #     set_list[(tbin,)].append((set((hcube,)), idxlist))


      # def collapse(C):
      #   a = 0
      #   b = 0
      #   N = []
      #   while a < len(C) and b < len(C):
      #     A = sorted(C[a])
      #     B = sorted(C[b])
      #     if A == B:
      #       b += 1
      #     elif A[0] == B[0]:
      #       N.append(set(A)|set(B))
      #       b += 1
      #     else:
      #       a += 1
      #   if len(N) <= 1:
      #     return []
      #   else:
      #     return N + collapse(N)

      # q=collapse(t1)
      # for i in q: print(sorted(i))


      # print('Checking all 2-Way Joins')
      # join2 = {}
      # for a in range(0, len(proj_bin_list)-1):
      #   tA = proj_bin_list[a]
      #   for b in range(a+1, len(proj_bin_list)):
      #     tB = proj_bin_list[b]
      #     join_ss = tuple(set((tA, tB)))
      #     set_list = []
      #     for kA, vA in sets[tA].items():
      #       for kB, vB in sets[tB].items():
      #         join_hc = set((kA, kB))
      #         inter = vA & vB
      #         if len(inter) > 0:
      #           set_list.append((join_hc, inter))
      #     if len(set_list) > 0:
      #       join2[join_ss] = set_list
      # print('2-Way Join Results:')
      # for ss, set_list in join2.items():
      #   for hc, idxlist in set_list:
      #     print(ss, hc, idxlist)


      # print('Checking all 3-Way Joins')
      # join3 = []
      # checked = []
      # for a in range(0, len(join2)-1):
      #   sA, hA, vA = join2[a]
      #   for b in range(a+1, len(join2)):
      #     sB, hB, vB = join2[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join3.append((ss, hc, inter))


      # print('Checking all 4-Way Joins')
      # join4 = []
      # checked = []
      # for a in range(0, len(join3)-1):
      #   sA, hA, vA = join3[a]
      #   for b in range(a+1, len(join3)):
      #     sB, hB, vB = join3[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join4.append((ss, hc, inter))

      # if self.filelog:
      #   for i in join2:
      #     self.filelog.info('%s', str(i))
      #   for i in join3:
      #     self.filelog.info('%s', str(i))
      #   for i in join4:
      #     self.filelog.info('%s', str(i))

      DO_MIN_CHECK = False
      if DO_MIN_CHECK:
        def maxcount(x):
          y={}
          for i in x:
            y[i] = 1 if i not in y else y[i]+1
          return max(y.values())

        print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces')
        argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x])
        for key in hcube_global.keys():
          # logging.info('Showing MIN / MAX for points from HCube %s:', key)
          minA = {}; maxA={}
          for n in range(len(dist_hd[key])) :
            minA[n]=[] ; maxA[n]=[]
            for tbin in TEST_TBIN:
              if tbin not in dist_ld[key].keys():
                continue
                minA[n].append(0)
                maxA[n].append(0)          
              else:
                minA[n].append(argmin_nonzero(dist_ld[key][tbin][n]))
                maxA[n].append(np.argmax(dist_ld[key][tbin][n]))          
          numsame = np.zeros(len(dist_ld[key].keys())+1)
          for n in range(len(dist_hd[key][n])):
            minH = argmin_nonzero(dist_hd[key][n])
            maxH = np.argmax(dist_hd[key][n])
            minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])]
            numsamepair = maxcount(minA[n])
            numsame[numsamepair] += 1
            # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
          print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))]))

      print('Stopping HERE!')
      sys.exit(0)
      #  GAMMA FUNCTION EXPR # 8
      gamma1 = lambda a, b : (a * b)
      gamma2 = lambda a, b : (a + b) / 2

      # TODO: Factor in RMS weight
      for tbin in TEST_TBIN:
      # for tbin in sorted(bin_list):
        logging.info('')
        logging.info('BIPARTITE GRAPH for %s', str(tbin))
        bipart = {}
        edgelist = []
        for hcB in hcube_global.keys():
          num_B  = hcube_global[hcB]['count']
          wgt1_B = hcube_global[hcB]['density']
          if tbin not in overlap_hcube[hcB]:
            continue
          for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
            edge = {}
            if hcA not in bipart:
              bipart[hcA] = []  
            num_proj  = hcA_data['num_projected']
            wgt_A  = hcA_data['wgt']
            wgt2_B = wgt1_B*num_proj
            edge['combW1'] = gamma1(wgt_A, wgt1_B)
            edge['combW2'] = gamma1(wgt_A, wgt2_B)
            edge['combW3'] = gamma2(wgt_A, wgt1_B)
            edge['combW4'] = gamma2(wgt_A, wgt2_B)
            edge['num_A']  = len(hcA_data['idxlist'])
            edge['num_B']  = num_B
            edge['num_proj']  = num_proj
            edge['wgt_A']  = wgt_A
            edge['wgt1_B'] = wgt1_B
            edge['wgt2_B'] = wgt2_B
            edge['hcA'] = hcA
            edge['hcB'] = hcB
            bipart[hcA].append(edge)
            edgelist.append((hcA, hcB, num_proj))
        if len(bipart) == 0:
          logging.info("NO DATA FOR %s", str(tbin))
          continue
        logging.info('')
        logging.info('A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)')
        for k, v in bipart.items():
          for edge in v:
            logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge)
            if self.filelog:
              A,B = tbin
              self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj'])

        # Prepare nodes for graph
        nA = set()
        nB = set()
        elist = []
        for e in edgelist:
          a, b, z = e
          if z <= 5:
            continue
          nA.add(a)
          nB.add(b)
          elist.append((a,b,z))
        nAKeys = sorted(nA)[::-1]
        nBKeys = sorted(nB)[::-1]
        sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
        sizesB = [hcube_global[n]['count']*3 for n in nBKeys]
        idxA = {key: i for i, key in enumerate(nAKeys)}
        idxB = {key: i for i, key in enumerate(nBKeys)}
        edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
        G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin)

      logging.info('STOPPING HERE!!!!')
      sys.exit(0)
      return []