Python PCAnalyzer Examples

Programming Language: Python

Namespace/Package Name: datatools.pca

Class/Type: PCAnalyzer

Examples at hotexamples.com: 5

Python PCAnalyzer - 5 examples found. These are the top rated real world Python examples of datatools.pca.PCAnalyzer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

load(2)

Example #1

Show file

def reproj_distro():
    local = get_local()
    data = get_edges()
    r2 = redis.StrictRedis(port=6385, decode_responses=True)
    for tbin in [(2, 4), (2, 2), (4, 4), (4, 2), (4, 1), (3, 1)]:
        print('Processing:', tbin)
        tkey = '%d_%d' % tbin
        # Get Kernel
        kpca_key = 'subspace:pca:kernel:' + tkey
        kpca = PCAnalyzer.load(r2, kpca_key)
        # Get Training Data
        data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
        pca_pts = np.array([np.fromstring(x) for x in data_raw])
        kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
        proj_pts = kpca.project(alpha.xyz)
        biased_hcubes = []
        for i, pt in enumerate(proj_pts):
            biased_hcubes.append(kdtree.probe(pt, probedepth=9))
        if len(data) == 0:
            print('No Raw PCA data points for bin %s.... Going to next bin',
                  str(tbin))
            continue
        counts = {}
        for i in biased_hcubes:
            if i not in counts:
                counts[i] = 0
            counts[i] += 1
        for i in local[tkey]['keys']:
            if i not in counts:
                counts[i] = 0
        print('check')
        cvect = [counts[i] for i in local[tkey]['keys']]
        d = np.array(cvect) / sum(cvect)
        c = np.array(data[tkey])
        lcnt = np.sum(c, axis=0)
        gcnt = np.sum(c, axis=1)
        norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
        # Add biased data as a col
        kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
        kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
        arr = np.vstack((norm, kpca_cnt_norm, d)).T
        rowlist = tuple(gcnt) + (
            'localKPCA',
            'biased',
        )
        P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])

Example #2

Show file

File: scrapper.py Project: DaMSL/ddc

def reproj_distro():
  local = get_local()
  data  = get_edges()
  r2 = redis.StrictRedis(port=6385, decode_responses=True)
  for tbin in [(2,4), (2,2), (4,4), (4,2), (4,1), (3,1)]:
    print('Processing:', tbin)
    tkey = '%d_%d' % tbin
    # Get Kernel
    kpca_key = 'subspace:pca:kernel:' + tkey
    kpca = PCAnalyzer.load(r2, kpca_key)
    # Get Training Data
    data_raw = r2.lrange('subspace:pca:' + tkey, 0, -1)
    pca_pts = np.array([np.fromstring(x) for x in data_raw])
    kdtree = KDTree(200, maxdepth=8, data=pca_pts, method='middle')
    proj_pts = kpca.project(alpha.xyz)
    biased_hcubes = []
    for i, pt in enumerate(proj_pts):
      biased_hcubes.append(kdtree.probe(pt, probedepth=9))
    if len(data) == 0:
      print('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
      continue
    counts = {}
    for i in biased_hcubes:
      if i not in counts:
        counts[i] = 0
      counts[i] += 1
    for i in local[tkey]['keys']:
      if i not in counts:
        counts[i] = 0
    print('check')
    cvect = [counts[i] for i in local[tkey]['keys']]
    d = np.array(cvect)/sum(cvect)
    c = np.array(data[tkey])
    lcnt = np.sum(c, axis=0)
    gcnt = np.sum(c, axis=1)
    norm = np.nan_to_num(c / np.linalg.norm(c, axis=-1)[:, np.newaxis])
    # Add biased data as a col
    kpca_cnt = np.array([int(i) for i in local[tkey]['count']])
    kpca_cnt_norm = kpca_cnt / np.sum(kpca_cnt)
    arr = np.vstack((norm, kpca_cnt_norm, d)).T
    rowlist = tuple(gcnt) + ('localKPCA', 'biased',)
    P.bargraph((np.mean(norm, axis=0), d), tkey, ['Reweight', 'Biased'])

Example #3

Show file

File: simanl.py Project: DaMSL/ddc

  def execute(self, job):

  # PRE-PREOCESS ----------------------------------------------------------
    settings = systemsettings()
    bench = microbench('sim_%s' % settings.name, self.seqNumFromID())
    bench.start()
    stat  = StatCollector('sim_%s' % settings.name, self.seqNumFromID())
    mylogical_seqnum = str(self.seqNumFromID())

    # Prepare working directory, input/output files
    conFile = os.path.join(job['workdir'], job['name'] + '.conf')
    logFile = conFile.replace('conf', 'log')      # log in same place as config file
    dcdFile = conFile.replace('conf', 'dcd')      # dcd in same place as config file
    USE_SHM = True

    ADAPTIVE_CENTROID = False

    SIMULATE_RATIO = settings.SIMULATE_RATIO
    if SIMULATE_RATIO > 1:
      logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO)
    frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000)
    logging.info('Frame Size is %f  Using Sim Ratio of 1:%d', \
      frame_size, SIMULATE_RATIO)

    EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER
    logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER)

    # TODO: FOR LINEAGE
    # srcA, srcB = eval(job['src_bin'])
    # stat.collect('src_bin', [str(srcA), str(srcB)])

    traj = None

  # EXECUTE SIMULATION ---------------------------------------------------------
    if self.skip_simulation:

      logging.info('1. SKIPPING SIMULATION.....')
      USE_SHM = False

      job['dcd'] = dcdFile
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

    else:
      logging.info('1. Run Simulation')

      # Prepare & source to config file
      with open(self.data['sim_conf_template'], 'r') as template:
        source = template.read()

      # >>>>Storing DCD into shared memory on this node

      if USE_SHM:
        # ramdisk = '/dev/shm/out/'
        ramdisk = '/tmp/ddc/'
        if not os.path.exists(ramdisk):
          os.mkdir(ramdisk)
        job['outputloc'] = ramdisk
        dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd')
      else:
        job['outputloc'] = ''

      with open(conFile, 'w') as sysconfig:
        sysconfig.write(source % job)
        logging.info("Config written to: " + conFile)

      # # Run simulation in parallel
      # if 'parallel' in job:
      #   numnodes = job['parallel']
      #   total_tasks = numnodes * 24
      #   cmd = 'mpiexec -n %d namd2 %s > %s'  % (total_tasks, conFile, logFile)

      # # Run simulation single threaded
      # else:
      #   cmd = 'namd2 %s > %s' % (conFile, logFile)

      # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile)
      check = executecmd('module list')
      logging.debug('%s', check)

      cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile)

      #  MICROBENCH #1 (file to Lustre)
      # logging.debug("Executing Simulation:\n   %s\n", cmd)
      # bench = microbench()
      # bench.start()
      # stdout = executecmd(cmd)
      # logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
      # bench.mark('SimExec:%s' % job['name'])
      # shm_contents = os.listdir('/dev/shm/out')
      # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # logging.info("Copy Complete to Lustre.")
      # bench.mark('CopyLustre:%s' % job['name'])
      # shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents))
      # bench.show()

      max_expected_obs = int(job['runtime']) // int(job['dcdfreq'])
      # Retry upto 3 attempts if the sim fails
      MAX_TRY = 3
      for i in range(MAX_TRY, 0, -1):
        min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY)))
        logging.debug("Executing Simulation:\n   %s\n", cmd)
        logging.debug('# Obs Expected to see: %d', max_expected_obs)
        stdout = executecmd(cmd)
        logging.info("SIMULATION Complete! STDOUT/ERR Follows:")
        # Check file for expected data
        if USE_SHM:
          traj = md.load(dcd_ramfile, top=job['pdb'])
        else:
          traj = md.load(dcdFile, top=job['pdb'])
        logging.info("Obs Threshold  = %4d", min_required_obs)
        logging.info("#Obs This Traj = %4d", traj.n_frames)
        if traj.n_frames >= min_required_obs:
          logging.info('Full (enough) Sim Completed')
          break
        logging.info('Detected a failed Simulation. Retrying the same sim.')
      
      bench.mark('SimExec:%s' % job['name'])

      # Internal stats
      sim_length = self.data['sim_step_size'] * int(job['runtime'])
      sim_realtime = bench.delta_last()
      sim_run_ratio =  (sim_realtime/60) / (sim_length/1000000)
      logging.info('##SIM_RATIO %6.3f  min-per-ns-sim', sim_run_ratio)
      stat.collect('sim_ratio', sim_run_ratio)

      if USE_SHM:
        shm_contents = os.listdir(ramdisk)
        logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents))

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)")
          time.sleep(10)

        if not os.path.exists(dcd_ramfile):
          logging.warning("DCD STIILL FILE NOT FOUND!!!!")
        else:
          logging.info("DCD File was found")

      # #  MICROBENCH #2 (file to Alluxio)
      # allux = AlluxioClient()
      # # copy to Aluxio FS
      # allux.put(ramdisk + job['name'] + '.dcd', '/')
      # logging.info("Copy Complete to Alluxio.")
      # bench.mark('CopyAllux:%s' % job['name'])

      # And copy to Lustre
      # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir'])
      # And copy to Lustre (usng zero-copy):
      if USE_SHM:
        src  = open(dcd_ramfile, 'rb')
        dest = open(dcdFile, 'w+b')
        offset = 0
        dcdfilesize = os.path.getsize(dcd_ramfile)
        while True:
          sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize)
          if sent == 0:
            break
          offset += sent
        logging.info("Copy Complete to Lustre.")
        bench.mark('CopyLustre:%s' % job['name'])
      
      # TODO: Update job's metadata
      key = wrapKey('jc', job['name'])
      self.data[key]['dcd'] = dcdFile

  # ANALYSIS   ------- ---------------------------------------------------------
    #  ANALYSIS ALGORITHM
  # 1. With combined Sim-analysis: file is loaded locally from shared mem
    logging.debug("2. Load DCD")

    # Load full higher dim trajectory
    # traj = datareduce.filter_heavy(dcd_ramfile, job['pdb'])
    if traj is None:
      if USE_SHM:
        traj = md.load(dcd_ramfile, top=job['pdb'])
      else:
        traj = md.load(dcdFile, top=job['pdb'])

    # Center Coordinates
    traj.center_coordinates()

    bench.mark('File_Load')
    logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj))


  #  DIMENSIONALITY REDUCTION --------------------------------------------------
  # 4-A. Subspace Calcuation: RMS using Alpha-Filter
    #------ A:  RMSD-ALPHA  ------------------
      #     S_A = rmslist
    logging.info('---- RMSD Calculation against pre-defined centroids ----')
      #  RMSD is calculated on the Ca ('alpha') atoms in distance space
      #   whereby all pairwise distances are calculated for each frame.
      #   Pairwise distances are plotted in euclidean space
      #   Distance to each of the 5 pre-calculated centroids is calculated

    # 1. Filter to Alpha atoms
    alpha = traj.atom_slice(deshaw.FILTER['alpha'])



    # 2. (IF USED) Convert to distance space: pairwise dist for all atom combinations
    # alpha_dist = dr.distance_space(alpha)

    # 3. Calc RMS for each conform to all centroids
    # Heuristic centroid weight (TODO: make this trained)\

    # 4. For adaptive Centriods
    #  Centroids Will be pulled & updated. 
    logging.info('CENTROID Retrieval & Updating')
    self.wait_catalog()

    #  If they were mutable....
    # logging.info('Acquiring a Lock on the Centroids')
    # centroids = self.catalog.loadNPArray('centroid')
    # thetas = self.catalog.loadNPArray('thetas')
    # lock = self.catalog.lock_acquire('centroid')
    # if lock is None:
    #   logging.info('Could not lock the Centroids. Will use current cached (possibly stale) data.')
    # bench.mark('ConcurrLockCentroid'%(A,B))

    #  Implemented as a Transactional Data Structure....
    if ADAPTIVE_CENTROID:
      centroids = []
      for state in range(numLabels):
        cent_raw  = self.catalog.lrange('centroid:xyz:%d'%state, 0, -1)
        cent_xyz  = [pickle.loads(i) for i in cent_raw]
        cent_npts = [int(i) for i in self.catalog.lrange('centroid:npts:%d'%state, 0, -1)]
        c_sum = np.zeros(shape=cent_xyz[0].shape)
        c_tot = 0
        for x, n in zip(cent_xyz, cent_npts):
          c = x * n
          c_sum += c
          c_tot += n
        centroids.append(c_sum / c_tot)
    else:
      centroids = self.catalog.loadNPArray('centroid')      

    # if EXPERIMENT_NUMBER < 10:
    # 5. Calculate the RMSD for each filtered point to 5 pre-determined centroids
    # cw = [.92, .94, .96, .99, .99]
    cw = [.94, .95, .97, .99, .99]

    numLabels = len(self.data['centroid'])
    numConf = len(traj.xyz)
    stat.collect('numpts',numConf)

    # 4. Account for noise : Simple spatial mean filter over a small window
    #    Where size of window captures extent of noise 
    #    (e.g. 10000fs window => motions under 10ps are considered "noisy")
    noise = self.data['obs_noise']
    stepsize = 500 if 'interval' not in job else int(job['interval'])
    nwidth = noise//(2*stepsize)
    noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0)
    rms_filtered = np.array([noisefilt(alpha.xyz, i) for i in range(numConf)])
    # Notes: Delta_S == rmslist
    rmslist_sv = calc_rmsd(rms_filtered, centroids, weights=cw)
      # rmslist = adaptive_rmsd(rms_filtered, centroids, theta)

    # else:
    rmslist = calc_rmsd(alpha, centroids)

    numConf = traj.n_frames
    numLabels = len(centroids)

    # rmslist = calc_rmsd(alpha.xyz, self.data['centroid'], weights=cw)
    logging.debug('  RMS:  %d points projected to %d centroid-distances', \
      numConf, numLabels)


    # 6. Apply Heuristics Labeling  -- Single Variate
    rmslabel = []
    binlist = [(a, b) for a in range(numLabels) for b in range(numLabels)]
    label_count = {ab: 0 for ab in binlist}
    groupbystate = [[] for i in range(numLabels)]
    groupbybin = {ab: [] for ab in binlist}
    for i, rms in enumerate(rmslist_sv):
      #  Sort RMSD by proximity & set state A as nearest state's centroid
      A, B = np.argsort(rms)[:2]

      #  Calc Absolute proximity between nearest 2 states' centroids
      # THETA Calc derived from static run. it is based from the average std dev of all rms's from a static run
      #   of BPTI without solvent. It could be dynamically calculated, but is hard coded here
      #  The theta is divided by four based on the analysis of DEShaw:
      #   est based on ~3% of DEShaw data in transition (hence )
      # avg_stddev = 0.34119404492089034
      # theta = settings.RMSD_THETA
      ## FOR ADAPTIVE Cantroids. Theta is now updated dyamically

      # NOTE: Original formulate was relative. Retained here for reference:  
      # Rel vs Abs: Calc relative proximity for top 2 nearest centroids   
      # relproximity = rms[A] / (rms[A] + rms[rs[1]])
      # B = rs[1] if relproximity > (.5 - theta) else A
      # proximity = abs(rms[prox[1]] - rms[A]) / (rms[prox[1]] + rms[A])  #relative
      #proximity = abs(rms[prox[1]] - rms[A])    #abs
      # Update for Adaptive Centroid.
      delta = np.abs(rms[B] - rms[A])

      #  (TODO:  Factor in more than top 2, better noise)
      #  Label secondary sub-state
      # sub_state = B prox[1] if proximity < theta else A
      # For ADAPTIVE Centroids
      if delta < 0.33:
        sub_state = B
      else:
        sub_state = A
      rmslabel.append((A, sub_state))

      # Add this index to the set of indices for this respective label
      #  TODO: Should we evict if binsize is too big???
      # logging.debug('Label for observation #%3d: %s', i, str((A, B)))
      label_count[(A, sub_state)] += 1

      # Group high-dim point by state
      # TODO: Consider grouping by stateonly or well/transitions (5 vs 10 bins)
      groupbystate[A].append(i)
      groupbybin[(A, sub_state)].append(i)

    # stat.collect('observe', label_count)
    bench.mark('RMS')
    logging.info('Labeled the following:')
    for A in range(numLabels):
      if len(groupbystate[A]) > 0:
        logging.info('label,state,%d,num,%d', A, len(groupbystate[A]))
    for ab in binlist:
      if len(groupbybin[ab]) > 0:
        A, B = ab
        logging.info('label,bin,%d,%d,num,%d', A, B, len(groupbybin[ab]))

    # FEATURE LANDSCAPE -- Multi-Variate

    # Calc Feature landscape for each frame's RMSD
    feal_list = [feal.atemporal(rms) for rms in rmslist]
    logging.info('Calculated Feature Landscape. Aggregate for this traj')
    # For logging purposes
    agg_feal = np.mean(feal_list, axis=0)
    logging.info('CountsMax [C]:  %s', str(agg_feal[:5]))
    logging.info('StateDist [S]:  %s', str(agg_feal[5:10]))
    logging.info('RelDist [A-B]:  %s', str(agg_feal[10:]))

    #  ADAPTIVE CENTROID & THETA CALCULATION
    # if lock is None:
    #   logging.info('Never acqiured a lock. Skipping adaptive update (TODO: Mark pts as stale)')
    # else:  
    #   logging.info('Updating Adaptive Centroid')
    
    if ADAPTIVE_CENTROID:
      pipe = self.catalog.pipeline()
      for state in range(numLabels):
        n_pts = len(groupbybin[(state, state)])
        if n_pts == 0:
          logging.info('Skipping State %d Centroid -- Well not visited on this trajectory')
          continue

        cent_xyz  = [alpha.xyz[i] for i in groupbybin[(state, state)]]
        cent_npts = len(groupbybin[(state, state)])
        c_sum = np.zeros(shape=alpha.xyz[0].shape)
        for pt in cent_xyz:
          c_sum += pt
        centroid_local = c_sum / n_pts
        centroid_delta = LA.norm(centroids[state] - cent) 
        pipe.rpush('centroid:xyz:%d' % state, pickle.dumps(centroid_local))
        pipe.rpush('centroid:npts:%d' % state, n_pts)
        pipe.rpush('centroid:delta:%d' % state, centroid_delta)
      pipe.execute()

  # 4-B. Subspace Calcuation: COVARIANCE Matrix, 200ns windows, Full Protein
  #------ B:  Covariance Matrix  -----------------
    if EXPERIMENT_NUMBER > 5:
      # 1. Project Pt to PC's for each conform (top 3 PC's)
      logging.info('---- Covariance Calculation on 200ns windows (Full Protein, cartesian Space) ----')

      # Calculate Covariance over 200 ps Windows sliding every 100ps
      #  These could be user influenced...
      WIN_SIZE_NS = .2
      SLIDE_AMT_NS = .1
      logging.debug("Calculating Covariance over trajectory. frame_size = %.1f, WINSIZE = %dps, Slide = %dps", 
        frame_size, WIN_SIZE_NS*1000, SLIDE_AMT_NS*1000)
      covar = dr.calc_covar(alpha.xyz, WIN_SIZE_NS, frame_size, slide=SLIDE_AMT_NS)
      bench.mark('CalcCovar')
      stat.collect('numcovar', len(covar))
      logging.debug("Calcualted %d covariance matrices. Storing variances", len(covar)) 


  #  BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available
    # try:
    self.wait_catalog()
    # except OverlayNotAvailable as e:
    #   logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis")


  # Update Catalog with 1 Long Atomic Transaction  
    global_index = []
    with self.catalog.pipeline() as pipe:
      while True:
        try:
          logging.debug('Update Filelist')
          pipe.watch(wrapKey('jc', job['name']))
          file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1
          # HD Points
          logging.debug('Update HD Points')
          for x in range(traj.n_frames):
            # Note: Pipelined insertions "should" return contiguous set of index points
            index = pipe.rpush('xid:reference', (file_idx, x)) - 1
            global_index.append(index - 1) 

          pipe.multi()
          logging.debug('Update RMS Subspace')
          for x in range(traj.n_frames):
            A, B = rmslabel[x]
            index = global_index[x]
            # Labeled Observation (from RMSD)
            pipe.rpush('label:rms', rmslabel[x])
            pipe.rpush('varbin:rms:%d_%d' % (A, B), index)
            # pipe.rpush('lineage:rms:%d_%d:%d_%d' % (srcA, srcB, A, B), index)
            # pipe.rpush('lineage:pca:%s:%d_%d' % (job['src_hcube'], A, B), index)
            pipe.rpush('subspace:rms', bytes(rmslist_sv[x]))
            pipe.rpush('subspace:feal', bytes(feal_list[x]))            

          logging.debug('Update OBS Counts')
          for b in binlist:
            pipe.rpush('observe:rms:%d_%d' % b, label_count[b])
          pipe.incr('observe:count')
          pipe.hset('anl_sequence', job['name'], mylogical_seqnum)

          if EXPERIMENT_NUMBER > 5:
            logging.debug('Update Covar Subspace')
            for i, si in enumerate(covar):
              logging.debug('Update COVAR Pt #%d', i)
              local_index = int(i * frame_size * SLIDE_AMT_NS)
              pipe.rpush('subspace:covar:pts', bytes(si))
              pipe.rpush('subspace:covar:xid', global_index[local_index])
              pipe.rpush('subspace:covar:fidx', (file_idx, local_index))

          logging.debug('Executing')
          pipe.execute()
          break
        except redis.WatchError as e:
          logging.debug('WATCH ERROR')
          continue

    self.data[key]['xid:start'] = global_index[0]
    self.data[key]['xid:end'] = global_index[-1]
    bench.mark('Indx_Update')

  # (Should we Checkpoint here?)

  # 4-C. Subspace Calcuation: PCA BY Strata (PER STATE) using Alpha Filter
  #------ C:  GLOBAL PCA by state  -----------------
  #  Update PCA Vectors for each state with new data
    if EXPERIMENT_NUMBER > 5 and EXPERIMENT_NUMBER < 10:
      logging.info('---- PCA per BIN over Alpha Filter in cartesian Space ----')
      # TODO:  This will eventually get moved into a User process macrothread 
      #   which will set in between analysis and controller. 
      # For now, we're recalculating using a lock

      # Check if vectors need to be recalculated
      # Connect to reservoir samples
      # TODO: Catalog or Cache?
      reservoir = ReservoirSample('rms', self.catalog)
      # STALENESS_FACTOR = .25   # Recent updates account for 25% of the sample (Just a guess)

      num_inserted = {ab: 0 for ab in binlist}
      num_params = np.prod(alpha.xyz.shape[1:])

      for A, B in binlist:
        num_observations = len(groupbybin[(A,B)])

        if num_observations == 0:
          logging.info('No data received for bin (%d,%d).  Not processing this bin here.', A, B)
          continue

        res_label = '%d_%d' % (A,B)
        updateVectors = False
        kpca_key = 'subspace:pca:kernel:%d_%d' % (A, B)
        kpca = PCAnalyzer.load(self.catalog, kpca_key)
        newkpca = False
        if kpca is None:
          # kpca = PCAKernel(None, 'sigmoid')
          kpca = PCAKernel(6, 'rbf')
          newkpca = True


        logging.info('PCA:  Checking if current vectors for state %d are out of date', A)
        rsize = reservoir.getsize(res_label)
        tsize = kpca.trainsize

        #  KPCA is out of date is the sample size is 20% larger than previously used  set
        #  Heuristics --- this could be a different "staleness" factor or we can check it some other way
        if newkpca or rsize > (tsize * 1.5):

          #  Should we only use a sample here??? (not now -- perhaps with larger rervoirs or if KPCA is slow
          traindata = reservoir.get(res_label)
          if newkpca:
            logging.info('New PCA Kernel. Trained on data set of size %d. Current \
              reservoir is %d pts.', tsize, rsize)
            logging.info('Projecting %d points on Kernel PCA for bin (%d,%d)',
              num_observations, A, B)
            traindata = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]),
              dtype=np.float32)
            for i, index in enumerate(groupbybin[(A,B)]):
              np.copyto(traindata[i], alpha.xyz[index])
          else:
            logging.info('PCA Kernel is old (Updating it). Trained on data set of \
              size %d. Current reservoir is %d pts.', tsize, rsize)


          if len(traindata) <= num_params:
            logging.info("Not enough data to calculate PC's (Need at least %d \
              observations). Skipping PCA for Bin (%d,%d)", num_params, A, B)
            hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32)
            for i, index in enumerate(groupbybin[(A,B)]):
              np.copyto(hd_pts[i], alpha.xyz[index])
            num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts)
            logging.debug('Updating reservoir Sample for Bin (%d, %d)')
            continue

          logging.info('   Performing Kernel PCA (Gaussian) for bin (%d,%d) using traindata of size %d', \
            A, B, len(traindata))

          kpca.solve(traindata)

          # NOTE: Pick PCA Algorithm HERE
          # pca = calc_kpca(np.array(traindata), kerneltype='sigmoid')
          # pca = calc_pca(np.array(traindata))
          bench.mark('CalcKPCA_%d_%d'%(A,B))

          # new_vect = pca.alphas_.T
          lock = self.catalog.lock_acquire(kpca_key)
          if lock is None:
            logging.info('Could not lock the PC Kernel for Bin (%d,%d). Not updating', A, B)
          else:
            kpca.store(self.catalog, kpca_key)
            lock = self.catalog.lock_release(kpca_key, lock)
          bench.mark('ConcurrPCAWrite_%d_%d'%(A,B))

          # Project Reservoir Sample to the Kernel and overwrite current set of points
          #  This should only happen up until the reservior is filled
          # If we are approx above to train, be sure to project all reservor points
          if not newkpca:
            logging.info('Clearing and Re-Projecting the entire reservoir of %d points for Bin (%d,%d).', \
              rsize, A, B)
            rsamp_lowdim = kpca.project(traindata)
            pipe = self.catalog.pipeline()
            pipe.delete('subspace:pca:%d_%d'%(A,B))
            for si in rsamp_lowdim:
              pipe.rpush('subspace:pca:%d_%d'%(A,B), bytes(si))
            pipe.execute()


        else:
          logging.info('PCA Kernel is good -- no need to change them')

        bench.mark('start_ProjPCA')
        logging.info('Projecting %d points on Kernel PCA for Bin (%d,%d)', num_observations, A, B)
        hd_pts = np.zeros(shape=((num_observations,)+alpha.xyz.shape[1:]), dtype=np.float32)
        for i, index in enumerate(groupbybin[(A,B)]):
          np.copyto(hd_pts[i], alpha.xyz[index])
        pc_proj = kpca.project(hd_pts)
        bench.mark('ProjPCA_%d_%d'%(A,B))

        # 2. Append subspace in catalog
        pipe = self.catalog.pipeline()
        for si in pc_proj:
          pipe.rpush('subspace:pca:%d_%d' % (A,B), bytes(si))
        pipe.execute()

        logging.debug('Updating reservoir Sample')
        num_inserted[(A,B)] = reservoir.insert(res_label, hd_pts)

      bench.mark('PCA')
      pipe = self.catalog.pipeline()
      for ab, num in num_inserted.items():
        if num > 0:
          pipe.rpush('subspace:pca:updates:%d_%d' % (A, B), num)
      pipe.execute()

  # ---- POST PROCESSING
    if USE_SHM:
      shutil.rmtree(ramdisk)
      # shm_contents = os.listdir('/dev/shm')
      shm_contents = os.listdir('/tmp')
      logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents))
    
    # For benchmarching:
    # print('##', job['name'], dcdfilesize/(1024*1024*1024), traj.n_frames)
    bench.show()
    stat.show()

    # Return # of observations (frames) processed
    return [numConf]

Example #4

Show file

    def execute(self):
        """Special execute function for the reweight operator -- check/validate.
      """
        # PRE-PROCESSING ---------------------------------------------------------------------------------
        logging.debug(
            "============================  <PRE-PROCESS>  ============================="
        )
        self.cacheclient = CacheClient(self.name)
        numLabels = 5
        binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
        labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
        num_pts = len(labeled_pts_rms)
        logging.debug('##NUM_OBS: %d', num_pts)

        # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
        TEST_TBIN = [(2, 0), (4, 2), (2, 2), (4, 1), (3, 1), (4, 4), (0, 4),
                     (0, 2), (0, 1)]
        MAX_SAMPLE_SIZE = 100  # Max # of cov traj to back project per HCube
        MAX_PT_PER_MATRIX = 100  # Num points to sample from each cov traj
        COVAR_SIZE = 200  # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
        MAX_HCUBE = 6  # Max Num HCubes to process

        # IMPLEMENT USER QUERY with REWEIGHTING:
        logging.debug(
            "=======================  <QUERY PROCESSING>  ========================="
        )

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

        logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
        logging.info("Retrieving All Covariance Vectors")
        home = os.getenv('HOME')
        cfile = home + '/work/DEBUG_COVAR_PTS'
        DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
        if DO_COVAR:
            if os.path.exists(cfile + '.npy'):
                covar_pts = np.load(cfile + '.npy')
                logging.debug('Loaded From File')
            else:
                covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
                covar_pts = np.array([np.fromstring(x) for x in covar_raw])
                np.save(cfile, covar_pts)
                logging.debug('Loaded From Catalog & Saved')
        covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
        logging.debug('Indiced Loaded. Retrieving File Indices')
        covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

        if DO_COVAR:
            logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
            logging.info(
                "Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)"
            )

            # FOR incrementatl PCA:
            NUM_PC = 6
            ipca_key = 'subspace:covar:ipca'
            ipca = PCAnalyzer.load(self.catalog, ipca_key)
            if ipca is None:
                logging.info('Creating a NEW IPCA')
                ipca = PCAIncremental(NUM_PC)
                lastindex = 0
            else:
                lastindex = ipca.trainsize
                logging.info(
                    'IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts',
                    ipca.trainsize,
                    len(covar_pts) - ipca.trainsize)

            # For incremental, partial solve using only newer pts (from the last "trainsize")
            if len(covar_pts) - lastindex > 0:
                ipca.solve(covar_pts[lastindex:])
                logging.info("Incrementatl PCA Updated. Storing Now...")

                ####  BARRIER
                self.wait_catalog()
                ipca.store(self.catalog, ipca_key)

            logging.info("IPCA Saved. Projecting Covariance to PC")

        cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
        if os.path.exists(cfile + '.npy'):
            subspace_covar_pts = np.load(cfile + '.npy')
        else:
            subspace_covar_pts = ipca.project(covar_pts)
            np.save(cfile, subspace_covar_pts)

        # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
        logging.info(
            'Building Global KD Tree over Covar Subspace with %d data pts',
            len(subspace_covar_pts))
        global_kdtree = KDTree(250,
                               maxdepth=8,
                               data=subspace_covar_pts,
                               method='middle')

        if MAX_HCUBE <= 0:
            hcube_global = global_kdtree.getleaves()
        else:
            # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
            hcube_global_ALL = global_kdtree.getleaves()
            hcube_global = {}
            num = 0
            for k, v in hcube_global_ALL.items():
                hcube_global[k] = v
                num += 1
                if num == MAX_HCUBE:
                    break

        # hcube_global = global_kdtree.getleaves()
        logging.info(
            'Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)'
        )
        for k in sorted(hcube_global.keys()):
            v = hcube_global[k]
            logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'],
                         v['volume'], v['density'])

        if self.filelog:
            keys = hcube_global.keys()
            self.filelog.info('global,keys,%s', ','.join(keys))
            self.filelog.info(
                'global,count,%s',
                ','.join([str(hcube_global[k]['count']) for k in keys]))
            self.filelog.info(
                'global,volume,%s',
                ','.join([str(hcube_global[k]['volume']) for k in keys]))
            self.filelog.info(
                'global,density,%s',
                ','.join([str(hcube_global[k]['density']) for k in keys]))

        logging.info(
            "=====  SELECT Sampling of points from each Global HCube  (B)")
        s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
        hcube_global = {x[0]: x[1] for x in s}

        counter = 0
        for key in hcube_global.keys():
            counter += 1
            if hcube_global[key]['count'] <= MAX_SAMPLE_SIZE:
                cov_index = hcube_global[key]['elm']
                hcube_global[key]['samplefactor'] = 1
            else:
                cov_index = np.random.choice(hcube_global[key]['elm'],
                                             MAX_SAMPLE_SIZE)
                hcube_global[key]['samplefactor'] = len(
                    hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
            hcube_global[key]['idxlist'] = []
            for cov in cov_index:
                selected_hd_idx = np.random.choice(COVAR_SIZE,
                                                   MAX_PT_PER_MATRIX).tolist()
                hcube_global[key]['idxlist'].extend(
                    [int(covar_index[cov]) + i for i in selected_hd_idx])
            logging.info('Back Projecting Global HCube `%s`  (%d out of %d)',
                         key, counter, len(hcube_global.keys()))
            source_cov = self.backProjection(hcube_global[key]['idxlist'])
            hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
            logging.debug('Back Projected %d points to HD space: %s',
                          len(hcube_global[key]['idxlist']),
                          str(hcube_global[key]['alpha']))

        # logging.info('Calculating all HD Distances')
        # dist_hd = {}
        # dist_ld = {}
        # for key in hcube_global.keys():
        #   T = hcube_global[key]['alpha'].xyz
        #   N = len(T)
        #   dist_hd[key] = np.zeros(shape=(N, N))
        #   dist_ld[key] = {}
        #   for A in range(0, N):
        #     dist_hd[key][A][A] = 0
        #     for B in range(A+1, N):
        #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
        reservoir = ReservoirSample('rms', self.catalog)

        logging.info(
            "=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) "
        )
        hcube_list = {}

        logging.info(
            "Scanning current set of observed bins and finding all smallest with data (excluding largest 2)"
        )
        hcube_local = {}

        logging.info("=======================================================")
        logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
        logging.info(
            "=======================================================\n")

        overlap_hcube = {k: {} for k in hcube_global.keys()}

        projection_map = {}

        pt_projection_list = []
        for key in sorted(hcube_global.keys()):
            for i in range(len(hcube_global[key]['alpha'].xyz)):
                pt_projection_list.append([])
        for bin_idx, tbin in enumerate(TEST_TBIN):
            logging.info("Project Global HCubes into local subspace for %s",
                         str(tbin))
            # Load Vectors
            logging.info('Loading subspace and kernel for bin %s', str(tbin))

            # LOAD KPCA Kernel matrix
            kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
            kpca = PCAnalyzer.load(self.catalog, kpca_key)

            data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
            data = np.array([np.fromstring(x) for x in data_raw])
            if len(data) == 0:
                logging.error(
                    'No Raw PCA data points for bin %s.... Going to next bin',
                    str(tbin))
                continue

            logging.info(
                'Building KDtree over local %s bin from observations matrix of size: %s',
                str(tbin), str(data.shape))
            kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
            hcube_local[tbin] = kdtree.getleaves()
            logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
            for k in sorted(hcube_local[tbin].keys()):
                logging.info('    `%-9s`   #pts:%6d   density:%9.1f', k,
                             len(hcube_local[tbin][k]['elm']),
                             hcube_local[tbin][k]['density'])

            if self.filelog:
                keys = hcube_local[tbin].keys()
                A, B = tbin
                self.filelog.info('local,%d_%d,keys,%s', A, B, ','.join(keys))
                self.filelog.info(
                    'local,%d_%d,count,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['count']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,volume,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['volume']) for k in keys]))
                self.filelog.info(
                    'local,%d_%d,density,%s', A, B, ','.join(
                        [str(hcube_local[tbin][k]['density']) for k in keys]))

            n_total = 0
            logging.debug('Global Hcubes to Project (%d):  %s',
                          len(hcube_global.keys()), str(hcube_global.keys()))
            projection_map[bin_idx] = {
                k: set()
                for k in hcube_local[tbin].keys()
            }

            pnum = 0
            for key in sorted(hcube_global.keys()):
                overlap_hcube[key][tbin] = {}
                cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

                logging.debug(
                    'PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ',
                    key, len(cov_proj_pca), str(tbin))
                for i, pt in enumerate(cov_proj_pca):
                    hcube = kdtree.probe(pt, probedepth=9)
                    # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
                    if hcube not in overlap_hcube[key][tbin]:
                        overlap_hcube[key][tbin][hcube] = {
                            'idxlist': hcube_local[tbin][hcube]['elm'],
                            'wgt': hcube_local[tbin][hcube]['density'],
                            'num_projected': 0
                        }
                    overlap_hcube[key][tbin][hcube]['num_projected'] += 1

                    # Index this point in corresponding local HCube projection view
                    projection_map[bin_idx][hcube].add(pnum)

                    pt_projection_list[pnum].append(hcube)
                    pnum += 1

                for k, v in sorted(overlap_hcube[key][tbin].items()):
                    logging.debug(
                        '   Project ==> Local HCube `%-9s`: %5d points', k,
                        v['num_projected'])
                # logging.info('Calculating Lower Dimensional Distances')
                # N = len(cov_proj_pca)
                # dist_ld[key][tbin] = np.zeros(shape=(N, N))
                # for A in range(0, N):
                #   for B in range(A+1, N):
                #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])

    # Re-Index projected points -- could make this a list too

        next_index = 0
        view_list = []
        for bin_idx, hcube_map in projection_map.items():
            hcube_list = []
            for hcube_key, pt_list in hcube_map.items():
                hcube_list.append((set((hcube_key, )), set(pt_list)))
            view_list.append((set((bin_idx, )), hcube_list))

        print("CALLING: Collapse Join")
        joined_subspaces = collapse_join(projection_map.keys(), view_list)
        for subspace_list, correlated_hcubes in joined_subspaces:
            tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
            for hcube_list, pt_list in correlated_hcubes:
                print(tbin_list, hcube_list, pt_list)
                # TODO: Corrlate Back to Global
        print('Visualize HERE')

        # for idx, tbin in enumerate(TEST_TBIN):
        #   # Only process substates with data
        #   if tbin not in hcube_local:
        #     logging.warning('Local KD Tree not created for %s', str(tbin))
        #     continue
        #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # sets = {}
        # proj_bin_list = []
        # for tbin in TEST_TBIN:
        #   if tbin not in hcube_local:
        #     continue
        #   proj_bin_list.append(tbin)
        #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
        # for n, proj in enumerate(pt_projection_list):
        #   for i, tbin in enumerate(proj_bin_list):
        #     sets[tbin][proj[i]].add(n)
        #   if self.filelog:
        #     self.filelog.info('%d,%s', n, ','.join(proj))
        #   logging.info('%d,%s', n, ','.join(proj))

        # set_list = {}
        # for tbin, view in sets.items():
        #   set_list[(tbin,)] = []
        #   for hcube, idxlist in view.items():
        #     print(tbin, hcube, idxlist)
        #     set_list[(tbin,)].append((set((hcube,)), idxlist))

        # def collapse(C):
        #   a = 0
        #   b = 0
        #   N = []
        #   while a < len(C) and b < len(C):
        #     A = sorted(C[a])
        #     B = sorted(C[b])
        #     if A == B:
        #       b += 1
        #     elif A[0] == B[0]:
        #       N.append(set(A)|set(B))
        #       b += 1
        #     else:
        #       a += 1
        #   if len(N) <= 1:
        #     return []
        #   else:
        #     return N + collapse(N)

        # q=collapse(t1)
        # for i in q: print(sorted(i))

        # print('Checking all 2-Way Joins')
        # join2 = {}
        # for a in range(0, len(proj_bin_list)-1):
        #   tA = proj_bin_list[a]
        #   for b in range(a+1, len(proj_bin_list)):
        #     tB = proj_bin_list[b]
        #     join_ss = tuple(set((tA, tB)))
        #     set_list = []
        #     for kA, vA in sets[tA].items():
        #       for kB, vB in sets[tB].items():
        #         join_hc = set((kA, kB))
        #         inter = vA & vB
        #         if len(inter) > 0:
        #           set_list.append((join_hc, inter))
        #     if len(set_list) > 0:
        #       join2[join_ss] = set_list
        # print('2-Way Join Results:')
        # for ss, set_list in join2.items():
        #   for hc, idxlist in set_list:
        #     print(ss, hc, idxlist)

        # print('Checking all 3-Way Joins')
        # join3 = []
        # checked = []
        # for a in range(0, len(join2)-1):
        #   sA, hA, vA = join2[a]
        #   for b in range(a+1, len(join2)):
        #     sB, hB, vB = join2[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join3.append((ss, hc, inter))

        # print('Checking all 4-Way Joins')
        # join4 = []
        # checked = []
        # for a in range(0, len(join3)-1):
        #   sA, hA, vA = join3[a]
        #   for b in range(a+1, len(join3)):
        #     sB, hB, vB = join3[b]
        #     if sA == sB:
        #       continue
        #     ss, hc = sA | sB, hA | hB
        #     if (ss, hc) in checked[-10:]:
        #       continue
        #     checked.append((ss, hc))
        #     inter = vA & vB
        #     if len(inter) > 0:
        #       join4.append((ss, hc, inter))

        # if self.filelog:
        #   for i in join2:
        #     self.filelog.info('%s', str(i))
        #   for i in join3:
        #     self.filelog.info('%s', str(i))
        #   for i in join4:
        #     self.filelog.info('%s', str(i))

        DO_MIN_CHECK = False
        if DO_MIN_CHECK:

            def maxcount(x):
                y = {}
                for i in x:
                    y[i] = 1 if i not in y else y[i] + 1
                return max(y.values())

            print(
                '%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces'
            )
            argmin_nonzero = lambda x: np.argmin([(i if i > 0 else np.inf)
                                                  for i in x])
            for key in hcube_global.keys():
                # logging.info('Showing MIN / MAX for points from HCube %s:', key)
                minA = {}
                maxA = {}
                for n in range(len(dist_hd[key])):
                    minA[n] = []
                    maxA[n] = []
                    for tbin in TEST_TBIN:
                        if tbin not in dist_ld[key].keys():
                            continue
                            minA[n].append(0)
                            maxA[n].append(0)
                        else:
                            minA[n].append(
                                argmin_nonzero(dist_ld[key][tbin][n]))
                            maxA[n].append(np.argmax(dist_ld[key][tbin][n]))
                numsame = np.zeros(len(dist_ld[key].keys()) + 1)
                for n in range(len(dist_hd[key][n])):
                    minH = argmin_nonzero(dist_hd[key][n])
                    maxH = np.argmax(dist_hd[key][n])
                    minmax = ['%2d/%-2d' % i for i in zip(minA[n], maxA[n])]
                    numsamepair = maxcount(minA[n])
                    numsame[numsamepair] += 1
                    # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
                print(' '.join([
                    '%4.1f%%' % i for i in (100 * (numsame / np.sum(numsame)))
                ]))

        print('Stopping HERE!')
        sys.exit(0)
        #  GAMMA FUNCTION EXPR # 8
        gamma1 = lambda a, b: (a * b)
        gamma2 = lambda a, b: (a + b) / 2

        # TODO: Factor in RMS weight
        for tbin in TEST_TBIN:
            # for tbin in sorted(bin_list):
            logging.info('')
            logging.info('BIPARTITE GRAPH for %s', str(tbin))
            bipart = {}
            edgelist = []
            for hcB in hcube_global.keys():
                num_B = hcube_global[hcB]['count']
                wgt1_B = hcube_global[hcB]['density']
                if tbin not in overlap_hcube[hcB]:
                    continue
                for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
                    edge = {}
                    if hcA not in bipart:
                        bipart[hcA] = []
                    num_proj = hcA_data['num_projected']
                    wgt_A = hcA_data['wgt']
                    wgt2_B = wgt1_B * num_proj
                    edge['combW1'] = gamma1(wgt_A, wgt1_B)
                    edge['combW2'] = gamma1(wgt_A, wgt2_B)
                    edge['combW3'] = gamma2(wgt_A, wgt1_B)
                    edge['combW4'] = gamma2(wgt_A, wgt2_B)
                    edge['num_A'] = len(hcA_data['idxlist'])
                    edge['num_B'] = num_B
                    edge['num_proj'] = num_proj
                    edge['wgt_A'] = wgt_A
                    edge['wgt1_B'] = wgt1_B
                    edge['wgt2_B'] = wgt2_B
                    edge['hcA'] = hcA
                    edge['hcB'] = hcB
                    bipart[hcA].append(edge)
                    edgelist.append((hcA, hcB, num_proj))
            if len(bipart) == 0:
                logging.info("NO DATA FOR %s", str(tbin))
                continue
            logging.info('')
            logging.info(
                'A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)'
            )
            for k, v in bipart.items():
                for edge in v:
                    logging.info(
                        'A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f'
                        % edge)
                    if self.filelog:
                        A, B = tbin
                        self.filelog.info('edge,%d_%d,%s,%s,%d', A, B,
                                          edge['hcA'], edge['hcB'],
                                          edge['num_proj'])

            # Prepare nodes for graph
            nA = set()
            nB = set()
            elist = []
            for e in edgelist:
                a, b, z = e
                if z <= 5:
                    continue
                nA.add(a)
                nB.add(b)
                elist.append((a, b, z))
            nAKeys = sorted(nA)[::-1]
            nBKeys = sorted(nB)[::-1]
            sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
            sizesB = [hcube_global[n]['count'] * 3 for n in nBKeys]
            idxA = {key: i for i, key in enumerate(nAKeys)}
            idxB = {key: i for i, key in enumerate(nBKeys)}
            edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
            G.bipartite(sizesA, sizesB, edges, sizesA, sizesB,
                        'bipartite_%d_%d' % tbin)

        logging.info('STOPPING HERE!!!!')
        sys.exit(0)
        return []

Example #5

Show file

File: reweight_check.py Project: DaMSL/ddc

    def execute(self):
      """Special execute function for the reweight operator -- check/validate.
      """
    # PRE-PROCESSING ---------------------------------------------------------------------------------
      logging.debug("============================  <PRE-PROCESS>  =============================")
      self.cacheclient = CacheClient(self.name)
      numLabels = 5
      binlist = [(A, B) for A in range(numLabels) for B in range(numLabels)]
      labeled_pts_rms = self.catalog.lrange('label:rms', 0, -1)
      num_pts = len(labeled_pts_rms)
      logging.debug('##NUM_OBS: %d', num_pts)

      # TEST_TBIN = [(i,j) for i in range(2,5) for j in range(5)]
      TEST_TBIN = [(2,0), (4,2), (2,2), (4,1), (3,1), (4,4), (0,4), (0,2), (0,1)]
      MAX_SAMPLE_SIZE   =  100   # Max # of cov traj to back project per HCube
      MAX_PT_PER_MATRIX =  100   # Num points to sample from each cov traj
      COVAR_SIZE        = 200   # Ea Cov "pt" is 200 HD pts. -- should be static based on user query
      MAX_HCUBE         = 6      # Max Num HCubes to process


    # IMPLEMENT USER QUERY with REWEIGHTING:
      logging.debug("=======================  <QUERY PROCESSING>  =========================")

        #  1. RUN KPCA on <<?????>> (sample set) and project all pts
        #  2. Calculate K-D Tree on above
        #  3. Score each point with distance to centroid
        #  4. B = Select the smallest half of clusters
        #  5. Build state 3 and 4 KD-Tree using top N-PC for each (from sampled PCA)
        #  6. Run KMeans on each (???) for label/weight of H-Cubes in KD Tree (????)
        #       ALT-> use HCUbe size as its weight
        #  7. A = HCubes for states 3 (and 4)
        #  8. Reweight A into both state 3 and state 4 (B) HCubes
        #  9. ID Overlap
        # 10. Apply Gamme Function

      logging.info("=====  Covariance Matrix PCA-KMeans Calculation (B)")
      logging.info("Retrieving All Covariance Vectors")
      home = os.getenv('HOME')
      cfile = home + '/work/DEBUG_COVAR_PTS'
      DO_COVAR = self.calc_covar  # For recalculating covariance matrices (if not pre-calc/stored)
      if DO_COVAR: 
        if os.path.exists(cfile + '.npy'):
          covar_pts = np.load(cfile + '.npy')
          logging.debug('Loaded From File')
        else: 
          covar_raw = self.catalog.lrange('subspace:covar:pts', 0, -1)
          covar_pts = np.array([np.fromstring(x) for x in covar_raw])
          np.save(cfile, covar_pts)
          logging.debug('Loaded From Catalog & Saved')
      covar_index = self.catalog.lrange('subspace:covar:xid', 0, -1)
      logging.debug('Indiced Loaded. Retrieving File Indices')
      covar_fidx = self.catalog.lrange('subspace:covar:fidx', 0, -1)

      if DO_COVAR: 
        logging.info("    Pulled %d Covariance Vectors", len(covar_pts))
        logging.info("Calculating Incremental PCA on Covariance (or Pick your PCA Algorithm here)")

        # FOR incrementatl PCA:
        NUM_PC = 6
        ipca_key = 'subspace:covar:ipca'
        ipca = PCAnalyzer.load(self.catalog, ipca_key)
        if ipca is None:
          logging.info('Creating a NEW IPCA')
          ipca = PCAIncremental(NUM_PC)
          lastindex = 0
        else:
          lastindex = ipca.trainsize
          logging.info('IPCA Exists. Trained on %d pts. Will update with incremental batch of %d NEW pts', 
            ipca.trainsize, len(covar_pts)-ipca.trainsize)

        # For incremental, partial solve using only newer pts (from the last "trainsize")
        if len(covar_pts)-lastindex > 0:
          ipca.solve(covar_pts[lastindex:])
          logging.info("Incrementatl PCA Updated. Storing Now...")

          ####  BARRIER 
          self.wait_catalog()
          ipca.store(self.catalog, ipca_key)

        logging.info("IPCA Saved. Projecting Covariance to PC")

      cfile = home + '/work/DEBUG_SUBCOVAR_PTS'
      if os.path.exists(cfile + '.npy'):
        subspace_covar_pts = np.load(cfile + '.npy')
      else: 
        subspace_covar_pts = ipca.project(covar_pts)
        np.save(cfile, subspace_covar_pts)

      # OW/ PROJECT NEW PTS ONLY -- BUT RETAIN grouped index of all points
      logging.info('Building Global KD Tree over Covar Subspace with %d data pts', len(subspace_covar_pts))
      global_kdtree = KDTree(250, maxdepth=8, data=subspace_covar_pts, method='middle')
  

      if MAX_HCUBE <= 0:
        hcube_global = global_kdtree.getleaves()
      else:
      # FOR DEBUGGING -- USE ONLY 3 GLOBAL HCUBES
        hcube_global_ALL = global_kdtree.getleaves()
        hcube_global = {}
        num = 0
        for k, v in hcube_global_ALL.items():
          hcube_global[k] = v
          num += 1
          if num == MAX_HCUBE:
            break

      # hcube_global = global_kdtree.getleaves()
      logging.info('Global HCubes: Key  Count  Volume  Density  (NOTE DEBUGGING ONLY 3 USED)')
      for k in sorted(hcube_global.keys()):
        v = hcube_global[k]
        logging.info('%-10s        %6d %8.1f %6.1f', k, v['count'], v['volume'], v['density'])

      if self.filelog:
        keys = hcube_global.keys()
        self.filelog.info('global,keys,%s',','.join(keys))
        self.filelog.info('global,count,%s',','.join([str(hcube_global[k]['count']) for k in keys]))
        self.filelog.info('global,volume,%s',','.join([str(hcube_global[k]['volume']) for k in keys]))
        self.filelog.info('global,density,%s',','.join([str(hcube_global[k]['density']) for k in keys]))

      logging.info("=====  SELECT Sampling of points from each Global HCube  (B)")
      s = sorted(hcube_global.items(), key=lambda x: x[1]['count'])
      hcube_global = {x[0]: x[1] for x in s}


      counter = 0
      for key in hcube_global.keys():
        counter += 1
        if hcube_global[key]['count']  <= MAX_SAMPLE_SIZE:
          cov_index = hcube_global[key]['elm']
          hcube_global[key]['samplefactor'] = 1
        else:
          cov_index = np.random.choice(hcube_global[key]['elm'], MAX_SAMPLE_SIZE)
          hcube_global[key]['samplefactor'] = len(hcube_global[key]['elm']) / MAX_SAMPLE_SIZE
        hcube_global[key]['idxlist'] = []
        for cov in cov_index:
          selected_hd_idx = np.random.choice(COVAR_SIZE, MAX_PT_PER_MATRIX).tolist()
          hcube_global[key]['idxlist'].extend([int(covar_index[cov]) + i for i in selected_hd_idx])
        logging.info('Back Projecting Global HCube `%s`  (%d out of %d)', key, counter, len(hcube_global.keys()))
        source_cov = self.backProjection(hcube_global[key]['idxlist'])
        hcube_global[key]['alpha'] = datareduce.filter_alpha(source_cov)
        logging.debug('Back Projected %d points to HD space: %s', 
          len(hcube_global[key]['idxlist']), str(hcube_global[key]['alpha']))

      # logging.info('Calculating all HD Distances')
      # dist_hd = {}
      # dist_ld = {}
      # for key in hcube_global.keys():
      #   T = hcube_global[key]['alpha'].xyz
      #   N = len(T)
      #   dist_hd[key] = np.zeros(shape=(N, N))
      #   dist_ld[key] = {}
      #   for A in range(0, N):
      #     dist_hd[key][A][A] = 0
      #     for B in range(A+1, N):
      #       dist_hd[key][A][B] = dist_hd[key][B][A] = LA.norm(T[A] - T[B])
        

    # KD Tree for states from Reservoir Sample of RMSD labeled HighDim
      reservoir = ReservoirSample('rms', self.catalog)

      logging.info("=====  BUILD HCube Tree(s) Using Smallest State(s) (FROM RMSD Obsevations) ")
      hcube_list = {}

      logging.info("Scanning current set of observed bins and finding all smallest with data (excluding largest 2)")
      hcube_local = {}

      logging.info("=======================================================")
      logging.info("   PROJECT Global HCubes into Per-Bin HCube KD Tree(s)")
      logging.info("=======================================================\n")

      overlap_hcube = {k: {} for k in hcube_global.keys()}

      projection_map = {}


      pt_projection_list = []
      for key in sorted(hcube_global.keys()):
        for i in range(len(hcube_global[key]['alpha'].xyz)):
          pt_projection_list.append([])
      for bin_idx, tbin in enumerate(TEST_TBIN):
        logging.info("Project Global HCubes into local subspace for %s", str(tbin))
        # Load Vectors
        logging.info('Loading subspace and kernel for bin %s', str(tbin))

        # LOAD KPCA Kernel matrix
        kpca_key = 'subspace:pca:kernel:%d_%d' % tbin
        kpca = PCAnalyzer.load(self.catalog, kpca_key)

        data_raw = self.catalog.lrange('subspace:pca:%d_%d' % tbin, 0, -1)
        data = np.array([np.fromstring(x) for x in data_raw])
        if len(data) == 0:
          logging.error('No Raw PCA data points for bin %s.... Going to next bin', str(tbin))
          continue


        logging.info('Building KDtree over local %s bin from observations matrix of size: %s', str(tbin), str(data.shape))
        kdtree = KDTree(200, maxdepth=8, data=data, method='middle')
        hcube_local[tbin] = kdtree.getleaves()
        logging.info('LOCAL KD-Tree Completed for %s:', str(tbin))
        for k in sorted(hcube_local[tbin].keys()):
          logging.info('    `%-9s`   #pts:%6d   density:%9.1f', 
            k, len(hcube_local[tbin][k]['elm']), hcube_local[tbin][k]['density'])

        if self.filelog:
          keys = hcube_local[tbin].keys()
          A,B = tbin
          self.filelog.info('local,%d_%d,keys,%s',A,B,','.join(keys))
          self.filelog.info('local,%d_%d,count,%s',A,B,','.join([str(hcube_local[tbin][k]['count']) for k in keys]))
          self.filelog.info('local,%d_%d,volume,%s',A,B,','.join([str(hcube_local[tbin][k]['volume']) for k in keys]))
          self.filelog.info('local,%d_%d,density,%s',A,B,','.join([str(hcube_local[tbin][k]['density']) for k in keys]))          

        n_total = 0
        logging.debug('Global Hcubes to Project (%d):  %s', len(hcube_global.keys()), str(hcube_global.keys()))
        projection_map[bin_idx] = {k: set() for k in hcube_local[tbin].keys()}
        
        pnum = 0
        for key in sorted(hcube_global.keys()):
          overlap_hcube[key][tbin] = {}
          cov_proj_pca = kpca.project(hcube_global[key]['alpha'].xyz)

          logging.debug('PROJECT: Global HCube `%-9s` (%d pts) ==> Local KDTree %s  ', 
            key, len(cov_proj_pca), str(tbin))
          for i, pt in enumerate(cov_proj_pca):
            hcube = kdtree.probe(pt, probedepth=9)
            # NOTE: Retaining count of projected pts. Should we track individual pts -- YES (trying)
            if hcube not in overlap_hcube[key][tbin]:
              overlap_hcube[key][tbin][hcube] = {
                  'idxlist': hcube_local[tbin][hcube]['elm'],
                  'wgt': hcube_local[tbin][hcube]['density'], 
                  'num_projected': 0}
            overlap_hcube[key][tbin][hcube]['num_projected'] += 1

            # Index this point in corresponding local HCube projection view
            projection_map[bin_idx][hcube].add(pnum)

            pt_projection_list[pnum].append(hcube)
            pnum += 1

          for k, v in sorted(overlap_hcube[key][tbin].items()):
            logging.debug('   Project ==> Local HCube `%-9s`: %5d points', k, v['num_projected'])
          # logging.info('Calculating Lower Dimensional Distances')
          # N = len(cov_proj_pca)
          # dist_ld[key][tbin] = np.zeros(shape=(N, N))
          # for A in range(0, N):
          #   for B in range(A+1, N):
          #     dist_ld[key][tbin][A][B] = dist_ld[key][tbin][B][A] = LA.norm(cov_proj_pca[A] - cov_proj_pca[B])


    # Re-Index projected points -- could make this a list too

      next_index = 0
      view_list = []
      for bin_idx, hcube_map in projection_map.items():
        hcube_list = []
        for hcube_key, pt_list in hcube_map.items():
          hcube_list.append((set((hcube_key,)), set(pt_list)))
        view_list.append((set((bin_idx,)), hcube_list))

      print("CALLING: Collapse Join")
      joined_subspaces = collapse_join(projection_map.keys(), view_list)
      for subspace_list, correlated_hcubes in joined_subspaces:
        tbin_list = [TEST_TBIN[bin_idx] for bin_idx in subspace_list]
        for hcube_list, pt_list in correlated_hcubes:
          print(tbin_list, hcube_list, pt_list)
          # TODO: Corrlate Back to Global 
      print('Visualize HERE')



      # for idx, tbin in enumerate(TEST_TBIN):
      #   # Only process substates with data
      #   if tbin not in hcube_local:
      #     logging.warning('Local KD Tree not created for %s', str(tbin))
      #     continue
      #   projection_map[(idx,)] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))


      # sets = {}
      # proj_bin_list = []
      # for tbin in TEST_TBIN:
      #   if tbin not in hcube_local:
      #     continue
      #   proj_bin_list.append(tbin)
      #   sets[tbin] = {k: set() for k in hcube_local[tbin].keys()}
      # for n, proj in enumerate(pt_projection_list):
      #   for i, tbin in enumerate(proj_bin_list):
      #     sets[tbin][proj[i]].add(n)
      #   if self.filelog:
      #     self.filelog.info('%d,%s', n, ','.join(proj))
      #   logging.info('%d,%s', n, ','.join(proj))

      # set_list = {}
      # for tbin, view in sets.items():
      #   set_list[(tbin,)] = []
      #   for hcube, idxlist in view.items():
      #     print(tbin, hcube, idxlist)
      #     set_list[(tbin,)].append((set((hcube,)), idxlist))


      # def collapse(C):
      #   a = 0
      #   b = 0
      #   N = []
      #   while a < len(C) and b < len(C):
      #     A = sorted(C[a])
      #     B = sorted(C[b])
      #     if A == B:
      #       b += 1
      #     elif A[0] == B[0]:
      #       N.append(set(A)|set(B))
      #       b += 1
      #     else:
      #       a += 1
      #   if len(N) <= 1:
      #     return []
      #   else:
      #     return N + collapse(N)

      # q=collapse(t1)
      # for i in q: print(sorted(i))


      # print('Checking all 2-Way Joins')
      # join2 = {}
      # for a in range(0, len(proj_bin_list)-1):
      #   tA = proj_bin_list[a]
      #   for b in range(a+1, len(proj_bin_list)):
      #     tB = proj_bin_list[b]
      #     join_ss = tuple(set((tA, tB)))
      #     set_list = []
      #     for kA, vA in sets[tA].items():
      #       for kB, vB in sets[tB].items():
      #         join_hc = set((kA, kB))
      #         inter = vA & vB
      #         if len(inter) > 0:
      #           set_list.append((join_hc, inter))
      #     if len(set_list) > 0:
      #       join2[join_ss] = set_list
      # print('2-Way Join Results:')
      # for ss, set_list in join2.items():
      #   for hc, idxlist in set_list:
      #     print(ss, hc, idxlist)


      # print('Checking all 3-Way Joins')
      # join3 = []
      # checked = []
      # for a in range(0, len(join2)-1):
      #   sA, hA, vA = join2[a]
      #   for b in range(a+1, len(join2)):
      #     sB, hB, vB = join2[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join3.append((ss, hc, inter))


      # print('Checking all 4-Way Joins')
      # join4 = []
      # checked = []
      # for a in range(0, len(join3)-1):
      #   sA, hA, vA = join3[a]
      #   for b in range(a+1, len(join3)):
      #     sB, hB, vB = join3[b]
      #     if sA == sB:
      #       continue
      #     ss, hc = sA | sB, hA | hB
      #     if (ss, hc) in checked[-10:]:
      #       continue
      #     checked.append((ss, hc))
      #     inter = vA & vB
      #     if len(inter) > 0:
      #       join4.append((ss, hc, inter))

      # if self.filelog:
      #   for i in join2:
      #     self.filelog.info('%s', str(i))
      #   for i in join3:
      #     self.filelog.info('%s', str(i))
      #   for i in join4:
      #     self.filelog.info('%s', str(i))

      DO_MIN_CHECK = False
      if DO_MIN_CHECK:
        def maxcount(x):
          y={}
          for i in x:
            y[i] = 1 if i not in y else y[i]+1
          return max(y.values())

        print('%% of Points Per HCube with same NN subspaces (e.g. 20%% of points have same NN in 5 sub-spaces')
        argmin_nonzero = lambda x: np.argmin([(i if i>0 else np.inf) for i in x])
        for key in hcube_global.keys():
          # logging.info('Showing MIN / MAX for points from HCube %s:', key)
          minA = {}; maxA={}
          for n in range(len(dist_hd[key])) :
            minA[n]=[] ; maxA[n]=[]
            for tbin in TEST_TBIN:
              if tbin not in dist_ld[key].keys():
                continue
                minA[n].append(0)
                maxA[n].append(0)          
              else:
                minA[n].append(argmin_nonzero(dist_ld[key][tbin][n]))
                maxA[n].append(np.argmax(dist_ld[key][tbin][n]))          
          numsame = np.zeros(len(dist_ld[key].keys())+1)
          for n in range(len(dist_hd[key][n])):
            minH = argmin_nonzero(dist_hd[key][n])
            maxH = np.argmax(dist_hd[key][n])
            minmax = ['%2d/%-2d'%i for i in zip(minA[n], maxA[n])]
            numsamepair = maxcount(minA[n])
            numsame[numsamepair] += 1
            # print('%3d'%n, '%2d/%-2d  '%(minH, maxH), '%s' % ' '.join(minmax), '   [%d]'%numsamepair)
          print(' '.join(['%4.1f%%'%i for i in (100* (numsame/np.sum(numsame)))]))

      print('Stopping HERE!')
      sys.exit(0)
      #  GAMMA FUNCTION EXPR # 8
      gamma1 = lambda a, b : (a * b)
      gamma2 = lambda a, b : (a + b) / 2

      # TODO: Factor in RMS weight
      for tbin in TEST_TBIN:
      # for tbin in sorted(bin_list):
        logging.info('')
        logging.info('BIPARTITE GRAPH for %s', str(tbin))
        bipart = {}
        edgelist = []
        for hcB in hcube_global.keys():
          num_B  = hcube_global[hcB]['count']
          wgt1_B = hcube_global[hcB]['density']
          if tbin not in overlap_hcube[hcB]:
            continue
          for hcA, hcA_data in overlap_hcube[hcB][tbin].items():
            edge = {}
            if hcA not in bipart:
              bipart[hcA] = []  
            num_proj  = hcA_data['num_projected']
            wgt_A  = hcA_data['wgt']
            wgt2_B = wgt1_B*num_proj
            edge['combW1'] = gamma1(wgt_A, wgt1_B)
            edge['combW2'] = gamma1(wgt_A, wgt2_B)
            edge['combW3'] = gamma2(wgt_A, wgt1_B)
            edge['combW4'] = gamma2(wgt_A, wgt2_B)
            edge['num_A']  = len(hcA_data['idxlist'])
            edge['num_B']  = num_B
            edge['num_proj']  = num_proj
            edge['wgt_A']  = wgt_A
            edge['wgt1_B'] = wgt1_B
            edge['wgt2_B'] = wgt2_B
            edge['hcA'] = hcA
            edge['hcB'] = hcB
            bipart[hcA].append(edge)
            edgelist.append((hcA, hcB, num_proj))
        if len(bipart) == 0:
          logging.info("NO DATA FOR %s", str(tbin))
          continue
        logging.info('')
        logging.info('A (# Pts) H-Cube        <--- B H-Cube (# proj/total Pts)      wgt_A  wB1:density wB2:Mass     A*B1     A*B2     AVG(A,B1)     AVG(A,B2)')
        for k, v in bipart.items():
          for edge in v:
            logging.info('A (%(num_A)4d pts) `%(hcA)-8s` <--- `%(hcB)9s`  (%(num_B)4d / %(num_proj)4d pts) B %(wgt_A)9.1f %(wgt1_B)9.1f %(wgt2_B)9.1f %(combW1)9.1f %(combW2)9.1f %(combW3)9.1f %(combW3)9.1f' % edge)
            if self.filelog:
              A,B = tbin
              self.filelog.info('edge,%d_%d,%s,%s,%d',A,B,edge['hcA'],edge['hcB'],edge['num_proj'])

        # Prepare nodes for graph
        nA = set()
        nB = set()
        elist = []
        for e in edgelist:
          a, b, z = e
          if z <= 5:
            continue
          nA.add(a)
          nB.add(b)
          elist.append((a,b,z))
        nAKeys = sorted(nA)[::-1]
        nBKeys = sorted(nB)[::-1]
        sizesA = [hcube_local[tbin][n]['count'] for n in nAKeys]
        sizesB = [hcube_global[n]['count']*3 for n in nBKeys]
        idxA = {key: i for i, key in enumerate(nAKeys)}
        idxB = {key: i for i, key in enumerate(nBKeys)}
        edges = [(idxA[a], idxB[b], z) for a, b, z in elist]
        G.bipartite(sizesA,sizesB,edges,sizesA,sizesB,'bipartite_%d_%d' % tbin)

      logging.info('STOPPING HERE!!!!')
      sys.exit(0)
      return []