def load_PCA_Subspace(catalog): # HCube leaf size of 500 points settings = systemsettings() vectfile = settings.PCA_VECTOR_FILE logging.info("Loading PCA Vectors from %s", vectfile) pc_vect = np.load(vectfile) max_pc = pc_vect.shape[1] num_pc = min(settings.PCA_NUMPC, max_pc) pc = pc_vect[:num_pc] logging.info("Storing PCA Vectors to key: %s", 'pcaVectors') catalog.storeNPArray(pc, 'pcaVectors') logging.info("Loading Pre-Calculated PCA projections from Historical BPTI Trajectory") pre_calc_deshaw = np.load('data/pca_applied.npy') # Extract only nec'y PC's pts = pre_calc_deshaw.T[:num_pc].T pipe = catalog.pipeline() for si in pts: pipe.rpush('subspace:pca', bytes(si)) pipe.execute() logging.debug("PCA Subspace stored in Catalog") logging.info('Creating KD Tree') kd = KDTree(500, maxdepth=8, data=pts) logging.info('Encoding KD Tree') packaged = kd.encode() encoded = json.dumps(packaged) logging.info('Storing in catalog') catalog.delete('hcube:pca') catalog.set('hcube:pca', encoded) logging.info('PCA Complete')
def calcDEShaw_PCA(catalog, force=False): numPC = 3 numpts = catalog.llen('subspace:pca') if numpts == 0 or force: catalog.delete('subspace:pca') logging.debug("Projecting DEshaw PCA Vectors (assuming PC's are pre-calc'd") pcavect = catalog.loadNPArray('pcaVectors') logging.debug("Loaded PCA Vectors: %s, %s", str(pcavect.shape), str(pcavect.dtype)) src = np.load(DESHAW_PTS_FILE) logging.debug("Loaded source points: %s, %s", str(src.shape), str(src.dtype)) pcalist = np.zeros(shape=(len(src), numPC)) start = dt.datetime.now() pdbfile, dcdfile = deshaw.getHistoricalTrajectory(0) traj = md.load(dcdfile, top=pdbfile, frame=0) filt = traj.top.select_atom_indices(selection='heavy') pipe = catalog.pipeline() for i, conform in enumerate(src): if i % 10000 == 0: logging.debug("Projecting: %d", i) heavy = np.array([conform[k] for k in filt]) np.copyto(pcalist[i], np.array([np.dot(heavy.reshape(pc.shape),pc) for pc in pcavect[:numPC]])) raw_index = i * DESHAW_SAMPLE_FACTOR pipe.rpush('xid:reference', '(-1, %d)' % raw_index) end = dt.datetime.now() logging.debug("Projection time = %d", (end-start).seconds) rIdx = [] for si in pcalist: rIdx.append(pipe.rpush('subspace:pca', bytes(si))) pipe.execute() logging.debug("R_Index Created (pca)") else: logging.info('PCA Already created. Retrieving existing lower dim pts') X = catalog.lrange('subspace:pca', 0, -1) pcalist = np.array([np.fromstring(si) for si in X]) # HCube leaf size of 500 points logging.info('Creating KD Tree') kd = KDTree(500, data=pcalist) logging.info('Encoding KD Tree') encoded = json.dumps(kd.encode()) logging.info('Storing in catalog') catalog.delete('hcube:pca') catalog.set('hcube:pca', encoded) logging.info('PCA Complete')