def doCovar(self, winsize=.1, slide=.05): # Calculate Covariance matrices over all temporal data IAW given window param self.covar = [] self.fealcov = [] for i, tr in self.E.trlist.items(): if i % 100 == 0: print(i) cov = dr.calc_covar(tr.xyz, winsize, 1, slide=slide) if self.space in ['ds', 'dsw']: X, C = dr.distance_space(tr), self.E.cent_ds else: X, C = tr.xyz, self.E.cent_c wgt = self.E.cw if self.space in ['cw', 'dsw'] else [1,1,1,1,1] rms = calc_rmsd(X, C, weights=wgt) W = int(winsize * 1000) S = int(slide * 1000) feal = [np.mean([FL.feal.atemporal(i) for i in rms[st:st+W]], axis=0) for st in range(0, len(tr.xyz), S)] for n in range(min(len(cov), len(feal))): self.covar.append(cov[n]) self.fealcov.append(feal[n])
def calc_feal_AB(cls, traj): """Atemporal (individual frame) featue landscape """ maxd = self.max_rms_dist mind = self.min_rms_dist ds = DR.distance_space(traj) rms = rmsd.calc_rmsd(ds, self.cent_ds) # Proximity to State for i in range(traj.n_frames): fealand = [0 for i in range(5)] fealand[np.argmin(rms[i])] = self.scaleto # Proximity for dist in enumerate(rms): fealand.append(scaleto*max(maxd-(max(dist, mind), 0))/(maxd-mind)) # Additional Feature Spaces for a in range(4): for b in range(a+1, 5): rel_dist = rms[a]-rms[b] tup.append(log_reld(rel_dist)) fealand.extend(tup)
def calc_feal_AB(cls, traj): """Atemporal (individual frame) featue landscape """ maxd = self.max_rms_dist mind = self.min_rms_dist ds = DR.distance_space(traj) rms = rmsd.calc_rmsd(ds, self.cent_ds) # Proximity to State for i in range(traj.n_frames): fealand = [0 for i in range(5)] fealand[np.argmin(rms[i])] = self.scaleto # Proximity for dist in enumerate(rms): fealand.append(scaleto * max(maxd - (max(dist, mind), 0)) / (maxd - mind)) # Additional Feature Spaces for a in range(4): for b in range(a + 1, 5): rel_dist = rms[a] - rms[b] tup.append(log_reld(rel_dist)) fealand.extend(tup)
traj, rmsd = [], [] for tnum in range(42): p, d = DE.getHistoricalTrajectory_prot(tnum) traj.append(md.load(d, top=p).atom_slice(afilt)) resrms = [] ccent = np.load('../data/bpti-alpha-cartesian-centroid.npy') for tnum, tr in enumerate(detraj): minlist = TS.TimeScape.read_log(home+'/work/timescape/desh_%02d_minima.log'%tnum) minima = tr.slice(minlist) minima.superpose(topoa) for m in minima.xyz: resrms.append(LA.norm(ref - m, axis=1)) ds = DR.distance_space(minima) state = np.array([np.argmin(LA.norm(cent-i, axis=1)) for i in ds]) resrms = [np.zeros(58)] for in range(len(minima[1:])): resrms.append(LA.norm(minima[i].xyz - minima[i-1], axis=2)) for m, s in zip(minima, state): resrms.append(LA.norm(m.xyz - ccent[s], axis=0)) for i in range(1, 91116): CM[i] = np.abs(resrms[max(0, i-5):i].mean(0)-resrms[i: min(91116, i+5)].mean(0)) > theta a, b, Z = 0, 30000, 300 for sc in range(8): data = {'State':bL[a:b]} for rd in range(7):
def bootstrap_lattice(catalog, num=10, build_new=False): ''' Bootstrap After TimeScape has run on source trajectory ''' home = os.getenv("HOME") support = 1 cutoff = 8 start_coord = ['de2586_315', 'de531_20', 'de3765_63', 'de3305_668', 'de1732_139'] dcdfile = lambda x: home + '/work/data/{0}.dcd'.format(x) outloc = lambda x: home+'/work/jc/denovouniform1/{0}/{0}'.format(x) traj_list = {} basin_list = catalog.lrange('basin:list', 0, -1) if len(basin_list) == 134: logging.info('Basin Data already loaded!') rms_delta_list = [(i, np.sum(pickle.loads(catalog.get('basin:rmsdelta:'+b)))) for i, b in enumerate(basin_list)] else: logging.info('Loading all bootstrap data to initialize...') basin_list = [] rms_delta_list = [] pdb_file = home+'/work/data/alpha.pdb' topo = md.load(pdb_file) ref_alpha = md.load(home+'/work/' + catalog.get('pdb:ref:0')) ref_alpha.atom_slice(ref_alpha.top.select_atom_indices('alpha'), inplace=True) res_rms_Kr = FEATURE_SET for sc in start_coord: dist_space = [] srcfile = outloc(sc) + '.dcd' pdbfile = srcfile.replace('dcd', 'pdb') logging.debug('LOADING TRAJ: %s', srcfile) traj = md.load(srcfile, top = pdbfile) traj_list[sc] = traj alpha = traj.atom_slice(traj.top.select_atom_indices('alpha')) logging.info('Grabbing TS data...') W = TS.TimeScape.windows(outloc(sc) + '_transitions.log') ts_traj = TS.TimeScapeParser(pdbfile, outloc(sc), sc, dcd=srcfile, traj=traj) basins = ts_traj.load_basins() logging.info("Processing distance space and residue RMS") dsa = DR.distance_space(alpha) resrmsd = 10*np.array([LA.norm(i-ref_alpha.xyz[0], axis=1) for i in alpha.xyz]) basin_res_rms = np.zeros(shape=(len(ts_traj.basins), alpha.n_atoms)) for i, (a,b) in enumerate(W): dist_space.append(dsa[a:b].mean(0)) basin_res_rms[i] = np.median(resrmsd[a:b], axis=0) basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T logging.debug('RMS LEN CHECK: %d =?= %d -- Updating RMS Delta',len(basins), len(basin_res_rms_delta)) for i, basin in enumerate(basins): pipe = catalog.pipeline() bid = basin.id # Store on Disk and in redis jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid) logging.info('MIN for %s: Idx# %d to %s', bid, basin.mindex, jc_filename) minima_frame = traj.slice(basin.mindex) #md.load_frame(src_traj, basin.mindex, top=src_traj.replace('dcd', 'pdb')) minima_frame.save_pdb(jc_filename) basin_hash = basin.kv() basin_hash['pdbfile'] = jc_filename logging.info(' Basin: %(id)s %(start)d - %(end)d Minima: %(mindex)d size=%(len)d' % basin_hash) pipe.rpush('basin:list', bid) pipe.hmset('basin:%s'%bid, basin_hash) pipe.set('basin:dmu:'+bid, pickle.dumps(dist_space[i])) pipe.set('minima:%s'%bid, pickle.dumps(minima_frame)) # FOR RESIDUE RMSD resrms_d = np.sum(basin_res_rms_delta[i][res_rms_Kr]) basin_hash['resrms_delta'] = resrms_d rms_delta_list.append((len(basin_list), resrms_d)) basin_list.append(basin_hash) pipe.set('basin:rmsdelta:'+bid, pickle.dumps(basin_res_rms_delta[i])) pipe.execute() # FOR RESIDUE RMSD # FOR SEED SAMPLING USING RMS_DELTA # Note: skip the first basin # Re-Construct the Lattice from if build_new: dist_space = 10*np.array(dist_space) cm = ds<cutoff fs = lat.reduced_feature_set(cm,.115); len(fs) dr, cr = ds[:,fs], cm[:,fs] mfis,lfis = lat.maxminer(cr, 1) dlat, ik = lat.derived_lattice(mfis, dr, cr) pickle.dump(mfis, open(home + '/work/data/denovo_mfis.p', 'wb')) pickle.dump(lfis, open(home + '/work/data/denovo_lfis.p', 'wb')) pickle.dump(ik, open(home + '/work/data/denovo_iset.p', 'wb')) pickle.dump(dlat, open(home + '/work/data/denovo_dlat.p', 'wb')) else: logging.info('Loading Pre-Constructed Lattice Data') dlat = pickle.load(open(home + '/work/data/denovo_dlat.p', 'rb')) mfis = pickle.load(open(home + '/work/data/denovo_mfis.p', 'rb')) lfis = pickle.load(open(home + '/work/data/denovo_lfis.p', 'rb')) ik = pickle.load(open(home + '/work/data/denovo_iset.p', 'rb')) with catalog.pipeline() as pipe: pipe.set('lattice:max_fis', pickle.dumps(mfis)) pipe.set('lattice:low_fis', pickle.dumps(lfis)) pipe.set('lattice:dlat', pickle.dumps(dlat)) pipe.set('lattice:iset', pickle.dumps(ik)) pipe.execute() # logging.info('Building Existing lattice object') # lattice=lat.Lattice(ds, fs, cutoff, support) # lattice.set_fis(max_fis, low_fis) # lattice.set_dlat(dlat, Ik) # sampler = LatticeSampler(lattice) # Sample -- FOR USING LATTICE TO BOOTSTRAP # cl,sc,el = lat.clusterlattice(dlat, cr, dr, ik, num_k=8, invert=True) # cl_list = sorted(el, key=lambda x: len(x)) # TODO: Check if fan out > single item clusters # start_indices = [clu[0][0] for clu in cl_list[:num]] rms_delta_ranked = [x[0] for x in sorted(rms_delta_list, key=lambda i: i[1], reverse=True)] start_indices = rms_delta_ranked[:num] seedlist = [catalog.lindex('basin:list', i) for i in start_indices] sim_init = {key: catalog.get(key) for key in settings.sim_params.keys()} global_params = getSimParameters(sim_init, 'seed') global_params['psf'] = home+'/work/jc/serial2/de0_0/de0_0.psf' for seed in seedlist: logging.debug('\nSeeding Job: %s ', seed) basin = catalog.hgetall('basin:%s'%seed) catalog.rpush('executed', seed) # Generate new set of params/coords jcID, config = generateFromBasin(basin) # Update Additional JC Params and Decision History, as needed config.update(global_params) # Push to catalog logging.info("New Simulation Job Created: %s", jcID) for k, v in config.items(): logging.debug(" %s: %s", k, str(v)) catalog.rpush('jcqueue', jcID) catalog.hmset(wrapKey('jc', jcID), config)
def execute(self, job): # PRE-PREOCESS ---------------------------------------------------------- settings = systemsettings() bench = microbench("sim_%s" % settings.name, self.seqNumFromID()) bench.start() stat = StatCollector("sim_%s" % settings.name, self.seqNumFromID()) mylogical_seqnum = str(self.seqNumFromID()) # Prepare working directory, input/output files conFile = os.path.join(job["workdir"], job["name"] + ".conf") logFile = conFile.replace("conf", "log") # log in same place as config file dcdFile = conFile.replace("conf", "dcd") # dcd in same place as config file USE_SHM = True SIMULATE_RATIO = settings.SIMULATE_RATIO if SIMULATE_RATIO > 1: logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO) frame_size = (SIMULATE_RATIO * int(job["interval"])) / (1000) logging.info("Frame Size is %f Using Sim Ratio of 1:%d", frame_size, SIMULATE_RATIO) EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER logging.info("Running Experiment Configuration #%d", EXPERIMENT_NUMBER) # # Grab historical basin data's relative time to start (for lineage) traj = None # EXECUTE SIMULATION --------------------------------------------------------- if self.skip_simulation: logging.info("1. SKIPPING SIMULATION.....") USE_SHM = False job["dcd"] = dcdFile key = wrapKey("jc", job["name"]) self.data[key]["dcd"] = dcdFile else: logging.info("1. Run Simulation") # Prepare & source to config file with open(self.data["sim_conf_template"], "r") as template: source = template.read() # >>>>Storing DCD into shared memory on this node if USE_SHM: ramdisk = gettempdir() job["outputloc"] = ramdisk dcd_ramfile = os.path.join(ramdisk, job["name"] + ".dcd") else: job["outputloc"] = "" with open(conFile, "w") as sysconfig: sysconfig.write(source % job) logging.info("Config written to: " + conFile) # # Run simulation in parallel # if 'parallel' in job: # numnodes = job['parallel'] # total_tasks = numnodes * 24 # cmd = 'mpiexec -n %d namd2 %s > %s' % (total_tasks, conFile, logFile) # # Run simulation single threaded # else: # cmd = 'namd2 %s > %s' % (conFile, logFile) # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile) check = executecmd("module list") logging.debug("%s", check) cmd = "namd2 +p%d %s > %s" % (PARALLELISM, conFile, logFile) # MICROBENCH #1 (file to Lustre) # logging.debug("Executing Simulation:\n %s\n", cmd) # bench = microbench() # bench.start() # stdout = executecmd(cmd) # logging.info("SIMULATION Complete! STDOUT/ERR Follows:") # bench.mark('SimExec:%s' % job['name']) # shm_contents = os.listdir('/dev/shm/out') # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents)) # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir']) # logging.info("Copy Complete to Lustre.") # bench.mark('CopyLustre:%s' % job['name']) # shutil.rmtree(ramdisk) # shm_contents = os.listdir('/dev/shm') # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents)) # bench.show() max_expected_obs = int(job["runtime"]) // int(job["dcdfreq"]) # Retry upto 3 attempts if the sim fails MAX_TRY = 3 for i in range(MAX_TRY, 0, -1): min_required_obs = int(max_expected_obs * ((i - 1) / (MAX_TRY))) logging.debug("Executing Simulation:\n %s\n", cmd) logging.debug("# Obs Expected to see: %d", max_expected_obs) stdout = executecmd(cmd) logging.info("SIMULATION Complete! STDOUT/ERR Follows:") # Check file for expected data if USE_SHM: traj = md.load(dcd_ramfile, top=job["pdb"]) else: traj = md.load(dcdFile, top=job["pdb"]) logging.info("Obs Threshold = %4d", min_required_obs) logging.info("#Obs This Traj = %4d", traj.n_frames) if traj.n_frames >= min_required_obs: logging.info("Full (enough) Sim Completed") break logging.info("Detected a failed Simulation. Retrying the same sim.") break bench.mark("simulation") # bench.mark('SimExec:%s' % job['name']) # Internal stats sim_length = self.data["sim_step_size"] * int(job["runtime"]) sim_realtime = bench.delta_last() sim_run_ratio = (sim_realtime / 60) / (sim_length / 1000000) logging.info("##SIM_RATIO %6.3f min-per-ns-sim", sim_run_ratio) stat.collect("sim_ratio", sim_run_ratio) if USE_SHM: shm_contents = os.listdir(ramdisk) logging.debug("Ramdisk contents (should have files) : %s", str(shm_contents)) if not os.path.exists(dcd_ramfile): logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)") time.sleep(10) if not os.path.exists(dcd_ramfile): logging.warning("DCD STIILL FILE NOT FOUND!!!!") else: logging.info("DCD File was found") # # MICROBENCH #2 (file to Alluxio) # allux = AlluxioClient() # # copy to Aluxio FS # allux.put(ramdisk + job['name'] + '.dcd', '/') # logging.info("Copy Complete to Alluxio.") # bench.mark('CopyAllux:%s' % job['name']) # And copy to Lustre # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir']) # And copy to Lustre (usng zero-copy): if USE_SHM: # ALT: X-Mit to Cache Service and let cache write to disk lazily src = open(dcd_ramfile, "rb") dest = open(dcdFile, "w+b") offset = 0 dcdfilesize = os.path.getsize(dcd_ramfile) while True: sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize) if sent == 0: break offset += sent logging.info("Copy Complete to Lustre.") bench.mark("CopyLustre:%s" % job["name"]) # TODO: Update job's metadata key = wrapKey("jc", job["name"]) self.data[key]["dcd"] = dcdFile # ANALYSIS ------- --------------------------------------------------------- # ANALYSIS ALGORITHM # 1. With combined Sim-analysis: file is loaded locally from shared mem logging.debug("2. Load DCD") # Load topology and define filters topo = self.protein.pdb pdb = md.load(job["pdb"]) hfilt = pdb.top.select_atom_indices("heavy") pfilt = pdb.top.select("protein") afilt = pdb.top.select_atom_indices("alpha") # Load full higher dim trajectory if traj is None: if USE_SHM: traj = md.load(dcd_ramfile, top=pdb) else: traj = md.load(dcdFile, top=pdb) logging.debug("Trajectory Loaded: %s (%s)", job["name"], str(traj)) traj_prot = traj.atom_slice(pfilt) traj_heavy = traj.atom_slice(hfilt) traj_alpha = traj.atom_slice(afilt) # Superpose Coordinates to Common Reference traj_prot.superpose(topo) # Calculate output Distance Space # Use of the second side chain atom is discussed in the ref paper on Timescape # The atom pairs and selected atoms are in the timescape module sc_pairs = side_chain_pairs(traj_prot) n_features = len(sc_pairs) # Use the CA atoms to calculate distance space # NOTE THE CONVERSION FROM NM TO ANG!!!!!! dist_space = 10 * DR.distance_space(traj_alpha) # Get the frame rate to for conversion between source and timescape # NOTE: Timescae is run at a more coarse level logging.debug("Preprocessing output for TimeScapes: terrain") traj_frame_per_ps = SIMULATE_RATIO * int(job["interval"]) / 1000.0 # jc interval is in fs ts_frame_per_ps = int(self.data["timescape:rate"]) # this value is in ps frame_rate = int(ts_frame_per_ps / traj_frame_per_ps) logging.debug( "%5.2f fr/ps (Traj) %5.2f fr/ps (TS) FrameRate= %4.1f", traj_frame_per_ps, ts_frame_per_ps, frame_rate, ) # Execute Timescapes agility program to detect spatial-temporal basins output_prefix = os.path.join(job["workdir"], job["name"]) if not self.skip_timescape: # Prep file and save locally in shm tmploc = gettempdir() ts_out = tmploc + "traj_ts" ts_dcd = ts_out + ".dcd" ts_pdb = ts_out + ".pdb" heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate)) heavy.slice(0).save_pdb(ts_pdb) heavy.save_dcd(ts_dcd) # Gaussuan Full Width at Half-Max value affects sliding window size # ref: http://timescapes.biomachina.org/guide.pdf gmd_cut1 = int(self.data["timescape:gmd:low"]) gmd_cut2 = int(self.data["timescape:gmd:hi"]) gauss_wght_delta = int(self.data["timescape:delta"]) # Execute timescapes' terrain.py on the pre-processed trajectory cmd = "terrain.py %s %s %d %d %d GMD %s" % ( ts_pdb, ts_dcd, gmd_cut1, gmd_cut2, gauss_wght_delta, output_prefix, ) logging.info("Running Timescapes:\n %s", cmd) stdout = executecmd(cmd) logging.info("TimeScapes COMPLETE:\n%s", stdout) # Collect and parse Timescape output logging.debug("Parsing Timescapes output") ts_parse = TimeScapeParser(job["pdb"], output_prefix, job["name"], dcd=dcdFile, traj=traj, uniqueid=False) basin_list = ts_parse.load_basins(frame_ratio=frame_rate) n_basins = len(basin_list) minima_coords = {} basin_rms = {} basins = {} new_dmu = {} new_dsig = {} resid_rms_delta = {} stat.collect("num_basin", n_basins) downstream_list = [] # FOR BOOTSTRAPPING USING RMSD ref_file = os.path.join(settings.workdir, self.data["pdb:ref:0"]) logging.info("Loading RMSD reference frame from %s", self.data["pdb:ref:0"]) refframe = md.load(ref_file) ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices("alpha")) rmsd = 10 * md.rmsd(traj_alpha, ref_alpha) # FOR RESIDUE RMSD res_rms_Kr = FEATURE_SET_RESID resrmsd = 10 * np.array([LA.norm(i - ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz]) basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms)) # LOAD CENROIDS -- todo move to immut centroid = pickle.loads(self.catalog.get("centroid:ds")) basin_label_list = {} # Process each basin for i, basin in enumerate(basin_list): logging.info(" Processing basin #%2d", i) bid = basin.id downstream_list.append(bid) # Slice out minima coord & save to disk (for now) # TODO: Store in memory in cache minima_coords[bid] = traj.slice(basin.mindex) jc_filename = os.path.join(settings.datadir, "basin_%s.pdb" % bid) minima_coords[bid].save_pdb(jc_filename) # METRIC CALCULATION a, b = basin.start, basin.end new_dmu[bid] = np.mean(dist_space[a:b], axis=0) new_dsig[bid] = np.std(dist_space[a:b], axis=0) # Collect Basin metadata basin_hash = basin.kv() basin_hash["pdbfile"] = jc_filename basin_hash["rmsd"] = np.median(rmsd[a:b]) basin_res_rms[i] = np.median(resrmsd[a:b], axis=0) # FOR EXP #16 -- Residue RMSD # Set relative time (in ns) # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000 basins[bid] = basin_hash # LABEL ATEMPORAL DATA # Take every 4th frame stride = [dist_space[i] for i in range(a, b, 4)] rms = [LA.norm(centroid - i, axis=1) for i in stride] label_seq = [np.argmin(i) for i in rms] basin_label_list[bid] = label_seq logging.info(" Basin Processed: #%s, %d - %d", basin_hash["traj"], basin_hash["start"], basin_hash["end"]) # RMSD DELTA (RESIDUE) basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)} for k in basins.keys(): basins[k]["resrms_delta"] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr]) # TODO: Use Min Index as snapshot, median (or mean) DistSpace vals for each basin????? bench.mark("analysis") # BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available # try: self.wait_catalog() # except OverlayNotAvailable as e: # logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis") # FOR EXPERIMENT METRICS src_basin = self.catalog.hgetall("basin:" + job["src_basin"]) with open("/home-1/[email protected]/ddc/results/{0}_prov.log".format(settings.name), "a") as metric_out: for i, basin in enumerate(basin_list): bid = basin.id label_seq = basin_label_list[bid] basin_metric_label = LABEL10(label_seq) metric_out.write( "BASIN,%s,%s,%s,%s\n" % (bid, src_basin["label:10"], basin_metric_label, "".join([str(i) for i in label_seq])) ) # Update Index Synchronized data lists basin_list_sorted = sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1]))) for bid in basin_list_sorted: with self.catalog.pipeline() as pipe: while True: try: logging.debug("Updating %s basin indeces and distance space", len(basins)) pipe.rpush("basin:list", bid) pipe.rpush("dspace", pickle.dumps(new_dmu[bid])) basin_index, _ = pipe.execute() break except redis.WatchError as e: logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...") continue basins[bid]["dsidx"] = basin_index - 1 # Update Catalog with 1 Long Atomic Transaction with self.catalog.pipeline() as pipe: while True: try: logging.debug("Update Filelist") pipe.watch(wrapKey("jc", job["name"])) file_idx = pipe.rpush("xid:filelist", job["dcd"]) - 1 # HD Points logging.debug("Update HD Points") start_index = pipe.llen("xid:reference") pipe.multi() pipe.rpush("xid:reference", *[(file_idx, x) for x in range(traj.n_frames)]) pipe.set("resrms:" + job["name"], resrmsd) # Store all basin data logging.debug("Updating %s basins", len(basins)) for bid in sorted(basins.keys(), key=lambda x: (x.split("_")[0], int(x.split("_")[1]))): pipe.hmset("basin:" + bid, basins[bid]) pipe.set("minima:%s" % bid, pickle.dumps(minima_coords[bid])) for i in basin_label_list[bid]: pipe.rpush("basin:labelseq:" + bid, i) pipe.hset("anl_sequence", job["name"], mylogical_seqnum) logging.debug("Executing") pipe.execute() break except redis.WatchError as e: logging.debug("WATCH ERROR. Someone else is writing to the catalog. Retrying...") continue self.data[key]["xid:start"] = start_index self.data[key]["xid:end"] = start_index + traj.n_frames bench.mark("catalog") # ---- POST PROCESSING # Check for total data in downstream queue & Notify control manager to run # 1. Det. what's still running job_queue = slurm.currentqueue() n_simjobs = -1 # -1 to exclude self for j in job_queue: if j["name"].startswith("sw"): n_simjobs += 1 logging.info("SIM WORKER has detected %d other simulations running/pending.", len(job_queue)) remain_percent = n_simjobs / self.data["numresources"] # Fault Tolerance: ensure the pipeline persists if n_simjobs == 0: logging.info("I am the last simulation. Ensuring the controler executes.") self.catalog.set("ctl:force", 1) # Send downstream notification when less than 10% of jobs are still running if not self.skip_notify and remain_percent < 0.1: # Notify the control manager 'CM' self.notify("ctl") if USE_SHM: shutil.rmtree(ramdisk) shm_contents = os.listdir("/dev/shm") logging.debug("Ramdisk contents (should be empty of DDC) : %s", str(shm_contents)) # For benchmarching: bench.show() stat.show() # Return # of observations (frames) processed return downstream_list
all_label = np.zeros(shape=(len(jclist), 4500)) sumlog = open(home + '/work/results/trajout_{0}'.format(expname), 'w') for i, jc in enumerate(jclist[startnum:]): if 'dcd' not in jc: jc['dcd'] = jc['pdb'].replace('dcd', 'pdb') if not os.path.exists(jc['dcd']): logging.info("%d %s NO_DCD", i, jc['name']) continue tr = md.load(jc['dcd'], top=topo, stride=4) if tr.n_frames < 4503: logging.info("%d %s Frames: %d", i, jc['name'], tr.n_frames) continue prot = tr.atom_slice(pfilt) prot.save_dcd(wkdir + '/%s_%03d.dcd' % (expname, (i + startnum))) alpha = prot.atom_slice(afilt) ds = DR.distance_space(alpha) rms = [LA.norm(cent-i, axis=1) for i in ds] label = np.array([np.argmin(i) for i in rms[3:]]) all_label[i] = label label_str = ''.join([str(i) for i in label]) red.hset('basin:' + jc['name'], 'label', label_str) bincnt = ','.join([str(i) for i in np.bincount(label, minlength=5)]) src_basin = jc['src_basin'] if 'src_basin' in jc else 'NONE' logging.info("%d,%s,%s,%s", i, jc['name'], src_basin, bincnt) sumlog.write('%d,%s,%s,%s\n' % (i, jc['name'], src_basin, bincnt)) sumlog.close() np.save(home + '/work/results/label_{0}'.format(expname), all_label) logging.info('ALL Done!')
def load_historical_Expl(catalog): """ Load all DEShaw data into basins for processing """ settings = systemsettings() # idx_list = [0,1,2,3,4,20,23,24,30,32,34,40,41,42] idx_list = [0,34] tlist = {k: 'tr%d.dcd'%k for k in idx_list} seed_dir = os.path.join(settings.WORKDIR, 'seed') seed_ts_ratio = 16 # TimeScape ran on 4ps frame rate (16x source) # Load topology and anscillary data # bpti = Protein(bpti, catalog, load=True) pdb_file = os.path.join(seed_dir, 'coord.pdb') topo = md.load(pdb_file) pfilt = topo.top.select('protein') logging.info('Topology loaded %s', topo) # ID Side Chain pair atoms for each distance space calc sc_pairs = side_chain_pairs(topo.atom_slice(pfilt)) logging.info('Identified side chains: %d', len(sc_pairs)) # Process all sorce trajectories basin_list = [] C_T, mu_T, sigma_T = [], [], [] for idx in idx_list: logging.info('Procesing Seed index: %d', idx) # Load SRC seed trajetory & calc distance space -- TODO: make this optional tfile = os.path.join(seed_dir, tlist[idx]) traj = md.load(tfile, top=topo) traj.superpose(topo) ds = datareduce.distance_space(traj, pairs=sc_pairs) # Push to Catalog file_idx = catalog.rpush('xid:filelist', tfile) - 1 start_index = catalog.llen('xid:reference') # TODO: Do I still need to index every frame?????? catalog.rpush('xid:reference', *[(file_idx, x) for x in range(traj.n_frames)]) # Process Trajectory as basins logging.info(" Seed Loaded. Loading TimeScape Data...") seed_name = 'tr%d'%idx ts_data_path = os.path.join(seed_dir, 'TEST', seed_name) ts_traj = TimeScapeParser(pdb_file, ts_data_path, seed_name, dcd=tfile, traj=traj) basin_list = ts_traj.load_basins(frame_ratio=seed_ts_ratio) corr_mat = ts_traj.correlation_matrix() for i, basin in enumerate(ts_traj.basins): a, b = basin.start, basin.end bid = basin.id if a > traj.n_frames: logging.info('Finished processing all basins for this Trajectory!') break # Store on Disk and in redis jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid) minima_frame = md.load_frame(tfile, basin.mindex, top=topo) if traj is None else traj.slice(basin.mindex) minima_frame.save_pdb(jc_filename) C_T.append(np.mean(corr_mat[a:b], axis=0)) mu_T.append(np.mean(ds[a:b], axis=0)) sigma_T.append(np.std(ds[a:b], axis=0)) basin_hash = basin.kv() basin_hash['pdbfile'] = jc_filename basin_hash['dsidx'] = i logging.info(' Basin: %(id)s %(start)d - %(end)d Minima: %(mindex)d size=%(len)d' % basin_hash) pipe = catalog.pipeline() pipe.rpush('basin:list', bid) pipe.hmset('basin:%s'%bid, basin_hash) pipe.set('minima:%s'%bid, pickle.dumps(minima_frame)) pipe.execute() catalog.set('basin:processed', len(ts_traj.basins)) catalog.storeNPArray(np.array(C_T), 'corr_vector') catalog.storeNPArray(np.array(mu_T), 'dspace_mu')
def execute(self, job): # PRE-PREOCESS ---------------------------------------------------------- settings = systemsettings() bench = microbench('sim_%s' % settings.name, self.seqNumFromID()) bench.start() stat = StatCollector('sim_%s' % settings.name, self.seqNumFromID()) mylogical_seqnum = str(self.seqNumFromID()) # Prepare working directory, input/output files conFile = os.path.join(job['workdir'], job['name'] + '.conf') logFile = conFile.replace('conf', 'log') # log in same place as config file dcdFile = conFile.replace('conf', 'dcd') # dcd in same place as config file USE_SHM = True SIMULATE_RATIO = settings.SIMULATE_RATIO if SIMULATE_RATIO > 1: logging.warning(" USING SIMULATION RATIO OF %d -- THis is ONLY for debugging", SIMULATE_RATIO) frame_size = (SIMULATE_RATIO * int(job['interval'])) / (1000) logging.info('Frame Size is %f Using Sim Ratio of 1:%d', \ frame_size, SIMULATE_RATIO) EXPERIMENT_NUMBER = settings.EXPERIMENT_NUMBER logging.info('Running Experiment Configuration #%d', EXPERIMENT_NUMBER) # # Grab historical basin data's relative time to start (for lineage) traj = None # EXECUTE SIMULATION --------------------------------------------------------- if self.skip_simulation: logging.info('1. SKIPPING SIMULATION.....') USE_SHM = False job['dcd'] = dcdFile key = wrapKey('jc', job['name']) self.data[key]['dcd'] = dcdFile else: logging.info('1. Run Simulation') # Prepare & source to config file with open(self.data['sim_conf_template'], 'r') as template: source = template.read() # >>>>Storing DCD into shared memory on this node if USE_SHM: ramdisk = gettempdir() job['outputloc'] = ramdisk dcd_ramfile = os.path.join(ramdisk, job['name'] + '.dcd') else: job['outputloc'] = '' with open(conFile, 'w') as sysconfig: sysconfig.write(source % job) logging.info("Config written to: " + conFile) # # Run simulation in parallel # if 'parallel' in job: # numnodes = job['parallel'] # total_tasks = numnodes * 24 # cmd = 'mpiexec -n %d namd2 %s > %s' % (total_tasks, conFile, logFile) # # Run simulation single threaded # else: # cmd = 'namd2 %s > %s' % (conFile, logFile) # cmd = 'mpirun -n %d namd2 %s > %s' % (PARALLELISM, conFile, logFile) check = executecmd('module list') logging.debug('%s', check) cmd = 'namd2 +p%d %s > %s' % (PARALLELISM, conFile, logFile) # MICROBENCH #1 (file to Lustre) # logging.debug("Executing Simulation:\n %s\n", cmd) # bench = microbench() # bench.start() # stdout = executecmd(cmd) # logging.info("SIMULATION Complete! STDOUT/ERR Follows:") # bench.mark('SimExec:%s' % job['name']) # shm_contents = os.listdir('/dev/shm/out') # logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents)) # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir']) # logging.info("Copy Complete to Lustre.") # bench.mark('CopyLustre:%s' % job['name']) # shutil.rmtree(ramdisk) # shm_contents = os.listdir('/dev/shm') # logging.debug('Ramdisk contents (should be empty) : %s', str(shm_contents)) # bench.show() max_expected_obs = int(job['runtime']) // int(job['dcdfreq']) # Retry upto 3 attempts if the sim fails MAX_TRY = 3 for i in range(MAX_TRY, 0, -1): min_required_obs = int(max_expected_obs * ((i-1)/(MAX_TRY))) logging.debug("Executing Simulation:\n %s\n", cmd) logging.debug('# Obs Expected to see: %d', max_expected_obs) stdout = executecmd(cmd) logging.info("SIMULATION Complete! STDOUT/ERR Follows:") # Check file for expected data if USE_SHM: traj = md.load(dcd_ramfile, top=job['pdb']) else: traj = md.load(dcdFile, top=job['pdb']) logging.info("Obs Threshold = %4d", min_required_obs) logging.info("#Obs This Traj = %4d", traj.n_frames) if traj.n_frames >= min_required_obs: logging.info('Full (enough) Sim Completed') break logging.info('Detected a failed Simulation. Retrying the same sim.') break bench.mark('simulation') # bench.mark('SimExec:%s' % job['name']) # Internal stats sim_length = self.data['sim_step_size'] * int(job['runtime']) sim_realtime = bench.delta_last() sim_run_ratio = (sim_realtime/60) / (sim_length/1000000) logging.info('##SIM_RATIO %6.3f min-per-ns-sim', sim_run_ratio) stat.collect('sim_ratio', sim_run_ratio) if USE_SHM: shm_contents = os.listdir(ramdisk) logging.debug('Ramdisk contents (should have files) : %s', str(shm_contents)) if not os.path.exists(dcd_ramfile): logging.warning("DCD FILE NOT FOUND!!!! Wait 10 seconds for sim to close it (???)") time.sleep(10) if not os.path.exists(dcd_ramfile): logging.warning("DCD STIILL FILE NOT FOUND!!!!") else: logging.info("DCD File was found") # # MICROBENCH #2 (file to Alluxio) # allux = AlluxioClient() # # copy to Aluxio FS # allux.put(ramdisk + job['name'] + '.dcd', '/') # logging.info("Copy Complete to Alluxio.") # bench.mark('CopyAllux:%s' % job['name']) # And copy to Lustre # shutil.copy(ramdisk + job['name'] + '.dcd', job['workdir']) # And copy to Lustre (usng zero-copy): if USE_SHM: # ALT: X-Mit to Cache Service and let cache write to disk lazily src = open(dcd_ramfile, 'rb') dest = open(dcdFile, 'w+b') offset = 0 dcdfilesize = os.path.getsize(dcd_ramfile) while True: sent = sendfile(dest.fileno(), src.fileno(), offset, dcdfilesize) if sent == 0: break offset += sent logging.info("Copy Complete to Lustre.") bench.mark('CopyLustre:%s' % job['name']) # TODO: Update job's metadata key = wrapKey('jc', job['name']) self.data[key]['dcd'] = dcdFile # ANALYSIS ------- --------------------------------------------------------- # ANALYSIS ALGORITHM # 1. With combined Sim-analysis: file is loaded locally from shared mem logging.debug("2. Load DCD") # Load topology and define filters topo = self.protein.pdb pdb = md.load(job['pdb']) hfilt = pdb.top.select_atom_indices('heavy') pfilt = pdb.top.select('protein') afilt = pdb.top.select_atom_indices('alpha') # Load full higher dim trajectory if traj is None: if USE_SHM: traj = md.load(dcd_ramfile, top=pdb) else: traj = md.load(dcdFile, top=pdb) logging.debug('Trajectory Loaded: %s (%s)', job['name'], str(traj)) traj_prot = traj.atom_slice(pfilt) traj_heavy = traj.atom_slice(hfilt) traj_alpha = traj.atom_slice(afilt) # Superpose Coordinates to Common Reference traj_prot.superpose(topo) # Calculate output Distance Space # Use of the second side chain atom is discussed in the ref paper on Timescape # The atom pairs and selected atoms are in the timescape module sc_pairs = side_chain_pairs(traj_prot) n_features = len(sc_pairs) # Use the CA atoms to calculate distance space # NOTE THE CONVERSION FROM NM TO ANG!!!!!! dist_space = 10 * DR.distance_space(traj_alpha) # Get the frame rate to for conversion between source and timescape # NOTE: Timescae is run at a more coarse level logging.debug('Preprocessing output for TimeScapes: terrain') traj_frame_per_ps = SIMULATE_RATIO * int(job['interval']) / 1000. # jc interval is in fs ts_frame_per_ps = int(self.data['timescape:rate']) # this value is in ps frame_rate = int(ts_frame_per_ps / traj_frame_per_ps) logging.debug('%5.2f fr/ps (Traj) %5.2f fr/ps (TS) FrameRate= %4.1f', traj_frame_per_ps, ts_frame_per_ps, frame_rate) # Execute Timescapes agility program to detect spatial-temporal basins output_prefix = os.path.join(job['workdir'], job['name']) if not self.skip_timescape: # Prep file and save locally in shm tmploc = gettempdir() ts_out = tmploc + 'traj_ts' ts_dcd = ts_out + '.dcd' ts_pdb = ts_out + '.pdb' heavy = traj_heavy.slice(range(0, traj.n_frames, frame_rate)) heavy.slice(0).save_pdb(ts_pdb) heavy.save_dcd(ts_dcd) # Gaussuan Full Width at Half-Max value affects sliding window size # ref: http://timescapes.biomachina.org/guide.pdf gmd_cut1 = int(self.data['timescape:gmd:low']) gmd_cut2 = int(self.data['timescape:gmd:hi']) gauss_wght_delta = int(self.data['timescape:delta']) # Execute timescapes' terrain.py on the pre-processed trajectory cmd = 'terrain.py %s %s %d %d %d GMD %s' %\ (ts_pdb, ts_dcd, gmd_cut1, gmd_cut2, gauss_wght_delta, output_prefix) logging.info('Running Timescapes:\n %s', cmd) stdout = executecmd(cmd) logging.info('TimeScapes COMPLETE:\n%s', stdout) # Collect and parse Timescape output logging.debug('Parsing Timescapes output') ts_parse = TimeScapeParser(job['pdb'], output_prefix, job['name'], dcd=dcdFile, traj=traj, uniqueid=False) basin_list = ts_parse.load_basins(frame_ratio=frame_rate) n_basins = len(basin_list) minima_coords = {} basin_rms = {} basins = {} new_dmu={} new_dsig={} resid_rms_delta = {} stat.collect('num_basin', n_basins) downstream_list = [] # FOR BOOTSTRAPPING USING RMSD ref_file = os.path.join(settings.workdir, self.data['pdb:ref:0']) logging.info('Loading RMSD reference frame from %s', self.data['pdb:ref:0']) refframe = md.load(ref_file) ref_alpha = refframe.atom_slice(refframe.top.select_atom_indices('alpha')) rmsd = 10*md.rmsd(traj_alpha, ref_alpha) # FOR RESIDUE RMSD res_rms_Kr = FEATURE_SET_RESID resrmsd = 10*np.array([LA.norm(i-ref_alpha.xyz[0], axis=1) for i in traj_alpha.xyz]) basin_res_rms = np.zeros(shape=(len(basin_list), traj_alpha.n_atoms)) # LOAD CENROIDS -- todo move to immut centroid = pickle.loads(self.catalog.get('centroid:ds')) basin_label_list = {} # Process each basin for i, basin in enumerate(basin_list): logging.info(' Processing basin #%2d', i) bid = basin.id downstream_list.append(bid) # Slice out minima coord & save to disk (for now) # TODO: Store in memory in cache minima_coords[bid] = traj.slice(basin.mindex) jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid) minima_coords[bid].save_pdb(jc_filename) # METRIC CALCULATION a, b = basin.start, basin.end new_dmu[bid] = np.mean(dist_space[a:b], axis=0) new_dsig[bid] = np.std(dist_space[a:b], axis=0) # Collect Basin metadata basin_hash = basin.kv() basin_hash['pdbfile'] = jc_filename basin_hash['rmsd'] = np.median(rmsd[a:b]) basin_res_rms[i] = np.median(resrmsd[a:b], axis=0) # FOR EXP #16 -- Residue RMSD # Set relative time (in ns) # basin_hash['time'] = reltime_start + (a * frame_rate) / 1000 basins[bid] = basin_hash # LABEL ATEMPORAL DATA # Take every 4th frame stride = [dist_space[i] for i in range(a,b,4)] rms = [LA.norm(centroid - i, axis=1) for i in stride] label_seq = [np.argmin(i) for i in rms] basin_label_list[bid] = label_seq logging.info(' Basin Processed: #%s, %d - %d', basin_hash['traj'], basin_hash['start'], basin_hash['end']) # RMSD DELTA (RESIDUE) basin_res_rms_delta = np.array([rms_delta(i) for i in basin_res_rms.T]).T basin_rms_delta_bykey = {basin.id: basin_res_rms_delta[i] for i, basin in enumerate(basin_list)} for k in basins.keys(): basins[k]['resrms_delta'] = np.sum(basin_rms_delta_bykey[k][res_rms_Kr]) # TODO: Use Min Index as snapshot, median (or mean) DistSpace vals for each basin????? bench.mark('analysis') # BARRIER: WRITE TO CATALOG HERE -- Ensure Catalog is available # try: self.wait_catalog() # except OverlayNotAvailable as e: # logging.warning("Catalog Overlay Service is not available. Scheduling ASYNC Analysis") # FOR EXPERIMENT METRICS src_basin = self.catalog.hgetall("basin:" + job['src_basin']) with open('/home-1/[email protected]/ddc/results/{0}_prov.log'.format(settings.name), 'a') as metric_out: for i, basin in enumerate(basin_list): bid = basin.id label_seq = basin_label_list[bid] basin_metric_label = LABEL10(label_seq) metric_out.write('BASIN,%s,%s,%s,%s\n'% \ (bid, src_basin['label:10'], basin_metric_label, ''.join([str(i) for i in label_seq]))) # Update Index Synchronized data lists basin_list_sorted =sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1]))) for bid in basin_list_sorted: with self.catalog.pipeline() as pipe: while True: try: logging.debug('Updating %s basin indeces and distance space', len(basins)) pipe.rpush('basin:list', bid) pipe.rpush('dspace', pickle.dumps(new_dmu[bid])) basin_index,_ = pipe.execute() break except redis.WatchError as e: logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...') continue basins[bid]['dsidx'] = basin_index - 1 # Update Catalog with 1 Long Atomic Transaction with self.catalog.pipeline() as pipe: while True: try: logging.debug('Update Filelist') pipe.watch(wrapKey('jc', job['name'])) file_idx = pipe.rpush('xid:filelist', job['dcd']) - 1 # HD Points logging.debug('Update HD Points') start_index = pipe.llen('xid:reference') pipe.multi() pipe.rpush('xid:reference', *[(file_idx, x) for x in range(traj.n_frames)]) pipe.set('resrms:' + job['name'], resrmsd) # Store all basin data logging.debug('Updating %s basins', len(basins)) for bid in sorted(basins.keys(), key=lambda x: (x.split('_')[0], int(x.split('_')[1]))): pipe.hmset('basin:'+bid, basins[bid]) pipe.set('minima:%s'%bid, pickle.dumps(minima_coords[bid])) for i in basin_label_list[bid]: pipe.rpush('basin:labelseq:'+bid, i) pipe.hset('anl_sequence', job['name'], mylogical_seqnum) logging.debug('Executing') pipe.execute() break except redis.WatchError as e: logging.debug('WATCH ERROR. Someone else is writing to the catalog. Retrying...') continue self.data[key]['xid:start'] = start_index self.data[key]['xid:end'] = start_index + traj.n_frames bench.mark('catalog') # ---- POST PROCESSING # Check for total data in downstream queue & Notify control manager to run # 1. Det. what's still running job_queue = slurm.currentqueue() n_simjobs = -1 # -1 to exclude self for j in job_queue: if j['name'].startswith('sw'): n_simjobs += 1 logging.info('SIM WORKER has detected %d other simulations running/pending.', len(job_queue)) remain_percent = n_simjobs / self.data['numresources'] # Fault Tolerance: ensure the pipeline persists if n_simjobs == 0: logging.info('I am the last simulation. Ensuring the controler executes.') self.catalog.set('ctl:force', 1) # Send downstream notification when less than 10% of jobs are still running if not self.skip_notify and remain_percent < .1: # Notify the control manager 'CM' self.notify('ctl') if USE_SHM: shutil.rmtree(ramdisk) shm_contents = os.listdir('/dev/shm') logging.debug('Ramdisk contents (should be empty of DDC) : %s', str(shm_contents)) # For benchmarching: bench.show() stat.show() # Return # of observations (frames) processed return downstream_list
def bootstrap_lattice(catalog, num=10, build_new=False): ''' Bootstrap After TimeScape has run on source trajectory ''' home = os.getenv("HOME") support = 1 cutoff = 8 start_coord = [ 'de2586_315', 'de531_20', 'de3765_63', 'de3305_668', 'de1732_139' ] dcdfile = lambda x: home + '/work/data/{0}.dcd'.format(x) outloc = lambda x: home + '/work/jc/denovouniform1/{0}/{0}'.format(x) traj_list = {} basin_list = catalog.lrange('basin:list', 0, -1) if len(basin_list) == 134: logging.info('Basin Data already loaded!') rms_delta_list = [ (i, np.sum(pickle.loads(catalog.get('basin:rmsdelta:' + b)))) for i, b in enumerate(basin_list) ] else: logging.info('Loading all bootstrap data to initialize...') basin_list = [] rms_delta_list = [] pdb_file = home + '/work/data/alpha.pdb' topo = md.load(pdb_file) ref_alpha = md.load(home + '/work/' + catalog.get('pdb:ref:0')) ref_alpha.atom_slice(ref_alpha.top.select_atom_indices('alpha'), inplace=True) res_rms_Kr = FEATURE_SET for sc in start_coord: dist_space = [] srcfile = outloc(sc) + '.dcd' pdbfile = srcfile.replace('dcd', 'pdb') logging.debug('LOADING TRAJ: %s', srcfile) traj = md.load(srcfile, top=pdbfile) traj_list[sc] = traj alpha = traj.atom_slice(traj.top.select_atom_indices('alpha')) logging.info('Grabbing TS data...') W = TS.TimeScape.windows(outloc(sc) + '_transitions.log') ts_traj = TS.TimeScapeParser(pdbfile, outloc(sc), sc, dcd=srcfile, traj=traj) basins = ts_traj.load_basins() logging.info("Processing distance space and residue RMS") dsa = DR.distance_space(alpha) resrmsd = 10 * np.array( [LA.norm(i - ref_alpha.xyz[0], axis=1) for i in alpha.xyz]) basin_res_rms = np.zeros(shape=(len(ts_traj.basins), alpha.n_atoms)) for i, (a, b) in enumerate(W): dist_space.append(dsa[a:b].mean(0)) basin_res_rms[i] = np.median(resrmsd[a:b], axis=0) basin_res_rms_delta = np.array( [rms_delta(i) for i in basin_res_rms.T]).T logging.debug('RMS LEN CHECK: %d =?= %d -- Updating RMS Delta', len(basins), len(basin_res_rms_delta)) for i, basin in enumerate(basins): pipe = catalog.pipeline() bid = basin.id # Store on Disk and in redis jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid) logging.info('MIN for %s: Idx# %d to %s', bid, basin.mindex, jc_filename) minima_frame = traj.slice( basin.mindex ) #md.load_frame(src_traj, basin.mindex, top=src_traj.replace('dcd', 'pdb')) minima_frame.save_pdb(jc_filename) basin_hash = basin.kv() basin_hash['pdbfile'] = jc_filename logging.info( ' Basin: %(id)s %(start)d - %(end)d Minima: %(mindex)d size=%(len)d' % basin_hash) pipe.rpush('basin:list', bid) pipe.hmset('basin:%s' % bid, basin_hash) pipe.set('basin:dmu:' + bid, pickle.dumps(dist_space[i])) pipe.set('minima:%s' % bid, pickle.dumps(minima_frame)) # FOR RESIDUE RMSD resrms_d = np.sum(basin_res_rms_delta[i][res_rms_Kr]) basin_hash['resrms_delta'] = resrms_d rms_delta_list.append((len(basin_list), resrms_d)) basin_list.append(basin_hash) pipe.set('basin:rmsdelta:' + bid, pickle.dumps(basin_res_rms_delta[i])) pipe.execute() # FOR RESIDUE RMSD # FOR SEED SAMPLING USING RMS_DELTA # Note: skip the first basin # Re-Construct the Lattice from if build_new: dist_space = 10 * np.array(dist_space) cm = ds < cutoff fs = lat.reduced_feature_set(cm, .115) len(fs) dr, cr = ds[:, fs], cm[:, fs] mfis, lfis = lat.maxminer(cr, 1) dlat, ik = lat.derived_lattice(mfis, dr, cr) pickle.dump(mfis, open(home + '/work/data/denovo_mfis.p', 'wb')) pickle.dump(lfis, open(home + '/work/data/denovo_lfis.p', 'wb')) pickle.dump(ik, open(home + '/work/data/denovo_iset.p', 'wb')) pickle.dump(dlat, open(home + '/work/data/denovo_dlat.p', 'wb')) else: logging.info('Loading Pre-Constructed Lattice Data') dlat = pickle.load(open(home + '/work/data/denovo_dlat.p', 'rb')) mfis = pickle.load(open(home + '/work/data/denovo_mfis.p', 'rb')) lfis = pickle.load(open(home + '/work/data/denovo_lfis.p', 'rb')) ik = pickle.load(open(home + '/work/data/denovo_iset.p', 'rb')) with catalog.pipeline() as pipe: pipe.set('lattice:max_fis', pickle.dumps(mfis)) pipe.set('lattice:low_fis', pickle.dumps(lfis)) pipe.set('lattice:dlat', pickle.dumps(dlat)) pipe.set('lattice:iset', pickle.dumps(ik)) pipe.execute() # logging.info('Building Existing lattice object') # lattice=lat.Lattice(ds, fs, cutoff, support) # lattice.set_fis(max_fis, low_fis) # lattice.set_dlat(dlat, Ik) # sampler = LatticeSampler(lattice) # Sample -- FOR USING LATTICE TO BOOTSTRAP # cl,sc,el = lat.clusterlattice(dlat, cr, dr, ik, num_k=8, invert=True) # cl_list = sorted(el, key=lambda x: len(x)) # TODO: Check if fan out > single item clusters # start_indices = [clu[0][0] for clu in cl_list[:num]] rms_delta_ranked = [ x[0] for x in sorted(rms_delta_list, key=lambda i: i[1], reverse=True) ] start_indices = rms_delta_ranked[:num] seedlist = [catalog.lindex('basin:list', i) for i in start_indices] sim_init = {key: catalog.get(key) for key in settings.sim_params.keys()} global_params = getSimParameters(sim_init, 'seed') global_params['psf'] = home + '/work/jc/serial2/de0_0/de0_0.psf' for seed in seedlist: logging.debug('\nSeeding Job: %s ', seed) basin = catalog.hgetall('basin:%s' % seed) catalog.rpush('executed', seed) # Generate new set of params/coords jcID, config = generateFromBasin(basin) # Update Additional JC Params and Decision History, as needed config.update(global_params) # Push to catalog logging.info("New Simulation Job Created: %s", jcID) for k, v in config.items(): logging.debug(" %s: %s", k, str(v)) catalog.rpush('jcqueue', jcID) catalog.hmset(wrapKey('jc', jcID), config)
def load_historical_Expl(catalog): """ Load all DEShaw data into basins for processing """ settings = systemsettings() # idx_list = [0,1,2,3,4,20,23,24,30,32,34,40,41,42] idx_list = [0, 34] tlist = {k: 'tr%d.dcd' % k for k in idx_list} seed_dir = os.path.join(settings.WORKDIR, 'seed') seed_ts_ratio = 16 # TimeScape ran on 4ps frame rate (16x source) # Load topology and anscillary data # bpti = Protein(bpti, catalog, load=True) pdb_file = os.path.join(seed_dir, 'coord.pdb') topo = md.load(pdb_file) pfilt = topo.top.select('protein') logging.info('Topology loaded %s', topo) # ID Side Chain pair atoms for each distance space calc sc_pairs = side_chain_pairs(topo.atom_slice(pfilt)) logging.info('Identified side chains: %d', len(sc_pairs)) # Process all sorce trajectories basin_list = [] C_T, mu_T, sigma_T = [], [], [] for idx in idx_list: logging.info('Procesing Seed index: %d', idx) # Load SRC seed trajetory & calc distance space -- TODO: make this optional tfile = os.path.join(seed_dir, tlist[idx]) traj = md.load(tfile, top=topo) traj.superpose(topo) ds = datareduce.distance_space(traj, pairs=sc_pairs) # Push to Catalog file_idx = catalog.rpush('xid:filelist', tfile) - 1 start_index = catalog.llen('xid:reference') # TODO: Do I still need to index every frame?????? catalog.rpush('xid:reference', *[(file_idx, x) for x in range(traj.n_frames)]) # Process Trajectory as basins logging.info(" Seed Loaded. Loading TimeScape Data...") seed_name = 'tr%d' % idx ts_data_path = os.path.join(seed_dir, 'TEST', seed_name) ts_traj = TimeScapeParser(pdb_file, ts_data_path, seed_name, dcd=tfile, traj=traj) basin_list = ts_traj.load_basins(frame_ratio=seed_ts_ratio) corr_mat = ts_traj.correlation_matrix() for i, basin in enumerate(ts_traj.basins): a, b = basin.start, basin.end bid = basin.id if a > traj.n_frames: logging.info( 'Finished processing all basins for this Trajectory!') break # Store on Disk and in redis jc_filename = os.path.join(settings.datadir, 'basin_%s.pdb' % bid) minima_frame = md.load_frame( tfile, basin.mindex, top=topo) if traj is None else traj.slice( basin.mindex) minima_frame.save_pdb(jc_filename) C_T.append(np.mean(corr_mat[a:b], axis=0)) mu_T.append(np.mean(ds[a:b], axis=0)) sigma_T.append(np.std(ds[a:b], axis=0)) basin_hash = basin.kv() basin_hash['pdbfile'] = jc_filename basin_hash['dsidx'] = i logging.info( ' Basin: %(id)s %(start)d - %(end)d Minima: %(mindex)d size=%(len)d' % basin_hash) pipe = catalog.pipeline() pipe.rpush('basin:list', bid) pipe.hmset('basin:%s' % bid, basin_hash) pipe.set('minima:%s' % bid, pickle.dumps(minima_frame)) pipe.execute() catalog.set('basin:processed', len(ts_traj.basins)) catalog.storeNPArray(np.array(C_T), 'corr_vector') catalog.storeNPArray(np.array(mu_T), 'dspace_mu')