def total_time(name): end_ts = lambda x: du.parse(x[0]).timestamp() + x[1] eid = db.get_expid(name) sw_list=db.runquery('select start,time from sw where expid=%d order by start'%eid) start = du.parse(sw_list[0][0]).timestamp() sw = sorted([dict(start=x[0], time=x[1], end=end_ts(x)-start) for x in sw_list], key=lambda i: i['end']) return sw[-1]['end']
def conv_over_time(name, step=10000, tw=False): eid = db.get_expid(name) obs = getobs(name) if tw: obs = obs2tw(obs) V = getdistinct(obs) N = len(obs) plotlists = {v: [] for v in V} sw_list=db.runquery('select start,time,numobs from sw where expid=%d order by start'%eid) end_ts = lambda x: du.parse(x[0]).timestamp() + x[1] ts_0 = du.parse(sw_list[0][0]).timestamp() sw = sorted([dict(start=x[0], time=x[1], numobs=x[2], end=end_ts(x)-ts_0) for x in sw_list], key=lambda i: i['end']) n = 0 snum = 0 nextcalc = step while n < N and snum < len(sw): n += sw[snum]['numobs'] if n > nextcalc: t = sw[snum]['end'] / 3600. # c = bootstrap_sampler(obs[:min(n,N)], samplesize=.25) c = bootstrap_iter(obs[:min(n,N)], size=step) for v in V: if v in c.keys(): # plotlists[v].append((t, min(c[v][3], 1.))) plotlists[v].append((t, min(c[v][1]/c[v][0], 1.))) else: plotlists[v].append((t, 1.)) nextcalc += step snum += 1 return plotlists
def traj_results(name): expid = db.get_expid(name) obslist = getobs(name) tlist = [] jc = db.runquery('select bin, start, end from jc where expid=%d order by start'%expid) for b, s, e in jc: tlist.append((b, obslist[s:e])) return tlist
def all_elas(): rlist = [5, 10, 25, 50, 75, 100, 200] rex = {k: F.ExprAnl(port=6391+i) for i, k in enumerate(rlist)} for k, e in rex.items(): print('Loading', k) e.load(min(750, len(e.conf))) name = 'rtime250_%d' % k eid = db.get_expid(name) db.adhoc('select swname, start, time, cpu from sw where expid=%d' % eid) allboot = {k: C.elas_boot(v, 1000, 'dsw', limit=151000) for k,v in rex.items()} plots = [{k: v[s] for k,v in allboot.items()} for s in range(5)] for i, p in enumerate(plots): P.scats(p, 'Elas_%d'%i)
def by_src_bin(name): expid = db.get_expid(name) obslist = getobs(name) obs = {} jc = db.runquery('select bin, start, end from jc where expid=%d'%expid) for b, s, e in jc: if b not in obs: obs[b] = [] obs[b].extend(obslist[s:e]) D = {k: {str((a,b)): 0 for a, b in binlist} for k in obs.keys()} total = {k: 0 for k in obs.keys()} for k, v in obs.items(): for i in v: total[k] += 1 D[k][i] += 1 return D
def by_src_hcube(name): expid = db.get_expid(name) obslist = getobs(name) obs = {} jc = db.runquery('select bin, hcube, start, end from jc where expid=%d'%expid) for b, h, s, e in jc: if b not in obs: obs[b] = {} if h not in obs[b]: obs[b][h] = [] obs[b][h].extend(obslist[s:e]) D = {k: {h: {s: 0 for s in sbinlist} for h in obs[k].keys()} for k in obs.keys()} for k, v1 in obs.items(): for h, v2 in v1.items(): for i in v2: if i == '0-D': continue D[k][h][i] += 1 return D
def elas_feal(name, feal_list, max_obs, step=2000): eid = db.get_expid(name) plotlist = [] sw_list=db.runquery('select start,time,numobs from sw where expid=%d order by start'%eid) end_ts = lambda x: du.parse(x[0]).timestamp() + x[1] ts_0 = du.parse(sw_list[0][0]).timestamp() sw = sorted([dict(start=x[0], time=x[1], numobs=x[2], end=end_ts(x)-ts_0) for x in sw_list], key=lambda i: i['end']) n = 0 snum = 0 nextcalc = step while n < max_obs and snum < len(sw): n += sw[snum]['numobs'] if n > nextcalc: t = sw[snum]['end'] / 3600. # c = bootstrap_sampler(obs[:min(n,N)], samplesize=.25) c = op.bootstrap_block(feal_list[:n], step) plotlist.append((t, min(np.max(c[1]), 1.))) nextcalc += step snum += 1 return plotlist
def centroid_bootstrap(catalog): centfile = settings.RMSD_CENTROID_FILE centroid = np.load(centfile) cent_npts = [1, 1, 1, 1, 1] # TBD numLabels = len(centroid) binlist = [(a, b) for a in range(numLabels) for b in range(numLabels)] logging.info("Loaded Starting Centroids from %s", centfile) name = catalog.get('name') if name is None: logging.info('Name not configured in this catalog. Set it and try again') return # Load/Set initial (current) Configs from Catalog if catalog.exists('thetas'): thetas = catalog.loadNPArray('thetas') else: thetas = np.zeros(shape=(numLabels, numLabels)) thetas[:] = 0.25 if catalog.exists('transition_sensitivity'): trans_factor = catalog.loadNPArray('transition_sensitivity') else: trans_factor = 0.2 use_gradient = True obs_count = {ab: 0 for ab in binlist} C_delta = [] T_delta = [] # Configure Noise Filter noise = int(catalog.get('obs_noise')) dcdfreq = int(catalog.get('dcdfreq')) stepsize = int(catalog.get('sim_step_size')) nwidth = noise//(2*stepsize) noisefilt = lambda x, i: np.mean(x[max(0,i-nwidth):min(i+nwidth, len(x))], axis=0) # Get previously Labeled data (or label data IAW current settings) eid = db.get_expid(name) obslist = [i[0] for i in db.runquery('SELECT obs FROM obs WHERE expid=%d' % eid)] jobs = [i[0] for i in sorted(catalog.hgetall('anl_sequence').items(), key=lambda x: x[1])] shape = None # Initialize lists for pair-wise distances (top 2 nearest centroids) diffList = {} transList = {} scatPlot = {} for A in range(0, numLabels-1): for B in range(A+1, numLabels): diffList[(A, B)] = [] transList[(A, B)] = [] scatPlot[(A, B)] = [] allScat = [] # Load trajectories & filter obs_global = [] # Process learning in batches (static batch size to start) batch_size = 25 max_obs = 150 batch = 0 while batch <= max_obs: logging.info("Procssing Jobs %d - %d", batch, batch+batch_size) exec_sim = [] obs_list = [] for job in jobs[batch:batch+25]: conf = catalog.hgetall('jc_' + job) traj = md.load(conf['dcd'], top=conf['pdb']) alpha = datareduce.filter_alpha(traj) conf['alpha'] = alpha.xyz exec_sim.append(conf) if shape is None: shape = conf['alpha'].shape[1:] # xyz_filtered = np.array([noisefilt(alpha.xyz, i) for i in range(alpha.n_frames)]) rmslist = calc_rmsd(alpha, centroid) labels = [] for rms in rmslist: # [cw[i]*LA.norm(pt - centroid[i]) for i in range(5)] A, B = np.argsort(rms)[:2] delta = np.abs(rms[B] - rms[A]) if delta < thetas[A][B]: sub_state = B else: sub_state = A classify = (A, sub_state) labels.append(classify) obs_count[classify] += 1 # For globally updating Thetas obs_global.append(classify) if A < B: diffList[(A, B)].append(rms[A] - rms[B]) else: diffList[(B, A)].append(rms[B] - rms[A]) for a in range(0, numLabels-1): for b in range(a+1, numLabels): transList[(a, b)].append(rms[a] - rms[b]) if (a, a) == classify or (b, b) == classify: c = 'b' elif (a, b) == classify or (b, a) == classify: c = 'g' elif a == A or b == A: c = 'r' else: c = 'black' scatPlot[(a, b)].append((rms[a] - rms[b], c)) obs_list.append(labels) logging.info('Bin Distribution:') grpby = {} for llist in obs_list: for l in llist: if l not in grpby: grpby[l] = 0 grpby[l] += 1 for k in sorted(grpby.keys()): logging.info('%s: %5d', k, grpby[k]) for A in range(0, numLabels-1): for B in range(A+1, numLabels): d = diffList[(A, B)] logging.info('Diff list for %d,%d: %d, %5.2f, %5.2f', A, B, len(d), min(d), max(d)) # # 6. Apply Heuristics Labeling # # logging.debug('Applying Labeling Heuristic. Origin: %d, %d', srcA, srcB) # rmslabel = [] # # label_count = {ab: 0 for ab in binlist} # groupbystate = [[] for i in range(numLabels)] # groupbybin = {ab: [] for ab in binlist} # For each frame in each traj: ID labeled well pts & build avg op logging.info('Selecting observed Well States') coor_sum = {i: np.zeros(shape=shape) for i in range(numLabels)} coor_tot = {i: 0 for i in range(numLabels)} for job, obslist in zip(exec_sim, obs_list): # offset = int(job['xid:start']) # for i, frame in enumerate(job['alpha']): for frame, label in zip(job['alpha'], obslist): # A, B = eval(obslist[offset+i]) A, B = label if A != B: continue coor_sum[A] += frame coor_tot[A] += 1 logging.info('Calculating Avg from following stats:') logging.info(' Total Frames: %d', sum([len(sim['alpha']) for sim in exec_sim])) # Calculate New Centroids (w/deltas) delta = [] for S in range(numLabels): if coor_tot[S] == 0: logging.info(" State: %d --- NO OBSERVATIONS IN THIS WELL STATE", S) continue cent_local = coor_sum[S] / coor_tot[S] diff_local = LA.norm(centroid[S] - cent_local) update = ((centroid[S] * cent_npts[S]) + (cent_local * coor_tot[S])) / (cent_npts[S] + coor_tot[S]) delta.append(LA.norm(update - centroid[S])) logging.info(' State %d: NewPts=%5d Delta=%5.2f LocalDiff=%5.2f', S, coor_tot[S], delta[-1], diff_local) centroid[S] = update cent_npts[S] += coor_tot[S] centroid_change = np.mean(delta) if len(C_delta) > 1: rel_change = np.abs((centroid_change - C_delta[-1]) / C_delta[-1]) logging.info('Centroid Change: %5.2f (%5.2f%%)', centroid_change, 100*rel_change) C_delta.append(centroid_change) batch += batch_size # Update Thetas (usig global data ?????) delta = [] for A in range(0, numLabels-1): for B in range(A+1, numLabels): X = sorted(diffList[(A, B)]) if len(X) < 100: logging.info('Lacking data on %d, %d', A, B) continue # logging.info(' Total # Obs: %d', len(X)) crossover = 0 for i, x in enumerate(X): if x > 0: crossover = i break # logging.info(' Crossover at Index: %d', crossover) if crossover < 50 or (len(X)-crossover) < 50: logging.info(' Lacking local data skipping.') continue # Find local max gradient (among 50% of points) if use_gradient: thetas_updated = np.copy(thetas) zoneA = int((1-trans_factor) * crossover) zoneB = crossover + int(trans_factor * (len(X) - crossover)) gradA = zoneA + np.argmax(np.gradient(X[zoneA:crossover])) gradB = crossover + np.argmax(np.gradient(X[crossover:zoneB])) thetaA = X[gradA] thetaB = X[gradB] thetas_updated[A][B] = np.abs(thetaA) thetas_updated[B][A] = np.abs(thetaB) tdeltA = np.abs(thetas_updated[A][B] - thetas[A][B]) tdeltB = np.abs(thetas_updated[B][A] - thetas[B][A]) delta.append(tdeltA) delta.append(tdeltB) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', A, B, tdeltA, (100*tdeltA/thetas[A][B])) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', B, A, tdeltB, (100*tdeltB/thetas[B][A])) thetas[A][B] = thetas_updated[A][B] thetas[B][A] = thetas_updated[B][A] else: # Classify Fixed Percent of observations as Transitional thetas_updated = np.copy(thetas) transitionPtA = int((1-trans_factor) * crossover) transitionPtB = crossover + int(trans_factor * (len(X) - crossover)) thetaA = X[transitionPtA] thetaB = X[transitionPtB] thetas_updated[A][B] = np.abs(thetaA) thetas_updated[B][A] = np.abs(thetaB) tdeltA = np.abs(thetas_updated[A][B] - thetas[A][B]) tdeltB = np.abs(thetas_updated[B][A] - thetas[B][A]) delta.append(tdeltA) delta.append(tdeltB) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', A, B, tdeltA, (100*tdeltA/thetas[A][B])) logging.info(' Theta Change (%d,%d): %4.2f (%4.1f)', B, A, tdeltB, (100*tdeltB/thetas[B][A])) thetas[A][B] = thetas_updated[A][B] thetas[B][A] = thetas_updated[B][A] T_delta.append(np.mean(delta)) P.line(np.array(C_delta), 'Avg_CHANGE_Centroid_Pos_%s' % name) P.line(np.array(T_delta), 'Avg_CHANGE_Theta_Val_%s' % name) P.bargraph_simple(obs_count, 'Final_Histogram_%s' % name) # for k, X in diffList.items(): # A, B = k # P.transition_line(sorted(X), A, B, title='-X', trans_factor=.5) # for k, X in transList.items(): # A, B = k # P.transition_line(sorted(X), A, B, title='-ALL', trans_factor=.5) for k, X in scatPlot.items(): collab = {'b': 'Well', 'g': 'Trans', 'r': 'Primary', 'brown': 'Secondary', 'black': 'None'} ptmap = {k: [] for k in collab.keys()} ordpts = sorted(X, key = lambda x : x[0]) for i, tup in enumerate(ordpts): y, c = tup ptmap[c].append((i, y)) # if c == 'b' or c == 'g': # ptmap[c].append((i, y)) # else: # ptmap[c].append((i, 0)) A, B = k P.scat_Transtions(ptmap, title='-%d_%d'%(A,B), size=1, labels=collab)
def getobs(name): eid = db.get_expid(name) t = db.runquery('select idx, obs from obs where expid=%d order by idx'%eid) return [i[1] for i in t]
def elas_boot(ex, size, method=None, limit=375000, state=None): name = ex.r.get('name') print(name, '-', end=' ') eid = db.get_expid(name) # Get feature landscape (for convergence) if method is None: feal = ex.all_feal()[:limit] else: feal = ex.all_feal(True, method)[:limit] plotlist = [] # Get list of all simulations sw_list=db.runquery('select start,time,numobs from sw where expid=%d order by start'%eid) print('Simulations read: ', len(sw_list)) end_ts = lambda x: du.parse(x[0]).timestamp() + x[1] # Account for any gaps in execution & adjust (ensure real time is logically grouped) ts_0 = du.parse(sw_list[0][0]).timestamp() last = ts_0 sw_seq = [] gap = 0 cutoff = 30*60 # 30 min gap is bad swbystart = sorted(sw_list, key=lambda i: i[0]) for s, t, n in swbystart: sim_start = du.parse(s).timestamp() - ts_0 if sim_start - last > cutoff: print('FOUND GAP:', int(sim_start - last)) gap += sim_start - last + cutoff new_start = sim_start - gap end = new_start + t sw_seq.append({'start': new_start, 'end':end, 'numobs':n}) last = sim_start maxobs = sum([i['numobs'] for i in sw_seq]) # Sort by end time (for convergence) sw = sorted(sw_seq, key=lambda i: i['end']) N = min(limit, maxobs, len(feal)) dnum = 0 # Data item # (as in stream) snum = 0 # Sim # lastcalc = 0 nextcalc = size i = 0 boot = [] ci = [] # Process each simulation's observations, batch into step-sizes and calc bootstrap last_conv = 1. while dnum < N and snum < len(sw): dnum += sw[snum]['numobs'] if dnum > nextcalc: t = sw[snum]['end'] / 3600. # get time arr = np.array(feal[lastcalc:dnum]).T feal_ci = [] straps = np.array([bootstrap_std(arr[feat]) for feat in range(5, 20)]) for feat in range(5, 15): feal_ci.append(straps[feat][1]/straps[feat][0]) # feal_ci.append(straps[feat-5][1]) ci.append(feal_ci) feal_ci = [] for feat in range(10): calc = bootstrap_std([x[feat] for x in ci]) feal_ci.append(calc[1]/calc[0]) # feal_ci.append(calc[1]) # conv = min(1., np.mean(feal_ci[1:])) conv = np.mean(np.nan_to_num(feal_ci)) if conv == 0: conv = last_conv plotlist.append((t, conv)) lastcalc = dnum nextcalc += size nextcalc = min(nextcalc, N) last_conv = conv snum += 1 return plotlist
def getobs(name): burnin = {'serial':0, 'parallel':0,'uniform3':25000,'biased4':50000, 'feal1':25000, 'reweight4':50000} eid = db.get_expid(name) t = db.runquery('select idx, obs from obs where expid=%d order by idx'%eid) b = 0 if name not in burnin else burnin[name] return [i[1] for i in t[b:b+510000]]