def __init__(self, rooms): self.rooms = rooms self.mapSize = (MiniMap.mapImage.width - 2 * MiniMap.mapBorder[x], MiniMap.mapImage.height - 2 * MiniMap.mapBorder[y]) self.xGridCoords = [] self.yGridCoords = [] for room in self.rooms: if self.xGridCoords.count(room.gridCoord[x]) == 0: self.xGridCoords.append(room.gridCoord[x]) if self.yGridCoords.count(room.gridCoord[y]) == 0: self.yGridCoords.append(room.gridCoord[y]) self.xGridCoords.sort() self.yGridCoords.sort() self.minimums = [min(self.xGridCoords), min(self.yGridCoords)] self.maximums = [max(self.xGridCoords), max(self.yGridCoords)] self.roomCount = (abs(self.maximums[x]) + abs(self.minimums[x]) + 1, abs(self.maximums[y]) + abs(self.minimums[y]) + 1) self.roomSize = min( MiniMap.spaceRatio * self.mapSize[x] / ((MiniMap.spaceRatio + 1) * self.roomCount[x] - 1), MiniMap.spaceRatio * self.mapSize[y] / ((MiniMap.spaceRatio + 1) * self.roomCount[y] - 1)) self.spaceSize = min(self.roomSize / MiniMap.spaceRatio, self.roomSize / MiniMap.spaceRatio) self.shift = [ median(self.xGridCoords) * (self.roomSize + self.spaceSize), median(self.yGridCoords) * (self.roomSize + self.spaceSize) ]
def __init__(self, rooms): self.rooms = rooms self.mapSize = (MiniMap.mapImage.width - 2 * MiniMap.mapBorder[x], MiniMap.mapImage.height - 2 * MiniMap.mapBorder[y]) self.xGridCoords = [] self.yGridCoords = [] for room in self.rooms: if self.xGridCoords.count(room.gridCoord[x]) == 0: self.xGridCoords.append(room.gridCoord[x]) if self.yGridCoords.count(room.gridCoord[y]) == 0: self.yGridCoords.append(room.gridCoord[y]) self.xGridCoords.sort() self.yGridCoords.sort() self.minimums = [min(self.xGridCoords), min(self.yGridCoords)] self.maximums = [max(self.xGridCoords), max(self.yGridCoords)] self.roomCount = (abs(self.maximums[x]) + abs(self.minimums[x]) + 1, abs(self.maximums[y]) + abs(self.minimums[y]) + 1) self.roomSize = min(MiniMap.spaceRatio * self.mapSize[x] / ((MiniMap.spaceRatio + 1) * self.roomCount[x] - 1), MiniMap.spaceRatio * self.mapSize[y] / ((MiniMap.spaceRatio + 1) * self.roomCount[y] - 1)) self.spaceSize = min(self.roomSize / MiniMap.spaceRatio, self.roomSize / MiniMap.spaceRatio) self.shift = [median(self.xGridCoords) * (self.roomSize + self.spaceSize), median(self.yGridCoords) * (self.roomSize + self.spaceSize)]
def test_median(): ''' Test the weighted median ''' w = np.repeat(1., 100) data = np.arange(100) print np.median(data) print UT.median(data, weights=w) if np.median(data) != UT.median(data, weights=w): raise ValueError return None
def test_SFMS_highz(run, T, nsnap=15, lit='lee', nsnap0=15, downsampled='14'): ''' Compare the best-fit SFMS parameters from ABC to literature at high z ''' if lit == 'lee': lit_label = 'Lee et al. (2015)' # median ABC theta abcout = abcee.readABC(run, T) theta_med = [UT.median(abcout['theta'][:, i], weights=abcout['w'][:]) for i in range(len(abcout['theta'][0]))] subcat_sim = abcee.model(run, theta_med, nsnap0=nsnap0, downsampled=downsampled) m_arr = np.arange(8., 12.1, 0.1) sfr_abc = Obvs.SSFR_SFMS(m_arr, UT.z_nsnap(nsnap), theta_SFMS=subcat_sim['theta_sfms']) + m_arr sfr_obv = Obvs.SSFR_SFMS_obvs(m_arr, UT.z_nsnap(nsnap), lit=lit) + m_arr fig = plt.figure() sub = fig.add_subplot(111) sub.fill_between(m_arr, sfr_abc-0.3, sfr_abc+0.3, color='b', alpha=0.25, linewidth=0, edgecolor=None, label=r'ABC $\theta_{median}$') sub.plot(m_arr, sfr_obv+0.3, ls='--', c='k', label=lit_label) sub.plot(m_arr, sfr_obv-0.3, ls='--', c='k') sub.set_xlim([8., 12.]) sub.set_xlabel('$\mathtt{log\;M_*}$', fontsize=25) sub.set_ylim([-4., 3.]) sub.set_ylabel('$\mathtt{log\;SFR}$', fontsize=25) sub.legend(loc='best') fig.savefig(UT.fig_dir()+'SFMS.z'+str(round(UT.z_nsnap(nsnap),2))+'.'+run+'.'+lit+'.png', bbox_inches='tight') plt.close() return None
def test_ABC_SMHMR(run, T):#, sumstat=['smf']): ''' Compare the SMHMR the median T-th ABC particle pool with 'data' Hardcoded for smf only ''' # data summary statistic subcat_dat = abcee.Data(nsnap0=15) # median theta abcout = abcee.readABC('test0', T) theta_med = [UT.median(abcout['theta'][:, i], weights=abcout['w'][:]) for i in range(len(abcout['theta'][0]))] # F( median theta) subcat_sim = abcee.model(run, theta_med, nsnap0=15, downsampled='14') fig = plt.figure() sub = fig.add_subplot(111) smhmr = Obvs.Smhmr() # simulation m_mid, mu_mhalo, sig_mhalo, cnts = smhmr.Calculate(subcat_sim['m.max'], subcat_sim['m.star'], dmhalo=0.2, weights=subcat_sim['weights']) sub.fill_between(m_mid, mu_mhalo - sig_mhalo, mu_mhalo + sig_mhalo, color='b', alpha=0.25, linewidth=0, edgecolor=None, label='Sim.') # data m_mid, mu_mhalo, sig_mhalo, cnts = smhmr.Calculate(subcat_dat['m.max'], subcat_dat['m.star'], weights=subcat_dat['weights']) sub.errorbar(m_mid, mu_mhalo, yerr=sig_mhalo, color='k', label='Data') sub.set_xlim([10., 15.]) sub.set_xlabel('Halo Mass $(\mathcal{M}_{halo})$', fontsize=25) sub.set_ylim([8., 12.]) sub.set_ylabel('Stellar Mass $(\mathcal{M}_*)$', fontsize=25) sub.legend(loc='upper right') plt.show() return None
def test_ABCsumstat(run, T):#, sumstat=['smf']): ''' Compare the summary statistics of the median T-th ABC particle pool with data. Hardcoded for smf only ''' # data summary statistic sumdata = abcee.SumData(['smf'], info=True, nsnap0=15) # median theta abcout = abcee.readABC('test0', T) theta_med = [UT.median(abcout['theta'][:, i], weights=abcout['w'][:]) for i in range(len(abcout['theta'][0]))] subcat = abcee.model(run, theta_med, nsnap0=15, downsampled='14') sumsim = abcee.SumSim(['smf'], subcat, info=True) fig = plt.figure() sub = fig.add_subplot(111) sub.plot(sumdata[0][0], sumdata[0][1], c='k', ls='--', label='Data') sub.plot(sumsim[0][0], sumsim[0][1], c='b', label='Sim.') sub.set_xlim([9., 12.]) sub.set_xlabel('Stellar Masses $(\mathcal{M}_*)$', fontsize=25) sub.set_ylim([1e-6, 10**-1.75]) sub.set_yscale('log') sub.set_ylabel('$\Phi$', fontsize=25) sub.legend(loc='upper right') plt.show() return None
def select_rewrite_expression(name, exprs): """ Given an expression name and a list of expressions, tries to select an expression with the highest selectivity for use in AST re-writing. """ # For equality check (=, !=, is), select the mode if name[1] == "equality": values = [e.right.value for e in exprs] filter_using = util.mode(values) for e in exprs: if e.right.value == filter_using: return e # For ordering checks, select the median value for static elif name[1] == "order": is_static = name[3][1] == "static" values = [e.right.value for e in exprs] # For static (numeric) compares, we use median # value to eliminate as many as possible. # For non-numeric, we use mode if is_static: filter_using = util.median(values) else: filter_using = util.mode(values) for e in exprs: if e.right.value == filter_using: return e # For ordering checks without static values, use any else: return exprs[0]
def uclus_distance(self, x, y): """ The method to determine the distance between one cluster an another item/cluster. The distance equals to the *average* (median) distance from any member of one cluster to any member of the other cluster. :param x: first cluster/item. :param y: second cluster/item. """ # create a flat list of all the items in <x> if not isinstance(x, Cluster): x = [x] else: x = x.fullyflatten() # create a flat list of all the items in <y> if not isinstance(y, Cluster): y = [y] else: y = y.fullyflatten() distances = [] for k in x: for l in y: distances.append(self.distance(k, l)) return median(distances)
def func_avg(kwargs, f=None, name=None): kwargs = dict(kwargs) pool = kwargs.pop("pool") n_runs = kwargs.pop("n_runs") result = pool.map(FunctionCaller(kwargs), range(n_runs)) xs, sizes = list(zip(*result)) if f is None: f = open("tuning.txt", "w") xs_str = "Evals: " + str(xs) sizes_str = "Sizes: " + str(sizes) median_str = "Median: %.1f Average: %.1f ~ %.1f" % (util.median( xs), util.avg(xs), util.avg([x for x in xs if util.is_finite(x)])) avg_size_str = "AverageSize: %.1f ~ %.1f" % ( util.avg(sizes), util.avg([s for s in sizes if util.is_finite(s)])) success_rate = len([x for x in xs if util.is_finite(x)]) / len(xs) success_rate_str = "Success_rate: %.2f" % success_rate report_str = [ xs_str, sizes_str, median_str, avg_size_str, success_rate_str ] if name is not None: name_str = "* %s" % name report_str = [name_str] + report_str report_str = "\n".join(report_str) print(report_str, file=f) f.flush() print(report_str)
def to_dict(self): return { "name": self.name, "num_articles": len(self.articles), "articles": self.articles, #"scopus_bins": [b.to_dict() for b in self.histogram], "scopus_median": median([a["scopus"] for a in self.articles]) }
def test_readABC(T): ''' Try reading in different ABC outputs and do basic plots ''' abcout = abcee.readABC('test0', T) # median theta theta_med = [UT.median(abcout['theta'][:, i], weights=abcout['w'][:]) for i in range(len(abcout['theta'][0]))] print theta_med
def addVehicle(self, veHistory): time = max(veHistory.keys()) * self.timePerFrame getRatio = lambda mvBbox: (mvBbox[3] - self.ratioErode) / (1 + abs( mvBbox[2] - self.ratioErode)) # height / width ratios = map(getRatio, veHistory.values()) medianRatio = util.median(ratios) vehicle = "Moto" if medianRatio > self.ratioRef else "Automobile" self._data.append((time, vehicle)) self.segmenter.addVehicle(vehicle)
def xor_investigate_average(): n_runs = 100 x = [] for i in range(n_runs): result = xor_investigate(i) print("xor(%d) = %d" % (i, result)) x.append(result) #x = [xor_investigate() for i in range(n_runs)] print("Average=%.2f" % util.avg(x)) print("Median=%.2f" % util.median(x))
def time_task(self): while True: t_send = time.time() clock_deltas = {None: (t_send, t_send)} for peer, request in [(peer, peer.get_time().addCallback(lambda res: (time.time(), res))) for peer in self.peers]: try: t_recv, response = yield request t = .5 * (t_send + t_recv) clock_deltas[(peer.id, peer.address, peer.port)] = (t, float(response)) except: traceback.print_exc() continue self.clock_offset = util.median(mine - theirs for mine, theirs in clock_deltas.itervalues()) yield sleep(random.expovariate(1/100))
def bondStats(self, ni, nj, ij=0): # get the residue index extents. rlist = [atom['resSeq'] for atom in self.models[0]] rmin = min(rlist) rmax = max(rlist) # build the list of distances. d = [] for i in range(rmin, rmax + 1): j = i + ij ai = self.select(i, ni) aj = self.select(j, nj) if ai and aj: d = d + dist(ai, aj) # compute and return statistics on the list. return (median(d), min(d), max(d))
def sigMstar_tduty_fid(Mhalo=12, dMhalo=0.5): ''' Figure plotting sigmaMstar at M_halo = Mhalo for different duty cycle time (t_duty) with fiducial SFMS parameter values rather than ABC values. ''' # read in parameter values for randomSFH_1gyr abcout = ABC.readABC('randomSFH_1gyr', 13) # the median theta will be designated the fiducial parameter values theta_fid = [UT.median(abcout['theta'][:, i], weights=abcout['w'][:]) for i in range(len(abcout['theta'][0]))] runs = ['randomSFH_0.5gyr', 'randomSFH_1gyr', 'randomSFH_2gyr', 'randomSFH_5gyr', 'randomSFH_10gyr'] tduties = [0.5, 1., 2., 5., 10.] #hardcoded smhmr = Obvs.Smhmr() sigMstar_fid = [] for i_t, tduty in enumerate(tduties): subcat_sim = ABC.model(runs[i_t], theta_fid, nsnap0=15, sigma_smhm=0.2, downsampled='14') isSF = np.where(subcat_sim['gclass'] == 'sf') # only SF galaxies sig_mstar_fid = smhmr.sigma_logMstar( subcat_sim['halo.m'][isSF], subcat_sim['m.star'][isSF], weights=subcat_sim['weights'][isSF], Mhalo=Mhalo, dmhalo=dMhalo) sigMstar_fid.append(sig_mstar_fid) sigMstar_fid = np.array(sigMstar_fid) # make figure fig = plt.figure(figsize=(5,5)) sub = fig.add_subplot(111) sub.scatter(tduties, sigMstar_fid) # x-axis sub.set_xlabel('$t_\mathrm{duty}$ [Gyr]', fontsize=20) sub.set_xlim([0., 10.]) # y-axis sub.set_ylabel('$\sigma_{M_*}(M_\mathrm{halo} = 10^{'+str(Mhalo)+'} M_\odot)$', fontsize=20) sub.set_ylim([0., 0.5]) fig.savefig(''.join([UT.tex_dir(), 'figs/sigMstar_tduty_fid.pdf']), bbox_inches='tight', dpi=150) plt.close() return None
def angleStats(self, ni, nj, nk, ij=0, ik=0): # get the residue index extents. rlist = [atom['resSeq'] for atom in self.models[0]] rmin = min(rlist) rmax = max(rlist) # build the list of angles. theta = [] for i in range(rmin, rmax + 1): j = i + ij k = i + ik ai = self.select(i, ni) aj = self.select(j, nj) ak = self.select(k, nk) if ai and aj and ak: theta = theta + angle(ai, aj, ak) # compute and return statistics on the list. return (median(theta), min(theta), max(theta))
def dihedStats(self, ni, nj, nk, nl, ij=0, ik=0, il=0): # get the residue index extents. rlist = [atom['resSeq'] for atom in self.models[0]] rmin = min(rlist) rmax = max(rlist) # build the list of dihedrals. omega = [] for i in range(rmin, rmax + 1): j = i + ij k = i + ik l = i + il ai = self.select(i, ni) aj = self.select(j, nj) ak = self.select(k, nk) al = self.select(l, nl) if ai and aj and ak and al: omega = omega + dihed(ai, aj, ak, al) # compute and return statistics on the list. return (median(omega), min(omega), max(omega))
def time_task(self): while True: t_send = time.time() clock_deltas = {None: (t_send, t_send)} for peer, request in [ (peer, peer.get_time().addCallback(lambda res: (time.time(), res))) for peer in self.peers ]: try: t_recv, response = yield request t = .5 * (t_send + t_recv) clock_deltas[(peer.id, peer.address, peer.port)] = (t, float(response)) except: traceback.print_exc() continue self.clock_offset = util.median( mine - theirs for mine, theirs in clock_deltas.itervalues()) yield sleep(random.expovariate(1 / 100))
def find_info_gain(self, X, y, attr): ''' Find the information gain for a given attribute ''' # Find out if its a numerical or categorical isCategorical = type(X[1][attr]) == str if isCategorical: value_set = set([x[attr] for x in X]) if len(value_set) == 1: # only one value for entire list return None elif len(value_set) == 2: attribute_values = [list(value_set)[0]] else: attribute_values = [x for x in value_set] else: # calculate mean, median of the values value_set = set([x[attr] for x in X]) if len(value_set) == 1: return None values = [x for x in value_set] # calculate info gain on median and mean attribute_values = [mean(values), median(values)] max_info_gain = 0 val = None X_left = None X_right = None Y_left = None Y_right = None for each in attribute_values: x_left, x_right, y_left, y_right = partition_classes( X, y, attr, each) info_gain = information_gain(y, [y_left, y_right]) if info_gain > max_info_gain: max_info_gain = info_gain val = each X_left, X_right, Y_left, Y_right = x_left, x_right, y_left, y_right return isCategorical, val, max_info_gain, X_left, X_right, Y_left, Y_right
def unbalanced_countries(self): self.sort_countries() q1 = self.num_countries/4.0 q2 = int(q1*2) q3 = int(q1*3) q1 = int(q1) quant1 = self.countries[:q1] quant2 = self.countries[q1:q2] quant3 = self.countries[q2:q3] median = util.median(self.countries) mid50 = len(quant2[-1].territories) - len(quant2[0].territories) min_terrs = median - mid50*1.5 max_terrs = median + mid50*1.5 small = [c for c in self.countries if len(c.territories) < min_terrs] large = [c for c in self.countries if len(c.territories) > max_terrs] if len(self.countries[0].territories)*1.5 \ < len(self.countries[-1].territories): if self.countries[0] not in small: small.append(self.countries[0]) if self.countries[-1] not in large: large.append(self.countries[-1]) return sorted(small), sorted(large)
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args']['functions'] m.write_tsv_file(os.path.join(config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.warn('SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.warn('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def centralize(self): self.position.x = utl.median(self.axes['x'][:]) self.position.y = utl.median(self.axes['y'][:]) self.position.z = utl.median(self.axes['z'][:])
def median(self): """returns the mean value""" return util.median(self.values)
def test_bench_disk_paxos(metasync, opts): "test disk paxos" "bencmark latency of paxos with backends" from disk_paxos import DiskPaxosWorker repeat = 5 client_num = [1, 2, 3, 4, 5] backend_list = [["google"], ["dropbox"], ["onedrive"], ["box"], ["google", "dropbox", "onedrive"]] results = [['Clients'] + [','.join(x) for x in backend_list]] # start to test for num in client_num: for num_prop in range(1, num + 1): for _ in range(repeat): row = ['%d/%d clients' % (num_prop, num)] for backend in backend_list: srvs = map(services.factory, backend) dbg.info('Test paxos for %d/%d clients and %s' % (num_prop, num, ','.join(backend))) # initialize all disk blocks blockList = [] for i in range(num): path = '/diskpaxos/client%d' % i for srv in srvs: if not srv.exists(path): srv.put(path, '') else: srv.update(path, '') blockList.append(path) clients = [] for i in range(num_prop): storages = map(services.factory, backend) worker = DiskPaxosWorker(storages, blockList[i], blockList) clients.append(worker) #dbg.dbg('client %d %s' % (i, worker.clientid)) for worker in clients: worker.start() latency = [] master_latency = None for worker in clients: worker.join() latency.append(worker.latency) if (worker.master): assert master_latency is None master_latency = worker.latency for worker in clients: worker.join() summary = ",".join(map(str,[min(latency), max(latency), util.median(latency), master_latency])) dbg.info("Result: %s" % summary) row.append(summary) results.append(row) # tabularize print "Item Format: min,max,median,master" for row in results: for e in row: print "%s \t" % e, print
def test_bench_paxos2(metasync, opts): "bencmark latency of paxos with backends" def new_index(srv, folder, prefix): if not srv.exists(folder): return 0 files = srv.listdir(folder) cnt = 0 for fn in files: if fn.startswith(prefix): cnt += 1 return cnt from paxos import PPaxosWorker2 repeat = 5 client_num = [1, 2, 3, 4, 5] backend_list = [["dropbox"], ["onedrive"]] results = [['Clients'] + [','.join(x) for x in backend_list]] # start to test for num in client_num: for _ in range(repeat): row = ['%d clients' % (num)] for backend in backend_list: dbg.info('Test paxos for %d clients and %s' % (num, ','.join(backend))) srvs = map(services.factory, backend) # init log file prefix = 'test2-%d-%d' % (num , len(backend)) index = new_index(srvs[0], '/ppaxos', prefix) path = '/ppaxos/%s.%d' % (prefix, index) dbg.info(path) for srv in srvs: srv.init_log2(path) clients = [] for i in range(num): storages = map(services.factory, backend) worker = PPaxosWorker2(storages, path) clients.append(worker) for worker in clients: worker.start() for worker in clients: worker.join() latency = [] master_latency = None for worker in clients: latency.append(worker.latency) if (worker.master): assert master_latency is None master_latency = worker.latency assert master_latency is not None summary = ",".join(map(str,[min(latency), max(latency), util.median(latency), master_latency])) dbg.info("Result: %s" % summary) row.append(summary) results.append(row) # tabularize print "Item Format: min,max,median,master" for row in results: for e in row: print "%s \t" % e, print
def combine(result_matrices, score_scalings, membership, quantile_normalize): """This is the combining function, taking n result matrices and scalings""" for m in result_matrices: m.fix_extreme_values() if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.info("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row]][cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.info("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) #print "qqq(%d) = %f" % (i, qqq) if qqq == 0: logging.error("very sparse score !!!") values = values / qqq * abs(rs_quant) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.info("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def analysis(data, settings, flat_age_matrix=[], flat_sim_matrix=[], alpha=0.03): # value_keys = ["complete_term_jaccard", "top_term_jaccard", "top_gene_jaccard", "top_parents_jaccard"] value_keys = ["top_gene_jaccard", "top_parents_jaccard"] data_values = [[row._asdict()[k] for k in value_keys] for row in data] means = [mean([x[i] for x in data_values]) for i in range(len(value_keys))] stds = [sstdev([x[i] for x in data_values]) for i in range(len(value_keys))] medians = [median([x[i] for x in data_values]) for i in range(len(value_keys))] log.info("N: %s", len(data_values)) genes_found, genes_missed = len([genes[1] for row in data for genes in row.genes_found]),len([genes for row in data for genes in row.genes_missed]) log.info("genes_found: %s, genes_missed: %s", genes_found, genes_missed) total = genes_found + genes_missed log.info("genes_found: %s, genes_missed: %s", round(genes_found / total, 2), round(genes_missed / total, 2)) #log.info("distinct unknown genes: %s", len(unknown)) fig_num = 0 for i, value_key in enumerate(value_keys): log.info("%s: mean=%s, std=%s, median=%s", value_key, means[i], stds[i], medians[i]) f = plt.figure(fig_num) fig_num +=1 d = [x[i] for x in data_values] weights = np.ones_like(d)/float(len(d)) plt.hist(d, 100, weights=weights, alpha=0.5, label='Actual') weights = np.ones_like(flat_sim_matrix)/float(len(flat_sim_matrix)) plt.hist(flat_sim_matrix, 100, weights=weights, alpha=0.5, label='Randomized') plt.xlabel('Similarity') plt.ylabel(value_keys[i]) plt.title('Histogram of ' + value_key + " | " + r'$\mu=' + str(round(means[i], 2)) + r',\ \sigma=' + str(round(stds[i], 2)) + r'$') plt.legend(loc='upper right') f.show() f = plt.figure(fig_num) fig_num +=1 x = [r.age for r in data] y = [r._asdict()[value_key] for r in data] fit = np.polyfit(x,y,1) fit_fn = np.poly1d(fit) plt.plot(x,y, 'k.', [min(x), max(x)], fit_fn([min(x), max(x)]), '--g') plt.xlabel('Age') plt.ylabel("Similarity Index") plt.title("Ancestors of " + value_key + " vs Age" + " | " + r'$\mu=' + str(round(means[i], 2)) + r',\ \sigma=' + str(round(stds[i], 2)) + r'$' ) f.show() f = plt.figure(fig_num) fig_num +=1 x = flat_age_matrix y = flat_sim_matrix fit = np.polyfit(x,y,1) fit_fn = np.poly1d(fit) plt.plot(x,y, 'k.', [min(x), max(x)], fit_fn([min(x), max(x)]), '--g', alpha=alpha) plt.xlabel('Age') plt.ylabel("Similarity Index") plt.title("Randomized Ancestors of " + value_key + " vs Age" + " | " + r'$\mu=' + str(round(means[i], 2)) + r',\ \sigma=' + str(round(stds[i], 2)) + r'$' ) f.show() f = plt.figure(fig_num) fig_num +=1 d = [x[4] for x in data] weights = np.ones_like(d)/float(len(d)) plt.hist(d, 100, weights=weights, alpha=0.5, label='Actual') weights = np.ones_like(flat_age_matrix)/float(len(flat_age_matrix)) plt.hist(flat_age_matrix, 100, weights=weights, alpha=0.5, label='Randomized') plt.xlabel('Age') f.show() raw_input() plt.close("all")
def test_median_with_nans(self): """tests the mean() function""" array = np.array([2.0, 3.0, np.nan, 1.0]) result = util.median(array) self.assertAlmostEqual(2.0, result)
def test_bench_disk_paxos(metasync, opts): "test disk paxos" "bencmark latency of paxos with backends" from disk_paxos import DiskPaxosWorker repeat = 5 client_num = [1, 2, 3, 4, 5] backend_list = [["google"], ["dropbox"], ["onedrive"], ["box"], ["google", "dropbox", "onedrive"]] results = [['Clients'] + [','.join(x) for x in backend_list]] # start to test for num in client_num: for num_prop in range(1, num + 1): for _ in range(repeat): row = ['%d/%d clients' % (num_prop, num)] for backend in backend_list: srvs = map(services.factory, backend) dbg.info('Test paxos for %d/%d clients and %s' % (num_prop, num, ','.join(backend))) # initialize all disk blocks blockList = [] for i in range(num): path = '/diskpaxos/client%d' % i for srv in srvs: if not srv.exists(path): srv.put(path, '') else: srv.update(path, '') blockList.append(path) clients = [] for i in range(num_prop): storages = map(services.factory, backend) worker = DiskPaxosWorker(storages, blockList[i], blockList) clients.append(worker) #dbg.dbg('client %d %s' % (i, worker.clientid)) for worker in clients: worker.start() latency = [] master_latency = None for worker in clients: worker.join() latency.append(worker.latency) if (worker.master): assert master_latency is None master_latency = worker.latency for worker in clients: worker.join() summary = ",".join( map(str, [ min(latency), max(latency), util.median(latency), master_latency ])) dbg.info("Result: %s" % summary) row.append(summary) results.append(row) # tabularize print "Item Format: min,max,median,master" for row in results: for e in row: print "%s \t" % e, print
def calcAverage(self): levels = [] for each in self.players: levels.append(int(each.getLevel())) self.averageLevel = util.median(levels)
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args'][ 'functions'] m.write_tsv_file(os.path.join( config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores( result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([ mat.values[index_map[row], cluster - 1] for row in row_members ]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.debug('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def main (): parser = argparse.ArgumentParser() parser.add_argument('--host', default = 'mongodb://localhost:27017/') parser.add_argument('action', choices = ['insert']) parser.add_argument('-it', type = int, default = 2, help = "inserting thread num") parser.add_argument('-rt', type = int, default = 1, help = "reading thread num") parser.add_argument('-rmt', type = int, default = 1, help = "removing thread num") parser.add_argument('-db_name', default = "test_database") parser.add_argument('-collection_name', default = "test_collection") parser.add_argument('-noi', '--no-index', action = 'store_true', help = "dont create index") args = parser.parse_args() collection = connect(args) print collection.count(), "records in collection %s.%s" % (args.db_name, args.collection_name) print "dropping old indexes..." try: collection.drop_index([("timestamp", pymongo.ASCENDING)]) except pymongo.errors.OperationFailure as e: if "index not found" not in str(e): raise if not args.no_index: print "trying to create indexes..." #pymongo.ASCENDING == 1 collection.create_index([("ev_id", pymongo.ASCENDING)]) collection.create_index([("ev_id", pymongo.ASCENDING), ("timestamp", pymongo.ASCENDING)]) collection.create_index([("ev_id", pymongo.ASCENDING), ("s_id", pymongo.ASCENDING), ("timestamp", pymongo.ASCENDING)]) collection.create_index([("s_id", pymongo.ASCENDING)]) # insert() if args.action == 'insert': accum = collections.deque() r_accum = collections.deque() rm_accum = collections.deque() print "starting threads:", args.it, "inserting", args.rt, "reading", args.rmt, "removing" for _ in range(args.it): collection = connect(args) t = threading.Thread(target = insert_loop, args = [accum, collection]) t.daemon = True t.start() for _ in range(args.rt): collection = connect(args) t = threading.Thread(target = read_loop, args = [r_accum, collection]) t.daemon = True t.start() for _ in range(args.rmt): collection = connect(args) t = threading.Thread(target = remove_loop, args = [rm_accum, collection]) t.daemon = True t.start() print "starting stat loop..." t_start = time.time() total_inserts = 0 total_reads = 0 try: sleepd = 0 while True: time.sleep(1 - sleepd) t1 = time.time() v = list(accum) accum.clear() ra = list(r_accum) r_accum.clear() v.sort() ra.sort() num = len(v) rnum = len(ra) total_inserts += num total_reads += rnum m = util.median(v) m = ("%0.5f" % m) if m is not None else None p95 = util.p95(v) p95 = ("%0.5f" % p95) if p95 is not None else None mx = max(v) if v else None mx = ("%0.5f" % mx) if mx is not None else None print args.it, "ithreads", num, "insert/s, med", m, "p95", p95, "max", mx, "total num", total_inserts m = util.median(ra) m = (("%0.5f" % m) if m is not None else None) or '-' p95 = util.p95(ra) p95 = (("%0.5f" % p95) if p95 is not None else None) or '-' mx = max(ra) if ra else None mx = (("%0.5f" % mx) if mx is not None else None) or '-' print " ", args.rt, "rthreads", rnum, "reads/s, med", m, "p95", p95, "max", mx, "total num", total_reads sleepd = time.time() - t1 finally: print "total inserts", total_inserts, "total reads", total_reads, "took", (time.time() - t_start), "s"
def test_bench_paxos(metasync, opts): "bencmark latency of paxos with backends" def new_index(srv, folder, prefix): if services.slug(srv) == 'onedrive': folder = '/Public' + folder if not srv.exists(folder): return 0 files = srv.listdir(folder) cnt = 0 for fn in files: if fn.startswith(prefix): cnt += 1 return cnt from paxos import PPaxosWorker repeat = 5 client_num = [1, 2, 3, 4, 5] backend_list = [["google"], ["dropbox"], ["onedrive"], ["box"], ["google", "dropbox", "onedrive"]] results = [['Clients'] + [','.join(x) for x in backend_list]] # start to test for num in client_num: for _ in range(repeat): row = ['%d clients' % (num)] for backend in backend_list: dbg.info('Test paxos for %d clients and %s' % (num, ','.join(backend))) srvs = map(services.factory, backend) # init log file prefix = 'test-%d-%d' % (num, len(backend)) index = new_index(srvs[0], '/ppaxos', prefix) path = '/ppaxos/%s.%d' % (prefix, index) dbg.info(path) for srv in srvs: srv.init_log(path) clients = [] for i in range(num): storages = map(services.factory, backend) worker = PPaxosWorker(storages, path) clients.append(worker) for worker in clients: worker.start() latency = [] master_latency = None for worker in clients: worker.join() latency.append(worker.latency) if (worker.master): assert master_latency is None master_latency = worker.latency for worker in clients: worker.join() summary = ",".join( map(str, [ min(latency), max(latency), util.median(latency), master_latency ])) dbg.info("Result: %s" % summary) row.append(summary) results.append(row) # tabularize print "Item Format: min,max,median,master" for row in results: for e in row: print "%s \t" % e, print