def anchoredDistanceFromFrqSmallPolytomies(quartTable,method,met): frq = dict() for key,val in quartTable.iteritems(): for key2,val2 in val.iteritems(): if met == "log": if method == "gmean": vtt = np.exp(-stats.gmean(val2)) elif method == "mean": vtt = np.exp(-mean(val2)) else: vtt = np.exp(-sqrt(mean(square(val2)))) elif met == "freq": if method == "gmean": vtt = (stats.gmean(val2)) elif method == "mean": vtt = (mean(val2)) else: vtt = (sqrt(mean(square(val2)))) if key in frq: v = frq[key] v[key2] = vtt frq[key] = v else: v = dict() v[key2] = vtt frq[key] = v return frq
def _calculate(compound): num = DRP.models.NumMolDescriptorValue if any(element in inorgElements for element in compound.elements): delete_descriptors([compound]) vals_to_create = [] inorgElementNormalisationFactor = sum(info['stoichiometry'] for element, info in compound.elements.items() if element in inorgElements) for prop in inorgAtomicProperties: vals_to_create.append(num( compound=compound, descriptor=descriptorDict['drpInorgAtom{}_geom_unw'.format(prop.title().replace('_', ''))], value=gmean([inorgElements[element][prop] for element in compound.elements if element in inorgElements]) )) vals_to_create.append(num( compound=compound, descriptor=descriptorDict['drpInorgAtom{}_geom_stoich'.format(prop.title().replace('_', ''))], value = gmean([inorgElements[element][prop]*(info['stoichiometry']/inorgElementNormalisationFactor) for element, info in compound.elements.items() if element in inorgElements]))) vals_to_create.append(num( compound=compound, descriptor=descriptorDict['drpInorgAtom{}_max'.format(prop.title().replace('_', ''))], value = max(inorgElements[element][prop] for element in compound.elements if element in inorgElements))) vals_to_create.append(num( compound=compound, descriptor=descriptorDict['drpInorgAtom{}_range'.format(prop.title().replace('_', ''))], value = max(inorgElements[element][prop] for element in compound.elements if element in inorgElements) - min(inorgElements[element][prop] for element in compound.elements if element in inorgElements))) num.objects.bulk_create(vals_to_create)
def get_extreme_bounding_box_multithreaded(G,choice_set_config,time_dependent_relation,trip_times,trip_data,n_threads=2): config=choice_set_config inverse_relation=get_inverse_time_dependent_relation(time_dependent_relation) id_sample=trip_data.keys() if config['bounding_box_sample_size']<len(trip_data): id_sample=random.sample(id_sample,config['bounding_box_sample_size']) work_queue = Queue() done_queue_range = Queue() done_queue_vars=Queue() processes = [] for trip_id in id_sample: work_queue.put(trip_id) for w in xrange(n_threads): p = Process(target=bounding_box_worker, args=(work_queue, done_queue_range,done_queue_vars,G,choice_set_config,time_dependent_relation,trip_data,trip_times)) p.start() processes.append(p) work_queue.put('STOP') all_bounds=[] vars={} for i in range(n_threads): all_bounds = all_bounds + list(iter(done_queue_range.get,'STOP')) this_vars= list(iter(done_queue_vars.get,'STOP')) vars=dict(vars,**this_vars[0]) for p in processes: p.join() ranges={} for var in vars: low=[] high=[] for bound in all_bounds: if var in bound: low.append(bound[var][0]) high.append(bound[var][1]) low.sort() high.sort() if low: low=gmean(low) high=gmean(high) if low<=high: ranges[var]=(low,high) else: print 'WARNING: coefficient for ' + var + ' had inconsistent high/low. Try increasing sample size for bounding box determination.' else: print 'WARNING: coefficient for ' + var + ' had no range. Try increasing sample size for bounding box determination.' if choice_set_config['verbose']: print 'EXT_BOUND: ', ranges return ranges
def test_1D_list(self): a = (1,2,3,4) actual= stats.gmean(a) desired = power(1*2*3*4,1./4.) assert_almost_equal(actual, desired,decimal=14) desired1 = stats.gmean(a,axis=-1) assert_almost_equal(actual, desired1, decimal=14)
def test_1D_array(self): a = array((1,2,3,4), float32) actual= stats.gmean(a) desired = power(1*2*3*4,1./4.) assert_almost_equal(actual, desired, decimal=7) desired1 = stats.gmean(a,axis=-1) assert_almost_equal(actual, desired1, decimal=7)
def test_2D_array_default(self): a = array(((1,2,3,4), (1,2,3,4), (1,2,3,4))) actual= stats.gmean(a) desired = array((1,2,3,4)) assert_array_almost_equal(actual, desired, decimal=14) desired1 = stats.gmean(a,axis=0) assert_array_almost_equal(actual, desired1, decimal=14)
def test_gmean(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.gmean(abs(x)) rm = stats.mstats.gmean(abs(xm)) assert_equal(r, rm) r = stats.gmean(abs(y)) rm = stats.mstats.gmean(abs(ym)) assert_equal(r, rm)
def test_gmean(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.gmean(abs(x)) rm = stats.mstats.gmean(abs(xm)) assert_allclose(r, rm, rtol=1e-13) r = stats.gmean(abs(y)) rm = stats.mstats.gmean(abs(ym)) assert_allclose(r, rm, rtol=1e-13)
def groupCovForPosRep(files,totalCounts,up,down,calType='total'): """ This function groups the replicates of samples' coverage around TSS and TSE sites. return two columns: ['mean'/'total'/'median'/'geometricmean','std'] * files: list. A list of replicate files. * totalCounts: list. A list of int with same length of files. Stores the total mapped reads in files. * calType: str. Calculation type. return a dataframe with two columns. ['mean_value','standard deviation'] """ res_df = pd.DataFrame() mean = pd.DataFrame() std = pd.DataFrame() for f,total in zip(files,totalCounts): df = pd.read_csv(f,sep='\t',header=0,index_col=0,low_memory=False) # remove the antibody, signal peptide and non signal peptide are separate try: #df = df.drop('heavychain');df=df.drop('lightchain');df=df.drop('NeoRKanR') #df = df.loc[['heavychain','lightchain']] df = df.loc[['lightchain'],:] except: pass # filter by median of gene df['median'] = df.median(axis=1) #df = df[df['median']>=0.5] del df['median'] up_df = df.iloc[:,0:up] down_df = df.iloc[:,up:up+down] try: up_df = up_df.replace('-',np.nan).dropna().astype('int').T down_df = down_df.replace('-',np.nan).dropna().astype('int').T except: pass up_df = up_df/float(total)*(10**6) # row: position. col: gene down_df = down_df/float(total)*(10**6) if calType == 'total': up_df[calType] = up_df.sum(axis=1);down_df[calType] = down_df.sum(axis=1) if calType == 'mean': up_df[calType] = up_df.mean(axis=1);down_df[calType] = down_df.mean(axis=1) if calType == 'median': up_df[calType] = up_df.median(axis=1);down_df[calType] = down_df.median(axis=1) if calType == 'geoMean': up_df = up_df.replace([0],[1]);down_df=down_df.replace([0],[1]) up_df[calType] = sp_stats.gmean(up_df.values,axis=1) down_df[calType] = sp_stats.gmean(down_df.values,axis=1) df = df.T df[calType]= pd.concat([up_df[calType],down_df[calType]]) # add caltype column up_df['std']=up_df.std(axis=1);down_df['std']=down_df.std(axis=1) df['std'] = pd.concat([up_df['std'],down_df['std']]) mean[f+calType] = df[calType] # stores the results std[f+'std'] = df['std'] res_df['mean'] = mean.mean(axis=1) res_df['std'] = ((std**2).sum(axis=1)/std.shape[1]).apply(np.sqrt) return res_df
def findTrueAverageTableAnchoringOnDifferentSidesOverall(frq,anch,list_taxa,N1,N2,method, met): anch = sorted(list(anch)) lst_taxa = list(list_taxa.keys()) TotalKey = dict() n = len(lst_taxa) N = {N1,N2} for i in range(0,n): if lst_taxa[i] in N: continue for j in range(i+1,n): if lst_taxa[j] in N: continue p = sorted([lst_taxa[i],lst_taxa[j]]) key_orig = genKey(p,anch) l = sorted([lst_taxa[i],lst_taxa[j],anch[0],anch[1]]) key_inv = "/".join(l) v = frq[key_orig] v_inv = float(v[0])/v[1] if key_inv in TotalKey: if (met=="freq"): vt = TotalKey[key_inv] vt.append(v_inv) elif met == "log": vt = TotalKey[key_inv] vt.append(-np.log(1.*v_inv)) else: if (met == "freq"): vt = list() vt.append(v_inv) elif met == "log": vt = list() vt.append(-np.log(1.*v_inv)) TotalKey[key_inv] = vt TotalKeyf = dict() for q,v2 in TotalKey.iteritems(): l = set(q.split("/")) l = list(l - set(anch)) if met == "log": if method == "gmean": vtt = np.exp(-stats.gmean(v2)) elif method == "mean": vtt = np.exp(-mean(v2)) else: vtt = np.exp(-sqrt(mean(square(v2)))) if met == "freq": if method == "gmean": vtt = (stats.gmean(v2)) elif method == "mean": vtt = (mean(v2)) else: vtt = (sqrt(mean(square(v2)))) TotalKeyf[q] = vtt return TotalKeyf
def geom_mean(a): """ Compute the geometric mean for an "arbitrary" data set, ie one that contains zeros and negative numbers. Parameters ---------- a : array-like A numpy.ndarray, or something that can be converted to an ndarray Returns ------- The geometric mean of the input array Notes ----- The traditional geometric mean can not be computed on a mixture of positive and negative numbers. The approach here, validated rigorously in the cited paper[1], is to compute the geometric mean of the absolute value of the negative numbers separately, and then take a weighted arithmetic mean of that and the geometric mean of the positive numbers. We're going to discard 0 values, operating under the assumption that in this context there are going to be few or no observations with a value of exactly 0. References ---------- [1] Geometric mean for negative and zero values Elsayed A. E. Habib International Journal of Research and Reviews in Applied Sciences 11:419 (2012) http://www.arpapress.com/Volumes/Vol11Issue3/IJRRAS_11_3_08.pdf A new "Logicle" display method avoids deceptive effects of logarithmic scaling for low signals and compensated data. Parks DR, Roederer M, Moore WA. Cytometry A. 2006 Jun;69(6):541-51. PMID: 16604519 http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.20258/full """ a = np.array(a) pos = a[a > 0] pos_mean = stats.gmean(pos) pos_prop = pos.size / a.size neg = a[a < 0] neg = np.abs(neg) neg_mean = stats.gmean(neg) if neg.size > 0 else 0 neg_prop = neg.size / a.size return (pos_mean * pos_prop) - (neg_mean * neg_prop)
def get_extreme_bounding_box_random_sample(G,choice_set_config,time_dependent_relation,trip_times,trip_data): config=choice_set_config inverse_relation=get_inverse_time_dependent_relation(time_dependent_relation) id_sample=trip_data.keys() if config['bounding_box_sample_size']<len(trip_data): id_sample=random.sample(id_sample,config['bounding_box_sample_size']) all_bounds=[] cur_bound={} vars={} for id in id_sample: cur_bound=find_coef_bounding_box(G,trip_data[id][0],trip_data[id][-1],choice_set_config,time_dependent_relation,trip_times[id[0]]) if choice_set_config['verbose']: print 'CUR_BOUND: ', cur_bound this_bound={} for key in cur_bound: orig_key=key if key in inverse_relation: orig_key=inverse_relation[key] this_bound[orig_key]=cur_bound[key] vars[orig_key]=1 all_bounds.append(this_bound) ranges={} for var in vars: low=[] high=[] for bound in all_bounds: if var in bound: low.append(bound[var][0]) high.append(bound[var][1]) low.sort() high.sort() if low: low=gmean(low) high=gmean(high) if low<=high: ranges[var]=(low,high) else: print 'WARNING: coefficient for ' + var + ' had inconsistent high/low. Try increasing sample size or percentile for bounding box determination.' else: print 'WARNING: coefficient for ' + var + ' had no range. Try increasing sample size or percentile for bounding box determination.' if choice_set_config['verbose']: print 'EXT_BOUND: ', ranges return ranges
def get_stats_on_dataset(dataset, prediction): # Get mean, median, stddev of each dataset last_tp = dataset.experiments[-1] # Mean of abs of entire dataset sum = 0 count = 0 for exp in dataset.experiments: for gene in dataset.gene_list: sum += abs(exp.ratios[gene]) count += 1 mean_dataset = sum / float(count) # Mean of prediction sum = 0 count = 0 for gene in dataset.gene_list: sum += abs(prediction[gene]) count += 1 mean_prediction = sum / float(count) # Mean of last tp sum = 0 count = 0 for gene in dataset.gene_list: sum += abs(last_tp.ratios[gene]) count += 1 mean_last_tp = sum / float(count) # Stddev of dataset dataset_list = [] for exp in dataset.experiments: for gene in dataset.gene_list: dataset_list.append(exp.ratios[gene]) std_dataset = numpy.std(dataset_list) std_prediction = numpy.std(prediction.values()) std_last_tp = numpy.std(last_tp.ratios.values()) median_dataset = numpy.median(dataset_list) median_prediction = numpy.median(prediction.values()) median_last_tp = numpy.median(last_tp.ratios.values()) gmean_dataset = stats.gmean(map(abs,dataset_list)) gmean_prediction = stats.gmean(map(abs,prediction.values())) gmean_last_tp = stats.gmean(map(abs,last_tp.ratios.values()))
def compute_doc_state(self, doc): # fetch entity embedding for the set of candidates being considered candidates = set() for chain in doc.chains: for c in sorted(chain.candidates, key=lambda c: c.features[self.ranking_feature], reverse=True)[:self.rerank_depth]: candidates.add(c.id) candidate_embeddings = self.em.get_embeddings(candidates) distance_cache = {} def candidate_distance(a, b): if a < b: a, b = b, a key = (a,b) if key not in distance_cache: distance_cache[key] = self.distance(candidate_embeddings[a], candidate_embeddings[b]) return distance_cache[key] # precompute the top candidates for each chain rc_by_chain = {} for chain in doc.chains: rc_by_chain[chain] = [c.id for c in sorted(chain.candidates, key=lambda c: c.features[self.ranking_feature], reverse=True)][:self.coherence_depth+1] state = {} for c in candidates: dists = [] for chain in doc.chains: top = [ci for ci in rc_by_chain[chain] if ci != c][:self.coherence_depth] if not top: continue dists.append(min(candidate_distance(c, tc) for tc in top if tc != c)) if dists: state[c] = gmean(dists) return state
def combine_fr(frbc, fr1, fr2): # combines all filter responses to find possible stair areas """with gmean 0 in one fr makes it all 0 > raises RuntimeWarning""" import warnings warnings.filterwarnings("ignore") builder = [] for row in xrange(frbc.shape[0]): # all filter responses have the same shape # row = 0 temprow = [] for column in xrange(frbc.shape[1]): temp = stats.gmean([abs(frbc[row,column]),abs(fr1[row,column]),abs(fr2[row,column])]) # gmean of all arrays, this would raise a warning if temp > 100: # threshold for binary temp = 0 if temp > 0: temp = 1 temprow.append(temp) # take abs of corresponding value of each fr and calculate geometric mean builder.append(temprow) combi_fr = np.asanyarray(builder) # returns binary array """show combined filter responses""" # plt.imshow(combi_fr.T,cmap='spectral',interpolation='none', origin='lower') # <-- this would show the actual orientation of the data # plt.imshow(combi_fr,interpolation='none') # plt.title('combined filter responses') # # plt.colorbar() # plt.show() return combi_fr
def create_simple_record(sequence): features = np.zeros(1, SimpleRecordType) features["mean"] = sequence.mean() features["var"] = sequence.var() features["skewness"] = stats.skew(sequence) features["kurtosis"] = stats.kurtosis(sequence) features["first"] = sequence[0] features["sign"] = np.sign(sequence).mean() features["zeros"] = (sequence == 0).mean() if (features["zeros"] == 0.0): features["harmonic_mean"] = stats.hmean(abs(sequence)) features["geometric_mean"] = stats.gmean(abs(sequence)) else: features["harmonic_mean"] = np.nan features["geometric_mean"] = np.nan for m in [2, 3, 5]: seqm = sequence % m for v in range(m): features["val_%dmod%d" % (v, m)] = (seqm == v).mean() return features
def sfm(data): """ Spectral Flatness Measure """ g = stats.gmean(data, dtype=numpy.float64) + 0.00001 a = numpy.mean(data, dtype=numpy.float64) return 10 * log10(g / a)
def test_SED_error(I=1., e1=1, e2=10): """Compute the error one makes by using the simple formulas: e = sqrt(e1 * e2) f = I / (e2 - e1) e2f = e ** 2 * f to compute a differential flux f or e2f from an integral flux measurement I in an energy bin [e1, e2]. Note that e is the log bin center and e2f is typically plotted in spectral energy distributions (SEDs). Index SED-Error Flux-Error 1.5 1.28 0.85 2.0 1.00 1.00 2.5 0.85 1.28 3.0 0.81 1.75 """ from scipy.stats import gmean e = gmean([e1, e2]) f = I / (e2 - e1) e2f = e ** 2 * f # @note: e ** 2 = e1 * e2 here. for Index in np.arange(1.5, 3.5, 0.5): f_correct = powerlaw.power_law_flux(I, Index, e, e1, e2) e2f_correct = e ** 2 * f_correct # We compute ratios, which corresponds to differences # on a log scale SED = e2f / e2f_correct Flux = f / f_correct
def centralize(mat): r"""Center data around its geometric average. Parameters ---------- mat : array_like, float a matrix of proportions where rows = compositions and columns = components Returns ------- numpy.ndarray centered composition matrix Examples -------- >>> import numpy as np >>> from skbio.stats.composition import centralize >>> X = np.array([[.1,.3,.4, .2],[.2,.2,.2,.4]]) >>> centralize(X) array([[ 0.17445763, 0.30216948, 0.34891526, 0.17445763], [ 0.32495488, 0.18761279, 0.16247744, 0.32495488]]) """ mat = closure(mat) cen = ss.gmean(mat, axis=0) return perturb_inv(mat, cen)
def bind(self, *nodes, composition=None): if self.HIERARCHICAL: children = nodes else: children = self._concatenate_children(nodes) if composition is None: composition = self.COMPOSITION row_vecs = {} if composition: # gen_vec is the weighted average of all other blobs with # the same number of children. gen_vecs = {row: self.vector_model.zeros() for row in self.rows} comparable = (n for n in self.nodes if len(n.children) == len(children)) for node in comparable: child_sims = [my_child.similarity(other_child) for my_child, other_child in zip(children, node.children)] total_sim = stats.gmean(child_sims) for row, vec in gen_vecs.items(): vec += vectors.normalize(node.row_vecs[row]) * total_sim row_vecs = {row: vec * composition for row, vec in gen_vecs.items()} assert not np.isnan(np.sum(list(row_vecs.values()))) id_string = self._id_string(children) return VectorNode(self, id_string, children=children, row_vecs=row_vecs)
def calcMoneyReturn(list_rebound, n_rebound, n_output, moneyInit, commisionFee): n_list = len(list_rebound) if n_rebound==0: # when n_rebound is empty print('empty...') n_rebound = n_list n_output = 1 data_output = np.zeros((n_output,3)) for k in range(0,n_output): # randomly choose rebounds # idx_reb = np.random.choice(range(n_list),n_rebound) if n_rebound<n_list: idx_reb = np.random.choice(range(n_list-n_rebound),1) elif n_rebound==n_list: idx_reb = 0 list_sel = list_rebound[idx_reb:idx_reb+n_rebound-1] # Calc Geometric Mean and Stdev reb_gmean = (gmean( list_sel/100+1, axis=0 )-1)*100 reb_stdev = np.std(list_sel,0) # Calc Money Return t_price = moneyInit for k_sel in range(0,len(list_sel)): t_price = (1+list_sel[k_sel]/100) * t_price - commisionFee moneyReturn = t_price # gmean, stdev, money return data_output[k,:] = [reb_gmean, reb_stdev, moneyReturn] return data_output
def build(self): selection = self.periodicTable() row = self.ZMatModel.rowCount() self.addRow() self.ZMatModel.dataChanged.disconnect(self.clearUpdateView) newSymbol = selection[1] newData = [newSymbol] if len(self.highList) >= 1: newBond = round(2.1*gmean([ elements[e].covalent_radius for e in [selection[0], elems[self.highList[0][0]]] ]), 4) newData.append(self.highList[0][0]+1) newData.append(newBond) if len(self.highList) >= 2: newAngle = 109.4712 newData.append(self.highList[1][0]+1) newData.append(newAngle) if len(self.highList) == 3: newDihedral = 120. newData.append(self.highList[2][0]+1) newData.append(newDihedral) for j, cell in enumerate(newData): item = QStandardItem(str(cell)) self.ZMatModel.setItem(row, j, item) self.highList = [] self.ZMatModel.dataChanged.connect(self.clearUpdateView) self.updateView()
def _compute_neighborhood_graph_weight(self, root, graph): # list all nodes at increasing distances # at each distance # compute the arithmetic mean weight on nodes # compute the geometric mean weight on edges # compute the product of the two # make a list of the neighborhood_graph_weight at every distance neighborhood_graph_weight_list = [] w = graph.node[root][self.key_weight] node_weight_list = np.array([w], dtype=np.float64) node_average = node_weight_list[0] edge_weight_list = np.array([1], dtype=np.float64) edge_average = edge_weight_list[0] # for all distances root_dist_dict = graph.node[root]['remote_neighbours'] for distance, node_set in root_dist_dict.iteritems(): # extract array of weights at given distance weight_array_at_d = np.array([graph.node[v][self.key_weight] for v in node_set], dtype=np.float64) if distance % 2 == 0: # nodes node_weight_list = np.concatenate( (node_weight_list, weight_array_at_d)) node_average = np.mean(node_weight_list) else: # edges edge_weight_list = np.concatenate( (edge_weight_list, weight_array_at_d)) edge_average = stats.gmean(edge_weight_list) weight = node_average * edge_average neighborhood_graph_weight_list.append(weight) graph.node[root]['neigh_graph_weight'] = \ neighborhood_graph_weight_list
def findTrueAverageTableAnchoring(frq,anch,list_taxa,method): n = len(set(list_taxa)-set(anch)) anch = sorted(list(anch)) lst_taxa = list(list_taxa.keys()) TotalKey = dict() s = {1,2,3} for i in range(0,n): for j in range(i+1,n): for k in range(j+1,n): for z in range(k+1,n): for taxon_i in list_taxa[lst_taxa[i]]: for taxon_j in list_taxa[lst_taxa[j]]: for taxon_k in list_taxa[lst_taxa[k]]: for taxon_z in list_taxa[lst_taxa[z]]: keyt = "/".join(sorted([taxon_i,taxon_j,taxon_k,taxon_z])) lab_taxon_i = taxon_i lab_taxon_j = taxon_j lab_taxon_k = taxon_k lab_taxon_z = taxon_z tmp_dict = dict() tmp_dict[lst_taxa[i]] = lab_taxon_i tmp_dict[lst_taxa[j]] = lab_taxon_j tmp_dict[lst_taxa[k]] = lab_taxon_k tmp_dict[lst_taxa[z]] = lab_taxon_z key_orig = "/".join(sorted([lab_taxon_i,lab_taxon_j,lab_taxon_k,lab_taxon_z])) l = sorted([lst_taxa[i],lst_taxa[j],lst_taxa[k],lst_taxa[z]]) key_inv = "/".join(l) v = frq[key_orig] v_inv = dict() for q in range(1,4): q1 = sorted([tmp_dict[l[0]],tmp_dict[l[q]]]) stmp = list(s-{q}) q2 = sorted([tmp_dict[l[stmp[0]]],tmp_dict[l[stmp[1]]]]) if q1[0]<q2[0]: v_inv[l[q]] = v[q1[1]] else: v_inv[l[q]] = v[q2[1]] if key_inv in TotalKey: vt = TotalKey[key_inv] for keyt in vt.keys(): vt[keyt].append(v_inv[keyt]) else: vt = dict() for q in v_inv: vt[q] = list() vt[q].append(v_inv[q]) TotalKey[key_inv] = vt TotalKeyf = dict() for q,v in TotalKey.iteritems(): vtt = dict() for q2,v2 in v.iteritems(): if method == "gmean": vtt[q2] = stats.gmean(v2) elif method == "mean": vtt[q2] = mean(v2) else: vtt[q2] = sqrt(mean(square(v2))) TotalKeyf[q] = vtt return TotalKeyf
def spectral_flatness(spectrum): """The spectral flatness is calculated by dividing the geometric mean of the power spectrum by the arithmetic mean of the power spectrum I'm not sure if the spectrum should be squared first... """ return gmean(spectrum)/mean(spectrum)
def gmean(self): """Returns the gmean of the models predictions. Returns ------- `PipeApply` """ return self.apply(lambda x: gmean(x, axis=0))
def test_2D_array_dim1(self): a = array(((1,2,3,4), (1,2,3,4), (1,2,3,4))) actual= stats.gmean(a, axis=1) v = power(1*2*3*4,1./4.) desired = array((v,v,v)) assert_array_almost_equal(actual, desired, decimal=14)
def _compute_coverage(self, contig_ids): ''' Computes the coverage of the transcript made up from contig_ids as a geometric mean of the coverage for each contig. ''' from scipy.stats import gmean filter = {'assembly': self.asm, 'node_id__in': contig_ids,} values = Stat.objects.filter(**filter).values_list('coverage', flat=True) return gmean(values)
def geom_std(values: t.List[float]) -> float: """ Calculates the geometric standard deviation for the passed values. Source: https://en.wikipedia.org/wiki/Geometric_standard_deviation """ import scipy.stats as stats import scipy as sp gmean = stats.gmean(values) return sp.exp(sp.sqrt(sp.sum([sp.log(x / gmean) ** 2 for x in values]) / len(values)))
def Fitness(Population): fitness_vec=np.zeros(len(Population)) for i in range(len(Population)): Compute=Population[i] No_omega=Compute[1:] fitness_vec[i]=sps.gmean(No_omega) return fitness_vec
def calculate_features_for_VAD(sound_frames, frequencies_axis, spectrogram): features = numpy.empty((spectrogram.shape[0], 3)) # smooted_spectrogram, smoothed_frequencies_axis = smooth_spectrogram(spectrogram, frequencies_axis, 24) for time_ind in range(spectrogram.shape[0]): mean_spectrum = spectrogram[time_ind].mean() if mean_spectrum > 0.0: sfm = -10.0 * math.log10(stats.gmean(spectrogram[time_ind]) / mean_spectrum) else: sfm = 0.0 # max_freq = smoothed_frequencies_axis[smooted_spectrogram[time_ind].argmax()] max_freq = frequencies_axis[spectrogram[time_ind].argmax()] features[time_ind][0] = numpy.square(sound_frames[time_ind]).mean() features[time_ind][1] = sfm features[time_ind][2] = max_freq """medfilt_order = 3 for feature_ind in range(features.shape[0]): features[feature_ind] = signal.medfilt(features[feature_ind], medfilt_order)""" return features
def _calc_overall_qual(label_qual, spatial_qual): """ Calculate the overall quality for all detections on all ground truth objects for a given image :param label_qual: g x d label quality score between zero and one for each possible combination of g ground truth objects and d detections. :param spatial_qual: g x d spatial quality score between zero and one for each possible combination of g ground truth objects and d detections. :return: overall_qual_mat: g x d overall label quality between zero and one for each possible combination of g ground truth objects and d detections. """ combined_mat = np.dstack((label_qual, spatial_qual)) # Calculate the geometric mean between label quality and spatial quality. # Note we ignore divide by zero warnings here for log(0) calculations internally. with np.errstate(divide='ignore'): overall_qual_mat = gmean(combined_mat, axis=2) return overall_qual_mat
def make_submite(): clfs = train_xgboost() df = pd.read_csv( '/home/kshitij/Desktop/Dataset/stage2_sample_submission.csv') x = get_tst() preds = [] for clf in clfs: preds.append(np.clip(clf.predict(x), 0.001, 1)) pred = gmean(np.array(preds), axis=0) print pred for i in range(len(pred)): df['cancer'] = pred df.to_csv('subm_xgb.csv', index=False) print(df.head())
def run_gemm(runs=3): results = [] for usecase in ["train", "inference"]: for precision in ["float", "half", "int8"]: if usecase == "train" and precision == "int8": continue run_results = [] for i in range(runs): print("RUNNING: gemm_bench {} {}. This is iteration {} of {}". format(usecase, precision, i + 1, runs)) prc = subprocess.Popen( ["../DeepBench/code/bin/gemm_bench", usecase, precision], stdout=subprocess.PIPE) out = prc.communicate()[0] run_results.append(extract_timings(out)) results.append("gemm_bench {} {}: {}".format( usecase, precision, gmean(np.array(run_results).min(axis=0)))) return results
def get_simulation(data): log_returns = np.log(1 + data.pct_change()) u = log_returns.mean() var = log_returns.var() drift = u - (0.5 * var) stdev = log_returns.std() t_intervals = DAYS_IN_YEAR * YEARS iterations = 10 daily_returns = np.exp(drift + stdev * norm.ppf(np.random.rand(t_intervals, iterations))) S0 = data.iloc[-1] price_list = np.zeros_like(daily_returns) price_list[0] = S0 for t in range(1, t_intervals): price_list[t] = price_list[t - 1] * daily_returns[t] asset_returns = price_list[-1] / (price_list[0]) return gmean(asset_returns)
def mixed_latent_kernel_density_estimate(self, kernel='gaussian', **kwargs): d_real = self.hypercube_distance_real h_real = gmean(d_real.ravel()) d_simp = self.sphere_distance_latent h_simp = d_simp.mean() if kernel == 'gaussian': s1 = np.exp(-(d_real / h_real)**2).mean(axis=(1, 2)) s2 = np.exp(-(d_simp / h_simp)**2).mean(axis=(1, 2)) return 1 / (s1 * s2) elif kernel == 'laplace': s1 = np.exp(-np.abs(d_real / h_real)).mean(axis=(1, 2)) s1 = np.exp(-np.abs(d_simp / h_simp)).mean(axis=(1, 2)) return 1 / (s1 * s2) else: raise ValueError('requested kernel not available') pass
def print_iris_statistics(data): # --------------------------------------------------- Create data frame < # df = pandas.DataFrame( columns = ["Dł. d. k.", "Sz. d. k.", "Dł. pł.", "Sz. pł."]) # ------------------------- Calculate different statistical information < # df.loc["Minimum [cm]", :] = [i for i in data.iloc[:, 0:4].min()] df.loc["Maksimum [cm]", :] = [i for i in data.iloc[:, 0:4].max()] df.loc["Rozstęp [cm]", :] = [i for i in df.loc["Maksimum [cm]"] - df.loc["Minimum [cm]"]] df.loc["Pierwszy kwartyl [cm]", :] = [ quantile(data.iloc[:, i], 0.25) for i in range(4)] df.loc["Mediana [cm]", :] = [ median(data.iloc[:, i]) for i in range(4)] df.loc["Trzeci kwartyl [cm]", :] = [ quantile(data.iloc[:, i], 0.75) for i in range(4)] df.loc["Średnia harmoniczna [cm]", :] = stats.hmean(data.iloc[:, 0:4]) df.loc["Średnia geometryczna [cm]", :] = stats.gmean(data.iloc[:, 0:4]) df.loc["Średnia arytmetyczna [cm]", :] = [i for i in data.mean()] # Operator ** means power() method # The shape attribute for numpy arrays returns the dimensions of the array # If Y has n rows and m columns, then Y.shape is (n,m). So Y.shape[0] is n df.loc["Średnia potęgowa 2 rzędu [cm]", :] = [i for i in ( ((data.iloc[:, 0:4] ** 2).sum() / data.shape[0]) ** (1 / 2))] df.loc["Średnia potęgowa 3 rzędu [cm]", :] = [i for i in ( ((data.iloc[:, 0:4] ** 3).sum() / data.shape[0]) ** (1 / 3))] df.loc["Wariancja [cm^2]", :] = [i for i in data.var()] df.loc["Odchylenie standardowe [cm]", :] = [i for i in data.std()] # If True, Fisher’s definition is used (normal ==> 0.0) # If False, Pearson’s definition is used (normal ==> 3.0) df.loc["Kurtoza", :] = stats.kurtosis(data.iloc[:, 0:4], fisher = False) pandas.set_option('display.max_rows', 1000) pandas.set_option('display.max_columns', 1000) pandas.set_option('display.width', 1000) print(df.astype(float).round(1))
def _asianGeoFloat(S, m, r, T): ''' Use simulated underlying to determine the price of an Asian Geometric Average Call / Put option with a floating strike price Payoffs are of the form : C = max(S - m*AVG_geo, 0) P = max(m*AVG_geo - S, 0) Parameters ---------- S : numpy.array Simulated stock price, want to be of the form such that the first row is the initial stock price, with subsequent rows representing an additional time step increase, and each column is a simulated path of the asset m : number of any type (int, float8, float64 etc.) Strike value scaler of option, determined at initiation r : number of any type (int, float8, float64 etc.) Risk free interest rate, implied constant till expiration T : number of any type (int, float8, float64 etc.) Time till expiration for option Returns ------- [[call,put],[callMotion,putMotion]] : list of pair of lists, first of floats, second of one-dimensional numpy.array's First list is the call and put price, determined by the average of the simulated stock payoffs Second list is the call and put simulated paths payoffs at expiration, NOT discounted Notes ----- The accuracy of pricing is dependent on the number of time steps and simulated paths chosen for the underlying stochastic motion ''' avg = sctats.gmean(S, axis=0) callMotion = (S[-1] - m * avg).clip(0) putMotion = (m * avg - S[-1]).clip(0) call = np.exp(-r * T) * np.average(callMotion) put = np.exp(-r * T) * np.average(putMotion) return ([[call, put], [callMotion, putMotion]])
def plot_counter_stat(csv, plot_format, stat_name, counter_numerator, counter_denominator, scale): """ Process the returned csv file into a time-series statistic to plot and also calculate some useful aggregate stats. """ df = pd.read_csv(csv, sep='|', header=0, names=[ 'time', 'count', 'rsrvd1', 'event', 'rsrvd2', 'frac', 'rsrvd3', 'rsrvd4' ], dtype={ 'time': np.float64, 'count': np.float64, 'rsrvd1': str, 'event': str, 'rsrvd2': str, 'frac': np.float64, 'rsrvd3': str, 'rsrvd4': str }) df_processed = pd.DataFrame() df_processed[stat_name] = ( df[df['event'] == counter_numerator]['count'].reset_index(drop=True) ) / (df[df['event'] == counter_denominator]['count'].reset_index( drop=True)) * scale df_processed.dropna(inplace=True) # Calculate some meaningful aggregate stats for comparing time-series plots geomean = stats.gmean(df_processed[stat_name]) p50 = stats.scoreatpercentile(df_processed[stat_name], 50) p90 = stats.scoreatpercentile(df_processed[stat_name], 90) p99 = stats.scoreatpercentile(df_processed[stat_name], 99) xtitle = f"gmean:{geomean:>6.2f} p50:{p50:>6.2f} p90:{p90:>6.2f} p99:{p99:>6.2f}" if plot_format == "terminal": plot_terminal(df_processed, stat_name, xtitle) elif plot_format == "matplotlib": plot_matplotlib(df_processed, stat_name, xtitle) else: print(f"Do not know how to plot {plot_format}")
def run_FLOCK(input_file, method, bins, density, output_file, mfi_file, mfi_calc, profile): # This version of the tool assumes FLOCK is installed. # install FLOCK with: # conda install flock run_command = [method, input_file] if bins: run_command.append(bins) if density: run_command.append(density) try: subprocess.call(" ".join(run_command), env=os.environ.copy(), shell=True) subprocess.call(" ".join(['mv', 'flock_results.txt', output_file]), env=os.environ.copy(), shell=True) except: sys.stderr.write("Could not run FLOCK\n") sys.exit(2) # Here add some way to calculate the count and tack it on to profile file. flockdf = pd.read_table(output_file) if mfi_calc == "mfi": MFIs = flockdf.groupby('Population').mean().round(decimals=2) elif mfi_calc == "gmfi": MFIs = flockdf.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2) else: MFIs = flockdf.groupby('Population').median().round(decimals=2) with open(mfi_file, "w") as outf: MFIs.to_csv(outf, sep="\t", float_format='%.0f') (events, columns) = flockdf.shape fstats = {} fstats['population'] = flockdf.iloc[:, -1:].iloc[:, 0] fstats['population_freq'] = fstats['population'].value_counts() fstats['population_freq_sort'] = fstats['population_freq'].sort_index() fstats['population_per'] = (fstats['population'].value_counts(normalize=True) * 100).round(decimals=2) fstats['population_per_sort'] = fstats['population_per'].sort_index() fstats['population_all'] = pd.concat([fstats['population_freq_sort'], fstats['population_per_sort']], axis=1) fstats['population_all'].columns = ['Count', 'Percentage'] fstats['population_all']['Population_ID'] = fstats['population_all'].index flock_profile = pd.read_table('profile.txt') profile_pop = flock_profile.merge(fstats['population_all'], on='Population_ID') profile_pop.to_csv(profile, sep="\t", float_format='%.2f', index=False) return
def multioutput_fscore(y_true, y_pred, beta=1): """ Geometric mean of the fbeta_score, computed on each label. Aim is avoiding issues when dealing with imbalanced cases. Can be used as scorer for GridSearchCV: scorer = make_scorer(multioutput_fscore,beta=1) Parameters --------- y_true: lst List of labels y_prod: lst List of predictions beta: float Beta value to be used to calculate fscore metric Returns ------- f1score: float Geometric mean of the fscore """ # If provided y predictions is a dataframe then extract the values from that if isinstance(y_pred, pd.DataFrame) == True: y_pred = y_pred.values # If provided y actuals is a dataframe then extract the values from that if isinstance(y_true, pd.DataFrame) == True: y_true = y_true.values f1score_list = [] for column in range(0, y_true.shape[1]): score = fbeta_score(y_true[:, column], y_pred[:, column], beta, average='weighted') f1score_list.append(score) f1score = np.asarray(f1score_list) f1score = f1score[f1score < 1] # Get the geometric mean of f1score f1score = gmean(f1score) return f1score
def __call__(self, data): conds = self.condition_layout blocks = self.block_layout # Build two new layouts. c0 is a list of lists of indexes into # the data that represent condition 0 for each block. c1 is # the same for data that represent condition 1 for each block. c0_blocks = intersect_layouts(blocks, [conds[0]]) c1_blocks = intersect_layouts(blocks, [conds[1]]) # Get the mean for each block for both conditions. means0 = group_means(data, c0_blocks) means1 = group_means(data, c1_blocks) # If we have tuning params, add another dimension to the front # of each ndarray to vary the tuning param. if self.alphas is not None: shape = (len(self.alphas), ) + np.shape(means0) old0 = means0 old1 = means1 means0 = np.zeros(shape) means1 = np.zeros(shape) for i, a in enumerate(self.alphas): means0[i] = old0 + a means1[i] = old1 + a means0 /= means1 ratio = means0 # If we have more than one block, we combine their ratios # using the geometric mean. ratio = gmean(ratio, axis=-1) # 'Symmetric' means that the order of the conditions does not # matter, so we should always return a ratio >= 1. So for any # ratios that are < 1, use the inverse. if self.symmetric: # Add another dimension to the front where and 1 is its # inverse, then select the max across that dimension ratio_and_inverse = np.array([ratio, 1.0 / ratio]) ratio = np.max(ratio_and_inverse, axis=0) return ratio
def get_score(keywords, title): title = nlp(title) if len(keywords) == 0 or len(title) == 0: return 0 scores = [] for word1 in keywords: indexes = [] for word2 in title: indexes.append(nlp(word1).similarity(word2)) scores.append(max(indexes)) finallist = [] for max_score in scores: if max_score <= 0: finallist.append(0.01) else: finallist.append(max_score) mean = stats.gmean(finallist) return mean
def autometa_clr(df: pd.DataFrame) -> pd.DataFrame: """Normalize k-mers by Centered Log Ratio transformation Steps ----- * Drop any k-mers not present for all contigs * Drop any contigs not containing any kmer counts * Fill any remaining na values with 0 * Normalize the k-mer count by the total count of all k-mers for a given contig * Add 1 as 0 can not be utilized for CLR * Perform CLR transformation log(norm. value / geometric mean norm. value) Parameters ---------- df : pd.DataFrame K-mers Dataframe where index_col='contig' and column values are k-mer frequencies. References ---------- * Aitchison, J. The Statistical Analysis of Compositional Data (1986) * Pawlowsky-Glahn, Egozcue, Tolosana-Delgado. Lecture Notes on Compositional Data Analysis (2011) * Why ILR is preferred `stats stackexchange discussion <https://stats.stackexchange.com/questions/242445/why-is-isometric-log-ratio-transformation-preferred-over-the-additivealr-or-ce>`_ * Use of CLR transformation prior to PCA `stats stackexchange discussion <https://stats.stackexchange.com/questions/305965/can-i-use-the-clr-centered-log-ratio-transformation-to-prepare-data-for-pca>`_ * Lecture notes on Compositional Data Analysis (CoDa) `PDF <http://www.sediment.uni-goettingen.de/staff/tolosana/extra/CoDa.pdf>`_ Returns ------- pd.DataFrame index='contig', cols=[kmer, kmer, ...] Columns have been transformed by CLR normalization. """ # steps in 1: data cleaning df = df.dropna(axis="columns", how="all").dropna(axis="index", how="all").fillna(0) # steps in 2 and 3: normalization and CLR transformation step_2a = lambda x: (x + 1) / x.sum() step_2b = lambda x: np.log(x / gmean(x)) return df.transform(step_2a, axis="columns").transform(step_2b, axis="columns")
def get_cgm_stats(self, start_date, end_date): """ Compute cgm stats with dates Args: start_date (dt.DateTime): start date end_date (dt.DateTime): end date Returns: (float, float): geo mean and std """ cgm_values = [] for time, cgm_event in self.glucose_timeline.items(): if start_date <= time <= end_date: cgm_value = cgm_event.get_value() cgm_values.append(cgm_value) return gmean(cgm_values), gstd(cgm_values)
def get_fft_stats(z): avg = np.average(z) std = np.std(z) median = np.median(z) var = np.var(z) kurt = stats.kurtosis(z) hmean = stats.hmean(z) gmean = stats.gmean(z) skew = stats.skew(z) median_dev_abs = np.sum(np.abs(z - median)) std_dev_abs = np.sum(np.abs(z - std)) stats_array = [ avg, std, median, var, kurt, hmean, gmean, skew, median_dev_abs, std_dev_abs ] return stats_array
def multioutput_fscore(y_true, y_pred, beta=1): """ MultiOutput Fscore This is a performance metric of my own creation. It is a sort of geometric mean of the fbeta_score, computed on each label. It is compatible with multi-label and multi-class problems. It features some peculiarities (geometric mean, 100% removal...) to exclude trivial solutions and deliberatly under-estimate a stangd fbeta_score average. The aim is avoiding issues when dealing with multi-class/multi-label imbalanced cases. Arguments: y_true -> List of labels y_prod -> List of predictions beta -> Beta value to be used to calculate fscore metric Output: f1score -> Calculation geometric mean of fscore """ # If provided y predictions is a dataframe then extract the values from that if isinstance(y_pred, pd.DataFrame) == True: y_pred = y_pred.values # If provided y actuals is a dataframe then extract the values from that if isinstance(y_true, pd.DataFrame) == True: y_true = y_true.values f1score_list = [] for column in range(0, y_true.shape[1]): score = fbeta_score(y_true[:, column], y_pred[:, column], beta, average='weighted') f1score_list.append(score) f1score = np.asarray(f1score_list) f1score = f1score[f1score < 1] # Get the geometric mean of f1score f1score = gmean(f1score) return f1score
def update_site(site): infoDf = pd.DataFrame() #Filter by site siteFulldf = dfFull[dfFull['Site'] == site] siteDf = df[df['Site'] == site] #Calculate correlation siteCorDf = siteDf.corr() eCorDf = siteCorDf[siteCorDf['EnteroCount'] != 1] eCorVal = eCorDf['EnteroCount'] eCorVal = float(eCorVal) #Get geometric mean gMean = stats.gmean(siteDf.loc[:, "EnteroCount"]) #Avg rainfall at the site avgRain = siteDf['FourDayRainTotal'].mean() #Last time sample was collected lastSample = max(pd.to_datetime(siteFulldf.Date)) data = { 'Site': site, 'Correlation Coefficient': round(eCorVal, 3), 'Geo.Mean Enterococcus Count': round(gMean, 3), 'Avg. Rainfall': round(avgRain, 3), 'Last Sample Date': lastSample.strftime('%m/%d/%Y') } infoDf = infoDf.append(data, ignore_index=True) if (len(infoDf) == 0): data = { 'Site': '', 'Correlation Coefficient': '', 'Geo.Mean Enterococcus Count': '', 'Avg. Rainfall': '', 'Last Sample Date': '' } infoDf = infoDf.append(data, ignore_index=True) cols = [ 'Site', 'Correlation Coefficient', 'Geo.Mean Enterococcus Count', 'Avg. Rainfall', 'Last Sample Date' ] infoDf = infoDf[cols] return (infoDf.to_dict('records'))
def make_submission(): clfs = train_xgboost() df = pd.read_csv( '/home/kshitij/Desktop/Dataset/stage2_sample_submission.csv') x = np.array([ np.mean( np.load('/home/kshitij/Desktop/Dataset/stage2_features/%s.npy' % str(did)).reshape(-1, 2048), axis=0) for did in df['id'].tolist() ]) preds = [] for clf in clfs: preds.append(np.clip(clf.predict(x), 0.001, 1)) pred = gmean(np.array(preds), axis=0) df['cancer'] = pred df.to_csv('submGBR2.csv', index=False)
def agregate(): y_tru = pd.read_csv('data/test.lst', sep='\t', header=None, names=['0', 'y', 'fns'])['y'] all = [] for i in range(0, len(y_tru), 5000): fn = 'tmp/320_66_%d.npy' % i print(fn) # prd = gmean(np.load(fn), axis=0) print(prd.shape) all.append(prd) all = np.vstack(all) print(all.shape) np.save('tmp/f_avg', all)
def looking_for_optimal_f(self): x = np.linspace(0.01, 1.0, 199) f_geomeans = list() for f in x: df = self.generate_rollouts(f=f, verbose=0, plot=False) gm_df = df.pct_change() + 1 gm_df = gm_df.fillna(1) gmean = stats.gmean(gm_df, axis=0) gmean = np.nan_to_num(gmean) geomean = np.mean(gmean) f_geomeans.append((f, geomean)) df = pd.DataFrame(f_geomeans) df = df.set_index([0]) print(df) # df.plot(x=df[0], y=df[1], kind="scatter", grid=True, legend=True) df.plot(grid=True, legend=True) opt_f = df.idxmax(axis=0) print(f"Optimal f is {float(opt_f)}") plt.show()
def _geometric(vals, weights=None): """ Compute the geometric average of the elements of ``vals``. Parameters ---------- vals: np.ndarray An array of values, typically representing link ratios from a single development period. weights: np.ndarray Not yet implemented. Returns ------- float """ arr = np.asarray(vals, dtype=np.float) return (np.NaN if arr.size == 0 else stats.gmean(arr))
def check(): y_tru = pd.read_csv('data/test.lst', sep='\t', header=None, names=['0', 'y', 'fns'])['y'] all = [] # for fn in glob.glob('tmp/r*npy'): for fn in [ 'tmp/rnx101_val_33_0.npy', 'tmp/rnx101t_val_44_0.npy', 'tmp/rn152k_val_26_0.npy', 'tmp/rnx101t_r_val_44_0.npy' ]: prd = np.load(fn) all.append(prd) print(accuracy_score(y_tru, np.argmax(prd, axis=1)), fn) all = gmean(np.array(all), axis=0) print(accuracy_score(y_tru, np.argmax(all, axis=1)), fn)
def easy_step(self, action): self.counter += 1 self.update_env_r_from_r(action) # write to file input for OMNeT: Routing vector_to_file(matrix_to_omnet_v(self.env_r), self.folder + OM_ROUTING, 'w') # verify file position and format (separator, matrix/vector) np.savetxt('tmp.txt', routing, fmt='%d') # execute OMNeT omnet_wrapper(self) # read OMNeT's output: Delay and Lost packets om_output = file_to_csv(self.folder + OM_DELAY) self.update_env_d(csv_to_matrix(om_output, self.active_nodes)) self.update_env_l(csv_to_lost(om_output)) reward = rl_reward(self) # log everything to file vector_to_file([-reward], self.folder + REWARD_LOG, 'a') # s = rl_state(self) log = np.concatenate( ([self.counter], [self.env_l], [np.mean(matrix_to_rl(self.env_d))], [np.max(matrix_to_rl(self.env_d))], [ (np.mean(matrix_to_rl(self.env_d)) + np.max(matrix_to_rl(self.env_d))) / 2 ], [stats.gmean(matrix_to_rl(self.env_d))])) vector_to_file(log, self.folder + WHOLE_LOG, 'a') # generate traffic for next iteration self.update_env_t(self.t_gen.generate()) # write to file input for OMNeT: Traffic or do nothing if static if self.traffic.split(':')[0] not in ('stat', 'stat_eq', 'file', 'dir'): vector_to_file(matrix_to_omnet_v(self.env_t), self.folder + OM_TRAFFIC, 'w') new_state = rl_state(self) return new_state, reward, 0
def blockreduce_pyramid(input_arr, block_size=(2, 2, 2), func=np.max, max_iters=12): """ Parameters ---------- input_arr: np.array Input array to iteratively downsample Default: Path("local_staging/singlecellimages/manifest.csv") block_size: Tuple(int) Block size for iterative array reduction. All voxels in this block are merged via func into one voxel during the downsample. Default: (2, 2, 2) func: Callable[[np.array], float] Function to apply to block_size voxels to merge them into one new voxel. Default: np.max max_iters: int Maximum number of downsampling rounds before ending at a one voxel cell. Default: 12 Returns ------- result: Dict[float, np.array] Dictionary of reduced arrays. Keys are reduction fold, values the reduced array. """ # how much are we downsampling per round fold = gmean(block_size) # original image i = 0 pyramid = {fold**i: input_arr.copy()} # downsample and save to dict i = 1 while (i <= max_iters) and (np.max(pyramid[fold**(i - 1)].shape) > 1): pyramid[fold**i] = block_reduce(pyramid[fold**(i - 1)], block_size, func) i += 1 return pyramid
def read(self): """Reads the borehole information data from a excel file""" if self.filename: if not Path(self.filename).suffix == ".xlsx" or Path( self.filename).suffix == ".xls": logger.error(f"{self.filename} is not an Excel file") raise AssertionError(f"{self.filename} is not an Excel file") if config.borehole_processing.anisotropic_borehole_data: logger.info( 'Processing the file as containing anisotropic permeability values' ) self.data = pd.read_excel(self.filename, header=3, sheet_name=None) for borehole in self.data: self.data[borehole] = self.data[borehole].iloc[1:] self.data[borehole] = self.data[borehole] self.data[borehole].rename(columns={ 'Permeability (m^2)': 'kx', 'Unnamed: 3': 'ky', 'Unnamed: 4': 'kz', }, inplace=True) self.data[borehole]['kmean'] = stats.gmean( np.array([ self.data[borehole]['kx'], self.data[borehole]['ky'], self.data[borehole]['kz'], ]).astype(np.float)) else: self.data = pd.read_excel(self.filename, header=3, sheet_name=None) _excel_file: Dict[str, pd.DataFrame] = pd.read_excel(self.filename, sheet_name=None) for borehole in _excel_file: self.boreholes_info[borehole] = { "x": _excel_file[borehole].iloc[0, 1], "y": _excel_file[borehole].iloc[0, 2], "z": _excel_file[borehole].iloc[0, 3], }
def cbf_qei(gm, wm, csf, img, thresh=0.7): def fun1(x, xdata): d1 = np.exp(-(x[0]) * np.power(xdata, x[1])) return (d1) def fun2(x, xdata): d1 = 1 - np.exp(-(x[0]) * np.power(xdata, x[1])) return (d1) x1 = [0.054, 0.9272] x2 = [2.8478, 0.5196] x4 = [3.0126, 2.4419] scbf = smooth_image(nb.load(img), fwhm=5).get_fdata() if len(scbf.shape) > 3: scbf = scbf[:, :, :, 0] # load prob maps gmm = nb.load(gm).get_fdata() wmm = nb.load(wm).get_fdata() ccf = nb.load(csf).get_fdata() if len(gmm.shape) > 3: gmm = gmm[:, :, :, 0] wmm = wmm[:, :, :, 0] ccf = ccf[:, :, :, 0] pbcf = 2.5 * gmm + wmm # gmm is 2.5 times wm msk = np.array((scbf != 0) & (scbf != np.nan) & (pbcf != np.nan)).astype(int) gm1 = np.array(gmm > thresh) wm1 = np.array(wmm > thresh) cc1 = np.array(ccf > thresh) r1 = np.array([0, np.corrcoef(scbf[msk == 1], pbcf[msk == 1])[1, 0]]).max() V = ((np.sum(gm1) - 1) * np.var(scbf[gm1 > 0]) + (np.sum(wm1) - 1) * np.var(scbf[wm1 > 0]) + (np.sum(cc1) - 1) * np.var(scbf[cc1 > 0])) / ( np.sum(gm1 > 0) + np.sum(wm1 > 0) + np.sum(cc1 > 0) - 3) negGM = np.sum(scbf[gm1] < 0) / (np.sum(gm1)) GMCBF = np.mean(scbf[gm1]) CV = V / np.abs(GMCBF) Q = [fun1(x1, CV), fun1(x2, negGM), fun2(x4, r1)] return gmean(Q)
def summary_angular_errors(errors): errors = sorted(errors) def g(f): return np.percentile(errors, f * 100) median = g(0.5) mean = np.mean(errors) gm = gmean(errors) trimean = 0.25 * (g(0.25) + 2 * g(0.5) + g(0.75)) results = { '25': np.mean(errors[:int(0.25 * len(errors))]), '75': np.mean(errors[int(0.75 * len(errors)):]), '95': g(0.95), 'tri': trimean, 'med': median, 'mean': mean, 'gm': gm } return results
def main() -> None: throughputs_hase = [] throughputs_original = [] for i in range(args.n): with open(f"{args.outdir}/{args.name}_{i}.out") as file: benchmarks, throughput = parse(file) throughputs_original.append(throughput) with open(f"{args.outdir}/{args.name}_hase_{i}.out") as file: benchmarks, throughput = parse(file) throughputs_hase.append(throughput) throughputs_hase = np.array(throughputs_hase) throughputs_original = np.array(throughputs_original) ratios = aggregate(throughputs_hase) / aggregate(throughputs_original) for i in range(len(benchmarks)): print(f"{benchmarks[i]}\t{ratios[i]:.4f}") print("GeoMean\t" + str(gmean(ratios)))
def geometric_mean_group_auc(y_true, p_pred): A = y_true.index.values unique_A = np.unique(A) if type(y_true) == pd.DataFrame: _y_true = y_true.values.flatten() y_true_, y_pred_ = [], [] for a in unique_A: y_true_.append(_y_true[A == a]) y_pred_.append(p_pred[A == a]) aucs = [] for yt, yp in zip(y_true_, y_pred_): aucs.append(roc_auc_score(yt, yp)) try: val = gmean(np.array(aucs)) except: val = -1 return val