Esempio n. 1
0
def anchoredDistanceFromFrqSmallPolytomies(quartTable,method,met):
    frq = dict()
    for key,val in quartTable.iteritems():
        for key2,val2 in val.iteritems():
            if met == "log":
                if method == "gmean":
                    vtt = np.exp(-stats.gmean(val2))
                elif method == "mean":
                    vtt = np.exp(-mean(val2))
                else:
                    vtt = np.exp(-sqrt(mean(square(val2))))
            elif met == "freq":
                if method == "gmean":
                    vtt = (stats.gmean(val2))
                elif method == "mean":
                    vtt = (mean(val2))
                else:
                    vtt = (sqrt(mean(square(val2))))
            if key in frq:
                v = frq[key]
                v[key2] = vtt
                frq[key] = v
            else:
                v = dict()
                v[key2] = vtt
                frq[key] = v
    return frq
Esempio n. 2
0
File: drp.py Progetto: nihaoCC/DRP
def _calculate(compound):

    num = DRP.models.NumMolDescriptorValue

    if any(element in inorgElements for element in compound.elements):
        delete_descriptors([compound])
        vals_to_create = []
        inorgElementNormalisationFactor = sum(info['stoichiometry'] for element, info in compound.elements.items() if element in inorgElements)
        for prop in inorgAtomicProperties:
            vals_to_create.append(num( 
                                        compound=compound,
                                        descriptor=descriptorDict['drpInorgAtom{}_geom_unw'.format(prop.title().replace('_', ''))],
                                        value=gmean([inorgElements[element][prop] for element in compound.elements if element in inorgElements])
                                        ))

            vals_to_create.append(num(
                                        compound=compound,
                                        descriptor=descriptorDict['drpInorgAtom{}_geom_stoich'.format(prop.title().replace('_', ''))],
                                        value = gmean([inorgElements[element][prop]*(info['stoichiometry']/inorgElementNormalisationFactor) for element, info in compound.elements.items() if element in inorgElements])))
               
            vals_to_create.append(num(
                                        compound=compound,
                                        descriptor=descriptorDict['drpInorgAtom{}_max'.format(prop.title().replace('_', ''))],
                                        value = max(inorgElements[element][prop] for element in compound.elements if element in inorgElements)))
    
            vals_to_create.append(num(
                                        compound=compound,
                                        descriptor=descriptorDict['drpInorgAtom{}_range'.format(prop.title().replace('_', ''))],
                                        value = max(inorgElements[element][prop] for element in compound.elements if element in inorgElements) - min(inorgElements[element][prop] for element in compound.elements if element in inorgElements)))
        num.objects.bulk_create(vals_to_create)
Esempio n. 3
0
def get_extreme_bounding_box_multithreaded(G,choice_set_config,time_dependent_relation,trip_times,trip_data,n_threads=2):
	
	config=choice_set_config
	inverse_relation=get_inverse_time_dependent_relation(time_dependent_relation)
	
	id_sample=trip_data.keys()
	if config['bounding_box_sample_size']<len(trip_data):
		id_sample=random.sample(id_sample,config['bounding_box_sample_size'])
		
	work_queue = Queue()
	done_queue_range = Queue()
	done_queue_vars=Queue()
	processes = []
		
	for trip_id in id_sample:
		work_queue.put(trip_id)

	for w in xrange(n_threads):
		p = Process(target=bounding_box_worker, args=(work_queue, done_queue_range,done_queue_vars,G,choice_set_config,time_dependent_relation,trip_data,trip_times))
		p.start()
		processes.append(p)
		work_queue.put('STOP')

	all_bounds=[]
	vars={}
	for i in range(n_threads):
		all_bounds = all_bounds + list(iter(done_queue_range.get,'STOP'))
		this_vars= list(iter(done_queue_vars.get,'STOP'))
		vars=dict(vars,**this_vars[0])
		
	for p in processes:
		p.join()
	
	ranges={}
	for var in vars:
		low=[]
		high=[]
		
		for bound in all_bounds:
			if var in bound:
				low.append(bound[var][0])
				high.append(bound[var][1])
		low.sort()
		high.sort()
			
		if low:
			low=gmean(low)
			high=gmean(high)
			if low<=high:
				ranges[var]=(low,high)
			else:
				print 'WARNING: coefficient for ' + var + ' had inconsistent high/low.  Try increasing sample size for bounding box determination.'
		else:
			print 'WARNING: coefficient for ' + var + ' had no range.  Try increasing sample size for bounding box determination.'
				
	if choice_set_config['verbose']:
		print 'EXT_BOUND: ', ranges
				
	return ranges
Esempio n. 4
0
    def test_1D_list(self):
        a = (1,2,3,4)
        actual= stats.gmean(a)
        desired = power(1*2*3*4,1./4.)
        assert_almost_equal(actual, desired,decimal=14)

        desired1 = stats.gmean(a,axis=-1)
        assert_almost_equal(actual, desired1, decimal=14)
Esempio n. 5
0
    def test_1D_array(self):
        a = array((1,2,3,4), float32)
        actual= stats.gmean(a)
        desired = power(1*2*3*4,1./4.)
        assert_almost_equal(actual, desired, decimal=7)

        desired1 = stats.gmean(a,axis=-1)
        assert_almost_equal(actual, desired1, decimal=7)
Esempio n. 6
0
    def test_2D_array_default(self):
        a = array(((1,2,3,4),
                   (1,2,3,4),
                   (1,2,3,4)))
        actual= stats.gmean(a)
        desired = array((1,2,3,4))
        assert_array_almost_equal(actual, desired, decimal=14)

        desired1 = stats.gmean(a,axis=0)
        assert_array_almost_equal(actual, desired1, decimal=14)
Esempio n. 7
0
    def test_gmean(self):
        for n in self.get_n():
            x, y, xm, ym = self.generate_xy_sample(n)
            r = stats.gmean(abs(x))
            rm = stats.mstats.gmean(abs(xm))
            assert_equal(r, rm)

            r = stats.gmean(abs(y))
            rm = stats.mstats.gmean(abs(ym))
            assert_equal(r, rm)
Esempio n. 8
0
    def test_gmean(self):
        for n in self.get_n():
            x, y, xm, ym = self.generate_xy_sample(n)
            r = stats.gmean(abs(x))
            rm = stats.mstats.gmean(abs(xm))
            assert_allclose(r, rm, rtol=1e-13)

            r = stats.gmean(abs(y))
            rm = stats.mstats.gmean(abs(ym))
            assert_allclose(r, rm, rtol=1e-13)
def groupCovForPosRep(files,totalCounts,up,down,calType='total'):
    """
    This function groups the replicates of samples' coverage around TSS and TSE sites.
    return two columns: ['mean'/'total'/'median'/'geometricmean','std']
    
    * files: list. A list of replicate files.
    * totalCounts: list. A list of int with same length of files. Stores the total mapped reads in files.
    * calType: str. Calculation type.
    return a dataframe with two columns. ['mean_value','standard deviation']
    """
    res_df = pd.DataFrame()
    mean = pd.DataFrame()
    std = pd.DataFrame()
    for f,total in zip(files,totalCounts):
        df = pd.read_csv(f,sep='\t',header=0,index_col=0,low_memory=False)
        # remove the antibody, signal peptide and non signal peptide are separate
        try:
            #df = df.drop('heavychain');df=df.drop('lightchain');df=df.drop('NeoRKanR')
            #df = df.loc[['heavychain','lightchain']]
            df = df.loc[['lightchain'],:]
        except:
            pass
        # filter by median of gene
        df['median'] = df.median(axis=1)
        #df = df[df['median']>=0.5]
        del df['median']
        up_df = df.iloc[:,0:up]
        down_df = df.iloc[:,up:up+down]
        try:
            up_df = up_df.replace('-',np.nan).dropna().astype('int').T
            down_df = down_df.replace('-',np.nan).dropna().astype('int').T
        except:
            pass
        up_df = up_df/float(total)*(10**6)   # row: position. col: gene
        down_df = down_df/float(total)*(10**6)
        if calType == 'total':
            up_df[calType] = up_df.sum(axis=1);down_df[calType] = down_df.sum(axis=1)
        if calType == 'mean':
            up_df[calType] = up_df.mean(axis=1);down_df[calType] = down_df.mean(axis=1)
        if calType == 'median':
            up_df[calType] = up_df.median(axis=1);down_df[calType] = down_df.median(axis=1)
        if calType == 'geoMean':
            up_df = up_df.replace([0],[1]);down_df=down_df.replace([0],[1])
            up_df[calType] = sp_stats.gmean(up_df.values,axis=1)
            down_df[calType] = sp_stats.gmean(down_df.values,axis=1)
        df = df.T
        df[calType]= pd.concat([up_df[calType],down_df[calType]])  # add caltype column
        up_df['std']=up_df.std(axis=1);down_df['std']=down_df.std(axis=1)
        df['std'] = pd.concat([up_df['std'],down_df['std']])
        mean[f+calType] = df[calType]  # stores the results
        std[f+'std'] = df['std']
    res_df['mean'] = mean.mean(axis=1)
    res_df['std'] = ((std**2).sum(axis=1)/std.shape[1]).apply(np.sqrt)
    return res_df
def findTrueAverageTableAnchoringOnDifferentSidesOverall(frq,anch,list_taxa,N1,N2,method, met):
    anch = sorted(list(anch))
    lst_taxa = list(list_taxa.keys())
    TotalKey = dict()
    n = len(lst_taxa)
    N = {N1,N2}
    for i in range(0,n):
        if lst_taxa[i] in N:
            continue 
        for j in range(i+1,n):
            if lst_taxa[j] in N:
                continue
            p = sorted([lst_taxa[i],lst_taxa[j]])
            key_orig = genKey(p,anch)

            l = sorted([lst_taxa[i],lst_taxa[j],anch[0],anch[1]])
            key_inv = "/".join(l)
            v = frq[key_orig]
            v_inv = float(v[0])/v[1]
            if key_inv in TotalKey:
                if (met=="freq"):
                    vt = TotalKey[key_inv]
                    vt.append(v_inv)
                elif met == "log":
                    vt = TotalKey[key_inv]
                    vt.append(-np.log(1.*v_inv))
            else:
                if (met == "freq"):
                    vt = list()
                    vt.append(v_inv)
                elif met == "log":
                    vt = list()
                    vt.append(-np.log(1.*v_inv))
            TotalKey[key_inv] = vt
    TotalKeyf = dict()
    for q,v2 in TotalKey.iteritems():
        l = set(q.split("/"))
        l = list(l - set(anch))
        if met == "log":
            if method == "gmean":
                vtt = np.exp(-stats.gmean(v2))
            elif method == "mean":
                vtt = np.exp(-mean(v2))
        else:
            vtt = np.exp(-sqrt(mean(square(v2))))
        if met == "freq":
            if method == "gmean":
                vtt = (stats.gmean(v2))
            elif method == "mean":
                vtt = (mean(v2))
            else:
                vtt = (sqrt(mean(square(v2))))
        TotalKeyf[q] = vtt
    return TotalKeyf
Esempio n. 11
0
def geom_mean(a):
    """
    Compute the geometric mean for an "arbitrary" data set, ie one that
    contains zeros and negative numbers.
    
    Parameters
    ----------
    
    a : array-like
        A numpy.ndarray, or something that can be converted to an ndarray
        
    Returns
    -------
    The geometric mean of the input array
    
    Notes
    -----
    The traditional geometric mean can not be computed on a mixture of positive
    and negative numbers.  The approach here, validated rigorously in the
    cited paper[1], is to compute the geometric mean of the absolute value of
    the negative numbers separately, and then take a weighted arithmetic mean
    of that and the geometric mean of the positive numbers.  We're going to 
    discard 0 values, operating under the assumption that in this context
    there are going to be few or no observations with a value of exactly 0.
    
    References
    ----------
    [1] Geometric mean for negative and zero values
        Elsayed A. E. Habib
        International Journal of Research and Reviews in Applied Sciences
        11:419 (2012)
        http://www.arpapress.com/Volumes/Vol11Issue3/IJRRAS_11_3_08.pdf
        
        A new "Logicle" display method avoids deceptive effects of logarithmic 
        scaling for low signals and compensated data.
        Parks DR, Roederer M, Moore WA.
        Cytometry A. 2006 Jun;69(6):541-51.
        PMID: 16604519
        http://onlinelibrary.wiley.com/doi/10.1002/cyto.a.20258/full
    """
    
    a = np.array(a)
    pos = a[a > 0]
    pos_mean = stats.gmean(pos)
    pos_prop = pos.size / a.size
    
    neg = a[a < 0]
    neg = np.abs(neg)
    neg_mean = stats.gmean(neg) if neg.size > 0 else 0
    neg_prop = neg.size / a.size
    
    return (pos_mean * pos_prop) - (neg_mean * neg_prop)
    
Esempio n. 12
0
def get_extreme_bounding_box_random_sample(G,choice_set_config,time_dependent_relation,trip_times,trip_data):
	
	config=choice_set_config
	inverse_relation=get_inverse_time_dependent_relation(time_dependent_relation)
	
	id_sample=trip_data.keys()
	if config['bounding_box_sample_size']<len(trip_data):
		id_sample=random.sample(id_sample,config['bounding_box_sample_size'])
		
	all_bounds=[]
	cur_bound={}
	vars={}
	for id in id_sample:
		cur_bound=find_coef_bounding_box(G,trip_data[id][0],trip_data[id][-1],choice_set_config,time_dependent_relation,trip_times[id[0]])
		if choice_set_config['verbose']:
			print 'CUR_BOUND: ', cur_bound
		this_bound={}
		for key in cur_bound:
			orig_key=key
			if key in inverse_relation:
				orig_key=inverse_relation[key]
			this_bound[orig_key]=cur_bound[key]
			vars[orig_key]=1
		all_bounds.append(this_bound)
	
	ranges={}
	for var in vars:
		low=[]
		high=[]
		for bound in all_bounds:
			if var in bound:
				low.append(bound[var][0])
				high.append(bound[var][1])
		low.sort()
		high.sort()
			
		if low:
			low=gmean(low)
			high=gmean(high)
			if low<=high:
				ranges[var]=(low,high)
			else:
				print 'WARNING: coefficient for ' + var + ' had inconsistent high/low.  Try increasing sample size or percentile for bounding box determination.'
		else:
			print 'WARNING: coefficient for ' + var + ' had no range.  Try increasing sample size or percentile for bounding box determination.'
				
	if choice_set_config['verbose']:
		print 'EXT_BOUND: ', ranges
				
	return ranges
def get_stats_on_dataset(dataset, prediction):
    # Get mean, median, stddev of each dataset
    last_tp = dataset.experiments[-1]

    # Mean of abs of entire dataset
    sum = 0
    count = 0
    for exp in dataset.experiments:
        for gene in dataset.gene_list:
            sum += abs(exp.ratios[gene])
            count += 1

    mean_dataset = sum / float(count)

    # Mean of prediction
    sum = 0
    count = 0
    for gene in dataset.gene_list:
        sum += abs(prediction[gene])
        count += 1

    mean_prediction = sum / float(count)

    # Mean of last tp
    sum = 0
    count = 0
    for gene in dataset.gene_list:
        sum += abs(last_tp.ratios[gene])
        count += 1

    mean_last_tp = sum / float(count)

    # Stddev of dataset
    dataset_list = []
    for exp in dataset.experiments:
        for gene in dataset.gene_list:
            dataset_list.append(exp.ratios[gene])

    std_dataset = numpy.std(dataset_list)
    std_prediction = numpy.std(prediction.values())
    std_last_tp = numpy.std(last_tp.ratios.values())

    median_dataset = numpy.median(dataset_list)
    median_prediction = numpy.median(prediction.values())
    median_last_tp = numpy.median(last_tp.ratios.values())

    gmean_dataset = stats.gmean(map(abs,dataset_list))
    gmean_prediction = stats.gmean(map(abs,prediction.values()))
    gmean_last_tp = stats.gmean(map(abs,last_tp.ratios.values()))
Esempio n. 14
0
    def compute_doc_state(self, doc):
        # fetch entity embedding for the set of candidates being considered
        candidates = set()
        for chain in doc.chains:
            for c in sorted(chain.candidates, key=lambda c: c.features[self.ranking_feature], reverse=True)[:self.rerank_depth]:
                    candidates.add(c.id)
        candidate_embeddings = self.em.get_embeddings(candidates)

        distance_cache = {}
        def candidate_distance(a, b):
            if a < b:
                a, b = b, a
            key = (a,b)
            if key not in distance_cache:
                distance_cache[key] = self.distance(candidate_embeddings[a], candidate_embeddings[b])
            return distance_cache[key]

        # precompute the top candidates for each chain
        rc_by_chain = {}
        for chain in doc.chains:
            rc_by_chain[chain] = [c.id for c in sorted(chain.candidates, key=lambda c: c.features[self.ranking_feature], reverse=True)][:self.coherence_depth+1]

        state = {}
        for c in candidates:
            dists = []
            for chain in doc.chains:
                top = [ci for ci in rc_by_chain[chain] if ci != c][:self.coherence_depth]
                if not top:
                    continue
                dists.append(min(candidate_distance(c, tc) for tc in top if tc != c))
            if dists:
                state[c] = gmean(dists)

        return state
Esempio n. 15
0
def combine_fr(frbc, fr1, fr2):
	# combines all filter responses to find possible stair areas
	"""with gmean 0 in one fr makes it all 0 > raises RuntimeWarning""" 
	import warnings
	warnings.filterwarnings("ignore")

	builder = []
	for row in xrange(frbc.shape[0]): # all filter responses have the same shape
	# row = 0
		temprow = []
		for column in xrange(frbc.shape[1]):
			temp = stats.gmean([abs(frbc[row,column]),abs(fr1[row,column]),abs(fr2[row,column])]) # gmean of all arrays, this would raise a warning
			if temp > 100: # threshold for binary
				temp = 0
			if temp > 0:
				temp = 1
			temprow.append(temp) # take abs of corresponding value of each fr and calculate geometric mean
		builder.append(temprow)
 	combi_fr = np.asanyarray(builder) # returns binary array

 	"""show combined filter responses"""
	# plt.imshow(combi_fr.T,cmap='spectral',interpolation='none', origin='lower') # <-- this would show the actual orientation of the data

	# plt.imshow(combi_fr,interpolation='none')
	# plt.title('combined filter responses')
	# # plt.colorbar()
	# plt.show()
	return combi_fr
Esempio n. 16
0
def create_simple_record(sequence):
    features = np.zeros(1, SimpleRecordType)
    
    features["mean"] = sequence.mean()
    features["var"] = sequence.var()
    features["skewness"] = stats.skew(sequence)
    features["kurtosis"] = stats.kurtosis(sequence)
    
    features["first"] = sequence[0]
    features["sign"] = np.sign(sequence).mean()
    features["zeros"] = (sequence == 0).mean()
    
    if (features["zeros"] == 0.0):
        features["harmonic_mean"] = stats.hmean(abs(sequence))
        features["geometric_mean"] = stats.gmean(abs(sequence))
    else:
        features["harmonic_mean"] = np.nan
        features["geometric_mean"] = np.nan
    
    for m in [2, 3, 5]:
        seqm = sequence % m
        for v in range(m):
            features["val_%dmod%d" % (v, m)] = (seqm == v).mean()
    
    return features
Esempio n. 17
0
def sfm(data):
    """
    Spectral Flatness Measure
    """
    g = stats.gmean(data, dtype=numpy.float64) + 0.00001
    a = numpy.mean(data, dtype=numpy.float64)
    return 10 * log10(g / a)
Esempio n. 18
0
def test_SED_error(I=1., e1=1, e2=10):
    """Compute the error one makes by using the simple formulas:
    e = sqrt(e1 * e2)
    f = I / (e2 - e1)
    e2f = e ** 2 * f
    to compute a differential flux f or e2f from an integral flux
    measurement I in an energy bin [e1, e2].
    Note that e is the log bin center and e2f is typically plotted
    in spectral energy distributions (SEDs).

    Index    SED-Error Flux-Error
    1.5    1.28    0.85
    2.0    1.00    1.00
    2.5    0.85    1.28
    3.0    0.81    1.75
    """
    from scipy.stats import gmean
    e = gmean([e1, e2])
    f = I / (e2 - e1)
    e2f = e ** 2 * f  # @note: e ** 2 = e1 * e2 here.
    for Index in np.arange(1.5, 3.5, 0.5):
        f_correct = powerlaw.power_law_flux(I, Index, e, e1, e2)
        e2f_correct = e ** 2 * f_correct
        # We compute ratios, which corresponds to differences
        # on a log scale
        SED = e2f / e2f_correct
        Flux = f / f_correct
Esempio n. 19
0
def centralize(mat):
    r"""Center data around its geometric average.

    Parameters
    ----------
    mat : array_like, float
       a matrix of proportions where
       rows = compositions and
       columns = components

    Returns
    -------
    numpy.ndarray
         centered composition matrix

    Examples
    --------
    >>> import numpy as np
    >>> from skbio.stats.composition import centralize
    >>> X = np.array([[.1,.3,.4, .2],[.2,.2,.2,.4]])
    >>> centralize(X)
    array([[ 0.17445763,  0.30216948,  0.34891526,  0.17445763],
           [ 0.32495488,  0.18761279,  0.16247744,  0.32495488]])

    """
    mat = closure(mat)
    cen = ss.gmean(mat, axis=0)
    return perturb_inv(mat, cen)
Esempio n. 20
0
    def bind(self, *nodes, composition=None):
        if self.HIERARCHICAL:
            children = nodes
        else:
            children = self._concatenate_children(nodes)
        if composition is None:
            composition = self.COMPOSITION

        row_vecs = {}
        if composition:
            # gen_vec is the weighted average of all other blobs with
            # the same number of children.
            gen_vecs = {row: self.vector_model.zeros() for row in self.rows}
            comparable = (n for n in self.nodes if len(n.children) == len(children))
            for node in comparable:
                child_sims = [my_child.similarity(other_child)
                              for my_child, other_child in zip(children, node.children)]
                total_sim = stats.gmean(child_sims)
                for row, vec in gen_vecs.items():
                    vec += vectors.normalize(node.row_vecs[row]) * total_sim

            row_vecs = {row: vec * composition for row, vec in gen_vecs.items()}
            
            assert not np.isnan(np.sum(list(row_vecs.values())))

        id_string = self._id_string(children)
        return VectorNode(self, id_string, children=children, row_vecs=row_vecs)
Esempio n. 21
0
def calcMoneyReturn(list_rebound, n_rebound, n_output, moneyInit, commisionFee):
    
    n_list = len(list_rebound)
    
    if n_rebound==0: # when n_rebound is empty
        print('empty...')
        n_rebound = n_list
        n_output = 1
    
    data_output = np.zeros((n_output,3))
    for k in range(0,n_output):
        
        # randomly choose rebounds
        # idx_reb = np.random.choice(range(n_list),n_rebound)
        if n_rebound<n_list:
            idx_reb = np.random.choice(range(n_list-n_rebound),1)
        elif n_rebound==n_list:
            idx_reb = 0
        list_sel = list_rebound[idx_reb:idx_reb+n_rebound-1]
        
        # Calc Geometric Mean and Stdev
        reb_gmean = (gmean( list_sel/100+1, axis=0 )-1)*100
        reb_stdev = np.std(list_sel,0)
        
        # Calc Money Return
        t_price = moneyInit
        for k_sel in range(0,len(list_sel)):
            t_price = (1+list_sel[k_sel]/100) * t_price - commisionFee
        moneyReturn = t_price
        
        # gmean, stdev, money return
        data_output[k,:] = [reb_gmean, reb_stdev, moneyReturn]
        
    return data_output
Esempio n. 22
0
File: moldy.py Progetto: shrx/moldy
 def build(self):
     selection = self.periodicTable()
     row = self.ZMatModel.rowCount()
     self.addRow()
     self.ZMatModel.dataChanged.disconnect(self.clearUpdateView)
     newSymbol = selection[1]
     newData = [newSymbol]
     if len(self.highList) >= 1:
         newBond = round(2.1*gmean([ elements[e].covalent_radius for e in [selection[0], elems[self.highList[0][0]]] ]), 4)
         newData.append(self.highList[0][0]+1)
         newData.append(newBond)
         if len(self.highList) >= 2:
             newAngle = 109.4712
             newData.append(self.highList[1][0]+1)
             newData.append(newAngle)
             if len(self.highList) == 3:
                 newDihedral = 120.
                 newData.append(self.highList[2][0]+1)
                 newData.append(newDihedral)
     for j, cell in enumerate(newData):
         item = QStandardItem(str(cell))
         self.ZMatModel.setItem(row, j, item)
     self.highList = []
     self.ZMatModel.dataChanged.connect(self.clearUpdateView)
     self.updateView()
Esempio n. 23
0
 def _compute_neighborhood_graph_weight(self, root, graph):
     # list all nodes at increasing distances
     # at each distance
     # compute the arithmetic mean weight on nodes
     # compute the geometric mean weight on edges
     # compute the product of the two
     # make a list of the neighborhood_graph_weight at every distance
     neighborhood_graph_weight_list = []
     w = graph.node[root][self.key_weight]
     node_weight_list = np.array([w], dtype=np.float64)
     node_average = node_weight_list[0]
     edge_weight_list = np.array([1], dtype=np.float64)
     edge_average = edge_weight_list[0]
     # for all distances
     root_dist_dict = graph.node[root]['remote_neighbours']
     for distance, node_set in root_dist_dict.iteritems():
         # extract array of weights at given distance
         weight_array_at_d = np.array([graph.node[v][self.key_weight]
                                       for v in node_set], dtype=np.float64)
         if distance % 2 == 0:  # nodes
             node_weight_list = np.concatenate(
                 (node_weight_list, weight_array_at_d))
             node_average = np.mean(node_weight_list)
         else:  # edges
             edge_weight_list = np.concatenate(
                 (edge_weight_list, weight_array_at_d))
             edge_average = stats.gmean(edge_weight_list)
         weight = node_average * edge_average
         neighborhood_graph_weight_list.append(weight)
     graph.node[root]['neigh_graph_weight'] = \
         neighborhood_graph_weight_list
Esempio n. 24
0
def findTrueAverageTableAnchoring(frq,anch,list_taxa,method):
	n = len(set(list_taxa)-set(anch))	
	anch = sorted(list(anch))
	lst_taxa = list(list_taxa.keys())
	TotalKey = dict()
	s = {1,2,3}
	for i in range(0,n):
		for j in range(i+1,n):
			for k in range(j+1,n):
				for z in range(k+1,n):
					for taxon_i in list_taxa[lst_taxa[i]]:
						for taxon_j in list_taxa[lst_taxa[j]]:
							for taxon_k in list_taxa[lst_taxa[k]]:
								for taxon_z in list_taxa[lst_taxa[z]]:
									keyt = "/".join(sorted([taxon_i,taxon_j,taxon_k,taxon_z]))
									lab_taxon_i = taxon_i
									lab_taxon_j = taxon_j
									lab_taxon_k = taxon_k
									lab_taxon_z = taxon_z
									tmp_dict = dict()
									tmp_dict[lst_taxa[i]] = lab_taxon_i
									tmp_dict[lst_taxa[j]] = lab_taxon_j
									tmp_dict[lst_taxa[k]] = lab_taxon_k
									tmp_dict[lst_taxa[z]] = lab_taxon_z
									key_orig = "/".join(sorted([lab_taxon_i,lab_taxon_j,lab_taxon_k,lab_taxon_z]))
									l = sorted([lst_taxa[i],lst_taxa[j],lst_taxa[k],lst_taxa[z]])
									key_inv = "/".join(l)
									v = frq[key_orig]
									v_inv = dict()
									for q in range(1,4):
										q1 = sorted([tmp_dict[l[0]],tmp_dict[l[q]]])
										stmp = list(s-{q})
										q2 = sorted([tmp_dict[l[stmp[0]]],tmp_dict[l[stmp[1]]]])
										if q1[0]<q2[0]:
											v_inv[l[q]] = v[q1[1]]	
										else:
											v_inv[l[q]] = v[q2[1]]
									if key_inv in TotalKey:
										vt = TotalKey[key_inv] 
										for keyt in vt.keys():
											vt[keyt].append(v_inv[keyt])
									else:
										vt = dict()
										for q in v_inv:
											vt[q] = list()
											vt[q].append(v_inv[q])
									TotalKey[key_inv] = vt
									
	TotalKeyf = dict()
	for q,v in TotalKey.iteritems():
		vtt = dict()
		for q2,v2 in v.iteritems():
			if method == "gmean":
				vtt[q2] = stats.gmean(v2)
			elif method == "mean":
				vtt[q2] = mean(v2)
			else:
				vtt[q2] = sqrt(mean(square(v2)))
		TotalKeyf[q] = vtt
	return TotalKeyf
Esempio n. 25
0
def spectral_flatness(spectrum):
    """The spectral flatness is calculated by dividing the geometric mean of 
    the power spectrum by the arithmetic mean of the power spectrum

    I'm not sure if the spectrum should be squared first...
    
    """
    return gmean(spectrum)/mean(spectrum)
Esempio n. 26
0
    def gmean(self):
        """Returns the gmean of the models predictions.

        Returns
        -------
        `PipeApply`
        """
        return self.apply(lambda x: gmean(x, axis=0))
Esempio n. 27
0
 def test_2D_array_dim1(self):
     a = array(((1,2,3,4),
                (1,2,3,4),
                (1,2,3,4)))
     actual= stats.gmean(a, axis=1)
     v = power(1*2*3*4,1./4.)
     desired = array((v,v,v))
     assert_array_almost_equal(actual, desired, decimal=14)
Esempio n. 28
0
 def _compute_coverage(self, contig_ids):
     '''
     Computes the coverage of the transcript made up from contig_ids
     as a geometric mean of the coverage for each contig.
     '''
     from scipy.stats import gmean
     filter = {'assembly': self.asm, 'node_id__in': contig_ids,}
     values = Stat.objects.filter(**filter).values_list('coverage', flat=True)
     return gmean(values)
Esempio n. 29
0
def geom_std(values: t.List[float]) -> float:
    """
    Calculates the geometric standard deviation for the passed values.
    Source: https://en.wikipedia.org/wiki/Geometric_standard_deviation
    """
    import scipy.stats as stats
    import scipy as sp
    gmean = stats.gmean(values)
    return sp.exp(sp.sqrt(sp.sum([sp.log(x / gmean) ** 2 for x in values]) / len(values)))
def Fitness(Population):

    fitness_vec=np.zeros(len(Population))

    for i in range(len(Population)):
        Compute=Population[i]
        No_omega=Compute[1:]
        fitness_vec[i]=sps.gmean(No_omega)
    return fitness_vec
Esempio n. 31
0
File: vad.py Progetto: zhucq/vad
def calculate_features_for_VAD(sound_frames, frequencies_axis, spectrogram):
    features = numpy.empty((spectrogram.shape[0], 3))
    # smooted_spectrogram, smoothed_frequencies_axis = smooth_spectrogram(spectrogram, frequencies_axis, 24)
    for time_ind in range(spectrogram.shape[0]):
        mean_spectrum = spectrogram[time_ind].mean()
        if mean_spectrum > 0.0:
            sfm = -10.0 * math.log10(stats.gmean(spectrogram[time_ind]) / mean_spectrum)
        else:
            sfm = 0.0
        # max_freq = smoothed_frequencies_axis[smooted_spectrogram[time_ind].argmax()]
        max_freq = frequencies_axis[spectrogram[time_ind].argmax()]
        features[time_ind][0] = numpy.square(sound_frames[time_ind]).mean()
        features[time_ind][1] = sfm
        features[time_ind][2] = max_freq
    """medfilt_order = 3
    for feature_ind in range(features.shape[0]):
        features[feature_ind] = signal.medfilt(features[feature_ind], medfilt_order)"""
    return features
Esempio n. 32
0
def _calc_overall_qual(label_qual, spatial_qual):
    """
    Calculate the overall quality for all detections on all ground truth objects for a given image
    :param label_qual: g x d label quality score between zero and one for each possible combination of
    g ground truth objects and d detections.
    :param spatial_qual: g x d spatial quality score between zero and one for each possible combination of
    g ground truth objects and d detections.
    :return: overall_qual_mat: g x d overall label quality between zero and one for each possible combination of
    g ground truth objects and d detections.
    """
    combined_mat = np.dstack((label_qual, spatial_qual))

    # Calculate the geometric mean between label quality and spatial quality.
    # Note we ignore divide by zero warnings here for log(0) calculations internally.
    with np.errstate(divide='ignore'):
        overall_qual_mat = gmean(combined_mat, axis=2)

    return overall_qual_mat
def make_submite():
    clfs = train_xgboost()

    df = pd.read_csv(
        '/home/kshitij/Desktop/Dataset/stage2_sample_submission.csv')

    x = get_tst()

    preds = []
    for clf in clfs:
        preds.append(np.clip(clf.predict(x), 0.001, 1))

    pred = gmean(np.array(preds), axis=0)
    print pred
    for i in range(len(pred)):
        df['cancer'] = pred
    df.to_csv('subm_xgb.csv', index=False)
    print(df.head())
Esempio n. 34
0
def run_gemm(runs=3):
    results = []
    for usecase in ["train", "inference"]:
        for precision in ["float", "half", "int8"]:
            if usecase == "train" and precision == "int8":
                continue
            run_results = []
            for i in range(runs):
                print("RUNNING: gemm_bench {} {}. This is iteration {} of {}".
                      format(usecase, precision, i + 1, runs))
                prc = subprocess.Popen(
                    ["../DeepBench/code/bin/gemm_bench", usecase, precision],
                    stdout=subprocess.PIPE)
                out = prc.communicate()[0]
                run_results.append(extract_timings(out))
            results.append("gemm_bench {} {}: {}".format(
                usecase, precision, gmean(np.array(run_results).min(axis=0))))
    return results
Esempio n. 35
0
def get_simulation(data):
    log_returns = np.log(1 + data.pct_change())
    u = log_returns.mean()
    var = log_returns.var()
    drift = u - (0.5 * var)
    stdev = log_returns.std()
    t_intervals = DAYS_IN_YEAR * YEARS
    iterations = 10
    daily_returns = np.exp(drift + stdev *
                           norm.ppf(np.random.rand(t_intervals, iterations)))
    S0 = data.iloc[-1]
    price_list = np.zeros_like(daily_returns)
    price_list[0] = S0
    for t in range(1, t_intervals):
        price_list[t] = price_list[t - 1] * daily_returns[t]

    asset_returns = price_list[-1] / (price_list[0])
    return gmean(asset_returns)
Esempio n. 36
0
 def mixed_latent_kernel_density_estimate(self,
                                          kernel='gaussian',
                                          **kwargs):
     d_real = self.hypercube_distance_real
     h_real = gmean(d_real.ravel())
     d_simp = self.sphere_distance_latent
     h_simp = d_simp.mean()
     if kernel == 'gaussian':
         s1 = np.exp(-(d_real / h_real)**2).mean(axis=(1, 2))
         s2 = np.exp(-(d_simp / h_simp)**2).mean(axis=(1, 2))
         return 1 / (s1 * s2)
     elif kernel == 'laplace':
         s1 = np.exp(-np.abs(d_real / h_real)).mean(axis=(1, 2))
         s1 = np.exp(-np.abs(d_simp / h_simp)).mean(axis=(1, 2))
         return 1 / (s1 * s2)
     else:
         raise ValueError('requested kernel not available')
     pass
Esempio n. 37
0
def print_iris_statistics(data):
    # --------------------------------------------------- Create data frame < #
    df = pandas.DataFrame(
            columns = ["Dł. d. k.",
                       "Sz. d. k.",
                       "Dł. pł.",
                       "Sz. pł."])

    # ------------------------- Calculate different statistical information < #
    df.loc["Minimum [cm]", :] = [i for i in data.iloc[:, 0:4].min()]
    df.loc["Maksimum [cm]", :] = [i for i in data.iloc[:, 0:4].max()]
    df.loc["Rozstęp [cm]", :] = [i for i in df.loc["Maksimum [cm]"]
                                 - df.loc["Minimum [cm]"]]

    df.loc["Pierwszy kwartyl [cm]", :] = [
        quantile(data.iloc[:, i], 0.25) for i in range(4)]
    df.loc["Mediana [cm]", :] = [
        median(data.iloc[:, i]) for i in range(4)]
    df.loc["Trzeci kwartyl [cm]", :] = [
        quantile(data.iloc[:, i], 0.75) for i in range(4)]

    df.loc["Średnia harmoniczna [cm]", :] = stats.hmean(data.iloc[:, 0:4])
    df.loc["Średnia geometryczna [cm]", :] = stats.gmean(data.iloc[:, 0:4])
    df.loc["Średnia arytmetyczna [cm]", :] = [i for i in data.mean()]

    # Operator ** means power() method
    # The shape attribute for numpy arrays returns the dimensions of the array
    # If Y has n rows and m columns, then Y.shape is (n,m). So Y.shape[0] is n
    df.loc["Średnia potęgowa 2 rzędu [cm]", :] = [i for i in (
            ((data.iloc[:, 0:4] ** 2).sum() / data.shape[0]) ** (1 / 2))]
    df.loc["Średnia potęgowa 3 rzędu [cm]", :] = [i for i in (
            ((data.iloc[:, 0:4] ** 3).sum() / data.shape[0]) ** (1 / 3))]

    df.loc["Wariancja [cm^2]", :] = [i for i in data.var()]
    df.loc["Odchylenie standardowe [cm]", :] = [i for i in data.std()]

    # If True, Fisher’s definition is used (normal ==> 0.0)
    # If False, Pearson’s definition is used (normal ==> 3.0)
    df.loc["Kurtoza", :] = stats.kurtosis(data.iloc[:, 0:4], fisher = False)

    pandas.set_option('display.max_rows', 1000)
    pandas.set_option('display.max_columns', 1000)
    pandas.set_option('display.width', 1000)
    print(df.astype(float).round(1))
Esempio n. 38
0
    def _asianGeoFloat(S, m, r, T):
        '''
        Use simulated underlying to determine the price of an Asian Geometric
        Average Call / Put option with a floating strike price
        Payoffs are of the form :
        C = max(S - m*AVG_geo, 0)
        P = max(m*AVG_geo - S, 0)

        Parameters
        ----------
        S : numpy.array
            Simulated stock price, want to be of the form such that the first row
            is the initial stock price, with subsequent rows representing an
            additional time step increase, and each column is a simulated path of
            the asset
        m : number of any type (int, float8, float64 etc.)
            Strike value scaler of option, determined at initiation
        r : number of any type (int, float8, float64 etc.)
            Risk free interest rate, implied constant till expiration
        T : number of any type (int, float8, float64 etc.)
            Time till expiration for option

        Returns
        -------
        [[call,put],[callMotion,putMotion]] : list of pair of lists, first of
            floats, second of one-dimensional numpy.array's
            First list is the call and put price, determined by the average
            of the simulated stock payoffs
            Second list is the call and put simulated paths payoffs at expiration,
            NOT discounted

        Notes
        -----
        The accuracy of pricing is dependent on the number of time steps and
        simulated paths chosen for the underlying stochastic motion

        '''
        avg = sctats.gmean(S, axis=0)
        callMotion = (S[-1] - m * avg).clip(0)
        putMotion = (m * avg - S[-1]).clip(0)

        call = np.exp(-r * T) * np.average(callMotion)
        put = np.exp(-r * T) * np.average(putMotion)
        return ([[call, put], [callMotion, putMotion]])
def plot_counter_stat(csv, plot_format, stat_name, counter_numerator,
                      counter_denominator, scale):
    """
    Process the returned csv file into a time-series statistic to plot and
    also calculate some useful aggregate stats.
    """
    df = pd.read_csv(csv,
                     sep='|',
                     header=0,
                     names=[
                         'time', 'count', 'rsrvd1', 'event', 'rsrvd2', 'frac',
                         'rsrvd3', 'rsrvd4'
                     ],
                     dtype={
                         'time': np.float64,
                         'count': np.float64,
                         'rsrvd1': str,
                         'event': str,
                         'rsrvd2': str,
                         'frac': np.float64,
                         'rsrvd3': str,
                         'rsrvd4': str
                     })
    df_processed = pd.DataFrame()

    df_processed[stat_name] = (
        df[df['event'] == counter_numerator]['count'].reset_index(drop=True)
    ) / (df[df['event'] == counter_denominator]['count'].reset_index(
        drop=True)) * scale
    df_processed.dropna(inplace=True)

    # Calculate some meaningful aggregate stats for comparing time-series plots
    geomean = stats.gmean(df_processed[stat_name])
    p50 = stats.scoreatpercentile(df_processed[stat_name], 50)
    p90 = stats.scoreatpercentile(df_processed[stat_name], 90)
    p99 = stats.scoreatpercentile(df_processed[stat_name], 99)
    xtitle = f"gmean:{geomean:>6.2f} p50:{p50:>6.2f} p90:{p90:>6.2f} p99:{p99:>6.2f}"

    if plot_format == "terminal":
        plot_terminal(df_processed, stat_name, xtitle)
    elif plot_format == "matplotlib":
        plot_matplotlib(df_processed, stat_name, xtitle)
    else:
        print(f"Do not know how to plot {plot_format}")
Esempio n. 40
0
def run_FLOCK(input_file, method, bins, density, output_file, mfi_file,
              mfi_calc, profile):
    # This version of the tool assumes FLOCK is installed.
    # install FLOCK with:
    # conda install flock
    run_command = [method, input_file]
    if bins:
        run_command.append(bins)
    if density:
        run_command.append(density)
    try:
        subprocess.call(" ".join(run_command), env=os.environ.copy(), shell=True)
        subprocess.call(" ".join(['mv', 'flock_results.txt', output_file]), env=os.environ.copy(), shell=True)
    except:
        sys.stderr.write("Could not run FLOCK\n")
        sys.exit(2)
    # Here add some way to calculate the count and tack it on to profile file.
    flockdf = pd.read_table(output_file)
    if mfi_calc == "mfi":
        MFIs = flockdf.groupby('Population').mean().round(decimals=2)
    elif mfi_calc == "gmfi":
        MFIs = flockdf.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2)
    else:
        MFIs = flockdf.groupby('Population').median().round(decimals=2)

    with open(mfi_file, "w") as outf:
        MFIs.to_csv(outf, sep="\t", float_format='%.0f')

    (events, columns) = flockdf.shape
    fstats = {}
    fstats['population'] = flockdf.iloc[:, -1:].iloc[:, 0]
    fstats['population_freq'] = fstats['population'].value_counts()
    fstats['population_freq_sort'] = fstats['population_freq'].sort_index()
    fstats['population_per'] = (fstats['population'].value_counts(normalize=True) * 100).round(decimals=2)
    fstats['population_per_sort'] = fstats['population_per'].sort_index()
    fstats['population_all'] = pd.concat([fstats['population_freq_sort'], fstats['population_per_sort']], axis=1)
    fstats['population_all'].columns = ['Count', 'Percentage']
    fstats['population_all']['Population_ID'] = fstats['population_all'].index

    flock_profile = pd.read_table('profile.txt')
    profile_pop = flock_profile.merge(fstats['population_all'], on='Population_ID')
    profile_pop.to_csv(profile, sep="\t", float_format='%.2f', index=False)

    return
Esempio n. 41
0
def multioutput_fscore(y_true, y_pred, beta=1):
    """
    Geometric mean of the fbeta_score, computed on each label. 
    Aim is avoiding issues when dealing with imbalanced cases.
    Can be used as scorer for GridSearchCV:
        scorer = make_scorer(multioutput_fscore,beta=1)
        
    Parameters
    ---------
    y_true: lst
        List of labels
    y_prod: lst
        List of predictions
    beta: float
        Beta value to be used to calculate fscore metric
    
    Returns
    -------
    f1score: float
        Geometric mean of the fscore
    """

    # If provided y predictions is a dataframe then extract the values from that
    if isinstance(y_pred, pd.DataFrame) == True:
        y_pred = y_pred.values

    # If provided y actuals is a dataframe then extract the values from that
    if isinstance(y_true, pd.DataFrame) == True:
        y_true = y_true.values

    f1score_list = []
    for column in range(0, y_true.shape[1]):
        score = fbeta_score(y_true[:, column],
                            y_pred[:, column],
                            beta,
                            average='weighted')
        f1score_list.append(score)

    f1score = np.asarray(f1score_list)
    f1score = f1score[f1score < 1]

    # Get the geometric mean of f1score
    f1score = gmean(f1score)
    return f1score
Esempio n. 42
0
    def __call__(self, data):

        conds = self.condition_layout
        blocks = self.block_layout

        # Build two new layouts. c0 is a list of lists of indexes into
        # the data that represent condition 0 for each block. c1 is
        # the same for data that represent condition 1 for each block.
        c0_blocks = intersect_layouts(blocks, [conds[0]])
        c1_blocks = intersect_layouts(blocks, [conds[1]])

        # Get the mean for each block for both conditions.
        means0 = group_means(data, c0_blocks)
        means1 = group_means(data, c1_blocks)

        # If we have tuning params, add another dimension to the front
        # of each ndarray to vary the tuning param.
        if self.alphas is not None:
            shape = (len(self.alphas), ) + np.shape(means0)
            old0 = means0
            old1 = means1
            means0 = np.zeros(shape)
            means1 = np.zeros(shape)
            for i, a in enumerate(self.alphas):
                means0[i] = old0 + a
                means1[i] = old1 + a

        means0 /= means1
        ratio = means0

        # If we have more than one block, we combine their ratios
        # using the geometric mean.
        ratio = gmean(ratio, axis=-1)

        # 'Symmetric' means that the order of the conditions does not
        # matter, so we should always return a ratio >= 1. So for any
        # ratios that are < 1, use the inverse.
        if self.symmetric:
            # Add another dimension to the front where and 1 is its
            # inverse, then select the max across that dimension
            ratio_and_inverse = np.array([ratio, 1.0 / ratio])
            ratio = np.max(ratio_and_inverse, axis=0)

        return ratio
Esempio n. 43
0
def get_score(keywords, title):
    title = nlp(title)
    if len(keywords) == 0 or len(title) == 0:
        return 0
    scores = []
    for word1 in keywords:
        indexes = []
        for word2 in title:
            indexes.append(nlp(word1).similarity(word2))
        scores.append(max(indexes))

    finallist = []
    for max_score in scores:
        if max_score <= 0:
            finallist.append(0.01)
        else:
            finallist.append(max_score)
    mean = stats.gmean(finallist)
    return mean
Esempio n. 44
0
def autometa_clr(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize k-mers by Centered Log Ratio transformation

    Steps
    -----

        * Drop any k-mers not present for all contigs
        * Drop any contigs not containing any kmer counts
        * Fill any remaining na values with 0
        * Normalize the k-mer count by the total count of all k-mers for a given contig
        * Add 1 as 0 can not be utilized for CLR
        * Perform CLR transformation log(norm. value / geometric mean norm. value)

    Parameters
    ----------
    df : pd.DataFrame
        K-mers Dataframe where index_col='contig' and column values are k-mer
        frequencies.

    References
    ----------

        * Aitchison, J. The Statistical Analysis of Compositional Data (1986)
        * Pawlowsky-Glahn, Egozcue, Tolosana-Delgado. Lecture Notes on Compositional Data Analysis (2011)
        * Why ILR is preferred `stats stackexchange discussion <https://stats.stackexchange.com/questions/242445/why-is-isometric-log-ratio-transformation-preferred-over-the-additivealr-or-ce>`_
        * Use of CLR transformation prior to PCA `stats stackexchange discussion <https://stats.stackexchange.com/questions/305965/can-i-use-the-clr-centered-log-ratio-transformation-to-prepare-data-for-pca>`_
        * Lecture notes on Compositional Data Analysis (CoDa) `PDF <http://www.sediment.uni-goettingen.de/staff/tolosana/extra/CoDa.pdf>`_

    Returns
    -------
    pd.DataFrame
        index='contig', cols=[kmer, kmer, ...]
        Columns have been transformed by CLR normalization.

    """
    # steps in 1: data cleaning
    df = df.dropna(axis="columns", how="all").dropna(axis="index",
                                                     how="all").fillna(0)
    # steps in 2 and 3: normalization and CLR transformation
    step_2a = lambda x: (x + 1) / x.sum()
    step_2b = lambda x: np.log(x / gmean(x))
    return df.transform(step_2a, axis="columns").transform(step_2b,
                                                           axis="columns")
    def get_cgm_stats(self, start_date, end_date):
        """
        Compute cgm stats with dates

        Args:
            start_date (dt.DateTime): start date
            end_date (dt.DateTime): end date

        Returns:
            (float, float): geo mean and std
        """

        cgm_values = []
        for time, cgm_event in self.glucose_timeline.items():
            if start_date <= time <= end_date:
                cgm_value = cgm_event.get_value()
                cgm_values.append(cgm_value)

        return gmean(cgm_values), gstd(cgm_values)
Esempio n. 46
0
def get_fft_stats(z):

    avg = np.average(z)
    std = np.std(z)
    median = np.median(z)
    var = np.var(z)
    kurt = stats.kurtosis(z)
    hmean = stats.hmean(z)
    gmean = stats.gmean(z)
    skew = stats.skew(z)
    median_dev_abs = np.sum(np.abs(z - median))
    std_dev_abs = np.sum(np.abs(z - std))

    stats_array = [
        avg, std, median, var, kurt, hmean, gmean, skew, median_dev_abs,
        std_dev_abs
    ]

    return stats_array
def multioutput_fscore(y_true, y_pred, beta=1):
    """
    MultiOutput Fscore
    
    This is a performance metric of my own creation.
    It is a sort of geometric mean of the fbeta_score, computed on each label.
    
    It is compatible with multi-label and multi-class problems.
    It features some peculiarities (geometric mean, 100% removal...) to exclude
    trivial solutions and deliberatly under-estimate a stangd fbeta_score average.
    The aim is avoiding issues when dealing with multi-class/multi-label imbalanced cases.
        
    Arguments:
        y_true -> List of labels
        y_prod -> List of predictions
        beta -> Beta value to be used to calculate fscore metric
    
    Output:
        f1score -> Calculation geometric mean of fscore
    """

    # If provided y predictions is a dataframe then extract the values from that
    if isinstance(y_pred, pd.DataFrame) == True:
        y_pred = y_pred.values

    # If provided y actuals is a dataframe then extract the values from that
    if isinstance(y_true, pd.DataFrame) == True:
        y_true = y_true.values

    f1score_list = []
    for column in range(0, y_true.shape[1]):
        score = fbeta_score(y_true[:, column],
                            y_pred[:, column],
                            beta,
                            average='weighted')
        f1score_list.append(score)

    f1score = np.asarray(f1score_list)
    f1score = f1score[f1score < 1]

    # Get the geometric mean of f1score
    f1score = gmean(f1score)
    return f1score
Esempio n. 48
0
def update_site(site):
    infoDf = pd.DataFrame()

    #Filter by site
    siteFulldf = dfFull[dfFull['Site'] == site]
    siteDf = df[df['Site'] == site]
    #Calculate correlation
    siteCorDf = siteDf.corr()
    eCorDf = siteCorDf[siteCorDf['EnteroCount'] != 1]
    eCorVal = eCorDf['EnteroCount']
    eCorVal = float(eCorVal)
    #Get geometric mean
    gMean = stats.gmean(siteDf.loc[:, "EnteroCount"])
    #Avg rainfall at the site
    avgRain = siteDf['FourDayRainTotal'].mean()
    #Last time sample was collected
    lastSample = max(pd.to_datetime(siteFulldf.Date))

    data = {
        'Site': site,
        'Correlation Coefficient': round(eCorVal, 3),
        'Geo.Mean Enterococcus Count': round(gMean, 3),
        'Avg. Rainfall': round(avgRain, 3),
        'Last Sample Date': lastSample.strftime('%m/%d/%Y')
    }
    infoDf = infoDf.append(data, ignore_index=True)

    if (len(infoDf) == 0):
        data = {
            'Site': '',
            'Correlation Coefficient': '',
            'Geo.Mean Enterococcus Count': '',
            'Avg. Rainfall': '',
            'Last Sample Date': ''
        }
        infoDf = infoDf.append(data, ignore_index=True)

    cols = [
        'Site', 'Correlation Coefficient', 'Geo.Mean Enterococcus Count',
        'Avg. Rainfall', 'Last Sample Date'
    ]
    infoDf = infoDf[cols]
    return (infoDf.to_dict('records'))
Esempio n. 49
0
def make_submission():
    clfs = train_xgboost()
    df = pd.read_csv(
        '/home/kshitij/Desktop/Dataset/stage2_sample_submission.csv')
    x = np.array([
        np.mean(
            np.load('/home/kshitij/Desktop/Dataset/stage2_features/%s.npy' %
                    str(did)).reshape(-1, 2048),
            axis=0) for did in df['id'].tolist()
    ])
    preds = []

    for clf in clfs:
        preds.append(np.clip(clf.predict(x), 0.001, 1))

    pred = gmean(np.array(preds), axis=0)

    df['cancer'] = pred
    df.to_csv('submGBR2.csv', index=False)
Esempio n. 50
0
def agregate():
    y_tru = pd.read_csv('data/test.lst',
                        sep='\t',
                        header=None,
                        names=['0', 'y', 'fns'])['y']

    all = []
    for i in range(0, len(y_tru), 5000):
        fn = 'tmp/320_66_%d.npy' % i
        print(fn)
        #
        prd = gmean(np.load(fn), axis=0)
        print(prd.shape)

        all.append(prd)

    all = np.vstack(all)
    print(all.shape)
    np.save('tmp/f_avg', all)
Esempio n. 51
0
 def looking_for_optimal_f(self):
     x = np.linspace(0.01, 1.0, 199)
     f_geomeans = list()
     for f in x:
         df = self.generate_rollouts(f=f, verbose=0, plot=False)
         gm_df = df.pct_change() + 1
         gm_df = gm_df.fillna(1)
         gmean = stats.gmean(gm_df, axis=0)
         gmean = np.nan_to_num(gmean)
         geomean = np.mean(gmean)
         f_geomeans.append((f, geomean))
     df = pd.DataFrame(f_geomeans)
     df = df.set_index([0])
     print(df)
     # df.plot(x=df[0], y=df[1], kind="scatter", grid=True, legend=True)
     df.plot(grid=True, legend=True)
     opt_f = df.idxmax(axis=0)
     print(f"Optimal f is {float(opt_f)}")
     plt.show()
Esempio n. 52
0
    def _geometric(vals, weights=None):
        """
        Compute the geometric average of the elements of ``vals``.

        Parameters
        ----------
        vals: np.ndarray
            An array of values, typically representing link ratios from a
            single development period.

        weights: np.ndarray
            Not yet implemented.

        Returns
        -------
        float
        """
        arr = np.asarray(vals, dtype=np.float)
        return (np.NaN if arr.size == 0 else stats.gmean(arr))
Esempio n. 53
0
def check():
    y_tru = pd.read_csv('data/test.lst',
                        sep='\t',
                        header=None,
                        names=['0', 'y', 'fns'])['y']

    all = []
    # for fn in glob.glob('tmp/r*npy'):
    for fn in [
            'tmp/rnx101_val_33_0.npy', 'tmp/rnx101t_val_44_0.npy',
            'tmp/rn152k_val_26_0.npy', 'tmp/rnx101t_r_val_44_0.npy'
    ]:
        prd = np.load(fn)
        all.append(prd)

        print(accuracy_score(y_tru, np.argmax(prd, axis=1)), fn)

    all = gmean(np.array(all), axis=0)
    print(accuracy_score(y_tru, np.argmax(all, axis=1)), fn)
Esempio n. 54
0
    def easy_step(self, action):
        self.counter += 1
        self.update_env_r_from_r(action)

        # write to file input for OMNeT: Routing
        vector_to_file(matrix_to_omnet_v(self.env_r), self.folder + OM_ROUTING,
                       'w')
        # verify file position and format (separator, matrix/vector) np.savetxt('tmp.txt', routing, fmt='%d')

        # execute OMNeT
        omnet_wrapper(self)

        # read OMNeT's output: Delay and Lost packets
        om_output = file_to_csv(self.folder + OM_DELAY)
        self.update_env_d(csv_to_matrix(om_output, self.active_nodes))
        self.update_env_l(csv_to_lost(om_output))

        reward = rl_reward(self)

        # log everything to file
        vector_to_file([-reward], self.folder + REWARD_LOG, 'a')
        # s = rl_state(self)
        log = np.concatenate(
            ([self.counter], [self.env_l], [np.mean(matrix_to_rl(self.env_d))],
             [np.max(matrix_to_rl(self.env_d))], [
                 (np.mean(matrix_to_rl(self.env_d)) +
                  np.max(matrix_to_rl(self.env_d))) / 2
             ], [stats.gmean(matrix_to_rl(self.env_d))]))
        vector_to_file(log, self.folder + WHOLE_LOG, 'a')

        # generate traffic for next iteration
        self.update_env_t(self.t_gen.generate())

        # write to file input for OMNeT: Traffic or do nothing if static
        if self.traffic.split(':')[0] not in ('stat', 'stat_eq', 'file',
                                              'dir'):
            vector_to_file(matrix_to_omnet_v(self.env_t),
                           self.folder + OM_TRAFFIC, 'w')

        new_state = rl_state(self)

        return new_state, reward, 0
Esempio n. 55
0
def blockreduce_pyramid(input_arr,
                        block_size=(2, 2, 2),
                        func=np.max,
                        max_iters=12):
    """
    Parameters
        ----------
        input_arr: np.array
            Input array to iteratively downsample
            Default: Path("local_staging/singlecellimages/manifest.csv")
        block_size: Tuple(int)
            Block size for iterative array reduction.  All voxels in this block
            are merged via func into one voxel during the downsample.
            Default: (2, 2, 2)
        func: Callable[[np.array], float]
            Function to apply to block_size voxels to merge them into one new voxel.
            Default: np.max
        max_iters: int
            Maximum number of downsampling rounds before ending at a one voxel cell.
            Default: 12
        Returns
        -------
        result: Dict[float, np.array]
            Dictionary of reduced arrays.
            Keys are reduction fold, values the reduced array.
    """

    # how much are we downsampling per round
    fold = gmean(block_size)

    # original image
    i = 0
    pyramid = {fold**i: input_arr.copy()}

    # downsample and save to dict
    i = 1
    while (i <= max_iters) and (np.max(pyramid[fold**(i - 1)].shape) > 1):
        pyramid[fold**i] = block_reduce(pyramid[fold**(i - 1)], block_size,
                                        func)
        i += 1

    return pyramid
Esempio n. 56
0
 def read(self):
     """Reads the borehole information data from a excel file"""
     if self.filename:
         if not Path(self.filename).suffix == ".xlsx" or Path(
                 self.filename).suffix == ".xls":
             logger.error(f"{self.filename} is not an Excel file")
             raise AssertionError(f"{self.filename} is not an Excel file")
         if config.borehole_processing.anisotropic_borehole_data:
             logger.info(
                 'Processing the file as containing anisotropic permeability values'
             )
             self.data = pd.read_excel(self.filename,
                                       header=3,
                                       sheet_name=None)
             for borehole in self.data:
                 self.data[borehole] = self.data[borehole].iloc[1:]
                 self.data[borehole] = self.data[borehole]
                 self.data[borehole].rename(columns={
                     'Permeability (m^2)': 'kx',
                     'Unnamed: 3': 'ky',
                     'Unnamed: 4': 'kz',
                 },
                                            inplace=True)
                 self.data[borehole]['kmean'] = stats.gmean(
                     np.array([
                         self.data[borehole]['kx'],
                         self.data[borehole]['ky'],
                         self.data[borehole]['kz'],
                     ]).astype(np.float))
         else:
             self.data = pd.read_excel(self.filename,
                                       header=3,
                                       sheet_name=None)
         _excel_file: Dict[str,
                           pd.DataFrame] = pd.read_excel(self.filename,
                                                         sheet_name=None)
     for borehole in _excel_file:
         self.boreholes_info[borehole] = {
             "x": _excel_file[borehole].iloc[0, 1],
             "y": _excel_file[borehole].iloc[0, 2],
             "z": _excel_file[borehole].iloc[0, 3],
         }
Esempio n. 57
0
def cbf_qei(gm, wm, csf, img, thresh=0.7):
    def fun1(x, xdata):
        d1 = np.exp(-(x[0]) * np.power(xdata, x[1]))
        return (d1)

    def fun2(x, xdata):
        d1 = 1 - np.exp(-(x[0]) * np.power(xdata, x[1]))
        return (d1)

    x1 = [0.054, 0.9272]
    x2 = [2.8478, 0.5196]
    x4 = [3.0126, 2.4419]
    scbf = smooth_image(nb.load(img), fwhm=5).get_fdata()
    if len(scbf.shape) > 3:
        scbf = scbf[:, :, :, 0]
    # load prob maps
    gmm = nb.load(gm).get_fdata()
    wmm = nb.load(wm).get_fdata()
    ccf = nb.load(csf).get_fdata()
    if len(gmm.shape) > 3:
        gmm = gmm[:, :, :, 0]
        wmm = wmm[:, :, :, 0]
        ccf = ccf[:, :, :, 0]
    pbcf = 2.5 * gmm + wmm  # gmm is 2.5 times wm
    msk = np.array((scbf != 0) & (scbf != np.nan)
                   & (pbcf != np.nan)).astype(int)

    gm1 = np.array(gmm > thresh)
    wm1 = np.array(wmm > thresh)
    cc1 = np.array(ccf > thresh)
    r1 = np.array([0, np.corrcoef(scbf[msk == 1], pbcf[msk == 1])[1, 0]]).max()

    V = ((np.sum(gm1) - 1) * np.var(scbf[gm1 > 0]) +
         (np.sum(wm1) - 1) * np.var(scbf[wm1 > 0]) +
         (np.sum(cc1) - 1) * np.var(scbf[cc1 > 0])) / (
             np.sum(gm1 > 0) + np.sum(wm1 > 0) + np.sum(cc1 > 0) - 3)

    negGM = np.sum(scbf[gm1] < 0) / (np.sum(gm1))
    GMCBF = np.mean(scbf[gm1])
    CV = V / np.abs(GMCBF)
    Q = [fun1(x1, CV), fun1(x2, negGM), fun2(x4, r1)]
    return gmean(Q)
Esempio n. 58
0
def summary_angular_errors(errors):
    errors = sorted(errors)

    def g(f):
        return np.percentile(errors, f * 100)

    median = g(0.5)
    mean = np.mean(errors)
    gm = gmean(errors)
    trimean = 0.25 * (g(0.25) + 2 * g(0.5) + g(0.75))
    results = {
        '25': np.mean(errors[:int(0.25 * len(errors))]),
        '75': np.mean(errors[int(0.75 * len(errors)):]),
        '95': g(0.95),
        'tri': trimean,
        'med': median,
        'mean': mean,
        'gm': gm
    }
    return results
Esempio n. 59
0
def main() -> None:
    throughputs_hase = []
    throughputs_original = []
    for i in range(args.n):
        with open(f"{args.outdir}/{args.name}_{i}.out") as file:
            benchmarks, throughput = parse(file)
            throughputs_original.append(throughput)
        with open(f"{args.outdir}/{args.name}_hase_{i}.out") as file:
            benchmarks, throughput = parse(file)
            throughputs_hase.append(throughput)

    throughputs_hase = np.array(throughputs_hase)
    throughputs_original = np.array(throughputs_original)

    ratios = aggregate(throughputs_hase) / aggregate(throughputs_original)

    for i in range(len(benchmarks)):
        print(f"{benchmarks[i]}\t{ratios[i]:.4f}")

    print("GeoMean\t" + str(gmean(ratios)))
Esempio n. 60
0
def geometric_mean_group_auc(y_true, p_pred):
    A = y_true.index.values
    unique_A = np.unique(A)

    if type(y_true) == pd.DataFrame:
        _y_true = y_true.values.flatten()

    y_true_, y_pred_ = [], []
    for a in unique_A:
        y_true_.append(_y_true[A == a])
        y_pred_.append(p_pred[A == a])

    aucs = []
    for yt, yp in zip(y_true_, y_pred_):
        aucs.append(roc_auc_score(yt, yp))
    try:
        val = gmean(np.array(aucs))
    except:
        val = -1
    return val