Example #1
0
 def test_basic2(self):
     a1 = [3, 4, 5, 10, -3, -5, 6]
     a2 = [3, -6, -2, 8, 7, 4, 2, 1]
     a3 = [3., 4, 5, 10, -3, -5, -6, 7.0]
     assert_equal(stats.median(a1), 4)
     assert_equal(stats.median(a2), 2.5)
     assert_equal(stats.median(a3), 3.5)
Example #2
0
def descStats(data):
    """
        Compute descriptive statistics of data
    """
    dataList = list(data)
    logDataList = list(N.log10(dataList))
    desc = dict()
    if len(dataList) == 0:
        desc['mean']       = 0
        desc['median']     = 0
        desc['logMean']    = 0
        desc['logMedian']  = 0
    elif len(dataList) < 2:
        desc['mean']       = dataList[0]
        desc['median']     = dataList[0]
        desc['logMean']    = logDataList[0]
        desc['logMedian']  = logDataList[0]
    else:
        desc['mean']       = mean(dataList)
        desc['median']     = median(dataList)
        desc['logMean']    = mean(logDataList)
        desc['logMedian']  = median(logDataList)

    if len(dataList) < 3:
        desc['stdev']      = 0
        desc['sterr']      = 0
        desc['logStdev']   = 0
        desc['logSterr']   = 0
    else:
        desc['stdev']      = std(dataList)
        desc['sterr']      = stderr(dataList)
        desc['logStdev']   = std(logDataList)
        desc['logSterr']   = stderr(logDataList)
    return desc
Example #3
0
 def test_basic2(self):
     a1 = [3,4,5,10,-3,-5,6]
     a2 = [3,-6,-2,8,7,4,2,1]
     a3 = [3.,4,5,10,-3,-5,-6,7.0]
     assert_equal(stats.median(a1),4)
     assert_equal(stats.median(a2),2.5)
     assert_equal(stats.median(a3),3.5)
Example #4
0
def bin2m(a, factor):
    '''
	Median instead of mean for bin2
	'''
    oldshape = a.shape
    newshape = numpy.asarray(oldshape) / factor
    tmpshape = (newshape[0], factor, newshape[1], factor)
    f = factor * factor
    binned = stats.median(stats.median(numpy.reshape(a, tmpshape), 1), 2)
    return binned
def bin2m(a, factor):
        '''
        Median instead of mean for bin2
        '''
        oldshape = a.shape
        newshape = numpy.asarray(oldshape)/factor
        tmpshape = (newshape[0], factor, newshape[1], factor)
        f = factor * factor
        binned = stats.median(stats.median(numpy.reshape(a, tmpshape), 1), 2)
        return binned
Example #6
0
def calcola_ad(AD):
	alt_c = []
	ref_c = []
	for ad in AD:
		if ad != '.':
			ref_c += [int(ad.split(',')[0])]
			alt_c += [int(ad.split(',')[1])]

	try:
		ref = stats.median(ref_c)
		alt = stats.median(alt_c)
	except:
		ref = '.'
		alt = '.'

	return ','.join([str(ref),str(alt)])
Example #7
0
    def gStats(self, missingValue=0.0):
        """dict of {geneID: (min,max,mean,median,std,stderr,
        Shapiro-Wilk(w,p),normaltest_chisq (D'Agostino and Pearson),...}
        """
        import scipy as S
        import scipy.stats as SS

        rv = {}
        for k, v in self.items():
            # print k,v
            va = S.array(self.gValues(k, missingValue))

            try:
                normaltest = SS.normaltest(va)
            except:
                normaltest = None
            try:
                shapiro = SS.shapiro(va)
            except:
                shapiro = None

            try:
                rv[k] = (va.min(), va.max(), va.mean(), SS.median(va), SS.std(va), SS.stderr(va), normaltest, shapiro)
            except:
                print k, va
                raise
        return rv
def time_diff_analysis(all_items, test_type, item_type, log_scale=False):
    plt.plot([
        st.median([float(p) for p in item if p is not None]) for item in list(
            itertools.zip_longest(*[
                all_items[item_type][test_type]['CR'][i:i + 24]
                for i in range(0, len(all_items['Ratio']['r']['CR']), 24)
            ]))
    ],
             label='CR')
    plt.plot([
        st.median([float(p) for p in item if p is not None]) for item in list(
            itertools.zip_longest(*[
                all_items[item_type][test_type]['CR+'][i:i + 24]
                for i in range(0, len(all_items['Ratio']['r']['CR+']), 24)
            ]))
    ],
             label='CR+')
    plt.plot([
        st.median([float(p) for p in item if p is not None]) for item in list(
            itertools.zip_longest(*[
                all_items[item_type][test_type]['RTW'][i:i + 24]
                for i in range(0, len(all_items['Ratio']['r']['RTW']), 24)
            ]))
    ],
             label='RTW')
    plt.plot([
        st.median([float(p) for p in item if p is not None]) for item in list(
            itertools.zip_longest(*[
                all_items[item_type][test_type]['SR'][i:i + 24]
                for i in range(0, len(all_items['Ratio']['r']['SR']), 24)
            ]))
    ],
             label='SR')
    plt.plot([
        st.median([float(p) for p in item if p is not None]) for item in list(
            itertools.zip_longest(*[
                all_items[item_type][test_type][' TI'][i:i + 24]
                for i in range(0, len(all_items['Ratio']['r']['TI']), 24)
            ]))
    ],
             label='TI')
    plt.legend()
    plt.title(f"{test_type}-{item_type}")
    if log_scale:
        plt.yscale("log")
    plt.show()
Example #9
0
    def nanmedian(x):
        """Find the median over the given axis ignoring nans.

            fixme: should be fixed to work along an axis.
        """
        x = _asarray1d(x).copy()
        y = compress(isfinite(x), x)
        return median(y)
Example #10
0
    def nanmedian(x):
        """Find the median over the given axis ignoring nans.

            fixme: should be fixed to work along an axis.
        """
        x = _asarray1d(x).copy()
        y = compress(isfinite(x), x)
        return median(y)
def get_statistics_from_diffs(diffs):
    the_mean = st.mean(diffs)
    return {
        'min': min(diffs),
        'max': max(diffs),
        'mean': the_mean,
        'median': st.median(diffs),
        'stdev': st.stdev(diffs, the_mean),
        'q1': np.percentile(diffs, 25),
        'q3': np.percentile(diffs, 75)
    }
Example #12
0
    def _calc_basic_statistics(self):
        """This function determines the mean and the standard deviation
           of the data sample.
           Furthermore, several other simple properties are determined.
        """
        self.mean = stats.mean(self._data_samples)
        self.geom_mean = stats.geomean(self._data_samples)
        self.median = stats.median(self._data_samples)
        self.std_dev = stats.stddev(self._data_samples)

        self.min = min(self._data_samples)
        self.max = max(self._data_samples)
Example #13
0
def calcola_dp(DP):

	try:
		DP.remove('.')
	except:
		print(DP)
		return '.'
	try:	
		return stats.median(DP)
	except:
		print(DP)
		return '.'
Example #14
0
 def __init__(self, samples):
     self.samples = numpy.asarray(samples)
     self.N = len(samples)
     self.median = stats.median(samples)
     self.min = numpy.amin(samples)
     self.max = numpy.amax(samples)
     self.mean = stats.mean(samples)
     self.std = stats.std(samples)
     self.var = self.std**2.
     self.skew = stats.skew(samples)
     self.kurtosis = stats.kurtosis(samples)
     self.range = self.max - self.min
Example #15
0
    def _calc_basic_statistics(self):
        """This function determines the mean and the standard deviation
           of the data sample.
           Furthermore, several other simple properties are determined.
        """
        self.mean        = stats.mean(self._data_samples)
        self.geom_mean   = stats.geomean(self._data_samples)
        self.median      = stats.median(self._data_samples)
        self.std_dev     = stats.stddev(self._data_samples)

        self.min = min(self._data_samples)
        self.max = max(self._data_samples)
Example #16
0
    def test_median_simple(self):
        self.assertEqual(2.5, stats.median([1, 2, 3, 4]))
        self.assertAlmostEqual(2.5, stats.median([1.0, 2.0, 3.0, 4.0]))

        self.assertAlmostEqual(25, stats.median(self._integers))
        self.assertAlmostEqual(25, stats.median(self._floats))
        self.assertAlmostEqual(25 + 2.31, stats.median(self._floats2))
        self.assertAlmostEqual(27.5, stats.median(self._mixed))
Example #17
0
    def test_median_simple(self):
        self.assertEqual(2.5, stats.median([1, 2, 3, 4]))
        self.assertAlmostEqual(2.5, stats.median([1.0, 2.0, 3.0, 4.0]))

        self.assertAlmostEqual(25, stats.median(self._integers))
        self.assertAlmostEqual(25, stats.median(self._floats))
        self.assertAlmostEqual(25 + 2.31, stats.median(self._floats2))
        self.assertAlmostEqual(27.5, stats.median(self._mixed))
Example #18
0
def HLdistance(X1, X2):
    """
    The Hodges–Lehmann estimator is a statistical method for robust estimation.
    The principal form of this estimator is used to give an estimate of the
    difference between the values in two sets of data. If the two sets of data
    contain m and n data points respectively, m × n pairs of points (one from each set)
    can be formed and each pair gives a difference of values. The Hodges–Lehmann estimator
    for the difference is defined as the median of the m × n differences.
    """
    diffList = list()
    for x1 in X1:
        for x2 in X2:
            diffList.append(x1-x2)
    return median(diffList)
Example #19
0
 def __init__(self, samples, name=None):
     samples = np.asarray(samples)
     self.samples = samples
     self.name = name
     self.npts = len(samples)
     self.median = stats.median(samples)
     self.min = samples.min()
     self.max = samples.max()
     self.mean = samples.mean()
     self.std = samples.std()
     self.var = samples.var()
     self.skew = stats.skew(samples)
     self.kurtosis = stats.kurtosis(samples)
     self.range = self.max - self.min
 def __init__(self, samples, name=None):
     samples = np.asarray(samples)
     self.samples = samples
     self.name = name
     self.npts = len(samples)
     self.median = stats.median(samples)
     self.min = samples.min()
     self.max = samples.max()
     self.mean = samples.mean()
     self.std = samples.std()
     self.var = samples.var()
     self.skew = stats.skew(samples)
     self.kurtosis = stats.kurtosis(samples)
     self.range = self.max - self.min
Example #21
0
 def test_descriptive(self):
     from econpy.pytrix.stat import Dstat1
     x = numpy.array(self.data)
     d = Dstat1(x)
     self.assertEqual(d.nobs, x.size)
     self.assertAlmostEqual(d.sum, x.sum())
     self.assertEqual(d.min, x.min())
     self.assertEqual(d.max, x.max())
     self.assertAlmostEqual(d.mean, x.mean())
     #var: measure of the spread of the data set about the mean: unbiased
     self.assertAlmostEqual(d.m2, numpy.var(x))
     self.assertAlmostEqual(d.std, numpy.std(x))
     #assertEqual(d.zscores , Sstats.zs(x))
     self.assertAlmostEqual(d.median, Sstats.median(x))
     '''
Example #22
0
	def test_descriptive(self):
		from econpy.pytrix.stat import Dstat1
		x = numpy.array(self.data)
		d = Dstat1(x)
		self.assertEqual(d.nobs , x.size)
		self.assertAlmostEqual(d.sum , x.sum())
		self.assertEqual(d.min , x.min())
		self.assertEqual(d.max , x.max())
		self.assertAlmostEqual(d.mean , x.mean())
		#var: measure of the spread of the data set about the mean: unbiased
		self.assertAlmostEqual(d.m2 , numpy.var(x))
		self.assertAlmostEqual(d.std , numpy.std(x))
		#assertEqual(d.zscores , Sstats.zs(x))
		self.assertAlmostEqual(d.median , Sstats.median(x))
		'''
Example #23
0
    def test_median_vs_numpy(self):
        self.assertEqual(numpy.median([1, 2, 3, 4]), stats.median([1, 2, 3, 4]))
        self.assertAlmostEqual(numpy.median([1.0, 2.0, 3.0, 4.0]), stats.median([1.0, 2.0, 3.0, 4.0]))

        self.assertAlmostEqual(numpy.median(self._integers), stats.median(self._integers))

        self.assertAlmostEqual(numpy.median(self._floats), stats.median(self._floats))

        self.assertAlmostEqual(numpy.median(self._floats2), stats.median(self._floats2))

        self.assertAlmostEqual(numpy.median(self._mixed), stats.median(self._mixed))
Example #24
0
    def flux_err(self, filt, zp=None):
        """assumes unit = day for now
			not a correct calculation for large errors
			"""

        if not zp:
            zp = 3000e3  ## just choose something to get us close to mJy

        if isinstance(filt, str):
            if filt not in self.filts:
                return array([])
            else:
                ret = array([])
                for c in self.data['ts'][filt]:
                    #if c.has_key("name"):
                    #	if c['name'] == "f_err":
                    #		ret = c['val']
                    #		break
                    #	if c['name'] == "m_err":
                    #			ret = c["val"]*self.flux(filt,zp)
                    #			break
                    if c.has_key("ucd"):
                        if c['ucd'].find("flux") != -1 and c['ucd'].find(
                                "err") != -1:
                            ret = c["val"]
                            break
                        if c['ucd'].find(
                                "phot.mag") and c['ucd'].find("err") != -1:
                            ret = c["val"] * self.flux(filt, zp)
                            break

            if (ret == 0).sum() == ret.shape[0]:
                ## all zeros! figure out the scatter
                tmp = self.flux(filt, zp)
                for i in range(5):
                    med = median(tmp)
                    sigma = sqrt(((tmp - med)**2).sum() / ret.shape[0])
                    tmpi = (abs(tmp - med) < 2.5 * sigma).nonzero()[0]
                    tmp = tmp[tmpi]
                    #print med, sigma
                ret = sigma * ones(ret.shape[0])

            return ret
        return array([])
Example #25
0
	def flux_err(self,filt,zp=None):
			"""assumes unit = day for now
			not a correct calculation for large errors
			"""

			if not zp:
				zp = 3000e3  ## just choose something to get us close to mJy

			if isinstance(filt,str):
				if filt not in self.filts:
					return array([])
				else:
					ret = array([])
					for c in self.data['ts'][filt]:
						#if c.has_key("name"):
						#	if c['name'] == "f_err":
						#		ret = c['val']
						#		break
						#	if c['name'] == "m_err":
						#			ret = c["val"]*self.flux(filt,zp)
						#			break
						if c.has_key("ucd"):
							if c['ucd'].find("flux") != -1 and c['ucd'].find("err") != -1:
								ret = c["val"]
								break
							if c['ucd'].find("phot.mag") and c['ucd'].find("err") != -1:
								ret = c["val"]*self.flux(filt,zp)
								break						
				
				if (ret == 0).sum() == ret.shape[0]:
					## all zeros! figure out the scatter
					tmp = self.flux(filt,zp)
					for i in range(5):
						med = median(tmp)
						sigma = sqrt(((tmp - med)**2).sum()/ret.shape[0])
						tmpi = (abs(tmp - med) < 2.5*sigma).nonzero()[0]
						tmp = tmp[tmpi]
						#print med, sigma
					ret = sigma*ones(ret.shape[0])
				
				return ret
			return array([])					
Example #26
0
    def test_median_vs_numpy(self):
        self.assertEqual(numpy.median([1, 2, 3, 4]), stats.median([1, 2, 3,
                                                                   4]))
        self.assertAlmostEqual(numpy.median([1.0, 2.0, 3.0, 4.0]),
                               stats.median([1.0, 2.0, 3.0, 4.0]))

        self.assertAlmostEqual(numpy.median(self._integers),
                               stats.median(self._integers))

        self.assertAlmostEqual(numpy.median(self._floats),
                               stats.median(self._floats))

        self.assertAlmostEqual(numpy.median(self._floats2),
                               stats.median(self._floats2))

        self.assertAlmostEqual(numpy.median(self._mixed),
                               stats.median(self._mixed))
Example #27
0
    fin.close()
    
    return data
    
class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg

if __name__ == "__main__":
    src_file = None
    try:
        try:
            opts, args = getopt.getopt(sys.argv[1:], "hs:", ["help"])
            
            for o, a in opts:
                if o in ("-h", "--help"):
                    print __doc__
                    raise Usage("-s <input file>")
                elif o == "-s":
                    src_file = a
        except getopt.error, msg:
            raise Usage(msg)
            # more code, unchanged
    except Usage, err:
        print >>sys.stderr, err.msg
        print >>sys.stderr, "for help use --help"
    
    data = read_data(src_file)
    mean = stats.mean(data)
    median = stats.median(data)
    
Example #28
0
 def test_nanmedian_none(self):
     """Check nanmedian when no values are nan."""
     m = stats.nanmedian(self.X)
     assert_approx_equal(m, stats.median(self.X))
Example #29
0
 def test_axis(self):
     """Regression test for #760."""
     a1 = np.array([[3,4,5], [10,-3,-5]])
     assert_equal(stats.median(a1), np.array([6.5, 0.5, 0.]))
     assert_equal(stats.median(a1, axis=-1), np.array([4., -3]))
Example #30
0
    def signif(self, files, bucktype='none'):
        """Compute signification of 3 input sets: test, system_output1, system_output2
        """
        if len(files) != 3:
            raise ValueError(
                "You must supply 3 input files for `signif` command")

        if bucktype not in ['none', 'dialog']:
            raise ValueError("Unknown `bucktype`: %r" % bucktype)

        self.logger.debug("Importing scipy")
        from scipy.stats import median, mean, tvar, tstd
        from scipy.stats.morestats import wilcoxon
        from scipy.stats.distributions import norm, t as t
        from scipy import sqrt

        forest1, forest2, forest3 = self.loadForestFiles(files)

        self.logger.info("Processing forests 1 and 2")

        diff1 = {}
        for fn, tree1, tree2, dist, script in self.forestProcessor(
                forest1, forest2):
            H, D, I, S = script.HDIS
            n_errors = D + I + S
            fn = self.filenameKey(fn, bucktype)
            diff1.setdefault(fn, 0.)
            diff1[fn] += n_errors

        self.logger.info("Processing forests 1 and 3")

        diff2 = {}
        for fn, tree1, tree2, dist, script in self.forestProcessor(
                forest1, forest3):
            H, D, I, S = script.HDIS
            n_errors = D + I + S
            fn = self.filenameKey(fn, bucktype)
            diff2.setdefault(fn, 0.)
            diff2[fn] += n_errors

        def mapsswe(x, y):
            xm = mean(x)
            ym = mean(y)
            s = 0.
            n = 0.
            for xi, yi in izip(w1, w2):
                s += ((xi - yi) - (xm - ym))**2
                n += 1

            t_stat = sqrt(n) * abs(xm - ym) / sqrt(s / (n - 1.))
            p_value = t.sf(t_stat, n - 1) * 2
            return t_stat, p_value

        Z_values = []
        w1 = []
        w2 = []
        for key in sorted(diff1.keys()):
            if key not in diff2:
                self.logger.error("Unmatched utterance: %r", key)
                continue
            Na = diff1.pop(key)
            Nb = diff2.pop(key)
            w1.append(Na)
            w2.append(Nb)
            Z_values.append(Na - Nb)

        Z_mean = mean(Z_values)
        Z_median = median(Z_values)
        Z_tvar = tvar(Z_values)
        Z_tstd = tstd(Z_values)

        wilcoxon_t_stat, wilcoxon_p_value = wilcoxon(w1, w2)

        mapsswe_w_stat, mapsswe_p_value = mapsswe(w1, w2)

        fw = sys.stdout
        fw.write("Z stats:\n")
        fw.write("========\n")
        fw.write("  - mean:     %9.3f\n" % Z_mean)
        fw.write("  - median:   %9.3f\n" % Z_median)
        fw.write("  - tvar:     %9.3f\n" % Z_tvar)
        fw.write("  - tstd:     %9.3f\n\n" % Z_tstd)
        fw.write("Wilcoxon test:\n")
        fw.write("==============\n")
        fw.write(
            "  - p-value:  %9.3f (two-tailed) [significant if <= 0.05]\n" %
            wilcoxon_p_value)
        fw.write("  - t-stat:   %9.3f\n\n" % wilcoxon_t_stat)
        fw.write("MAPSSWE test:\n")
        fw.write("=============\n")
        fw.write(
            "  - p-value:  %9.3f (two-tailed) [significant if <= 0.05]\n" %
            mapsswe_p_value)
        fw.write("  - t-stat:   %9.3f\n\n" % mapsswe_w_stat)
Example #31
0
    def _filt_run(self, dat, filt, do_sim=False, vplot=True, nrange=1):

        if self.doplot and vplot:
            errorbar(dat[0], dat[1], dat[2], fmt="o")

        new = True
        if new:
            mymodel = Model(self.fitfunc_small_te,
                            extra_args=[dat[1], dat[2], False])
        else:
            mymodel = Model(
                self.fitfunc_te)  #,extra_args=[dat[1],dat[2],False])

        # get some good guesses
        try:
            scale = trim_mean(dat[1], 0.3)
        except:
            scale = mean(dat[1])
        offset = 1.0  #trim_mean(dat[1],0.3)
        t0 = median(dat[0])
        umin = 1.0
        b = 0.0  ## trending slope
        mydata = RealData(dat[0], dat[1], sx=1.0 / (60 * 24), sy=dat[2])

        trange = list(linspace(min(dat[0]), max(dat[0]), nrange))
        maxi = (dat[1] == max(dat[1])).nonzero()[0]
        trange.extend(list(dat[0][maxi]))
        trange.extend([t0, max(dat[0]) + 10, max(dat[0]) + 100])

        final_output = None
        for t0i in trange:
            for te in 10**linspace(log10(2), log10(200), nrange):
                if new:
                    pinit = [te, umin, t0i]  # ,scale,offset,b]
                else:
                    pinit = [te, umin, t0i, scale, offset, b]

                myodr = ODR(mydata, mymodel, beta0=pinit)
                myoutput = myodr.run()
                if final_output is None:
                    final_output = myoutput
                    old_sd_beta = final_output.sd_beta
                    continue

                if trim_mean(log10(myoutput.sd_beta / final_output.sd_beta),0.0) < 0.0 and \
                 myoutput.res_var <= final_output.res_var and (myoutput.sd_beta == 0.0).sum() <= (final_output.sd_beta == 0.0).sum():
                    final_output = myoutput

        if 1:
            t = linspace(
                min(dat[0]),
                max([
                    max(dat[0]),
                    final_output.beta[2] + 6 * final_output.beta[0]
                ]), 1500)
            if new:
                tmp = self.fitfunc_small_te(final_output.beta, dat[0], dat[1],
                                            dat[2], True)
                #print tmp, "***"
                p = list(final_output.beta)
                p.extend([tmp[0], tmp[1], tmp[2]])
                y = array(self.modelfunc_small_te(p, t))
            else:
                p = final_output.beta
                y = self.fitfunc_te(final_output.beta, t)
                #print final_output.beta
            if self.doplot:
                plot(t, y)
                xlabel('Time [days]')
                ylabel('Relative Flux Density')

            if do_sim:
                for i in range(10):
                    tmp = r.multivariate_normal(myoutput.beta,
                                                myoutput.cov_beta)
                    if self.doplot:
                        plot(
                            t,
                            self.a_te(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4],
                                      tmp[5], t), "-")

        return (final_output, p, new)
Example #32
0
                    results[strat][lookahead][type_cov].keys()):
                covs = map(str,
                           results[strat][lookahead][type_cov][effort_frac])
                o.write('%f %s\n' % (effort_frac, ' '.join(covs)))
            o.close()

            ofile = '%s%s.%s.%s-post-stats-%d-%s-%s' % (
                folder, outname, type, coverage, lookahead, strat, type_cov)
            o = open(ofile, 'w')
            o.write(
                '# effort_fract mean stdev median min max quartile1 quartile2 quartile3\n'
            )
            for effort_frac in sorted(
                    results[strat][lookahead][type_cov].keys()):
                covs = results[strat][lookahead][type_cov][effort_frac]
                mean, stdev, median = numpy.mean(covs), numpy.std(
                    covs), stats.median(
                        covs)  #stats.stdev(covs), numpy.median(covs)
                cov_min, cov_max = min(covs), max(covs)
                #quartile1, quartile2, quartile3 = stats.lscoreatpercentile(covs,.25),stats.lscoreatpercentile(covs,.50),stats.lscoreatpercentile(covs,.75)
                quartile1, quartile2, quartile3 = stats.scoreatpercentile(
                    covs, 25), stats.scoreatpercentile(
                        covs, 50), stats.scoreatpercentile(covs, 75)
                #scoreatpercentile
                o.write('%f %f %f %f %f %f %f %f %f\n' %
                        (effort_frac, mean, stdev, median, cov_min, cov_max,
                         quartile1, quartile2, quartile3))
            o.close()

if __name__ == '__main__':
    pass
Example #33
0
 def test_axis(self):
     """Regression test for #760."""
     a1 = np.array([[3, 4, 5], [10, -3, -5]])
     assert_equal(stats.median(a1), np.array([6.5, 0.5, 0.]))
     assert_equal(stats.median(a1, axis=-1), np.array([4., -3]))
Example #34
0
 def test_nanmedian_none(self):
     """Check nanmedian when no values are nan."""
     m = stats.nanmedian(self.X)
     assert_approx_equal(m, stats.median(self.X))
Example #35
0
 def test_nanmedian_some(self):
     """Check nanmedian when some values only are nan."""
     m = stats.nanmedian(self.Xsome)
     assert_approx_equal(m, stats.median(self.Xsomet))
Example #36
0
 def test_median(self):
     assert_equal(stats.median(self.a1), 4)
     assert_equal(stats.median(self.a2), 2.5)
     assert_equal(stats.median(self.a3), 3.5)
Example #37
0
def clust_main():			
	parent = "/home/ethan/hiv/papers/jidletter/"
	
	outmi = open(parent + 'sumary.mi', 'w')
	outmi.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n')
	out3 = open(parent + 'sumary.30y', 'w')
	out3.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n')
	
	inf_mi, inf_3 = {}, {}
	clu_mi, clu_3 = {}, {}
	
	cuts = [6]
	cuts.extend([(x+1)*12 for x in range(19)])

	for freq in np.linspace(0.05, 1.0, 20):
		inf_mi[freq], inf_3[freq] = [], []
		
	infile = open(parent + "pkl/full/" + "lin0.pkl.full", 'r')
	data = cPickle.load(infile)
	infile.close()
	
	c_mi = data['clu_mi']
	c_3 = data['clu_30y']
	
	for inst in c_mi:
		freq = inst[0]
		cut = inst[1]
		
		if not clu_mi.has_key(freq):
			clu_mi[freq] = {}
		
		if not clu_mi[freq].has_key(cut):
			clu_mi[freq][cut] = []
	
	for inst in c_3:
		freq = inst[0]
		cut = inst[1]
		
		if not clu_3.has_key(freq):
			clu_3[freq] = {}
		
		if not clu_3[freq].has_key(cut):
			clu_3[freq][cut] = []
		
		
	for file in os.listdir(parent + "pkl/full/"):
		print file
		
		infile = open(parent + "pkl/full/" + file, 'r')
		data = cPickle.load(infile)
		infile.close()
		
		history = data['history']
		smp_mi = data['samples_maxinc']
		smp_3 = data['samples_30y']
		c_mi = data['clu_mi']
		c_3 = data['clu_30y']
		
		for freq in np.linspace(0.05, 1.0, 20):
			for mi in smp_mi[freq]:
				inf_mi[freq].append(infectors_stage(history, mi))
			for th in smp_3[freq]:
				inf_3[freq].append(infectors_stage(history, th))
		
		for inst in c_mi:	
			freq = inst[0]
			cut = inst[1]
			
			for i, tok in enumerate(inst):
				if i > 1:
					clu_mi[freq][cut].append(int(tok))
					
		for inst in c_3:	
			freq = inst[0]
			cut = inst[1]
			
			for i, tok in enumerate(inst):
				if i > 1:
					clu_3[freq][cut].append(int(tok))
	
	for k, v in inf_mi.items():
		pcount = 0
		for tok in v:
			if tok == 'p': pcount += 1
		sk = str(k)
		sk = sk + '00000000000000'
		inf_mi[sk[0:8]] = float(pcount)/len(v)

	for k, v in inf_3.items():
		pcount = 0
		for tok in v:
			if tok == 'p': pcount += 1
		sk = str(k)
		sk = sk + '00000000000000'
		inf_3[sk[0:8]] = float(pcount)/len(v)

	
	for k, v in clu_mi.items():
		for k2, v2 in v.items():
			prclust = pr_clustering(v2)
			mean = stats.mean(v2) 
			median = stats.median(v2)
			std = stats.tstd(v2)

			outmi.write('%s,%s,%f,%f,%f,%f,%f\n' % (k, k2, prclust, mean, median, std, inf_mi[k]))
	
	for k, v in clu_3.items():
		for k2, v2 in v.items():
			prclust = pr_clustering(v2)
			mean = stats.mean(v2) 
			median = stats.median(v2)
			std = stats.tstd(v2)

			out3.write('%s,%s,%f,%f,%f,%f,%f\n' % (k, k2, prclust, mean, median, std, inf_3[k]))
Example #38
0
	def _filt_run(self,dat,filt,do_sim=False,vplot=True,nrange=1):
		
		if self.doplot and vplot:
			errorbar(dat[0],dat[1],dat[2],fmt="o")
		
		new = True
		if new:
			mymodel = Model(self.fitfunc_small_te,extra_args=[dat[1],dat[2],False])
		else:
			mymodel = Model(self.fitfunc_te) #,extra_args=[dat[1],dat[2],False])
			
		# get some good guesses
		try:
			scale = trim_mean(dat[1],0.3)
		except:
			scale = mean(dat[1])
		offset = 1.0 #trim_mean(dat[1],0.3)
		t0    = median(dat[0])
		umin  = 1.0
		b     = 0.0  ## trending slope
		mydata  = RealData(dat[0],dat[1],sx=1.0/(60*24),sy=dat[2])
		
		trange = list(linspace(min(dat[0]),max(dat[0]),nrange))
		maxi = (dat[1] == max(dat[1])).nonzero()[0]		
		trange.extend(list(dat[0][maxi]))
		trange.extend([t0, max(dat[0]) + 10, max(dat[0]) + 100])
		
		final_output = None
		for t0i in trange:
			for te in 10**linspace(log10(2),log10(200),nrange):
				if new:
					pinit = [te,umin,t0i] # ,scale,offset,b]
				else:
					pinit = [te,umin,t0i ,scale,offset,b]
				
				myodr = ODR(mydata,mymodel,beta0=pinit)
				myoutput = myodr.run()
				if final_output is None:
					final_output = myoutput
					old_sd_beta = final_output.sd_beta
					continue

				if trim_mean(log10(myoutput.sd_beta / final_output.sd_beta),0.0) < 0.0 and \
					myoutput.res_var <= final_output.res_var and (myoutput.sd_beta == 0.0).sum() <= (final_output.sd_beta == 0.0).sum():
					final_output = myoutput
					
		if 1:
			t = linspace(min(dat[0]),max([max(dat[0]),final_output.beta[2] + 6*final_output.beta[0]]),1500)
			if new:
				tmp = self.fitfunc_small_te(final_output.beta,dat[0],dat[1],dat[2],True)
				#print tmp, "***"
				p = list(final_output.beta)
				p.extend([tmp[0],tmp[1],tmp[2]])
				y = array(self.modelfunc_small_te(p,t))
			else:
				p = final_output.beta
				y = self.fitfunc_te(final_output.beta,t)
				#print final_output.beta 
			if self.doplot:
				plot(t,y)
				xlabel('Time [days]')
				ylabel('Relative Flux Density')
			
			if do_sim:
				for i in range(10):
					tmp = r.multivariate_normal(myoutput.beta, myoutput.cov_beta)
					if self.doplot:
						plot(t, self.a_te(tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],t),"-")
			
		return (final_output, p, new)
Example #39
0
 def reindex(self):
     values = self.series[-self.periods:]
     m = median(values).toscalar()
     self.append(m)
Example #40
0
def BaseTreeAgeStatisticTest(Data, filename):
    """
    Try for statistical test of randomness nonhomohenious tree distribution
    with account of its ages. Save the resulting statistcs and boundaries in
    txt-file, named <filename>
    
    Old. version. have to be changed
    Depricated function.
    Last revision:
         20.11.2011
            
    """
    ############################Base statistical test#######################
    import numpy as np

    RadiusI = np.linspace(30, 200, 100)
    MinAge = np.nanmin(Data[:, 2])
    MaxAge = np.nanmax(Data[:, 2])
    M = 1000
    N = np.shape(Data)[0]
    MaxMonte = 400
    alpha = 0.05
    indbound = np.round(MaxMonte * alpha)
    Z_mean = np.array([])
    Z_max = np.array([])
    Z_ang = np.array([])
    Z_min = np.array([])
    Z_kur = np.array([])
    Z_sk = np.array([])
    Z_med = np.array([])
    Iang = []
    Imax = []
    Imin = []
    Imean = []
    Ikur = []
    Isk = []
    Imed = []
    for r in RadiusI:
        ##########call monte-carlo function##########
        cnts, angs, corrs, maxs, mins, meds, kurs, sks = EvaluateFunct(
            MaxIteration=MaxMonte,
            LowIteration=M,
            RightBound=RightBound,
            TopBound=TopBound,
            Density=N,
            Radius=r,
            MinAge=MinAge,
            MaxAge=MaxAge,
        )
        angss = np.sort(angs)
        maxss = np.sort(maxs)
        minss = np.sort(mins)
        corrss = np.sort(corrs)
        kurss = np.sort(kurs)
        skss = np.sort(sks)
        medss = np.sort(meds)
        Iang.append([angss[-indbound], angss[indbound]])
        Imax.append([maxss[-indbound], maxss[indbound]])
        Imin.append([minss[-indbound], minss[indbound]])
        Imean.append([corrss[-indbound], corrss[indbound]])
        Ikur.append([kurss[-indbound], kurss[indbound]])
        Isk.append([skss[-indbound], skss[indbound]])
        Imed.append([medss[-indbound], medss[indbound]])

        corr = []
        phi = []
        indf = []
        index_false = 1
        for k in xrange(M):
            area = {
                "XCenter": r + np.random.rand() * (RightBound - r),
                "YCenter": r + np.random.rand() * (TopBound - r),
                "Radius": r,
            }
            [phi0, flag0, opt_val] = GetOptimalDirection(area, Data, allowed=4)
            if flag0 == True:
                corr.append(opt_val)
                phi.append(phi0)
            else:
                index_false = index_false + 1.0
                indf.append(index_false)
        Z_mean = np.append(Z_mean, np.mean(corr))
        Z_ang = np.append(Z_ang, np.mean(phi))
        Z_max = np.append(Z_max, np.nanmax(corr))
        Z_min = np.append(Z_min, np.nanmin(corr))
        Z_kur = np.append(Z_kur, st.kurtosis(corr))
        Z_sk = np.append(Z_sk, st.skew(corr))
        Z_med = np.append(Z_med, st.median(corr))
        print "Текущий радиус", r
        print "Средний угол:", Iang[-1], "значение", Z_ang[-1]
        print "Максимальная корреляция:", Imax[-1], "значение", Z_max[-1]
        print "Минимальная корреляция:", Imin[-1], "значение", Z_min[-1]
        print "Средняя корреляция:", Imean[-1], "значение", Z_mean[-1]
        print "Эксцесс:", Ikur[-1], "значение", Z_kur[-1]
        print "Асимметрия:", Isk[-1], "значение", Z_sk[-1]
        print "Mедиана:", Imed[-1], "значение", Z_med[-1]
    np.save(
        filename,
        np.array(
            [Z_mean, Z_ang, Z_max, Z_min, Z_kur, Z_sk, Z_med, Iang, Imax, Imin, Imean, Ikur, Isk, Imed], dtype=np.object
        ),
    )
Example #41
0
 def test_basic(self):
     data1 = [1, 3, 5, 2, 3, 1, 19, -10, 2, 4.0]
     data2 = [3, 5, 1, 10, 23, -10, 3, -2, 6, 8, 15]
     assert_almost_equal(stats.median(data1), 2.5)
     assert_almost_equal(stats.median(data2), 5)
Example #42
0
def clust_main():
    parent = "/home/ethan/hiv/papers/jidletter/"

    outmi = open(parent + 'sumary.mi', 'w')
    outmi.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n')
    out3 = open(parent + 'sumary.30y', 'w')
    out3.write('freq,cut,p.clu,mean.clu,med.clu,std.clu,act.pri\n')

    inf_mi, inf_3 = {}, {}
    clu_mi, clu_3 = {}, {}

    cuts = [6]
    cuts.extend([(x + 1) * 12 for x in range(19)])

    for freq in np.linspace(0.05, 1.0, 20):
        inf_mi[freq], inf_3[freq] = [], []

    infile = open(parent + "pkl/full/" + "lin0.pkl.full", 'r')
    data = cPickle.load(infile)
    infile.close()

    c_mi = data['clu_mi']
    c_3 = data['clu_30y']

    for inst in c_mi:
        freq = inst[0]
        cut = inst[1]

        if not clu_mi.has_key(freq):
            clu_mi[freq] = {}

        if not clu_mi[freq].has_key(cut):
            clu_mi[freq][cut] = []

    for inst in c_3:
        freq = inst[0]
        cut = inst[1]

        if not clu_3.has_key(freq):
            clu_3[freq] = {}

        if not clu_3[freq].has_key(cut):
            clu_3[freq][cut] = []

    for file in os.listdir(parent + "pkl/full/"):
        print file

        infile = open(parent + "pkl/full/" + file, 'r')
        data = cPickle.load(infile)
        infile.close()

        history = data['history']
        smp_mi = data['samples_maxinc']
        smp_3 = data['samples_30y']
        c_mi = data['clu_mi']
        c_3 = data['clu_30y']

        for freq in np.linspace(0.05, 1.0, 20):
            for mi in smp_mi[freq]:
                inf_mi[freq].append(infectors_stage(history, mi))
            for th in smp_3[freq]:
                inf_3[freq].append(infectors_stage(history, th))

        for inst in c_mi:
            freq = inst[0]
            cut = inst[1]

            for i, tok in enumerate(inst):
                if i > 1:
                    clu_mi[freq][cut].append(int(tok))

        for inst in c_3:
            freq = inst[0]
            cut = inst[1]

            for i, tok in enumerate(inst):
                if i > 1:
                    clu_3[freq][cut].append(int(tok))

    for k, v in inf_mi.items():
        pcount = 0
        for tok in v:
            if tok == 'p': pcount += 1
        sk = str(k)
        sk = sk + '00000000000000'
        inf_mi[sk[0:8]] = float(pcount) / len(v)

    for k, v in inf_3.items():
        pcount = 0
        for tok in v:
            if tok == 'p': pcount += 1
        sk = str(k)
        sk = sk + '00000000000000'
        inf_3[sk[0:8]] = float(pcount) / len(v)

    for k, v in clu_mi.items():
        for k2, v2 in v.items():
            prclust = pr_clustering(v2)
            mean = stats.mean(v2)
            median = stats.median(v2)
            std = stats.tstd(v2)

            outmi.write('%s,%s,%f,%f,%f,%f,%f\n' %
                        (k, k2, prclust, mean, median, std, inf_mi[k]))

    for k, v in clu_3.items():
        for k2, v2 in v.items():
            prclust = pr_clustering(v2)
            mean = stats.mean(v2)
            median = stats.median(v2)
            std = stats.tstd(v2)

            out3.write('%s,%s,%f,%f,%f,%f,%f\n' %
                       (k, k2, prclust, mean, median, std, inf_3[k]))
Example #43
0
 def test_median(self):
     assert_equal(stats.median(self.a1), 4)
     assert_equal(stats.median(self.a2), 2.5)
     assert_equal(stats.median(self.a3), 3.5)
Example #44
0
def clean_data(data):
    data.taxdelinquencyflag = data.taxdelinquencyflag.fillna('N').replace(
        ['Y', 'N'], [1, 0])

    # Drop all rows where parcelvalue is null
    data = data[data['parcelvalue'].notnull()]

    # Replace null is "is" columns (one that should be ether 1 or 0)
    data[['fireplace', 'tubflag']] = data[['fireplace', 'tubflag']].fillna(0)

    # Drop columns that have a very high number of null values
    data = high_null_count(data, 0.9)

    # Aircon
    # Assume all na are non aircon and change to 0/1. Drop ordinal column
    data['is_aircond'] = [
        0 if (x == 5 or math.isnan(x)) else 1 for x in data['aircond']
    ]
    data = data.drop(columns='aircond', axis=1)

    # Heating
    # Assume all na are non heating and change to 0/1. Drop ordinal column
    data['is_heating'] = [
        0 if (x == 13 or math.isnan(x)) else 1 for x in data['heatingtype']
    ]
    data = data.drop(columns='heatingtype', axis=1)

    # Setting numbath numfullbath, 34bath as 0
    data[['numfullbath', 'num34bath',
          'numbath']] = data[['numfullbath', 'num34bath', 'numbath']].fillna(0)
    data = data.drop(columns=['num34bath', 'numfullbath'], axis=1)

    # Number of stories/pools/garage, if null then 0
    data['numstories'] = data['numstories'].fillna(1)
    data[['poolnum', 'garagenum']] = data[['poolnum', 'garagenum']].fillna(0)

    # Drop rows that have more then 75% of the data missing
    data = data.dropna(axis=0, thresh=len(data.columns) * 0.75)

    # Convert country code to binary and dropping country code 2.
    # data.countycode = data.countycode.replace([6037, 6059, 6111], ['A', 'B', 'C'])
    # Dummy country column
    dummy_country = pd.get_dummies(data['countycode'],
                                   drop_first=True,
                                   prefix='countycode')
    data = data.merge(dummy_country, left_index=True, right_index=True)
    # Drop original column and country code 2 as it is very similar
    data = data.drop(columns=['countycode', 'countycode2'], axis=1)

    data = data[data['regioncode'].notnull()]
    data = data[data['citycode'].notnull()]

    # Drop columns where can't extrapolate data
    data = data.drop(columns='neighborhoodcode', axis=1)

    # if unitnum is null assume only 1 building
    data.unitnum = data.unitnum.fillna(1)

    # Set garage area to 0 if null
    data.garagearea = data.garagearea.fillna(0)

    # No way to get the year of the buiulding so drop rows with null year
    data = data[data['year'].notnull()]

    # Fill lot area and finished area with median
    data['lotarea'] = data.lotarea.fillna(stats.median(data['lotarea']))
    data['finishedarea'] = data.finishedarea.fillna(
        stats.median(data['finishedarea']))

    # filling quality build column
    # print(data.corr().loc['taxyear', 'qualitybuild'])
    # data.boxplot('qualitybuild', 'taxyear')
    # plt.show()

    # Fill qualitybuild based on tax year (highest correlation)
    mask_1 = data['taxyear'] == 2016.00000
    mask_2 = data['taxyear'] == 2015.00000
    data.loc[mask_1, 'qualitybuild'] = data.loc[mask_1,
                                                'qualitybuild'].fillna(7)
    data.loc[mask_2, 'qualitybuild'] = data.loc[mask_2,
                                                'qualitybuild'].fillna(5)

    # drop string columns
    data_clean = data.drop(columns=['transactiondate', 'tubflag', 'fireplace'],
                           axis=1)

    return data_clean
Example #45
0
 def test_nanmedian_some(self):
     """Check nanmedian when some values only are nan."""
     m = stats.nanmedian(self.Xsome)
     assert_approx_equal(m, stats.median(self.Xsomet))
    def signif(self, files, bucktype='none'):
        """Compute signification of 3 input sets: test, system_output1, system_output2
        """
        if len(files) != 3:
            raise ValueError("You must supply 3 input files for `signif` command")

        if bucktype not in ['none', 'dialog']:
            raise ValueError("Unknown `bucktype`: %r" % bucktype)

        self.logger.debug("Importing scipy")
        from scipy.stats import median, mean, tvar, tstd
        from scipy.stats.morestats import wilcoxon
        from scipy.stats.distributions import norm, t as t
        from scipy import sqrt

        forest1, forest2, forest3 = self.loadForestFiles(files)

        self.logger.info("Processing forests 1 and 2")

        diff1 = {}
        for fn, tree1, tree2, dist, script in self.forestProcessor(forest1, forest2):
            H, D, I, S = script.HDIS
            n_errors = D+I+S
            fn = self.filenameKey(fn, bucktype)
            diff1.setdefault(fn, 0.)
            diff1[fn] += n_errors

        self.logger.info("Processing forests 1 and 3")

        diff2 = {}
        for fn, tree1, tree2, dist, script in self.forestProcessor(forest1, forest3):
            H, D, I, S = script.HDIS
            n_errors = D+I+S
            fn = self.filenameKey(fn, bucktype)
            diff2.setdefault(fn, 0.)
            diff2[fn] += n_errors

        def mapsswe(x, y):
            xm = mean(x)
            ym = mean(y)
            s = 0.
            n = 0.
            for xi, yi in izip(w1, w2):
                s += ((xi-yi) - (xm-ym))**2
                n += 1

            t_stat = sqrt(n) * abs(xm-ym) / sqrt(s/(n-1.))
            p_value = t.sf(t_stat, n-1) * 2
            return t_stat, p_value

        Z_values = []
        w1 = []
        w2 = []
        for key in sorted(diff1.keys()):
            if key not in diff2:
                self.logger.error("Unmatched utterance: %r", key)
                continue
            Na = diff1.pop(key)
            Nb = diff2.pop(key)
            w1.append(Na)
            w2.append(Nb)
            Z_values.append(Na-Nb)

        Z_mean = mean(Z_values)
        Z_median = median(Z_values)
        Z_tvar = tvar(Z_values)
        Z_tstd = tstd(Z_values)

        wilcoxon_t_stat, wilcoxon_p_value = wilcoxon(w1, w2)

        mapsswe_w_stat, mapsswe_p_value = mapsswe(w1, w2)

        fw = sys.stdout
        fw.write("Z stats:\n")
        fw.write("========\n")
        fw.write("  - mean:     %9.3f\n" % Z_mean)
        fw.write("  - median:   %9.3f\n" % Z_median)
        fw.write("  - tvar:     %9.3f\n" % Z_tvar)
        fw.write("  - tstd:     %9.3f\n\n" % Z_tstd)
        fw.write("Wilcoxon test:\n")
        fw.write("==============\n")
        fw.write("  - p-value:  %9.3f (two-tailed) [significant if <= 0.05]\n" % wilcoxon_p_value)
        fw.write("  - t-stat:   %9.3f\n\n" % wilcoxon_t_stat)
        fw.write("MAPSSWE test:\n")
        fw.write("=============\n")
        fw.write("  - p-value:  %9.3f (two-tailed) [significant if <= 0.05]\n" % mapsswe_p_value)
        fw.write("  - t-stat:   %9.3f\n\n" % mapsswe_w_stat)
Example #47
0
 def test_basic(self):
     data1 = [1,3,5,2,3,1,19,-10,2,4.0]
     data2 = [3,5,1,10,23,-10,3,-2,6,8,15]
     assert_almost_equal(stats.median(data1),2.5)
     assert_almost_equal(stats.median(data2),5)