Esempio n. 1
0
def calculateP(variables, k, data, WINDOW_LEN):
    
    freq_old = np.zeros(len(variables))
    freq = np.zeros(len(variables))

    for i in range(len(variables)):
        sample = data[k:k+WINDOW_LEN]
        freq_old[i] = sample.count(variables[i])

        sample = data[k+WINDOW_LEN : k+2*WINDOW_LEN]
        freq[i] = sample.count(variables[i])

    if (len(variables)==2):
        chi = chisquare(freq, freq_old)
        p = chi[1]
        # Tried the exact binomial goodness of fit method:
        # p = binom_test(freq, n=None, p=freq_old[0]/sum(freq_old))
        # The results were the same as Chi-square
    else:    
        if (sum(freq==0)>0 or sum(freq_old==0)>0):
            chi = chisquare(freq, freq_old)
        else:
            chi = chi2_contingency([freq,freq_old], correction=True)
        p = chi[1]
        
    return p
Esempio n. 2
0
def min_rotation(x):
    print x
    RotateInstrumentComponent(offset,b+"/sixteenpack",X=x[1],Y=x[2],Z=x[3],Angle=x[0],RelativeRotation=False)
    difc=CalculateDIFC(offset)
    dm_new=ma.masked_array(difc.extractY().flatten()[firstDet:lastDet+1],mask=mask_array)
    print chisquare(f_obs=odm,f_exp=dm_new)
    return chisquare(f_obs=odm,f_exp=dm_new)[0]
def min_position(x):
    print x
    MoveInstrumentComponent(offset,b,X=x[0],Y=x[1],Z=x[2],RelativePosition=False)
    difc=CalculateDIFC(offset)
    dm_new=ma.masked_array(difc.extractY().flatten()[firstDet:lastDet+1],mask=mask_array)
    print chisquare(f_obs=odm,f_exp=dm_new)
    return chisquare(f_obs=odm,f_exp=dm_new)[0]
Esempio n. 4
0
def two_c(N, nbins, mass1, mass2):
    """ 2c) make a historgram averaged over last N timesteps to show convergence
        with MB for both masses. Input is N. Returns None. Works best with ~(30,5)."""
    if twomasses:
        vend_ave = v_all[-N:]
        h, bins = np.histogram(vend_ave, bins=nbins, normed=True)
        center = (bins[:-1] + bins[1:]) / 2
        lab1 = "ave over last {} tsteps".format(N)
        pl.plot(center, h, "black", label=lab1)

        # plot MB on top
        pl.plot(v, MB_v(v, mass1), "green", label="MB(v,mass1)")
        pl.plot(v, MB_v(v, mass2), "red", label="MB(v,mass2)")

        # calculate own Chi^2
        # chi21 = (((MB_v(center, mass1) - h)**2)/MB_v(center,mass1)).mean()
        # chi22 = (((MB_v(center, mass2) - h)**2)/MB_v(center,mass2)).mean()

        # built in chisquare
        a = chisquare(h, MB_v(center, mass1))
        b = chisquare(h, MB_v(center, mass2))
        pl.text(1.5, 0.6, "chi2_m1={:0.2}\nchi2_m2={:0.2}".format(a[0], b[0]))

        # makeplot
        pl.legend()

        pl.title("2c: two masses data v MB afer {0} steps".format(numTsteps))
        fname = "2c_two_masses_{}_steps".format(numTsteps)
        pl.savefig(fname)
        return None
    else:
        print("must run with 2 masses")
  def EMR_Chi2_Test(self, data):
    xz_array = [[],[]]; yz_array = [[],[]]
    x_chi = -10; y_chi = -10; tot_chi = -10
    for track in data:
      if len(track["space_points"]) > 0:
        for sp in range(len(track["space_points"])):
          space = track["space_points"][sp]
          if not space["x_pos"] in xz_array[0] and not space["z_pos"] in xz_array[1]:
            xz_array[0].append(space["x_pos"])
            xz_array[1].append(space["z_pos"])
          if not space["y_pos"] in yz_array[0] and not space["z_pos"] in yz_array[1]:
            yz_array[0].append(space["y_pos"])
            yz_array[1].append(space["z_pos"])

    if len(xz_array[1]) > 5 and len(yz_array[1]) > 5:
      x_expect = []; y_expect = []
      x_fit = np.polyfit(xz_array[1], xz_array[0], 1, full=True)
      y_fit = np.polyfit(yz_array[1], yz_array[0], 1, full=True)
      for i in range(len(xz_array[1])):
        x_expect.append(xz_array[1][i] * x_fit[0][0] + x_fit[0][1])
      for i in range(len(yz_array[1])):
        y_expect.append(yz_array[1][i] * y_fit[0][0] + y_fit[0][1])
        # print "Expected Y Values"
        # print "y = ", y_fit[0][0], "*", yz_array[1][i], " + ", y_fit[0][1]
        # print yz_array[0][i], " / ", y_expect[i], "\n\n"
      x_chi = chisquare(xz_array[0], x_expect)
      y_chi = chisquare(yz_array[0], y_expect)
      tot_chi = math.sqrt(x_chi[0] ** 2 + y_chi[0] ** 2)
      if tot_chi < 10:
        return [True, abs(x_chi[0]), abs(y_chi[0]), tot_chi]
      else:
        return [False, abs(x_chi[0]), abs(y_chi[0]), tot_chi]
    return [False, x_chi, y_chi, tot_chi]
def check_chisquare(f_obs, f_exp, ddof, axis, expected_chi2):
    # Use this only for arrays that have no masked values.
    f_obs = np.asarray(f_obs)
    if axis is None:
        num_obs = f_obs.size
    else:
        if axis == 'no':
            use_axis = 0
        else:
            use_axis = axis
        b = np.broadcast(f_obs, f_exp)
        num_obs = b.shape[use_axis]

    if axis == 'no':
        chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof)
    else:
        chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis)
    assert_array_equal(chi2, expected_chi2)

    ddof = np.asarray(ddof)
    expected_p = stats.chisqprob(expected_chi2, num_obs - 1 - ddof)
    assert_array_equal(p, expected_p)

    # Also compare to stats.chisquare
    if axis == 'no':
        stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof)
    else:
        stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof,
                                               axis=axis)
    assert_array_almost_equal(chi2, stats_chisq)
    assert_array_almost_equal(p, stats_p)
Esempio n. 7
0
	def scoreCHISQ(self, pos_query_set, neg_query_set):
		"""
		Use chisquare approximation to fisher's exact test
		to calculate p-values for each
		"""

		# compute probability of random distribution in each
		# category by simple combinatorics
		s1 = len(self.pos_de_set)
		s2 = len(self.neg_de_set)
		norm = float(s1+s2)
		s1 = s1/norm
		s2 = s2/norm
		# expected frequencies for each set
		expected = np.array([s1, s2])

		up_AGREE = float(len(pos_query_set.intersection(self.pos_de_set)))
		up_DISAGREE = float(len(pos_query_set.intersection(self.neg_de_set)))
		observed = np.array([up_AGREE, up_DISAGREE])
		UP_chisq, UP_pval = stats.chisquare(observed, expected)
		
		down_AGREE = float(len(neg_query_set.intersection(self.neg_de_set)))
		down_DISAGREE = float(len(neg_query_set.intersection(self.pos_de_set)))
		observed = np.array([down_DISAGREE, down_AGREE])
		DOWN_chisq, DOWN_pval = stats.chisquare(observed, expected)

		combined_p = UP_pval*DOWN_pval
		return combined_p	
Esempio n. 8
0
def dm_sigma_shape(sps):
    """ Chi-squares for Gaussian and Lorentzian profiles """
    env = get_envelope(sps)
    i_xs, i_ys = interp_envelope(env)
    G_fit = fit_gauss(i_xs, i_ys)
    L_fit = fit_lorentz(i_xs, i_ys)
    e_xs, e_ys = env.dm, env.sigma
    g_ys, l_ys = G_fit(e_xs), L_fit(e_xs)
    return (chisquare(e_ys, g_ys, 2)[0], chisquare(e_ys, l_ys, 2)[0])
    def Responses(self,PlotStartTime,PlotEndTime,DoSkip=[]):
        NotSkippedGroups = [G for G in self.final_data['SortedGroupsList'] if not any(ext in G for ext in DoSkip)]
        # spio.savemat(os.path.join(self.CXOutputPath,'SpikesToPlot.mat'),GroupsSpikes)
        # else:
        #     GroupsSpikes = self.loadmat(os.path.join(self.CXOutputPath,'SpikesToPlot.mat'))

        f, axarr = plt.subplots(int(round(len(NotSkippedGroups)/2.)), 2, sharex=True, figsize=(8, 17))
        axarr_twins = [col.twinx() for row in axarr for col in row]
        axarr_twins = np.reshape(axarr_twins, (int(round(len(NotSkippedGroups)/2.)), 2))
        PSTH_bin = 0.005
        statistics = []
        for idx, Group in enumerate(NotSkippedGroups):
            plt_y_idx = idx % (int(round(len(NotSkippedGroups)/2.)))
            plt_x_idx = int(idx / (int(round(len(NotSkippedGroups)/2.))))
            Scatter_Xs = np.concatenate(self.final_data['GroupsSpikes'][Group])
            Scatter_Ys = np.concatenate(
                [(np.ones(len(TimePoint)) * (int(TimePoint_idx) + 1)).astype(int) for TimePoint_idx, TimePoint in
                 enumerate(self.final_data['GroupsSpikes'][Group])])
            axarr[plt_y_idx, plt_x_idx].scatter(Scatter_Xs, Scatter_Ys, s=1, color='0.5')
            Step_Xs = np.arange(0, self.final_data['runtime'], PSTH_bin)
            Step_Ys = np.zeros_like(Step_Xs)
            for X in np.unique(Scatter_Xs):
                bin_idx = np.where(Step_Xs == min(Step_Xs, key=lambda x: abs(x - X)))
                Step_Ys[bin_idx] += len(np.where(Scatter_Xs == X)[0])
            axarr_twins[plt_y_idx, plt_x_idx].step(Step_Xs, Step_Ys, where='mid', c='b')
            axarr[plt_y_idx, plt_x_idx].plot([self.final_data['InputTime'], self.final_data['InputTime']], [0, len(self.final_data['FileList'])], color='r', linestyle='dotted',
                                             linewidth=2)
            axarr[plt_y_idx, plt_x_idx].spines['top'].set_color('none')
            axarr_twins[plt_y_idx, plt_x_idx].spines['top'].set_color('none')
            axarr[plt_y_idx, plt_x_idx].xaxis.set_ticks_position('bottom')
            axarr_twins[plt_y_idx, plt_x_idx].xaxis.set_ticks_position('bottom')
            axarr[plt_y_idx, plt_x_idx].set_ylim(0, len(self.final_data['FileList']))
            CurrentTitle = Group[Group.index('_') + 1:].replace('_L',' Layer ').replace('toL',' to Layer ')
            axarr[plt_y_idx, plt_x_idx].set_title(CurrentTitle)
            axarr[plt_y_idx, plt_x_idx].set_xlim(PlotStartTime, PlotEndTime)
            axarr_twins[plt_y_idx, plt_x_idx].set_xlim(PlotStartTime, PlotEndTime)
            if max(Step_Ys[int(np.where(np.array(Step_Xs)>=PlotStartTime)[0][0]):int(np.where(np.array(Step_Xs)<=PlotEndTime)[0][-1])]) == 0:
                axarr_twins[plt_y_idx, plt_x_idx].set_ylim(0, 1)
            else:
                axarr_twins[plt_y_idx, plt_x_idx].set_ylim(0, max(Step_Ys[int(np.where(np.array(Step_Xs)>=PlotStartTime)[0][0]):int(np.where(np.array(Step_Xs)<=PlotEndTime)[0][-1])]) )
            y_lower,y_upper = axarr_twins[plt_y_idx, plt_x_idx].get_ylim()
            if y_upper > 5 :
                axarr_twins[plt_y_idx, plt_x_idx].yaxis.set_ticks(np.arange(int(y_lower), int(y_upper)+1, (int(y_upper) - int(y_lower)) / 5))
            else:
                axarr_twins[plt_y_idx, plt_x_idx].yaxis.set_ticks(np.arange(int(y_lower), int(y_upper)+1))
            if plt_x_idx == 0 :
                axarr[plt_y_idx, plt_x_idx].set_ylabel('Trials')
            cropped_steps = Step_Ys[int(np.where(np.array(Step_Xs) >= PlotStartTime)[0][0]):int(np.where(np.array(Step_Xs) <= PlotEndTime)[0][-1])]
            statistics.append([CurrentTitle,st.chisquare(cropped_steps)[0],st.chisquare(cropped_steps)[1]])
        axarr[-1,0].set_xlabel('time (s)')
        axarr[-1,1].set_xlabel('time (s)')
        plt.locator_params(axis='x', nbins=5)
        plt.tight_layout()
        plt.show()
        f.savefig(os.path.join(self.IllustratorOutputFolder,'cell_type_response.eps'))
        with open (os.path.join(self.IllustratorOutputFolder,'table.txt'),'w') as table_file:
            table_file.write(tabulate(statistics,headers=['Group Name','Chi-Square','p-Value']))
def chi2_from_sig_m(sig_m, err_t, sig_w):
    sig_t = np.sqrt(sig_m**2 + sig_w**2)
    mean_err_t = np.mean(err_t)
    err_t_normalized = (err_t-mean_err_t)/sig_t
    vals, bins = np.histogram(err_t_normalized, bins='sturges')
    std_norm = stats.norm(loc=0, scale=1)
    
    normal_vals = [len(err_t)*integrate.quad(std_norm.pdf, bins[i], bins[i+1])[0] for i in range(len(vals))]
    print stats.chisquare(vals, normal_vals)[0]
    print sig_t.mean()
    print sum(vals), sum(normal_vals)
    return stats.chisquare(vals, normal_vals)[0]
Esempio n. 11
0
def GainvsVolt(folderpath):
    filename = folderpath+'/pyspes.log'
    fftplt=1;
    if 'pyspes.log' in filename:
        if os.path.isfile(filename):
            gain,errgain,volt,temp = np.genfromtxt(filename,usecols=(6,7,12,13),unpack=True,dtype='float')
            volt = -1.*volt
            mygr = ROOT.TGraphErrors(len(volt),volt.flatten(),gain.flatten(),np.zeros(len(volt),dtype=float),errgain.flatten())
            mygr.Draw("AP")
            for i in range(len(errgain)):
                if errgain[i]<1:
                    errgain[i] = 1
            popt,perr = scop.curve_fit(linearfunc,volt,gain,sigma=errgain)
            dof = len(gain)-1-2
            chi,pval = scist.chisquare(gain,linearfunc(volt,popt[0],popt[1]),dof)
            plt.errorbar(volt,gain,yerr=errgain,fmt='.')
            vbd = -1.*popt[0]/popt[1]
            errvbd = np.sqrt((perr[0][0]/(popt[0]**2)+perr[1][1]/(popt[1]**2))*(vbd**2))
            normg = popt[1]*20e-15/50./1.6e-19
            errnormg = np.sqrt(perr[1][1])*20e-15/50./1.6e-19
            plt.plot(volt,linearfunc(volt,popt[0],popt[1]),'r--')
            plt.grid(True)
            plt.xlabel('Bias Voltage [V]',fontsize=16)
            plt.ylabel('Gain [adu.]',fontsize=16)
            plt.annotate("DSF\n$U_{bd}$ : "+"{0:.2f} $\pm$ {1:.2f} V\nGain: {2:.2e} $e_0$/V\nTemp: {3:.1f}$^\circ$C\n$\chi^2$/DOF : {4:.1f}/{5}".format(vbd,errvbd,normg,np.mean(temp),chi,dof),xy=(0.6,0.3),xycoords='axes fraction',fontsize=16)
            plt.xlim(np.min(volt)-0.1,np.max(volt)+0.1)
            plt.show()
        else:
            print 'pyspes.log not exists, using FFT fit data ...'
            fftplt=1
    if (fftplt):
        filename = folderpath+'/spes.log'
        if os.path.isfile(filename):
            voltage,temperature,GainFFT,errGainFFT = np.genfromtxt(filename,usecols=(0,1,7,8),unpack=True)
            voltage = -1.*voltage
            popt,perr = scop.curve_fit(linearfunc,voltage,GainFFT,sigma=errGainFFT)
            dof = len(GainFFT)-1-2
            chi,pval = scist.chisquare(GainFFT,linearfunc(voltage,popt[0],popt[1]),dof)
            plt.errorbar(voltage,GainFFT,yerr=errGainFFT,fmt='.')
            vbd = -1.*popt[0]/popt[1]
            errvbd = np.sqrt((perr[0][0]/(popt[0]**2)+perr[1][1]/(popt[1]**2))*(vbd**2))
            normg = popt[1]*20e-15/50./1.6e-19
            errnormg = np.sqrt(perr[1][1])*20e-15/50./1.6e-19
            plt.plot(voltage,linearfunc(voltage,popt[0],popt[1]),'r--')
            plt.grid(True)
            plt.xlabel('Bias Voltage [V]',fontsize=16)
            plt.ylabel('Gain [adu.]',fontsize=16)
            plt.annotate("FFT\n$U_{bd}$ : "+"{0:.2f} $\pm$ {1:.2f} V\nGain: {2:.2e} $e_0$/V\nTemp: {3:.1f}$^\circ$C\n$\chi^2$/DOF : {4:.1f}/{5}".format(vbd,errvbd,normg,np.mean(temperature),chi,dof),xy=(0.6,0.3),xycoords='axes fraction',fontsize=16)
            plt.xlim(voltage[0]-0.1,voltage[-1]+0.1)
            plt.show()
        else:
            print "no log file found!"
Esempio n. 12
0
def statistic_test(tagging, feature_values):
    '''need to compare the two sides I split (how many of each label in each one)'''
    if len(frozenset(feature_values))>2:
        return 0.0,0.0 #only works for 2 values
    locs= find(feature_values==1)
    locs2= find(feature_values!=1)
    observed= array([len(find(tagging[locs]==1)),len(find(tagging[locs]!=1))])
    expected= array([len(find(tagging[locs2]==1)),len(find(tagging[locs2]!=1))])
    if any(expected==0):
        if any(observed==0):
            return inf, 0.0 #this is good for us
        return chisquare(expected, observed)
    return chisquare(observed, expected) #high stat+low p->good
Esempio n. 13
0
def test_4_14_add_poisson_noise():
	from scipy.stats import chisquare
	rows = 75
	mew = []
	dev = []
	sig = []
	for i in range(rows):
		fish = starCam.images[0].addPoissonNoise(img[i,:]) - img[0,:]
		muFish 	= np.mean(fish)
		varFish = np.var(fish)
		stdFish = np.std(fish)
		muImg   = np.mean(img)
		varImg  = np.var(img)
		mew.append(muFish)
		dev.append(stdFish)
		sig.append(varFish)
		chi = chisquare(np.array(fish))[1]

	# Chi-Squared Test (p-value must be >= than 0.05 for good Poisson Fit)
	assert ( chi >= 0.05 )
	# Mean of Samples == Mean of Normal Image
	assert ( abs(np.mean(mew) - muImg) <= 0.01 )
	# Avg of Sample Std Dev == sqrt [ Avg of Sample Variances ]
	assert ( abs(np.mean(dev) - np.sqrt(np.mean(sig))) <= 1e-3 )
	# Avg of Sample Variance == Avg of Sample Mean
	assert ( abs(np.mean(sig) - np.mean(mew)) <= 0.1 )
Esempio n. 14
0
def get_patient_ChiP(patient, geneToCases, patientToGenes):
    """
    :param patient: 
    :param numGenes: 
    :param numCases: 
    :param geneToCases: 
    :param patientToGenes: 
    :return: The chi-squared value of the sample, given the expected probabilities of the genes
    """
    patient_genes = patientToGenes[patient]
    numCases = len(patientToGenes)

    f_obs = [1. if gene in patient_genes else 0. for gene in geneToCases]


    # The expected value is the marginal probability of the gene's occurrence
    f_exp = [len(geneToCases[gene]) * 1.0 / numCases for gene in geneToCases]



    chisq, p = stats.chisquare(f_obs, f_exp)

    if p < 0.05:
        print patient
        # print "Observed: "
        # print f_obs[0:50]
        # print "Expected: "
        # print f_exp[0:50]

    return p
Esempio n. 15
0
def check_counts( obj2counts, expected, threshold=0.001, verbose=False):
    """Check some counts according to a chi-squared statistic.

    We can use this to see if sampling counts, etc. are what they should be.

    Here, obj2counts is a dictionary mapping each thing to a count expected is a *function* that takes an
    object and hands back its expected counts (unnormalized), or a dictionary doing the same (unnormalized)

    TODO: We may want a normalized version?

    """
    objects = obj2counts.keys()
    actual_counts   = map(lambda o: float(obj2counts[o]), objects)
    N = sum(actual_counts)

    if isinstance(expected, dict):
        e = map(lambda o: expected.get(o,0.0), objects)
    else:
        assert callable(expected)
        e = map(lambda o: expected(o), objects)

    Z = float(sum(e))
    expected_counts = map(lambda o: float(o*N)/Z, e)
    chi, p = chisquare(f_obs=actual_counts, f_exp=expected_counts)

    if verbose:
        print "# Chi squared gives chi=%f, p=%f" % (chi,p)
        if p < threshold:
            assert "# *** SIGNIFICANT DEVIATION FOUND IN P"

    assert p > threshold, "*** Chi squared test fail with chi=%f, p=%f" % (chi,p)

    return True
def get_chisquare(obs_data, obs_model, nbins=3):
    '''
    Sends into function actual MEP amplitude data at each time point, predicted
    amplitudes from simulated facilitation curves, number of bins to divide
    percentile bins into. Calculates histograms for experimental and predicted
    data. Compares frequencies in each bin. Calculates one-way chi square test.
    
    Parameters
    --------------
    obs_data : array of MEP amplitudes from experimental data
    obs_model : predicted MEP amplitudes from simulated trials (i.e. facilitation curves)
    
    Returns
    ---------------
    chi square statistic for how well predicted data matches experimental data
    
    '''
    percentile_bins = np.linspace(0, 100, nbins + 1)    
    bin_edges = np.percentile(obs_data, list(percentile_bins))
    hist_data, bin_edges  = np.histogram(obs_data,  bins=bin_edges)
    hist_data = hist_data / float(obs_data.size) # still presents frequencies proportional to number of observations? - check with Angus 
    # put in density so that value for each bin is expressed as proportion of total number of observations
    hist_model, bin_edges = np.histogram(obs_model, bins=bin_edges)
    hist_model = hist_model / float(obs_model.size)
    return stats.chisquare(hist_data, hist_model)
def test_adjust_bad_positions():
    pages_positions = {
        0: [8, 28, 33, 38],
        1: [10, 30, 35, 40],
        2: [10, 30, 35, 40],
        3: [0, 20, 25, 32],
        4: [3, 21, 25, 31],
        5: [3, 21, 25, 31],
    }

    mean_widths = np.diff([np.mean(pos) for pos in zip(*pages_positions.values())])

    pages_positions.update({
        6: [3, 21, 20, 31],       # bad: neg. width
        7: [3, 21, 25, 28, 31],   # bad: too many positions
        8: [3, 21, 25, 70],       # bad: invalid last position
    })

    alpha = 0.05
    adj_positions = adjust_bad_positions(pages_positions, pos_check_signif_level=alpha)

    assert pages_positions.keys() == adj_positions.keys()

    for p_num in pages_positions.keys():
        orig = pages_positions[p_num]
        adj = adj_positions[p_num]

        assert len(adj) == 4
        assert adj[0] == orig[0]

        adj_widths = np.diff(adj)
        _, p_val = chisquare(adj_widths, mean_widths)
        assert p_val >= alpha
Esempio n. 18
0
def transnational_distribution(node):
    """Check the graph distribution of the node in question to make sure
    that it qualifies as transantional, we don't want it too heavily
    skewed towards one other time_zone/country
    
    :param node: The node to test to see if it's graph is transnational
    :returns: If the node has a transnationally distributed graph
    :rtype: boolean
    
    """
    time_zone_list = []
    for time_zone in [n.time_zone for n in node.friends]:
        if (time_zone is not None):
            time_zone_list.append(time_zone)
    
    # if they dont have any friends with time zones, they cannot be quantified
    if(len(time_zone_list) < 1):
        return False
    
    # collect the top 3 time_zones in their network
    counts = [c[1] for c in Counter(time_zone_list).most_common(3)]
    # cs returns tuple(Power_divergenceResult, pvalue)
    cs = chisquare(counts)
    if (cs[0] < 5 and cs[1] > 0.25):
        return True
    else:
        return False
Esempio n. 19
0
def ejercicio5b():
    val1 = [6,1,1,2,6]
    valores = [6,7,3,4,7,3,7,2,6,3,7,8,2,1,3,5,8,7]
    p = estimarPbinomial(valores,8)
    val2 = [st.binom.cdf(x,8,p)*18 for x in val1]
    print " chi (python)",st.chisquare(val1, f_exp=[2.37,3.49,4.5,3.62,2.01],ddof=1)
    print " chi2", 1 - st.chi2.cdf(tabla2(),3)
Esempio n. 20
0
    def check_initializer_statistics(self, xp, n):
        from scipy import stats

        ws = xp.empty((n,) + self.shape, dtype=self.dtype)
        for i in range(n):
            initializer = self.target(**self.target_kwargs)
            initializer(xp.squeeze(ws[i:i+1], axis=0))

        expected_scale = self.scale or 1.1
        sampless = cuda.to_cpu(ws.reshape(n, -1).T)
        alpha = 0.01 / len(sampless)

        ab = 0.5 * (self.dim_in - 1)

        for samples in sampless:
            if self.dim_in == 1:
                numpy.testing.assert_allclose(abs(samples), expected_scale)
                _, p = stats.chisquare((numpy.sign(samples) + 1) // 2)
            else:
                _, p = stats.kstest(
                    samples,
                    stats.beta(
                        ab, ab,
                        loc=-expected_scale,
                        scale=2*expected_scale
                    ).cdf
                )
            assert p >= alpha
Esempio n. 21
0
def pymc3_random_discrete(dist, paramdomains,
                          valuedomain=Domain([0]), ref_rand=None,
                          size=100000, alpha=0.05, fails=20):
    model = build_model(dist, valuedomain, paramdomains)
    domains = paramdomains.copy()
    for pt in product(domains, n_samples=100):
        pt = pm.Point(pt, model=model)
        p = alpha
        # Allow Chisq test to fail (i.e., the samples be different)
        # a certain number of times.
        f = fails
        while p <= alpha and f > 0:
            o = model.named_vars['value'].random(size=size, point=pt)
            e = ref_rand(size=size, **pt)
            o = np.atleast_1d(o).flatten()
            e = np.atleast_1d(e).flatten()
            observed = dict(zip(*np.unique(o, return_counts=True)))
            expected = dict(zip(*np.unique(e, return_counts=True)))
            for e in expected.keys():
                expected[e] = (observed.get(e, 0), expected[e])
            k = np.array([v for v in expected.values()])
            if np.all(k[:, 0] == k[:, 1]):
                p = 1.
            else:
                _chi, p = st.chisquare(k[:, 0], k[:, 1])
            f -= 1
        assert p > alpha, str(pt)
Esempio n. 22
0
    def test_RegenerationProposal(self):
        from LOTlib.Inference.Proposals.RegenerationProposal import RegenerationProposal
        rp = RegenerationProposal(self.grammar)

        for tree in self.trees:
            cnt = Counter()
            for _ in xrange(NSAMPLES):
                p, fb = rp.propose_tree(tree)
                cnt[p] += 1

                # Check the proposal
                self.check_tree(p)

            ## check that the proposals are what they should be -- rp.lp_propose is correct!
            obsc = [cnt[t] for t in self.trees]
            expc = [exp(self.grammar.log_probability(t))*sum(obsc) for t in self.trees]
            csq, pv = chisquare([cnt[t] for t in self.trees],
                                [exp(rp.lp_propose(tree, x))*NSAMPLES for x in self.trees])

            # Look at some
            # print ">>>>>>>>>>>", tree
            # for p in self.trees:
            #     print "||||||||||", p
            #     v = rp.lp_propose(tree,p)
            #     print "V=",v

            for c, e, tt in zip([cnt[t] for t in self.trees],
                               [exp(rp.lp_propose(tree, x))*NSAMPLES for x in self.trees],
                               self.trees):
                print c, e, tt, rp.lp_propose(tree,tt)

            self.assertGreater(pv, 0.001, msg="Sampler failed chi squared!")
Esempio n. 23
0
def serial_test(sequence):
    """
    serial test tests for randomness by looking at conversions from 1 digit to the next
    a low p-value in the returned dict indicates the strength that the null-hypothesis of a
    random sequence may be rejected.

    http://books.google.com/books?id=EIbxfCGfzgcC&lpg=PA141&ots=o-8ymmqbs9&pg=PA142#v=onepage&q=&f=false

    :param sequence: any iterable with at most 2 values that can be turned
                     into an integer via int() . e.g.
                     '1001001'
                     [1, 0, 1, 0, 1]
    :rtype: returns dict of {'chi': <chisquare value>, 'p': <p-value of said chisquare>}

    >>> serial_test('101010101111000')
    {'chi': 1.4285714285714286, 'p': 0.69885130769248427}

    >>> serial_test('110000000000000111111111111')
    {'chi': 18.615384615384617, 'p': 0.00032831021826061683}

    """
    #if isinstance(sequence, basestring): sequence = map(int, sequence)
    pairwise = izip(sequence[1:], sequence[:-1])
    d = collections.defaultdict(int)
    for k in pairwise: d[k] += 1
    # order doesnt matter because the expected are all the same.
    obs = np.array(d.values())
    exp = np.ones_like(obs) * obs.mean()

    chi, pval =  chisquare(obs, exp)
    return {'chi': chi, 'p': pval}
Esempio n. 24
0
    def do_chi_square(self, catalog_file, alpha, seconds):
        """
        receives a catalog file, a significance level and a number of seconds 
        do a hyphotesis test to detect whether the catalog is poissonian or not, under the significance level
            H0 -> catalog is poissonian
            H1 -> catalog is not poissonian
        prints the p-value and the significance level
        """
        # get observed frequencies
        catalog = Catalog()
        observed_frequencies = catalog.get_observed_frequencies(catalog_file, seconds)
        
        # print if the observed frequencies are too low
        for x in observed_frequencies:
            if x < 5:
                print("Warning: the number of occurrences appear to be too low!")
                break

        # get the number of restrictions 
        restrictions = 1 # 1 restriction, since lambda - rate of occurrence - is estimated from the parameters

        # perform chi square test
        result = chisquare(observed_frequencies, ddof=restrictions) 

        # get the p_value 
        p_value = result[1]

        # print results showing the p value and the significance level 
        print("the p_value was: ", p_value)
        print("the significance level was: ", alpha)
def run(args):
    col_num = get_col_num(args.c)
    file_iter =  (l.rstrip("\r\n").split("\t")
                  for l in open(args.file) if l[0] != "#")

    pvals = np.array([float(b[col_num]) for b in file_iter])
    kwargs = {"bins": args.n} if args.n else {}
    hist, bins = np.histogram(pvals, normed=True, **kwargs)
    xlabels = "|".join("%.2f-%.2f" % b for b in pairwise(bins))
    print "#", chart(hist, xlabels)
    hist, bins = np.histogram(pvals, normed=False, **kwargs)

    print "# median: %.3f mean:%.3f; std: %.3f min:%.3f; max:%.3f" % (
        np.median(pvals), pvals.mean(), pvals.std(), pvals.min(), pvals.max())

    try:
        from scipy.stats import chisquare
        chisq, p = chisquare(hist)
        print "#chi-square test of uniformity. p: %.3g " \
              "(low value means reject null of uniformity)" % p
    except ImportError:
        pass
    print "#bin_start\tbin_end\tn"
    for bin, val in zip(pairwise(bins), hist):
        print "%.2f\t%.2f\t%i" % (bin[0], bin[1], val)
Esempio n. 26
0
File: util.py Progetto: cauyrd/fluff
def mirror_clusters(data, labels, cutoff=0.01):
    """
    Merge mirrored profiles based on a chi2 test of the mean profiles 
    Only if the profile is mirrored over all data tracks
    Returns the labels of the two matched mirrored tracks, if there is at least one match with a p-value
    greater than the cutoff.
    If not, return (None, None)
    """
    n = len(set(labels))
    if n == 1:
        return (None, None)
    mirror = dict([(i, {}) for i in range(n)])
    for track in data.keys():
        profiles = []
        for i in range(n):
            profiles.append(numpy.mean(data[track][labels == i], 0) + 1e-10)
        for i in range(n - 1):
            for j in range(i + 1, n):
                p = chisquare(profiles[i], profiles[j][::-1])[1]
                mirror[i].setdefault(j, []).append(p)
    result = []
    for i in mirror.keys():
        for j in mirror[i].keys():
            result.append([(i, j), mirror[i][j]])
    for (i, j), ps in sorted(result, cmp=lambda a, b: cmp(numpy.mean(a[1]), numpy.mean(b[1])))[::-1]:
        # print (i,j), ps, numpy.array(ps), cutoff
        if (numpy.array(ps) >= cutoff).all():
            return (i, j)
    return (None, None)
Esempio n. 27
0
def fe2_after_hbeta(wave, flux, error):
    fig = plt.figure()
    plt.plot(wave, flux)
    # (FeII, FeII), (FeII, FeII), FeII
    hbeta_complex_fit_func = models.Gaussian1D(5.0, 5169.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5150, 5180]}) + \
            models.Gaussian1D(5.0, 5197.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5180, 5210]}) + \
            models.Gaussian1D(2.0, 5234.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5220, 5250]}) + \
            models.Gaussian1D(2.0, 5276.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5260, 5300]}) + \
            models.Gaussian1D(5.0, 5316.0, 2.0, bounds = {"amplitude": [0, 10.0], "mean": [5300, 5325]}) + \
            models.Linear1D((flux[0] - flux[-1])/(wave[0]-wave[-1]), (-flux[0] * wave[-1] + flux[-1] * wave[0])/(wave[0]-wave[-1]))
    fitter = fitting.LevMarLSQFitter()
    with warnings.catch_warnings():
        warnings.filterwarnings('error')
        try:
            fit = fitter(hbeta_complex_fit_func, wave, flux, weights= error, maxiter = 10000)
        except Warning:
            expected = np.array(fit(wave))
            plt.plot(wave, expected)
            cont = models.Linear1D(fit.parameters[15], fit.parameters[16])
            plt.plot(wave, cont(wave))
            fig.savefig("aft-failed.jpg")
            plt.close()
            raise SpectraException("Line Fe2 after Hbeta fit failed")
    expected = np.array(fit(wave))
    plt.plot(wave, expected)
    cont = models.Linear1D(fit.parameters[15], fit.parameters[16])
    plt.plot(wave, cont(wave))
    fig.savefig("aft.jpg")
    plt.close()
    rcs = chisquare(flux, expected)[0] / np.abs(len(flux) - 17)
    if rcs > 10.0:
        plt.close()
        raise SpectraException("Line Fe2 after Hbeta reduced chi-square too large" + str(rcs))
    return fit.parameters
Esempio n. 28
0
def chi_squared_test(obs, exp):
    """
    :param obs: observation sequences
    :param exp: exception sequences
    :return: P-value
    """
    return chisquare(obs, exp)[1]
Esempio n. 29
0
def calDMcurve(data2d, ddms, freqs, period):
    chisqs = []
    for i,ddm in enumerate(ddms):
        deltaphases = ddm * 4.15e3 * 1. / freqs**2 / period
        data = np.array([rotate(data2d[j,:], dp) for j,dp in enumerate(deltaphases)])
        chisqs.append(stats.chisquare(data.sum(0))[0])
    return np.array(chisqs)
Esempio n. 30
0
 def compute_score(attr):
     if attr is group_var:
         return 3
     if attr.is_continuous:
         # One-way ANOVA
         col = data.get_column_view(attr)[0].astype(float)
         groups = (col[group_col == i] for i in range(n_groups))
         groups = (col[~np.isnan(col)] for col in groups)
         groups = [group for group in groups if len(group)]
         p = f_oneway(*groups)[1] if len(groups) > 1 else 2
     else:
         # Chi-square with the given distribution into groups
         # (see degrees of freedom in computation of the p-value)
         if not attr.values or not group_var.values:
             return 2
         observed = np.array(
             contingency.get_contingency(data, group_var, attr))
         observed = observed[observed.sum(axis=1) != 0, :]
         observed = observed[:, observed.sum(axis=0) != 0]
         if min(observed.shape) < 2:
             return 2
         expected = \
             np.outer(observed.sum(axis=1), observed.sum(axis=0)) / \
             np.sum(observed)
         p = chisquare(observed.ravel(), f_exp=expected.ravel(),
                       ddof=n_groups - 1)[1]
     if math.isnan(p):
         return 2
     return p
    male_count = len(sequence[sequence == 0])
    female_count = len(sequence[sequence == 1])
    male_diff = (male_count - 150)**2 / 150
    female_diff = (female_count - 150)**2 / 150
    chi_squared = male_diff + female_diff
    chi_squared_values.append(chi_squared)

plt.hist(chi_squared_values)

## 9. Increasing degrees of freedom ##

diffs = []
observed = [27816, 3124, 1039, 311, 271]
expected = [26146.5, 3939.9, 944.3, 260.5, 1269.8]

for i, obs in enumerate(observed):
    exp = expected[i]
    diff = (obs - exp)**2 / exp
    diffs.append(diff)

race_chisq = sum(diffs)

## 10. Using SciPy ##

from scipy.stats import chisquare
import numpy as np
observed = np.array([27816, 3124, 1039, 311, 271])
expected = np.array([26146.5, 3939.9, 944.3, 260.5, 1269.8])

chisquare_value, race_pvalue = chisquare(observed, expected)
def plot_bkg_templates(fnames_to_run):
    """
    Runs LOWESS smoothing algorithm ntoys times and finds 1 and 2 sigma bands for interpolation
    """

    for bkg_file in fnames_to_run:
        hdict = load(bkg_file)
        jmult = "3Jets" if "3Jets" in os.path.basename(bkg_file) else "4PJets"
        for tname, orig_template in hdict[args.lepton].items():

            proc = tname.split(
                "_")[0] if not "data_obs" in tname else "data_obs"
            sys = sorted(filter(None, tname.split(f"{proc}_")))[0]

            if proc == "BKG": continue
            #if sys not in ["hdampUP", "hdampDOWN", "mtop1665", "mtop1695", "mtop1715", "mtop1735", "mtop1755", "mtop1785", "ueUP", "ueDOWN"]: continue
            if sys == "nosys": continue
            print(args.lepton, jmult, sys, proc)

            nosys_hist = hdict[args.lepton][f"{proc}_nosys"].copy()
            orig_smooth_hist = Plotter.smoothing_mttbins(
                nosys=nosys_hist,
                systematic=orig_template,
                mtt_centers=mtt_centers,
                nbinsx=nbinsx,
                nbinsy=nbinsy)

            x_lims = (0, nosys_hist.dense_axes()[0].centers().size)

            # get vals and errors of systematic variation
            sys_histo_vals, sys_histo_sumw2 = orig_template.values(
                sumw2=True)[()]
            sys_histo_errs = np.sqrt(sys_histo_sumw2)

            # make toys based on Gaussian distribution of mu=bin_val, sigma=bin_error
            toy_arrays = np.zeros((nbins, ntoys))
            for idx in range(nbins):
                toy_arrays[idx] = np.random.normal(sys_histo_vals[idx],
                                                   sys_histo_errs[idx],
                                                   size=ntoys)

                # get smoothed relative deviation distributions from toys
            smoothed_rel_dev_arrays = np.zeros((ntoys, nbins))
            chi2_pvals = np.zeros((ntoys, 2))
            for idx in range(ntoys):
                smoothed_array = Plotter.smoothing_mttbins(
                    nosys=nosys_hist,
                    systematic=(toy_arrays.T)[idx],
                    mtt_centers=mtt_centers,
                    nbinsx=nbinsx,
                    nbinsy=nbinsy)
                chi2_pval = chisquare(
                    f_obs=smoothed_array, f_exp=orig_smooth_hist.values()[()]
                )  # convert to expected yields so inputs are greater than 5
                chi2_pvals[idx] = np.array(
                    [chi2_pval.statistic, chi2_pval.pvalue])
                smoothed_rel_dev_arrays[idx] = (
                    smoothed_array -
                    nosys_hist.values()[()]) / nosys_hist.values()[()]

                ## find 68% and 95% intervals
            plus_one_sigma_smooth_vals, minus_one_sigma_smooth_vals = np.zeros(
                nbins), np.zeros(nbins)
            plus_two_sigma_smooth_vals, minus_two_sigma_smooth_vals = np.zeros(
                nbins), np.zeros(nbins)
            for bin in range(nbins):
                plus_one_sigma_smooth_vals[bin] = np.sort(
                    smoothed_rel_dev_arrays[:, bin])[plus_one_sigma_ind]
                minus_one_sigma_smooth_vals[bin] = np.sort(
                    smoothed_rel_dev_arrays[:, bin])[minus_one_sigma_ind]
                plus_two_sigma_smooth_vals[bin] = np.sort(
                    smoothed_rel_dev_arrays[:, bin])[plus_two_sigma_ind]
                minus_two_sigma_smooth_vals[bin] = np.sort(
                    smoothed_rel_dev_arrays[:, bin])[minus_two_sigma_ind]

            # plot relative deviation
            fig, ax = plt.subplots()
            fig.subplots_adjust(hspace=.07)

            # original relative deviations
            orig_masked_vals, orig_masked_bins = Plotter.get_ratio_arrays(
                num_vals=orig_template.values()[()] - nosys_hist.values()[()],
                denom_vals=nosys_hist.values()[()],
                input_bins=nosys_hist.dense_axes()[0].edges())
            ax.step(orig_masked_bins,
                    orig_masked_vals,
                    where="post",
                    **{
                        "color": "k",
                        "linestyle": "-",
                        "label": "Original"
                    })
            # original smoothing relative deviations
            orig_smoothed_masked_vals, orig_smoothed_masked_bins = Plotter.get_ratio_arrays(
                num_vals=orig_smooth_hist.values()[()] -
                nosys_hist.values()[()],
                denom_vals=nosys_hist.values()[()],
                input_bins=nosys_hist.dense_axes()[0].edges())
            ax.step(orig_smoothed_masked_bins,
                    orig_smoothed_masked_vals,
                    where="post",
                    **{
                        "color": "r",
                        "linestyle": "-",
                        "label": "Original Smoothing"
                    })
            # plot 68 and 95% intervals for yields
            ax.fill_between(nosys_hist.dense_axes()[0].edges(),
                            np.r_[minus_one_sigma_smooth_vals,
                                  minus_one_sigma_smooth_vals[-1]],
                            np.r_[plus_one_sigma_smooth_vals,
                                  plus_one_sigma_smooth_vals[-1]],
                            where=np.r_[plus_one_sigma_smooth_vals,
                                        plus_one_sigma_smooth_vals[-1]] >
                            np.r_[minus_one_sigma_smooth_vals,
                                  minus_one_sigma_smooth_vals[-1]],
                            step="post",
                            **{
                                "label": "68%",
                                "facecolor": "#00cc00",
                                "alpha": 0.5
                            })
            ax.fill_between(nosys_hist.dense_axes()[0].edges(),
                            np.r_[minus_two_sigma_smooth_vals,
                                  minus_two_sigma_smooth_vals[-1]],
                            np.r_[plus_two_sigma_smooth_vals,
                                  plus_two_sigma_smooth_vals[-1]],
                            where=np.r_[plus_two_sigma_smooth_vals,
                                        plus_two_sigma_smooth_vals[-1]] >
                            np.r_[minus_two_sigma_smooth_vals,
                                  minus_two_sigma_smooth_vals[-1]],
                            step="post",
                            **{
                                "label": "95%",
                                "facecolor": "#ffcc00",
                                "alpha": 0.5
                            })

            ax.legend(loc="upper right", title=f"{sys}, {proc}")
            ax.axhline(
                0, **{
                    "linestyle": "--",
                    "color": (0, 0, 0, 0.5),
                    "linewidth": 1
                })
            ax.autoscale()
            ax.set_ylim(ax.get_ylim()[0], ax.get_ylim()[1] * 1.15)
            ax.set_xlim(x_lims)
            ax.set_xlabel(
                "$m_{t\\bar{t}}$ $\otimes$ |cos($\\theta^{*}_{t_{l}}$)|")
            ax.set_ylabel("Rel. Deviaton from Nominal")

            # add lepton/jet multiplicity label
            ax.text(0.02,
                    0.94,
                    f"{leptypes[args.lepton]}, {jet_mults[jmult]}",
                    fontsize=rcParams["font.size"] * 0.9,
                    horizontalalignment="left",
                    verticalalignment="bottom",
                    transform=ax.transAxes)
            ## draw vertical lines for distinguishing different ctstar bins
            vlines = [x_lims[1] * ybin / 5 for ybin in range(1, 5)]
            for vline in vlines:
                ax.axvline(vline, color="k", linestyle="--")
            hep.cms.label(ax=ax,
                          data=False,
                          paper=False,
                          year=args.year,
                          lumi=round(data_lumi_year[f"{args.lepton}s"] / 1000.,
                                     1))

            #set_trace()
            pltdir = os.path.join(outdir, args.lepton, jmult, sys)
            if not os.path.isdir(pltdir):
                os.makedirs(pltdir)

            figname = os.path.join(
                pltdir, "_".join([
                    jmult, args.lepton, sys, proc,
                    "SmoothingConfidenceIntervals"
                ]))
            fig.savefig(figname)
            print(f"{figname} written")
            plt.close()
Esempio n. 33
0
def pearson_chisquare(dist, N):
    """ 对于 8 (9-1) 个自由度,5% significant level,critical value of chi-square is  15.507
        这里返回 critical value - chi_square statistics
        如果返回值大于 0,则通过测试,95% 满足 benford's law,且越大越好;否则,未通过测试
    """
    return 15.507 - chisquare(dist, ideal_distribution)[0] * N
Esempio n. 34
0
def run_socnet_model(x, y, ct, g_id, cur, tag, ylabel, data_consolidated,
                     model_consolidated, curdate, text):
    import fitrs3
    from scipy.stats import chisquare
    rname = ct.replace('_', ' ')
    fdata = f'gpdata/dat/{ct}-{g_id}.dat'
    fgplot = f'gpdata/{ct}-{g_id}.gp'
    fsvg = f'svg/{ct}-{g_id}.svg'
    freport = f'report/{ct}-{g_id}.html'

    file1 = f'scnlog/{ct}-p1.dat'
    file2 = f'scnlog/{ct}-p2.dat'

    partition = len(y) // 4

    forecast = fitrs3.previsaoredeslp(y, 7, 200, 400, 100, file1, file2,
                                      partition, y[-1] + 50, y[-1] * 20, 4, 6,
                                      0.2, 0.7, 0, 101)

    if forecast is None:
        data_consolidated.append('n.a.')
        data_consolidated.append('n.a.')

        dump_xy_dat(fdata, x, y)
        dump_svg(fgplot,
                 fsvg,
                 f'{text} for {rname} on {curdate}',
                 'Days from the first infected',
                 f'{ylabel}',
                 fdata,
                 2,
                 f"{rname} data",
                 opt='colorsequence podo',
                 txt1='NO FIT AVAILABLE FOR THE CURRENT DATA',
                 point=True)
    else:
        chisqr = chisquare(y, f_exp=forecast[:len(y)])[0]
        data_consolidated.append(chisqr)
        model_consolidated.append('socnet-fitrs3')
        nx = x if forecast is None else np.arange(len(forecast))
        dump_xyz_dat(fdata, nx, y, forecast)
        dump_svg2D(fgplot,
                   fsvg,
                   f'{text} for {rname} on {curdate}',
                   'Days from the first infected',
                   f'{ylabel}',
                   fdata,
                   2,
                   3,
                   f"{rname} data",
                   f'{text}',
                   opt='yrange [0<*:]',
                   txt1=f'SOCNET',
                   txt2=f'�² = {chisqr:9.2}')

    if forecast is not None:
        with open(freport, 'w') as f:
            table_info = f'<tr><td>Success status</td><td>Forecast calculated with socnet-fitrs3</td></tr>'
            table_info += f'<tr><td>Abort status</td><td>n.a.</td></tr>'
            table_info += f'<tr><td>Fit message</td><td>n.a.</td></tr>'
            table_stat = '<tr> <td>n.a</td></tr>'
            table_obs = f'<tr><th>Days from the first infected</th><th>{ylabel}</th><th>Model {ylabel}</th></tr>'
            if forecast != None:
                for i, j, k in itertools.zip_longest(nx,
                                                     y,
                                                     forecast,
                                                     fillvalue='nan'):
                    table_obs += f'<tr><td>{i}</td><td>{j}</td><td>{k:.0f}</td></tr>'
            else:
                for i, j in itertools.zip_longest(x, y, fillvalue='nan'):
                    table_obs += f'<tr><td>{i}</td><td>{j}</td><td>n.a.</td></tr>'
            f.write(param_page(rname, table_info, table_stat, table_obs, fsvg))
    return
Esempio n. 35
0
def get_top_labels():
    gender_dict = _get_genders()
    print(gender_dict)

    f_counts = {}
    m_counts = {}
    with open('mc_data_replicated.tsv', 'rt') as in_file:
        tsv_reader = csv.reader(in_file, delimiter='\t')
        next(tsv_reader)  # skip the first row, which has headings
        for row in tsv_reader:
            image = unidecode.unidecode(row[0])
            labels = row[1].split(",")
            gender = gender_dict[image]
            for label in labels:
                label = label.strip()
                if gender == "Male":
                    if label in m_counts:
                        m_counts[label] += 1
                    else:
                        m_counts[label] = 1
                else:
                    if label in f_counts:
                        f_counts[label] += 1
                    else:
                        f_counts[label] = 1

    ordered_f = [f for f in f_counts.items()
                 if f[1] >= 5]  # Only consider labels used at least 5 times
    ordered_m = [m for m in m_counts.items()
                 if m[1] >= 5]  # (as done in the paper)

    total_f = sum(value == "Female" for value in gender_dict.values())
    total_m = len(gender_dict) - total_f
    top_f = [(label, i / total_f * 100) for label, i in ordered_f]
    top_m = [(label, i / total_m * 100) for label, i in ordered_m]

    # Get occurrences of gender A's top labels in gender B
    for index, label in enumerate(top_f):
        m_prob = 0
        if label[0] in m_counts:
            m_prob = m_counts[label[0]] / total_m * 100

        chi2, p = chisquare([label[1], m_prob])
        top_f[index] = [label[0], label[1], m_prob, chi2]

    for index, label in enumerate(top_m):
        f_prob = 0
        if label[0] in f_counts:
            f_prob = f_counts[label[0]] / total_f * 100

        chi2, p = chisquare([label[1], f_prob])
        top_m[index] = [label[0], label[1], f_prob, chi2]

    # Get the top 25 labels by chi2 where occurrence is higher than expected for that gender
    top_f = [
        f for f in sorted(top_f, key=lambda item: item[3]) if f[1] > f[2]
    ][-25:]
    top_m = [
        m for m in sorted(top_m, key=lambda item: item[3]) if m[1] > m[2]
    ][-25:]

    top_f = sorted(top_f, key=lambda item: item[1])
    top_m = sorted(top_m, key=lambda item: item[1])

    fig, ax = plt.subplots()
    x = np.arange(len(top_f))
    width = 0.35
    rects1 = ax.barh(x - width / 2, [r[1] for r in top_f],
                     width,
                     label='Women')
    rects2 = ax.barh(x + width / 2, [r[2] for r in top_f], width, label='Men')
    ax.set_xlabel('% receiving each label')
    ax.set_title('Top labels for images of women')
    ax.set_yticks(x)
    ax.set_yticklabels([r[0] for r in top_f])
    ax.legend()
    plt.show()

    fig, ax = plt.subplots()
    x = np.arange(len(top_m))
    width = 0.35
    rects1 = ax.barh(x - width / 2, [r[2] for r in top_m],
                     width,
                     label='Women')
    rects2 = ax.barh(x + width / 2, [r[1] for r in top_m], width, label='Men')
    ax.set_xlabel('% receiving each label')
    ax.set_title('Top labels for images of men')
    ax.set_yticks(x)
    ax.set_yticklabels([r[0] for r in top_m])
    ax.legend()
    plt.show()
Esempio n. 36
0
df = pd.read_csv(infname, sep='\t')

DLQ_COLS = [f'DLQ01_resp-{i}' for i in range(5)]
freqs = df[DLQ_COLS].sum(axis=0).values

##########  stats on the frequencies  ###########

comparisons = ['across_DLQ01', 'across_nonzeroDLQ01', 'zeroVSnonzero_DLQ01']
index = pd.Index(comparisons, name='comparison')
stats_df = pd.DataFrame(columns=['test', 'chisq', 'pval'], index=index)

# Use chi2 to test the difference among a group
# of proportions, and then pairwise with binomial test.

# is there a difference among the whole DLQ score?
chisq, p = stats.chisquare(freqs)
stats_df.loc['across_DLQ01',
             ['test', 'chisq', 'pval']] = ['chisquare', chisq, p]

# is there a difference among the lucidity options (non-zero)?
nonzero_opts = freqs[1:]
chisq, p = stats.chisquare(nonzero_opts)
stats_df.loc['across_nonzeroDLQ01',
             ['test', 'chisq', 'pval']] = ['chisquare', chisq, p]

# compare if half the nights had LDs or not**
# but note that we don't really care about this,
# since we also highlight how it depends on how
# you measure success. But run this just to be able
# to say there were about half LDs
nonlucid = freqs[0]
    # dx = bin_size/100
    # f_exp2 = []
    # f_exp4 = []

    #
    # for i in range(len(bins5)-1):
    #     x = np.arange(bins5[i], bins5[i+1], dx)
    #     y2 = fit_function(x, *fitparams2)
    #     y4 = bi_gaussian(x, *fitparams4)
    #     area2 = np.trapz(y2, x)
    #     area4 = np.trapz(y4, x)
    #     f_exp2.append(area2)
    #     f_exp4.append(area4)

    chisq2_bincenter, p2_bincenter = chisquare(n5,
                                               fit_function(
                                                   bins5_mid, *fitparams2),
                                               ddof=len(guesses2) - 1)
    chisq4_bincenter, p4_bincenter = chisquare(n5,
                                               bi_gaussian(
                                                   bins5_mid, *fitparams4),
                                               ddof=len(guesses4) - 1)

    # chisq2_area, p2_area = chisquare(n5, f_exp2, ddof=len(guesses2)-1)
    # chisq4_area, p4_area = chisquare(n5, f_exp4, ddof=len(guesses4)-1)

    chisqdof2 = chisq2_bincenter / dof2
    chisqdof4 = chisq4_bincenter / dof4

    pos2a, pos2b, wid2a, wid2b, amp2a, amp2b, r2a, r2b = fitparams2

    f = open(
Esempio n. 38
0
    print(f"Skewness: {skew(bio[symbol])}")
    print(f"Kurtosis: {kurtosis(bio[symbol])}")
    print()

for symbol in trio.columns:
    print(f"{symbol}:")
    print(f"Mean: {trio[symbol].mean()}")
    print(f"STD: {trio[symbol].std()}")
    print(f"Variance: {trio[symbol].var()}")
    print(f"Skewness: {skew(trio[symbol])}")
    print(f"Kurtosis: {kurtosis(trio[symbol])}")
    print()

for symbol in uni.columns:
    print(f"{symbol}: ")
    print(kstest(uni[symbol], "norm"))
    print(chisquare(uni[symbol]))
    print()

for symbols in bio.columns:
    print(f"{symbols}: ")
    print(kstest(bio[symbols], "norm"))
    print(chisquare(bio[symbols]))
    print()

for symbols in trio.columns:
    print(f"{symbols}: ")
    print(kstest(trio[symbols], "norm"))
    print(chisquare(trio[symbols]))
    print()
Esempio n. 39
0
    #print(fil_max)
    #print(fil_max[0])
    #print(fil_max[1])
    pdays = []
    for k in range(0, len(arr)):
        d = arr.iloc[k]['epoch'] - fil_max[0]
        #print(d)
        pdays.append(d)
    plt.plot(pdays, p, 'o')
plt.gca().invert_yaxis()
plt.show()

obs = np.array([-2, -4, -9.0, -6])
pred = np.array([-3, -4, -8, -8])
a = ((obs - pred)**2)
chi = chisquare(obs.astype(np.float64), pred.astype(np.float64))
print(chi)
a = 4
print(a**2)

#Working with Takashi's models
list = os.listdir("/Users/bhagyasubrayan/Desktop/Plastic/public_data/")
lines = open('/Users/bhagyasubrayan/Desktop/Plastic/public_data/' +
             list[0]).readlines()
a = open('modelnames.txt', 'w').writelines(lines[2:])
with open('modelnames.txt', 'r') as in_file:
    stripped = (line.strip() for line in in_file)
    lines = (line.split() for line in stripped if line)
    with open('model.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        #writer.writerow(('name', 'mass'))
Esempio n. 40
0
def pearson_chisquare_pval(counts, N):
    """ 注意,计算 p-value 时,一定要传入次数,而不是概率
    """
    return chisquare(counts, ideal_distribution * N)[1]
Esempio n. 41
0
                                                   1] + 1
        for i in range(24, 28, 1):
            if datalist[row][i] != "Null":
                Multi_Flag_2017 = Multi_Flag_2017 + 1
        if Multi_Flag_2017 > 2:
            if datalist[row][0][0] == "0":
                Multi_Count_2017[
                    int(datalist[row][0][1]) -
                    1] = Multi_Count_2017[int(datalist[row][0][1]) - 1] + 1
            elif datalist[row][0][0] == "1":
                Multi_Count_2017[
                    int(datalist[row][0][0:2]) -
                    1] = Multi_Count_2017[int(datalist[row][0][0:2]) - 1] + 1
        Multi_Flag_2017 = 0
Corr_Matrix = np.vstack((Total_Count_2017, Multi_Count_2017))
[chistatistics, ptest] = chisquare(Corr_Matrix)

print(Total_Count_2017, Multi_Count_2017, chistatistics[0])

# We can use collision locations to estimate the areas of the zip code regions.

All_Zip_Code = list('')
All_Zip_Collision = list('')
for row in range(1, row_count, 1):
    if datalist[row][0][-1:] == "7":
        if datalist[row][3] != 'Null':
            a = datalist[row][3]
            if a in All_Zip_Code and datalist[row][4] != 'Null' and datalist[
                    row][5] != 'Null':
                zipcode = All_Zip_Code.index(a)
                All_Zip_Collision[zipcode] = All_Zip_Collision[zipcode] + 1
Esempio n. 42
0
            else:
                cur_pixel = pixels[x, y]
                histogram[cur_pixel] += 1
    obs = []
    exp = []
    X = 0
    for y in range(1, len(histogram), 2):
        x = histogram[y - 1]
        z = (histogram[y - 1] + histogram[y]) / 2
        if x > 0 and z > 0:
            obs.append(x)
            exp.append(z)
    obs = numpy.array(obs)
    exp = numpy.array(exp)

    chi, pval = stats.chisquare(obs, exp)

    chis.append(chi)
    pvals.append(pval)
    # print(pval)
    # if pval<=0.01:
    # 	if last_pval < 0.01:
    # 		print("Cover")
    # 		exit()
    # 	else:
    # 		break
    # last_pval=pval

# print(sum(pvals), sum(chis))
# print("Stego, length: %d" % sz)
if abs(sum(pvals) - 0.1) <= 0.1:
Esempio n. 43
0
    def enrichment_and_fold_change(self, seg_dict, min_valid=15):

        seg_lens = {k: len(V) for k, V in seg_dict.items()}
        sum_lens = float(sum(seg_lens.values()))
        seg_cnts = sorted([
            a for b in [[(c, k) for c in seg_dict[k]] for k in seg_dict.keys()]
            for a in b
        ])
        seg_means = sorted([(k, np.mean(V)) for k, V in seg_dict.items()],
                           key=lambda x: x[1])
        seg_obs = sorted([(k, len([v for v in V if v > 0]) / float(len(V)))
                          for k, V in seg_dict.items()])
        seg_min, seg_max = seg_means[0][0], seg_means[-1][0]

        seg_valid = len([x[1] for x in seg_cnts if x[0] > 0])

        if seg_valid < min_valid or seg_valid < (seg_lens[seg_max] / 5.0):
            return {a: b
                    for a, b in seg_means}, {a: b
                                             for a, b in seg_obs
                                             }, (seg_min, seg_max), (1.0, 1.0)

        seg_means = sorted([(k, np.mean(V)) for k, V in seg_dict.items()],
                           key=lambda x: x[1])
        seg_obs = sorted([(k, len([v for v in V if v > 0]) / float(len(V)))
                          for k, V in seg_dict.items()])
        seg_min, seg_max = seg_means[0][0], seg_means[-1][0]
        min_len, max_len = seg_lens[seg_min], seg_lens[seg_max]
        min_seg = seg_cnts[0:min_len]
        i = len(min_seg)
        while min_seg[-1][0] == seg_cnts[i][0]:
            min_seg.append(seg_cnts[i])
            i += 1
            if i == len(seg_cnts):
                return {a: b
                        for a, b in seg_means}, {a: b
                                                 for a, b in seg_obs
                                                 }, (seg_min, seg_max), (1.0,
                                                                         1.0)
        if max_len > len(seg_cnts) - i: max_seg = seg_cnts[i::]
        else:
            seg_rev = seg_cnts[-1::-1]
            max_seg = seg_rev[0:max_len]
            i = len(max_seg)
            while max_seg[-1][0] == seg_rev[i][0]:
                max_seg.append(seg_rev[i])
                i += 1
                if i == len(seg_cnts):
                    return {a: b
                            for a, b in seg_means
                            }, {a: b
                                for a, b in seg_obs}, (seg_min, seg_max), (1.0,
                                                                           1.0)
        min_len, max_len = len(min_seg), len(max_seg)
        AAexp, ABexp = min_len * (seg_lens[seg_min] / sum_lens), min_len * (
            seg_lens[seg_max] / sum_lens)
        BAexp, BBexp = max_len * (seg_lens[seg_min] / sum_lens), max_len * (
            seg_lens[seg_max] / sum_lens)
        AAobs, ABobs = len([x for x in min_seg if x[1] == seg_min
                            ]), len([x for x in min_seg if x[1] == seg_max])
        BAobs, BBobs = len([x for x in max_seg if x[1] == seg_min
                            ]), len([x for x in max_seg if x[1] == seg_max])
        chi_low = chisquare([AAobs, ABobs], f_exp=[AAexp, ABexp])[1]
        chi_hi = chisquare([BAobs, BBobs], f_exp=[BAexp, BBexp])[1]

        return {a: b
                for a, b in seg_means
                }, {a: b
                    for a, b in seg_obs}, (seg_min, seg_max), (chi_low, chi_hi)
Esempio n. 44
0
    # plt.plot(np.linspace(int(.05*EPOCH), EPOCH, int(.95*EPOCH)), np.asarray(losses)[int(.05*EPOCH):], 'bo', label='Loss')
    # plt.plot(np.linspace(int(.05*EPOCH), EPOCH, int(.95*EPOCH)), np.zeros(int(0.95*EPOCH))+float(loss.data.float()), 'g--',             label='Final Loss = %.3e' % (float(loss.data.float())))
    # plt.legend()
    # plt.show()

    ReHfit = torch.mean(torch.transpose(p, 0, 1)[0]).data.numpy()
    ReEfit = torch.mean(torch.transpose(p, 0, 1)[1]).data.numpy()
    ReHTfit = torch.mean(torch.transpose(p, 0, 1)[2]).data.numpy()
    fit_cffs = [ReHfit, ReEfit, ReHTfit]

    # plt.plot(phi[a:b], ydat[a:b], 'bo', label='data')
    # plt.plot(phi[a:b], f(xdat,fit_cffs), 'g--', label='fit')
    # plt.legend()
    # plt.show()

    err_H.append(abs(100 * (abs(fit_cffs[0] - ReH_target[a])) / ReH_target[a]))
    err_E.append(abs(100 * (abs(fit_cffs[1] - ReE_target[a])) / ReE_target[a]))
    err_HT.append(
        abs(100 * (abs(fit_cffs[2] - ReHT_target[a])) / ReHT_target[a]))

    print('Chi-Squared Value for this fit: %.3e' %
          (chisquare(f(xdat, fit_cffs), ydat[a:b])[0]))
    print('MSE Loss Value for this fit: %.3e' % (float(loss.data.float())))
    print('Average Error for set #%d using ANN = %.2f%%' %
          ((datset), ((err_H[-1] + err_E[-1] + err_HT[-1]) / 3)))
    #dvcsfit.fit_scipy(datset)

print('\n\033[1m%s%.2f%%' % ('Avg. Error of ReH = ', sum(err_H) / len(err_H)))
print('\033[1m%s%.2f%%' % ('Avg. Error of ReE = ', sum(err_E) / len(err_E)))
print('\033[1m%s%.2f%%' % ('Avg. Error of ReHT = ', sum(err_HT) / len(err_HT)))
Esempio n. 45
0
    def fitmodelnewx(model, x, y, dy):

        #    p0=np.array([k,B,omi,E0,alpha])
        p0 = np.array([k, B, omi])
        #    popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=([0.001,0.0001,0.001,0.00001,0.0], [4.0, 100.,50.0,100.0,1.0]))
        popt, pcov = curve_fit(model,
                               x,
                               y,
                               p0,
                               sigma=dy,
                               bounds=([0.001, 0.0001,
                                        0.0001], [2.0, 100., 9.0]))
        #    popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=(0., [1.8, 10.,10.,100.]))
        #popt, pcov = curve_fit(model, x, y, p0, sigma=dy)
        print "------ "
        print " k [", k, "] =", "%.5f" % popt[0], "+/-", "%.5f" % pcov[0,
                                                                       0]**0.5
        print " B [", B, "(10^14 G)]  =", "%.5f" % popt[
            1], "+/-", "%.5f" % pcov[1, 1]**0.5
        print " omi [2pi/spin_i=", omi, "(10^3 Hz)] =", "%.5f" % popt[
            2], "+/-", "%.5f" % pcov[2, 2]**0.5
        print " Spin Period [ms]=", 2.0 * np.pi / popt[
            2], "+/-", 2.0 * np.pi / popt[2] * (pcov[2, 2]**0.5) / popt[2]
        print " E0 [fixed (10^51 erg)] =", E0newa
        print " alpha (fixed) =", alphax
        print " E051=(L(Ttstart))*Tstart/k=", 10**(model(
            np.log10(startTxrt), popt[0], popt[1],
            popt[2])) * startTxrt / popt[0]
        print "------  "

        E051 = 10**(model(np.log10(startTxrt), popt[0], popt[1],
                          popt[2])) * startTxrt / popt[0]
        Pms = 2.0 * np.pi / popt[2]
        dPms = Pms * (pcov[2, 2]**0.5) / popt[2]

        ym = model(x, popt[0], popt[1], popt[2])
        print stats.chisquare(f_obs=y, f_exp=ym)
        mychi = sum(((y - ym)**2) / dy**2)
        #mychi=sum(((y-ym)**2)/ym)
        dof = len(x) - len(popt)
        print "my chisquare=", mychi
        print "dof=", dof
        p_value = 1 - stats.chi2.cdf(x=mychi, df=dof)
        print "P value", p_value

        bfmodel = model(np.log10(t), popt[0], popt[1], popt[2])

        out_file = open(outfilenewx, "a")
        out_file.write(fi + "," + str(startTxrt) + "," + str(E051) + "," +
                       str(alphax) + "," + str("%.5f" % popt[0]) + "," +
                       str("%.5f" % pcov[0, 0]**0.5) + "," +
                       str("%.5f" % popt[1]) + "," +
                       str("%.5f" % pcov[1, 1]**0.5) + "," +
                       str("%.5f" % Pms) + "," + str("%.5f" % dPms) + "," +
                       str("%.5f" % mychi) + "," + str("%.5f" % dof) + "," +
                       str("%.5f" % p_value) + "\n")
        out_file.close()

        return plt.plot(np.log10(t),
                        bfmodel,
                        'c',
                        label='CS06 alpha = 0.1 (fit)')
Esempio n. 46
0
        def __call__(self, parameters):
            allModelData = np.zeros(4)
            for stakes in self.allStakes:
                gainValue, lossValue = stakes
                allSimulations = [
                    lcaWrapper(gainValue, lossValue, *parameters)
                    for _ in range(self.numSimulationsPerCondition)
                ]
                allValidResponseSimulations = list(
                    filter(filterFunction, allSimulations))
                numValidResponses = len(allValidResponseSimulations)
                if numValidResponses < self.numSimulationsPerCondition / 3:
                    return (-1, parameters[3])
                _, allModelRTs, allModelResponses = zip(
                    *allValidResponseSimulations)
                modelStakes = np.hstack((np.full(
                    (numValidResponses, 1),
                    gainValue), np.full((numValidResponses, 1), lossValue)))
                modelDataForStakes = np.hstack(
                    (np.array(allModelResponses).reshape(-1, 1),
                     np.array(allModelRTs).reshape(-1, 1), modelStakes))
                allModelData = np.vstack((allModelData, modelDataForStakes))

            allModelData = allModelData[1:, :]

            actualDataMeanRT = np.mean(data[:, 1])
            simDataMeanRT = np.mean(allModelData[:, 1])
            delta = simDataMeanRT - actualDataMeanRT
            if delta > parameters[3]:
                delta = parameters[3]

            allModelData[:, 1] = allModelData[:, 1] - delta

            totalCost = 0
            quantiles = np.array([0.1, 0.3, 0.5, 0.7,
                                  0.9])  # quantiles of the chi^2 function
            observedProportionsChoiceWise = np.array(
                [0.1, 0.2, 0.2, 0.2, 0.2,
                 0.1])  # this is to cover some edge cases
            for stakes in self.allStakes:  # loop over all combinations of possible gain and loss
                gain, loss = stakes
                observedTrials = selectConditionTrials(data, gain, loss)
                numObservedTrials = np.shape(observedTrials)[0]
                modelTrials = selectConditionTrials(allModelData, gain, loss)
                numModelTrials = np.shape(modelTrials)[0]
                for choice in range(
                        2):  # loop over choice = 0 (reject) and 1 (accept)
                    observedTrialsForChoice = observedTrials[
                        observedTrials[:, 0] == choice]
                    observedRTsForChoice = observedTrialsForChoice[:, 1]
                    numObservedRTsForChoice = np.size(observedRTsForChoice)
                    observedPOfThisChoice = numObservedRTsForChoice / numObservedTrials

                    if numObservedRTsForChoice < 5:  # less than 5 trials --> can't compute quantile boundaries
                        continue  # skip this combination of gain, loss, choice

                    quantilesBoundaries = np.quantile(observedRTsForChoice,
                                                      quantiles)

                    observedProportions = \
                        np.histogram(observedRTsForChoice, bins=np.concatenate(([0], quantilesBoundaries, [100])))[
                            0] / numObservedTrials # proportions of experimental RTs in all quantiles

                    if numObservedRTsForChoice == 5 or 0 in observedProportions:  # some edge cases
                        observedProportions = observedProportionsChoiceWise * observedPOfThisChoice

                    observedFrequencies = numObservedTrials * observedProportions

                    modelTrialsForChoice = modelTrials[modelTrials[:, 0] ==
                                                       choice]
                    modelRTsForChoice = modelTrialsForChoice[:, 1]
                    numModelRTsForChoice = np.size(modelRTsForChoice)
                    modelProportions = \
                        np.histogram(modelRTsForChoice, bins=np.concatenate(([0], quantilesBoundaries, [100])))[
                            0] / numModelTrials

                    modelFrequencies = numObservedTrials * modelProportions

                    totalCost += chisquare(modelFrequencies,
                                           observedFrequencies)[0]
            return (totalCost, parameters[3] - delta)
Esempio n. 47
0
 def time_chisqure(self):
     stats.chisquare(self.chisq)
Esempio n. 48
0
def compare_histograms(hist, hist_vocab):
    return chisquare(hist, f_exp=hist_vocab)[0]
def generate_page():
    st.markdown(
        """
    ## Qu'est-ce qu'un échantillon ?

    Si on souhaite mesurer une caractéristique sur une grande population, on doit mesurer tous les individus de la population,
    mais il est souvent impossible de faire la mesure sur la population entière. Dans la pratique, on choisit un échantillon aléatoire de la
    population, c'est a dire plusieurs individus pris au hasard que l'on mesure pour avoir une approximation.
    Dans une population de 3 million d'individus avec deux allèles `A` et `a` pour un gène et ses trois génotypes associés (`AA`), (`Aa`), et (`aa`),
    il est plus facile de compter les génotypes de 100 individus pour avoir les fréquences génotypiques plutôt que de compter les 3 millions d'individus.

    On distingue la **mesure sur l'échantillon** obtenue après la avoir compté les 100 individus, de la **mesure théorique**
    que l'on aurait obtenue si on avait compté toute la population.

    Un échantillon est une représentation imparfaite de la population. Il se peut que par hasard, il contienne plus d'individus (`AA`),
    ou au contraire, plus d'individus (`aa`). Par conséquent, la mesure obtenue sur l'échantillon n'est quasiment jamais exactement égale à
    la mesure théorique.
    D'ailleurs un autre échantillon, contenant 100 individus différents, aurait des fréquences génotypiques sensiblement différentes de notre premier échantillon.

    De manière générale, plus un échantillon est grand, plus il y a de chance que la valeur estimée soit
    proche de la valeur théorique. A l'inverse, plus l'échantillon est petit, plus les valeurs estimées seront, en moyenne, éloignées de la valeur théorique.

    ## Échantillonnage aléatoire d'une population connue

    Pour estimer la fréquence génotypique de la population, on prend plusieurs échantillons aléatoirement dans la population.
    On observe grâce à un tirage aléatoire d'individus qui forment nos échantillons, que chaque échantillon possède des proportions de génotypes différentes.
    """
    )

    x = st.slider(
        "Fréquence théorique des génotypes (AA), (Aa), (aa)",
        0.0,
        1.0,
        (0.25, 0.75),
        0.01,
    )
    population_ratio = np.array([x[0], x[1] - x[0], 1 - x[1]])
    sample_size = st.number_input(
        "Nombre d'individus dans l'échantillon", 0, 1000000, 100, 10
    )
    nb_echantillons = st.number_input("Nombre d'échantillons", 0, 30, 10, 1)

    multiple_echantillon = np.zeros((nb_echantillons, 3))
    multiple_echantillon[:, 0] = np.random.binomial(
        sample_size, population_ratio[0], nb_echantillons
    )
    multiple_echantillon[:, 1] = np.random.binomial(
        sample_size, population_ratio[1], nb_echantillons
    )
    multiple_echantillon[:, 2] = sample_size - (
        multiple_echantillon[:, 0] + multiple_echantillon[:, 1]
    )
    fig = src.plots.display_echantillons(
        multiple_echantillon, truth=population_ratio * sample_size
    )
    st.pyplot(fig)

    st.markdown(
        """

    ## Déduire la fréquence des génotypes de la population totale

    Une fois que l'on a mesuré les fréquences des 3 génotypes, on peut demander quelle est la fréquence génotypique dans la population totale.
    La meilleure estimation que l'on puisse avoir est celle de l'échantillon. Par exemple, si on compte 30% de (`AA`) dans notre échantillon,
    on peut dire "la fréquence allélique des (`AA`) dans la population totale est d'environ 30%". On a une chance de me tromper bien sûr,
    mais elle est moins importante que si on avait dit "la fréquence allélique des (`AA`) dans la population totale est d'environ 60%".

    On peut mesurer les chances de se tromper grâce aux tests statistiques.

    Pour cela il faut prendre le problème à l'envers. On va faire une hypothèse sur la population (par exemple, l'hypothèse
    qu'il y a 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`))
    et on va mesurer la chance d'obtenir aléatoirement l'échantillon que l'on vient de mesurer si cette hypothèse est vraie.

    Ainsi dans cette exemple, si notre échantillon de 100 individus contient 60% de (`AA`), 20% de (`Aa`), et 20% de (`aa`), il y a moins de 1% de chance que
    notre échantillon soit originaire d'une population qui suit notre hypothèse.
    On peut donc dire:
     - L'échantillon provient d'une population avec 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`) mais on a moins de 0.1% de chance d'avoir raison
     - L'échantillon ne provient pas d'une population avec 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`) et on a plus de 99.9% de chance d'avoir raison

    Du coup on va conclure que la population a de grande chance de ne pas avoir 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`).

    En revanche, si notre échantillon de 100 individus contient 32% de (`AA`), 35% de (`Aa`), et 33% de (`aa`), il y a de grande chance que
    notre échantillon soit originaire d'une population qui suit notre hypothèse. La différence entre les fréquences alléliques de notre hypothèse,
    vient certainement du hasard de l'échantillon.
    On en conclura qu'il y a de grande chance que notre échantillon ait été pris dans une population avec des fréquences alléliques de  33% (`AA`), 33% (`Aa`), et 34% (`aa`).

    Il existe une équation qui nous donne la probabilité que notre échantillon ait été pris dans une population en fonction des fréquences génotypiques hypothétiques de la population,
    et de celle mesurées dans notre échantillon. On appelle cette équation l'équation du Chi2.

    Pour chaque génotype $i$, on compare le nombre d'individus que l'on a observé ($Obs_i$) contre le nombre d'individus que l'on aurait dû obtenir si la population suivait l'hypothèse $Theo_i$.

    $$
    Chi2 = \\sum{\\frac{(Obs_i - Theo_i)^2}{Theo_i} }
    $$

    Par exemple dans le cas de notre premier échantillon avec 60% de (`AA`), 20% de (`Aa`), et 20% de (`aa`), on a compté 60 (`AA`), 20 (`Aa`), et 20 (`aa`) dans notre échantillon.
    Si notre population suit l'hypothèse décrite dans notre exemple, on s'attend idéalement à avoir 33 (`AA`), 33 (`Aa`), et 34 (`aa`). Du coup notre Chi2 vaut:

    $$
    Chi2 = \\frac{(60-33)^2}{33} + \\frac{(20-33)^2}{33} + \\frac{(20-34)^2}{34} = 32.977
    $$

    On peut voir quelle est la probabilité d'obtenir cet échantillon sur la courbe du Chi2.

    """
    )

    fig = src.plots.chi2_curve()
    st.pyplot(fig)

    st.markdown(
        """
    On peut voir que l'échantillon a une probabilité très faible de venir de la population de notre hypothèse.
    En d'autre terme, si on observe un tel échantillon, on est presque certain que la population
    n'est pas distribuée avec 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`).

    ## Exemple pratique avec l'équilibre de Hardy Weinberg

    Grace au test du Chi2, on peut maintenant déterminer si un échantillon dont on vient de mesurer les fréquences génotypiques
    provient d'une population qui est à l'équilibre de Hardy Weinberg. Pour cela on fait l'hypothèse que notre population est à l'équilibre
    de Hardy Weinberg avec $p^2 %$ de (`AA`), $2pq %$ de (`Aa`), et $q^2 %$ de (`aa`), ou $p$ et $q$ sont les fréquences allèliques respectives de l'allèle `A` et `a`.

    Entrez les valeurs que vous observez dans votre échantillon pour chaque génotype:
    """
    )

    col1, col2, col3 = st.beta_columns(3)
    with col1:
        AA = st.number_input(
            "nombre de (AA) dans l'échantillon", 0, 1000000, 0, 1
        )
    with col2:
        Aa = st.number_input(
            "nombre de (Aa) dans l'échantillon", 0, 1000000, 0, 1
        )
    with col3:
        aa = st.number_input(
            "nombre de (aa) dans l'échantillon", 0, 1000000, 0, 1
        )

    if AA == Aa == aa == 0:
        # nothing is inputted
        pass
    else:
        N = AA + Aa + aa
        fA = (AA + 0.5 * Aa) / N
        fa = 1 - fA

        fAA = fA ** 2
        fAa = 2 * fA * fa
        faa = fa ** 2

        chi2_value, pvalue = chisquare(
            [AA, Aa, aa], f_exp=[fAA * N, fAa * N, faa * N]
        )

        st.markdown(
            f"""
        On a donc un échantillon de taille {N}.

        La fréquence allélique de l'échantillon est de {np.around(fA, 2)} pour `A` et {np.around(fa, 2)} pour `a`.

        Par conséquent, si la population est à l'équilibre de Hardy Weinberg, on s'attend à avoir:
         - {np.around(100*fAA, 2)} % de (`AA`)
         - {np.around(100*fAa, 2)} % de (`Aa`)
         - {np.around(100*faa, 2)} % de (`aa`)

        On observe:
        - {np.around(100*AA/N, 2)} % de (`AA`)
        - {np.around(100*Aa/N, 2)} % de (`Aa`)
        - {np.around(100*aa/N, 2)} % de (`aa`)

        Le test du Chi2 produit une valeur de {np.around(chi2_value, 4)} et il y a, par conséquent, {np.around(pvalue*100, 7)} % de chance que notre échantillon provienne d'une population à l'équilibre de Hardy Weinberg.
        """
        )

        if pvalue < 0.05:
            st.markdown(
                """
            **Comme cette valeur est inférieure à 5%, on en conclut, avec moins de 5% de chance de se tromper que la population ne suit pas l'équilibre de Hardy Weinberg.**
            """
            )
        else:
            st.markdown(
                """
            **Comme cette valeur est supérieure à 5%, on en conclut qu'il a une forte chance que la population suive l'équilibre de Hardy Weinberg**
            """
            )
Esempio n. 50
0
    def fitmodelold(model, x, y, dy):

        #    p0=np.array([k,B,omi,E0])
        p0 = np.array([k, B, omi])
        popt, pcov = curve_fit(model,
                               x,
                               y,
                               p0,
                               sigma=dy,
                               bounds=([0.0001, 0.0001,
                                        0.0001], [2.0, 100., 10.0]))
        #    popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=(0., [1.8, 10.,10.,100.]))
        #popt, pcov = curve_fit(model, x, y, p0, sigma=dy)

        #Ein = 0.5*Ine*popt[2]**2                            # initial spin energy 27.7 10^51 erg
        #tsdi = 3*Ine*c**3/(popt[1]**2*(r0)**6*popt[2]**2)*10**5   # Initial spin down time for the standard magnetic dipole formula 3.799*10^6/B2*omi**2 s
        #Li=Ein/tsdi

        print "------ "
        print " k [", k, "] =", "%.5f" % popt[0], "+/-", "%.5f" % pcov[0,
                                                                       0]**0.5
        print " B [", B, "(10^14 G)]  =", "%.5f" % popt[
            1], "+/-", "%.5f" % pcov[1, 1]**0.5
        print " omi [2pi/spin_i=", omi, "(kHz)] =", "%.5f" % popt[
            2], "+/-", "%.5f" % pcov[2, 2]**0.5
        print " Spin Period [ms]=", 2.0 * np.pi / popt[
            2], "+/-", 2.0 * np.pi / popt[2] * (pcov[2, 2]**0.5) / popt[2]

        #    print "E0 [",E0,"(10^51 erg)] =", "%.5f" %popt[3], "+/-", "%.5f" %pcov[3,3]**0.5
        print " E0 (fixed) [10^51 erg) =", E0old
        print " L(Tt)=", model(np.log10(startTxrt), popt[0], popt[1], popt[2])
        print " E051=(L(Ttstart))*Tstart/k=", 10**(model(
            np.log10(startTxrt), popt[0], popt[1],
            popt[2])) * startTxrt / popt[0]
        print "------  "

        E051 = (10**(model(np.log10(startTxrt), popt[0], popt[1],
                           popt[2]))) * startTxrt / popt[0]
        Pms = 2.0 * np.pi / popt[2]
        dPms = Pms * (pcov[2, 2]**0.5) / popt[2]
        print 'Pms, dPms=', Pms, dPms

        #    ym=model(x,popt[0],popt[1],popt[2],popt[3])
        ym = model(x, popt[0], popt[1], popt[2])
        print stats.chisquare(f_obs=y, f_exp=ym)
        mychi = sum(((y - ym)**2) / dy**2)
        #mychi=sum(((y-ym)**2)/ym)
        dof = len(x) - len(popt)
        print "my chisquare=", mychi
        print "dof=", dof
        p_value = 1 - stats.chi2.cdf(x=mychi, df=dof)
        print "P value", p_value

        bfmodel = model(np.log10(t), popt[0], popt[1], popt[2])

        out_file = open(outfileold, "a")
        #out_file.write(fi+","+str(startTxrt)+","+str(E051)+","+str("%.5f" %popt[0])+","+str("%.5f" %pcov[0,0]**0.5)+","+str("%.5f" %popt[1])+","+str("%.5f" %pcov[1,1]**0.5)+","+str("%.5f" %popt[2])+","+str("%.5f" %pcov[2,2]**0.5)+","+str("%.5f" %mychi)+","+str("%.5f" %dof)+","+str("%.5f" %p_value)+"\n")
        out_file.write(fi + "," + str(startTxrt) + "," + str(E051) + "," +
                       str("%.5f" % popt[0]) + "," +
                       str("%.5f" % pcov[0, 0]**0.5) + "," +
                       str("%.5f" % popt[1]) + "," +
                       str("%.5f" % pcov[1, 1]**0.5) + "," +
                       str("%.5f" % Pms) + "," + str("%.5f" % dPms) + "," +
                       str("%.5f" % mychi) + "," + str("%.5f" % dof) + "," +
                       str("%.5f" % p_value) + "\n")
        out_file.close()

        return plt.plot(np.log10(t), bfmodel, 'r', label='D11 (fit)')
Esempio n. 51
0
observed = [6662, 1179, 15128, 9592]
expected = [5249.8, 2597.4, 16533.5, 8180.3]

chisq_gender_income = 0
for i in range(len(observed)):
    chisq_gender_income += (observed[i] - expected[i])**2 / expected[i]

## 4. Finding statistical significance ##

from scipy.stats import chisquare

observed = [6662, 1179, 15128, 9592]
expected = [5249.8, 2597.4, 16533.5, 8180.3]

pvalue_gender_income = chisquare(observed, expected)

## 5. Cross tables ##

import pandas

table = pandas.crosstab(income["sex"], income["race"])
print(table)

## 6. Finding expected values ##

import pandas
from scipy.stats import chi2_contingency

table = pandas.crosstab(income["sex"], income["race"])
Esempio n. 52
0
def benfords(numbers):
    '''
    Examine the distribution of the first digits in a given corpus of numbers to see
    if they correspond to Benford's Law using a chi square test.

    Benford's Law, also known as the "first digit law" or the "law of anomalous numbers"
    states that there is a specific distribution pattern of the first digits of certain 
    groups of numbers.  See https://en.wikipedia.org/wiki/Benford%27s_law for more 
    info.

    :param numbers: The set of numbers to check against Benford's Law
    :type numbers: A list-like object (list, tuple, set, Pandas DataFrame or Series) 
                  containing floats or integers

    :Return Value:

    The function returns three values in a tuple (chi2, p, counts):

      * The 'chi2' value is a float in the range 0..1 that describes how well the observed 
        distribution of first digits matched the predictions of Benford's Law.  Lower is 
        better.  
      * The 'p' value is the probability that the computed 'chi2' is significant (i.e., it 
        tells you whether the chi2 value can be trusted).  Its range is also 0..1, but in 
        this case, higher is better.  Generally speaking, if the p-value is >= 0.95 then 
        the chi2 value is considered significant.
      * 'counts' is a Pandas series where the indices are the possible first digits 1-9 and
        the values are the observed distributions of those digits. If the observed distributions
        didn't match up with Benford's law, the counts may help you identify the anomalous values.

    '''
    def _first_digit(i: float):
        while i >= 10:
            i //= 10
        return trunc(i)

    _BENFORDS = [
        0.301,  # 1
        0.176,  # 2
        0.125,  # 3
        0.097,  # 4
        0.079,  # 5
        0.067,  # 6
        0.058,  # 7
        0.051,  # 8
        0.046  # 9
    ]

    if not is_list_like(numbers):
        raise TypeError(
            f'The argument must be a list or list-like of numbers, not type {type(numbers)}.'
        )
    if isinstance(numbers, pd.core.series.Series):
        numbers = numbers.values

    numbers = pd.DataFrame(numbers, columns=['numbers'])
    numbers['digits'] = numbers['numbers'].apply(_first_digit)

    counts = numbers['digits'].value_counts()

    # No leading zeroes!
    if 0 in counts.index:
        counts = counts.drop(0)

    # Ensure every digit 1-9 has an count, even if it's 0
    for i in range(1, 10):
        if not i in counts:
            counts[i] = 0

    # Sort by index just to be extra sure they are all in the correct
    # order
    counts = counts.sort_index()

    # Compute the actual distribution of first digits in the input
    # as a proportion of that count to the entire number of samples
    num_samples = counts.sum()
    counts = counts.apply(lambda x: x / num_samples)

    # Compare the actual distribution to Benford's Law
    chi2, p = chisquare(counts.values, _BENFORDS)

    # Return the results of the comparison, plus the observed counts
    return chi2, p, counts
Esempio n. 53
0
def return_chisquare(observed, expected):
    # null hypothesis: Diagnosed patients and control patients use feature with the same frequency
    count_chisquare = stats.chisquare(observed, f_exp=expected)

    return count_chisquare
Esempio n. 54
0
def distribution_test(dist):
    chisquare_val, p_val = ss.chisquare(dist)
    print('Chisquare:', chisquare_val)
    print('P:', p_val)
Esempio n. 55
0
def significance_testing(default_value, values, significane=0.05):
    value, p = chisquare(values)
    value = (value / p) * significane
    return [v - default_value > value for v in values]
Esempio n. 56
0
    def return_partition_DV(self, data, borders, r=2, alpha=0.05):
        # extract the bin boundaries
        Xmin = borders['nodes'][0]
        Xmax = borders['nodes'][1]
        Ymin = borders['nodes'][2]
        Ymax = borders['nodes'][3]

        # find the number of bins
        #    numBins = r ** 2
        idx = np.where((data[:, 0] >= Xmin)
                       & (data[:, 0] <= Xmax)
                       & (data[:, 1] >= Ymin)
                       & (data[:, 1] <= Ymax))

        # extract the points in the bin
        Xsub = data[idx, 0]
        Ysub = data[idx, 1]

        #    print(Xsub.shape, '\t', Ysub.shape)

        # find the indices of the points in the x- and y-patches
        idx_x = np.where((data[:, 0] >= Xmin) & (data[:, 0] <= Xmax))
        idx_y = np.where((data[:, 1] >= Ymin) & (data[:, 1] <= Ymax))

        # get the subpartitions
        ai = np.floor(
            np.percentile(data[idx_x, 0], 1 / r * np.arange(1, r) * 100))
        bj = np.floor(
            np.percentile(data[idx_y, 1], 1 / r * np.arange(1, r) * 100))

        # get the bin edges
        edges1 = np.concatenate(([Xmin], ai, [Xmax]))
        edges2 = np.concatenate(([Ymin], bj, [Ymax]))

        # first exit criteria: we cannot split inot unique boundaries any more
        # preallocate the partition list
        partitions = []
        if (len(np.unique(edges1, return_counts=True)[1]) < r + 1
                or len(np.unique(edges2, return_counts=True)[1]) < r + 1):

            # reject futher partitions, and return original bin
            partitions.insert(0, {
                'nodes': np.array([Xmin, Xmax, Ymin, Ymax]),
                'npts': len(idx[0])
            })
            return partitions

        # figure out the shift in the edges so that boundaries do not overlap
        xShift = np.zeros((2 * r, 2 * r))
        yShift = xShift
        xShift[:, 1:-1] = np.tile(np.array([[-1, 0]]), (2 * r, r - 1))
        yShift = xShift.T

        # find the boundaries for each bin
        # duplicate inner nodes for x mesh
        dupMidNodesX = np.append(
            np.insert(np.repeat((edges1[1:-1]), 2, axis=0), 0, edges1[0]),
            edges1[-1])

        # duplicate inner nodes for y mesh
        dupMidNodesY = np.append(
            np.insert(np.repeat((edges2[1:-1]), 2, axis=0), 0, edges2[0]),
            edges2[-1])
        # reshape
        dupMidNodesY = np.reshape(dupMidNodesY, (-1, 1))

        # now find the nodes for each bin
        xBinBound = dupMidNodesX + xShift
        yBinBound = dupMidNodesY + yShift

        # find the number of points in each bin, and put this info into array
        binned_data = binned_statistic_2d(Xsub.flatten(),
                                          Ysub.flatten(),
                                          None,
                                          'count',
                                          bins=[edges1, edges2])
        # get the counts. Flatten columnwise to match the bin definition in the
        # loop that creates the dictionaries below
        binCounts = binned_data.statistic.flatten('F')

        # define an empty list to hold the dictionaries of the fresh partitions
        bins = []
        # create dictionaries for each bin
        # start with the loop over y
        # note how the loop counts were obtained above to match the convention
        # here
        for yInd in np.arange(r):
            # this is the loop over x
            for xInd in np.arange(r):
                # get the bin number
                binNo = yInd * r + xInd
                xLow, xHigh = xBinBound[yInd, 2 * xInd + np.arange(2)]
                yLow, yHigh = yBinBound[2 * yInd + np.arange(2), xInd]
                bins.append({
                    'nodes': np.array([xLow, xHigh, yLow, yHigh]),
                    'npts': binCounts[binNo]
                })

        # calculate the chi square statistic
        chi2 = chisquare(binCounts)

        # check for independence and start recursion
        # if the chi2 test fails, do further partitioning:
        if (chi2.pvalue < alpha and Xmax != Xmin and Ymax != Ymin).all():
            for binInfo in bins:
                if binInfo['npts'] != 0:  # if the bin is not empty:
                    # append entries to the tuple
                    partitions.extend(
                        self.return_partition_DV(data=data,
                                                 borders=binInfo,
                                                 r=r,
                                                 alpha=alpha))

        # Second exit criteria:
        # if the partitions are independent, reject further partitioning and
        # save the orignal, unpartitioned bin
        elif len(idx[0]) != 0:
            partitions.insert(0, {
                'nodes': np.array([Xmin, Xmax, Ymin, Ymax]),
                'npts': len(idx[0])
            })

        return partitions
Esempio n. 57
0
def main():
    n_bins = 20

    a = mkPlot('1MHz')
    b = mkPlot('3MHz')
    c = mkPlot('5MHz')
    d = mkPlot('10MHz')
    e = mkPlot('12MHz')
    f = mkPlot('450kHz')
    g = mkPlot('500kHz')
    h = mkPlot('700kHz')
    i = mkPlot('800kHz')
    l = mkPlot('990kHz')

    y = np.array(
        [f['Frequenza'], g['Frequenza'], h['Frequenza'], i['Frequenza'], l['Frequenza'], a['Frequenza'], b['Frequenza'],
         c['Frequenza'], d['Frequenza'], e['Frequenza']])
    y_err = np.array([f['Std'], g['Std'], h['Std'], i['Std'], l['Std'], a['Std'], b['Std'],
                      c['Std'], d['Std'], e['Std']])
    x = np.array([0.45, 0.5, 0.7, 0.8, 0.99, 1, 3, 5, 10, 12])

    data_hist = np.array(
        [f['hist'], g['hist'], h['hist'], i['hist'], l['hist'], a['hist'], b['hist'], c['hist'], d['hist'],
         e['hist']])

    fig = plt.figure(figsize=(10, 5))
    ax = fig.add_subplot()

    fig1 = plt.figure(figsize=(10, 5))
    spec = gridspec.GridSpec(ncols=5, nrows=2, figure=fig1)
    ax1 = fig1.add_subplot(spec[0, 0])
    ax2 = fig1.add_subplot(spec[0, 1])
    ax3 = fig1.add_subplot(spec[0, 2])
    ax4 = fig1.add_subplot(spec[0, 3])
    ax5 = fig1.add_subplot(spec[0, 4])
    ax6 = fig1.add_subplot(spec[1, 0])
    ax7 = fig1.add_subplot(spec[1, 1])
    ax8 = fig1.add_subplot(spec[1, 2])
    ax9 = fig1.add_subplot(spec[1, 3])
    ax10 = fig1.add_subplot(spec[1, 4])

    h1 = ax1.hist(f['hist'], bins=n_bins)
    ax1.set_title('450kHz')

    h2 = ax2.hist(g['hist'], bins=n_bins)
    ax2.set_title('500kHz')

    h3 = ax3.hist(h['hist'], bins=n_bins)
    ax3.set_title('700kHz')

    h4 = ax4.hist(i['hist'], bins=n_bins)
    ax4.set_title('800kHz')

    h5 = ax5.hist(l['hist'], bins=n_bins)
    ax5.set_title('990kHz')

    h6 = ax6.hist(a['hist'], bins=n_bins)
    ax6.set_title('1MHz')

    h7 = ax7.hist(b['hist'], bins=n_bins)
    ax7.set_title('3MHz')

    h8 = ax8.hist(c['hist'], bins=n_bins)
    ax8.set_title('5MHz')

    h9 = ax9.hist(d['hist'], bins=n_bins)
    ax9.set_title('10MHz')

    h10 = ax10.hist(e['hist'], bins=n_bins)
    ax10.set_title('12MHz')

    linearity_plot = ax.errorbar(x, y, yerr=y_err * 10,
                                 label='Linearity data', ls='none', ecolor='r')
    coef = np.polyfit(x, y, 1)
    poly1d_fn = np.poly1d(coef)

    fit_label_1 = 'm = {}\n'.format(round(coef[0], 5))
    fit_label_2 = 'q = {}'.format(round(coef[1], 5))
    fit_label = fit_label_1 + fit_label_2

    fit_plot = ax.plot(x, poly1d_fn(x), '--k', linewidth=0.5, label=fit_label)
    legend = ax.legend(loc='upper left', shadow=True, fontsize='medium', prop={"size": 15})

    ax.set_xlabel('Nominal frequencies (MHz)', color='black', fontsize=15)
    ax.set_ylabel('Digitizer frequencies (MHz)', color='black', fontsize=15)

    """ax1.set_xlabel('', color='black')
    ax1.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax2.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax2.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax3.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax3.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax4.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax4.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax5.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax5.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax6.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax6.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax7.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax7.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax8.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax8.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax9.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax9.set_ylabel('Digitizer frequencies (MHz)', color='black')
    ax10.set_xlabel('Nominal frequencies (MHz)', color='black')
    ax10.set_ylabel('Digitizer frequencies (MHz)', color='black')"""

    print(' ----------------------- LEGEND AND UNITS ----------------------\n'
          '|                The frequencies are reported in [MHz]           |\n'
          ' ---------------------------------------------------------------\n\n'
          '                         *** RESULTS ***\n\n'
          'Nominal Frequencies | Derived Frequencies +/- Dev. Std                     \n'
          '---------------------------------------------------------------------------\n'
          '             450kHz | ' + str(f['Frequenza']) + ' +/- ' + str(f['Std']) + '\n'
          '             500kHz | ' + str(g['Frequenza']) + ' +/- ' + str(g['Std']) + '\n'
          '             700kHz | ' + str(h['Frequenza']) + ' +/- ' + str(h['Std']) + '\n'
          '             800kHz | ' + str(i['Frequenza']) + ' +/- ' + str(i['Std']) + '\n'
          '             990kHz | ' + str(l['Frequenza']) + ' +/- ' + str(l['Std']) + '\n'
          '               1MHz | ' + str(a['Frequenza']) + ' +/- ' + str(a['Std']) + '\n'
          '               3MHz | ' + str(b['Frequenza']) + ' +/- ' + str(b['Std']) + '\n'
          '               5MHz | ' + str(c['Frequenza']) + ' +/- ' + str(c['Std']) + '\n'
          '              10MHz | ' + str(d['Frequenza']) + ' +/- ' + str(d['Std']) + '\n'
          '              12MHz | ' + str(e['Frequenza']) + ' +/- ' + str(e['Std']) + '\n'
          '---------------------------------------------------------------------------\n\n'
          '                         *** FIT RESULTS ***\n\n'
          'Slope (m): ' + str(coef[0]) + '\n'
          'Intercept (q): ' + str(coef[1]) + '\n'
          'Chi-square: ' + str(chisquare(f_obs=y, f_exp=x)) + '\n'
          'Delta (%): ' + str(abs(1 - coef[0]) * 100),
          file=open("./Characterization/Linearity_output.txt", "w"))

    plt.tight_layout()
    plt.show()
Esempio n. 58
0
def fitgaussian(x,
                y,
                weights=None,
                guess=None,
                return_fit=True,
                return_uncertainties=False):
    """
    Fit a single gaussian to the data "y" at positions "x", points can be
    weighted by "weights" and an initial guess for the gaussian parameters

    :param x: numpy array (1D), the x values for the gaussian
    :param y: numpy array (1D), the y values for the gaussian
    :param weights: numpy array (1D), the weights for each y value
    :param guess: list of floats, the initial guess for the guassian fit
                  parameters in the following order:

                  [amplitude, center, fwhm, offset from 0 (in y-direction)]

    :param return_fit: bool, if True also calculates the fit values for x
                       i.e. yfit = gauss_function(x, *pfit)

    :param return_uncertainties: bool, if True also calculates the uncertainties
                                 based on the covariance matrix (pcov)
                                 uncertainties = np.sqrt(np.diag(pcov))

    :return pfit: numpy array (1D), the fit parameters in the
                  following order:

                [amplitude, center, fwhm, offset from 0 (in y-direction)]

    :return yfit: numpy array (1D), the fit y values, i.e. the gaussian values
                  for the fit parameters, only returned if return_fit = True

    """

    # if we don't have weights set them to be all equally weighted
    if weights is None:
        weights = np.ones(len(x))
    weights = 1.0 / weights
    # if we aren't provided a guess, make one
    if guess is None:
        guess = [np.nanmax(y), np.nanmean(y), np.nanstd(y), 0]
    # calculate the fit using curve_fit to the function "gauss_function"
    with warnings.catch_warnings(record=True) as _:
        pfit, pcov = curve_fit(gauss_function,
                               x,
                               y,
                               p0=guess,
                               sigma=weights,
                               absolute_sigma=True)
    if return_fit and return_uncertainties:
        # calculate the fit parameters
        yfit = gauss_function(x, *pfit)
        # work out the normalisation constant
        chis, _ = chisquare(y, f_exp=yfit)
        norm = chis / (len(y) - len(guess))
        # calculate the fit uncertainties based on pcov
        efit = np.sqrt(np.diag(pcov)) * np.sqrt(norm)
        # return pfit, yfit and efit
        return pfit, yfit, efit
    # if just return fit
    elif return_fit:
        # calculate the fit parameters
        yfit = gauss_function(x, *pfit)
        # return pfit and yfit
        return pfit, yfit
    # if return uncertainties
    elif return_uncertainties:
        # calculate the fit parameters
        yfit = gauss_function(x, *pfit)
        # work out the normalisation constant
        chis, _ = chisquare(y, f_exp=yfit)
        norm = chis / (len(y) - len(guess))
        # calculate the fit uncertainties based on pcov
        efit = np.sqrt(np.diag(pcov)) * np.sqrt(norm)
        # return pfit and efit
        return pfit, efit
    # else just return the pfit
    else:
        # return pfit
        return pfit
Esempio n. 59
0
y_int = np.interp(x1, x, yb)

# All the models!!
y1 = model_pickle['forest_clear_atmosphere.dat']
y2 = model_pickle['forest_t10_atmosphere.dat']
y3 = model_pickle['grass_clear_atmosphere.dat']
y4 = model_pickle['grass_t10_atmosphere.dat']
y5 = model_pickle['ice_clear_atmosphere.dat']
y6 = model_pickle['ice_t10_atmosphere.dat']
y7 = model_pickle['ocean_clear_atmosphere.dat']
y8 = model_pickle['ocean_t30_atmosphere.dat']
y9 = model_pickle['sand_clear_atmosphere.dat']
y10 = model_pickle['sand_t10_atmosphere.dat']

chisquaresb = [
    chisquare(y_int, f_exp=y1),
    chisquare(y_int, f_exp=y2),
    chisquare(y_int, f_exp=y3),
    chisquare(y_int, f_exp=y4),
    chisquare(y_int, f_exp=y5),
    chisquare(y_int, f_exp=y6),
    chisquare(y_int, f_exp=y7),
    chisquare(y_int, f_exp=y8),
    chisquare(y_int, f_exp=y9),
    chisquare(y_int, f_exp=y10)
]
# best fit for b is y10, sand_t10_atmosphere

y_intc = np.interp(x1, x, yc)
chisquaresc = [
    chisquare(y_intc, f_exp=y1),
Esempio n. 60
0
noise = (noise - np.mean(noise)) / np.std(noise)
ndata = len(noise)

# bin data
nbin = 20
minb = -4.5
maxb = 4.5
bins, freq = freq_hist(minb, maxb, nbin, noise)

# get idealized values
f_ideal = np.empty([nbin])
for i in range(nbin):
    f_ideal[i] = st.norm.cdf(bins[i + 1]) - st.norm.cdf(bins[i])

# do chi^2 test
chi2, p = st.chisquare(freq, f_ideal)
print p, chi2

# do the same with uniform transformation
u = 0.5 * erfc(-noise / np.sqrt(2.))
nbin = 20
minb = 0.
maxb = 1.
bins, freq = freq_hist(minb, maxb, nbin, u)
print freq
chi2, p = st.chisquare(freq, np.full([nbin], 1. / nbin))
print p, chi2

# plot
plt.plot(noise, 'k-', lw=1.)
plt.savefig('rx.png')