Ejemplo n.º 1
0
    def goodness_of_fit(self, mode='G-test'):
        '''
        Method to perform a goodness of fit test of the derived count rate.
        It compares the expected counts (from the final rate atimes the interval times) with the "observed counts" the latter are obtained
        not from the actual data, but from the poisson latent variable (i.e. before read noise).
        

        :mode: the type of test perfomed.

               Possible values are 'G-test', 'Pearson-chi-sq'

               G-test: (https://en.wikipedia.org/wiki/G-test)
                   This is the default value.
                   The G-test statistics, based on a likelihood ratio, is a better approximation to the chi-squared distribution
                   than Pearson's chi-square, which fails for small number counts

               Pearson-chi-sq: (https://en.wikipedia.org/wiki/Pearson's_chi-squared_test)
                   Pearsons' chi square is implemented and should give similar results for moderately large observed count rates
                   
               Squared deviations: (https://en.wikipedia.org/wiki/Reduced_chi-squared_statistic)
                   Use the variance of the counts plus the variance of the readnoise, summed together, as the
                   denominator 
        '''

        f_obs = (self.RM.noisy_counts[1:] -
                 self.RM.noisy_counts[:-1])[self.good_intervals]
        f_exp = (self.mean_electron_rate * self.dt[1:] /
                 self.RM.gain)[self.good_intervals]
        ddof = 1
        dof = np.sum(self.good_intervals) - 1 - ddof

        if mode == 'G-test':
            g, p = power_divergence(f_obs,
                                    f_exp=f_exp,
                                    ddof=ddof,
                                    lambda_='log-likelihood')

        elif mode == 'Pearson-chi-sq':
            g, p = power_divergence(f_obs,
                                    f_exp=f_exp,
                                    ddof=ddof,
                                    lambda_='pearson')

        elif mode == 'Squared-deviations':
            variance = (f_exp + 2 * np.square(self.RM.RON_e)) / np.square(
                self.RM.gain)
            g = np.sum(np.square(f_obs - f_exp) / variance)
            p = chi2.sf(g, dof)

        else:
            print('Goodness of fit test type not supported')
            assert False

        self.gof_stat = g
        self.gof_pval = p
    def compute(cls, observation, prediction):
        """
        Computes a Neyman score from an observation and a prediction.
        """

        obs_values = observation[~np.isnan(observation)]
        pred_values = prediction[~np.isnan(prediction)]

        assert(all(x<=1.00 for x in obs_values) and all(x<=1.00 for x in pred_values)), \
            "Probabiltity values should not be larger than 1.0"
        obs_values *= 100
        pred_values *= 100

        if type(obs_values) is pq.quantity.Quantity:
            obs_values = obs_values.magnitude
        if type(pred_values) is pq.quantity.Quantity:
            pred_values = pred_values.magnitude

        Neyman_Result = power_divergence(f_obs=pred_values,
                                         f_exp=obs_values,
                                         lambda_='neyman')

        utils.assert_dimensionless(Neyman_Result.statistic)
        utils.assert_dimensionless(Neyman_Result.pvalue)

        # Obtaining a score value normalized respect to the mean and std of the Chi-squared distribution
        dof = len(obs_values
                  ) - 1  # degrees of freedom for the Chi-squared distribution
        stat = Neyman_Result.statistic
        chisq_mean = dof
        chisq_std = np.sqrt(2 * dof)
        stat_n = abs(stat - chisq_mean) / chisq_std
        Neyman_result = NeymanResult(stat_n, Neyman_Result.pvalue)

        return NeymanScore(Neyman_result)
Ejemplo n.º 3
0
def dp(h1, h2):
    """Privacy distance between histograms `h1` and `h2`
    `h1` and `h2` contain counts, not probabilities."""

    # [0] returns statistic
    # [1] returns p-value
    return power_divergence(h1, f_exp=h2, lambda_="pearson")[0]
 def chi_goodness_yats(self):
     observed = np.asarray(self.observed)
     expected = np.asarray(self.expected)
     observed = observed + 0.5 * np.sign(expected - observed)
     chi2_yats, p_yats = power_divergence(observed, expected, ddof=0)
     add_result(self, "chi2_yats", round(chi2_yats, 5))
     add_result(self, "p_yats", round(p_yats, 5))
    def compute(cls, observation, prediction):
        """
        Computes a Log-Likelihood Ratio score from an observation and a prediction.
        """

        obs_values = observation[~np.isnan(observation)]
        pred_values = prediction[~np.isnan(prediction)]

        dof = len(obs_values
                  ) - 1  # degrees of freedom for the Chi-squared distribution
        Log_LikelihoodRatio_Result = power_divergence(obs_values,
                                                      pred_values,
                                                      ddof=dof,
                                                      lambda_='log-likelihood')

        utils.assert_dimensionless(Log_LikelihoodRatio_Result.statistic)
        utils.assert_dimensionless(Log_LikelihoodRatio_Result.pvalue)

        # Obtaining a score value normalized respect to the mean and std of the Chi-squared distribution
        stat = Log_LikelihoodRatio_Result.statistic
        chisq_mean = dof
        chisq_std = np.sqrt(2 * dof)
        stat_n = abs(stat - chisq_mean) / chisq_std

        Log_LikelihoodRatio_result = Log_LikelihoodRatioResult(
            stat_n, Log_LikelihoodRatio_Result.pvalue)
        return Log_LikelihoodRatioScore(Log_LikelihoodRatio_result)
Ejemplo n.º 6
0
    def calcualte_yates_correction(self):
        ho, he, rho, e_ho, e_he, e_rho, p, q = self.calcuate_expected_observed()

        observed = np.asarray([ho, he, rho])
        expected = np.asarray([e_ho, e_he, e_rho])
        dof = 2
        observed = observed + 0.5 * np.sign(expected - observed)

        return power_divergence(observed, expected, ddof=observed.size - 1 - dof, axis=None, lambda_=None)
Ejemplo n.º 7
0
    def goodness_of_fit(self, mode='G-test'):
        '''
        Method to perform a goodness of fit test of the derived count rate.
        It compares the expected counts (from the final rate atimes the interval times) with the "observed counts" the latter are obtained
        not from the actual data, but from the poisson latent variable (i.e. before read noise).
        

        :mode: the type of test perfomed.

               Possible values are 'G-test', 'Pearson-chi-sq'

               G-test: (https://en.wikipedia.org/wiki/G-test)
                   This is the default value.
                   The G-test statistics, based on a likelihood ratio, is a better approximation to the chi-squared distribution
                   than Pearson's chi-square, which fails for small number counts

               Pearson-chi-sq: (https://en.wikipedia.org/wiki/Pearson's_chi-squared_test)
                   Pearsons' chi square is implemented and should give similar results for moderately large observed count rates
        '''

        f_obs = self.RM.gain * (self.x_new[1:] -
                                self.x_new[:-1])[self.good_intervals]
        f_exp = self.RM.gain * (self.mean_countrate *
                                self.dt[1:])[self.good_intervals]
        ddof = 1
        dof = np.sum(self.good_intervals) - 1 - ddof

        if mode == 'G-test':
            g, p = power_divergence(f_obs,
                                    f_exp=f_exp,
                                    ddof=ddof,
                                    lambda_='log-likelihood')

        elif mode == 'Pearson-chi-sq':
            g, p = power_divergence(f_obs,
                                    f_exp=f_exp,
                                    ddof=ddof,
                                    lambda_='pearson')

        else:
            print('Goodness of fit test type not supported')

        self.gof_stat = g
        self.gof_pval = p
Ejemplo n.º 8
0
    def chi_square_yats(self):
        observed_list = [self.get_observed()]
        observed = np.asarray(observed_list)
        expected = expected_freq(observed)
        dof = expected.size - sum(expected.shape) + expected.ndim - 1
        observed = observed + 0.5 * np.sign(expected - observed)

        return power_divergence(observed,
                                expected,
                                ddof=observed.size - 1 - dof,
                                axis=None,
                                lambda_=None)
Ejemplo n.º 9
0
def test_correct_sampling():
    for name, sa in samplers.items():
        print("Sampler test: %s" % name)

        ma = sa.machine
        hi = ma.hilbert

        n_states = hi.n_states

        n_samples = max(40 * n_states, 10000)

        ord = np.random.randint(1, 3, size=1)
        assert ord == 1 or ord == 2

        if ord == 1:
            sa.machine_func = np.absolute
        if ord == 2:
            def _f(x, out):
                out[:] = np.absolute(x) ** 2
            sa.machine_func = _f

        ps = np.absolute(ma.to_array()) ** ord
        ps /= ps.sum()

        # expected frequencies
        f_exp = n_samples * ps

        n_rep = 6
        pvalues = np.zeros(n_rep)

        for jrep in range(n_rep):
            hist_samp = np.zeros(n_states)
            # fill in the histogram for sampler

            # Burnout fase
            for sw in range(n_samples // 10):
                sa.sweep()

            for sw in range(n_samples):
                sa.sweep()
                visible = sa.visible
                for v in visible:
                    sttn = hi.state_to_number(v)
                    hist_samp[sttn] += 1

            statistics, pvalues[jrep] = power_divergence(
                hist_samp, f_exp=f_exp, lambda_=3 / 2
            )

        s, pval = combine_pvalues(pvalues, method="fisher")
        assert pval > 0.01 or np.max(pvalues) > 0.01
Ejemplo n.º 10
0
def discrete_p_p_plot(expectedRates, observed, ax=None, show=False):
    if ax is None:
        ax = plt.axes()
    _p_p_plot(discrete_cdf(expectedRates), observed, ax)
    (counts, expCounts) = count_occurrences(expectedRates, observed)
    (chisq, pval) = stats.power_divergence(counts,
                                           np.array(expCounts),
                                           lambda_="log-likelihood")
    ax.set_title("One-sided Psi stat: %s nats\np-value: %s" %
                 (chisq / 2.0, pval),
                 loc='right')
    if show:
        plt.show()
    return ax
Ejemplo n.º 11
0
 def _compare_two_categorical_distributions(
     self,
     baseline: Union[list, np.ndarray, pd.Series],
     new: Union[list, np.ndarray, pd.Series],
     test: Optional[str] = "pearson",
 ) -> bool:
     """
     Compares two categorical distributions based on the chi-squared test
     """
     self._validate_test_name(test, "categorical")
     baseline = [cat for cat in baseline if cat in new]
     new = [cat for cat in new if cat in baseline]
     base_freq, new_freq = self._create_frequency_arrays(baseline, new)
     statistic, p_value = power_divergence(new_freq, base_freq, lambda_=test)
     return p_value < self._significance
Ejemplo n.º 12
0
    def test_simulated_fragment_size_distributions_match_that_to_simulate(self):
        for gene in genes:
            n_simulated = np.array([sum(fragment_sizes_sim[gene] == f)
                                    for f in f_size])

            # Calculate p-value for the two distributions being different
            p_divergence = power_divergence(n_simulated,
                                            reads_per_gene * f_prob,
                                            lambda_=power_divergence_lambda)[1]

            # Invert p-value to test if distributions are identical
            # Note that this is statistically not correct as not being able
            # to reject the null hypothesis does not generally prove it but
            # this seems like the best approach possible (plus: It is commonly
            # used in normality test).
            self.assertTrue(all(f_prob == n_simulated / reads_per_gene)
                            or ((1 - p_divergence) <= alpha_distcomp))
Ejemplo n.º 13
0
    def _power_divergence_test(self, f_obs, f_exp):
        """Tests the null hypothesis that the observed categorical data has the
        expected frequencies, using Cressie-Read power divergence statistic.

        Parameters
        ----------
        f_obs : array_like
            Observed frequencies in each category.
        f_exp : array_like, optional
            Expected frequencies in each category. By default the categories
            are assumed to be equally likely.
        """
        return power_divergence(
            f_obs=f_obs,
            f_exp=f_exp,
            ddof=self.ddof,
            axis=0,
            lambda_=self.power,
        )
def LLR_all_words(input_text, corpus):
    global wcI, wcB
    #    bg_corpus = [i for i in corpus if i!=input_text]
    #    corpusTokens = nltk.word_tokenize(cleanText(' '.join(bg_corpus)))
    inputTokens = nltk.word_tokenize(cleanText(input_text))
    wcI = Counter(inputTokens)
    #    wcB = Counter(corpusTokens)
    n1 = len(inputTokens)
    n2 = len(corpusTokens) - n1
    n = n1 + n2
    LLR_words = {}
    for word in wcI:
        c1 = wcI[word]
        c2 = wcB[word] - c1
        p = (c1 + c2) / n
        obs = np.array([[c1, n1], [c2, n2]])
        exp = np.array([[n1 * p, n1], [n2 * p, n2]])
        LLR_words[word] = power_divergence(f_exp=exp, f_obs=obs,
                                           lambda_=0).statistic[0]
    return LLR_words
Ejemplo n.º 15
0
    def test_simulated_pAoffset_distribution_is_uniform(self):
        for gene in genes:
            n_simulated = np.array([sum(pAoffsets_sim[gene] == offset)
                                    for offset in range(offset_min, pAlen_sim + 1)])

            # Calculate p-value for the two distributions being different
            p_divergence = power_divergence(n_simulated,
                                            reads_per_gene / (pAlen_sim
                                                              - offset_min
                                                              + 1),
                                            lambda_=power_divergence_lambda)[1]

            # Invert p-value to test if distributions are identical
            # Note that this is statistically not correct as not being able
            # to reject the null hypothesis does not generally prove it but
            # this seems like the best approach possible (plus: It is commonly
            # used in normality test).
            self.assertTrue(all(n_simulated / reads_per_gene
                                == 1 / (pAlen_sim - offset_min + 1))
                            or ((1 - p_divergence) <= alpha_distcomp))
    def compute(cls, observation, prediction):
        """
        Computes a Pearson's chi-squared score from an observation and a prediction.
        """

        obs_values = observation[~np.isnan(observation)]
        pred_values = prediction[~np.isnan(prediction)]

        dof = len(obs_values)-1  # degrees of freedom for the Chi-squared distribution
        Pearson_Result = power_divergence(obs_values, pred_values, ddof=dof, lambda_='pearson')

        utils.assert_dimensionless(Pearson_Result.statistic)
        utils.assert_dimensionless(Pearson_Result.pvalue)

        # Obtaining a score value normalized respect to the mean and std of the Chi-squared distribution
	stat = Pearson_Result.statistic
        chisq_mean = dof
        chisq_std = np.sqrt(2*dof)
        stat_n = abs(stat-chisq_mean)/chisq_std

	Pearson_result = PearsonResult(stat_n, Pearson_Result.pvalue)
        return PearsonChiSquaredScore(Pearson_result)
Ejemplo n.º 17
0
def power_div(fg_dist, bg_dist, lambda_="pearson"):
    """
    Compute the power divergence between two distributions.
    Need to test for statistical power first. Given scipy documentation:
        This test is invalid when the observed or expected frequencies in each
        category are too small. A typical rule is that all of the observed and
        expected frequencies should be at least 5.
    """
    import numpy as np
    # Removing 0-values in the expectation
    f_exp = np.asanyarray(fg_dist)
    f_obs = np.asanyarray(bg_dist)[f_exp != 0]
    f_exp = f_exp[f_exp != 0]
    if ((above_threshold(f_exp, 5) > 0.2) or
       (above_threshold(f_obs, 5) > 0.2)):
        return None, -1
    from scipy.stats import power_divergence
    # With the scipy implementation, it is expected that the number of
    # observation and expected counts are the same! Hence, we need to computed
    # the proportion of expected counts in each category to estimate the
    # expected numbers of observations.
    f_exp = sum(f_obs) * (f_exp / sum(f_exp))
    return power_divergence(f_exp=f_exp, f_obs=f_obs, lambda_=lambda_)
def log_lik_ratio(N,Y,p0):
    Y0 = int(round(p0*N))
    return power_divergence(f_obs=[N-Y,Y], f_exp=[N-Y0,Y0], lambda_="log-likelihood")[0]
Ejemplo n.º 19
0
def _check_triplet_dependency(triplets_for_estimation,
                              triplets_for_independence):
    n = triplets_for_estimation.shape[0]
    triplets = np.apply_along_axis(''.join, 1,
                                   triplets_for_estimation.astype(str))
    triplets, counts = np.unique(triplets, return_counts=True)
    triplet_probs = 1. * counts / counts.sum()

    bits, counts_0 = np.unique(triplets_for_independence[:, 0],
                               return_counts=True)
    if bits[0] != 0:
        bits = np.insert(bits, 0, 0)
        counts_0 = np.insert(counts_0, 0, 0)
    if bits[1] != 1:
        counts_0 = np.insert(counts_0, 1, 0)
    probs_0 = 1. * counts_0 / counts_0.sum()

    bits, counts_1 = np.unique(triplets_for_independence[:, 1],
                               return_counts=True)
    if bits[0] != 0:
        bits = np.insert(bits, 0, 0)
        counts_1 = np.insert(counts_1, 0, 0)
    if bits[1] != 1:
        counts_1 = np.insert(counts_1, 1, 0)
    probs_1 = 1. * counts_1 / counts_1.sum()

    bits, counts_2 = np.unique(triplets_for_independence[:, 2],
                               return_counts=True)
    if bits[0] != 0:
        bits = np.insert(bits, 0, 0)
        counts_2 = np.insert(counts_2, 0, 0)
    if bits[1] != 1:
        counts_2 = np.insert(counts_2, 1, 0)
    probs_2 = 1. * counts_2 / counts_2.sum()

    observed = []
    expected = []
    for t in ['000', '001', '010', '011', '100', '101', '110', '111']:
        if t not in triplets:
            triplets = np.append(triplets, t)
            triplet_probs = np.append(triplet_probs, 0.)
            counts = np.append(counts, 0.)

    # sns.barplot(triplets, counts, color='black')
    # plt.xlabel('Triplet')
    # plt.ylabel('Count')
    # plt.show()

    for i in range(8):
        triplet = np.array(list(triplets[i]), dtype=int)
        n_triplet = counts[i]
        # p_triplet = triplet_probs[i]
        p_ind = probs_0[triplet[0]] * probs_1[triplet[1]] * probs_2[triplet[2]]
        n_ind = n * p_ind
        # print('Triplet: {0}, p(i,j,k) = {1:5f}, p(i)p(j)p(k) = {2:5f}. diff = {3:5f}'.format(triplets[i], p_triplet, p_ind, p_triplet - p_ind))
        if p_ind == 0:
            continue
        observed.append(n_triplet)
        expected.append(n_ind)
        # expected.append(np.ceil(n_ind).astype(int))

    # df = pd.DataFrame.from_records(np.vstack((observed, expected)).T, index=triplets, columns=['Observed', 'Expected']).astype(int)
    # print(df)

    print('m = array(c{}, dim=c(2, 2, 2))'.format(
        tuple(np.array(observed)[np.argsort(triplets)].astype(int).tolist())))

    chi2, p_value = power_divergence(observed,
                                     expected,
                                     ddof=3,
                                     lambda_='pearson')
    # print('Chi-square p-value', p_value, '=>',
    #       ('Indepedent' if p_value > 0.05 else 'There is dependency'))
    # print()
    return p_value
Ejemplo n.º 20
0
def chi2_independence(data, x, y, correction=True):
    """
    Chi-squared independence tests between two categorical variables.

    The test is computed for different values of :math:`\\lambda`: 1, 2/3, 0,
    -1/2, -1 and -2 (Cressie and Read, 1984).

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        The dataframe containing the ocurrences for the test.
    x, y : string
        The variables names for the Chi-squared test. Must be names of columns
        in ``data``.
    correction : bool
        Whether to apply Yates' correction when the degree of freedom of the
        observed contingency table is 1 (Yates 1934).

    Returns
    -------
    expected : pd.DataFrame
        The expected contingency table of frequencies.
    observed : pd.DataFrame
        The (corrected or not) observed contingency table of frequencies.
    stats : :py:class:`pandas.DataFrame`
        The test summary, containing four columns:

        * ``'test'``: The statistic name
        * ``'lambda'``: The :math:`\\lambda` value used for the power\
                        divergence statistic
        * ``'chi2'``: The test statistic
        * ``'p'``: The p-value of the test
        * ``'cramer'``: The Cramer's V effect size
        * ``'power'``: The statistical power of the test

    Notes
    -----
    From Wikipedia:

    *The chi-squared test is used to determine whether there is a significant
    difference between the expected frequencies and the observed frequencies
    in one or more categories.*

    As application examples, this test can be used to *i*) evaluate the
    quality of a categorical variable in a classification problem or to *ii*)
    check the similarity between two categorical variables. In the first
    example, a good categorical predictor and the class column should present
    high :math:`\\chi^2` and low p-value. In the second example, similar
    categorical variables should present low :math:`\\chi^2` and high p-value.

    This function is a wrapper around the
    :py:func:`scipy.stats.power_divergence` function.

    .. warning :: As a general guideline for the consistency of this test, the
        observed and the expected contingency tables should not have cells
        with frequencies lower than 5.

    References
    ----------
    .. [1] Cressie, N., & Read, T. R. (1984). Multinomial goodness‐of‐fit
           tests. Journal of the Royal Statistical Society: Series B
           (Methodological), 46(3), 440-464.

    .. [2] Yates, F. (1934). Contingency Tables Involving Small Numbers and the
           :math:`\\chi^2` Test. Supplement to the Journal of the Royal
           Statistical Society, 1, 217-235.

    Examples
    --------
    Let's see if gender is a good categorical predictor for the presence of
    heart disease.

    >>> import pingouin as pg
    >>> data = pg.read_dataset('chi2_independence')
    >>> data['sex'].value_counts(ascending=True)
    0     96
    1    207
    Name: sex, dtype: int64

    If gender is not a good predictor for heart disease, we should expect the
    same 96:207 ratio across the target classes.

    >>> expected, observed, stats = pg.chi2_independence(data, x='sex',
    ...                                                  y='target')
    >>> expected
    target          0           1
    sex
    0       43.722772   52.277228
    1       94.277228  112.722772

    Let's see what the data tells us.

    >>> observed
    target      0     1
    sex
    0        24.5  71.5
    1       113.5  93.5

    The proportion is lower on the class 0 and higher on the class 1. The
    tests should be sensitive to this difference.

    >>> stats.round(3)
                     test  lambda    chi2  dof    p  cramer  power
    0             pearson   1.000  22.717  1.0  0.0   0.274  0.997
    1        cressie-read   0.667  22.931  1.0  0.0   0.275  0.998
    2      log-likelihood   0.000  23.557  1.0  0.0   0.279  0.998
    3       freeman-tukey  -0.500  24.220  1.0  0.0   0.283  0.998
    4  mod-log-likelihood  -1.000  25.071  1.0  0.0   0.288  0.999
    5              neyman  -2.000  27.458  1.0  0.0   0.301  0.999

    Very low p-values indeed. The gender qualifies as a good predictor for the
    presence of heart disease on this dataset.
    """
    # Python code inspired by SciPy's chi2_contingency
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert isinstance(x, str), 'x must be a string.'
    assert isinstance(y, str), 'y must be a string.'
    assert all(col in data.columns for col in (x, y)),\
        'columns are not in dataframe.'
    assert isinstance(correction, bool), 'correction must be a boolean.'

    observed = pd.crosstab(data[x], data[y])

    if observed.size == 0:
        raise ValueError('No data; observed has size 0.')

    expected = pd.DataFrame(expected_freq(observed),
                            index=observed.index,
                            columns=observed.columns)

    # All count frequencies should be at least 5
    for df, name in zip([observed, expected], ['observed', 'expected']):
        if (df < 5).any(axis=None):
            warnings.warn('Low count on {} frequencies.'.format(name))

    dof = float(expected.size - sum(expected.shape) + expected.ndim - 1)

    if dof == 1 and correction:
        # Adjust `observed` according to Yates' correction for continuity.
        observed = observed + 0.5 * np.sign(expected - observed)

    ddof = observed.size - 1 - dof
    n = data.shape[0]
    stats = []
    names = [
        "pearson", "cressie-read", "log-likelihood", "freeman-tukey",
        "mod-log-likelihood", "neyman"
    ]

    for name, lambda_ in zip(names, [1.0, 2 / 3, 0.0, -1 / 2, -1.0, -2.0]):
        if dof == 0:
            chi2, p, cramer, power = 0.0, 1.0, np.nan, np.nan
        else:
            chi2, p = power_divergence(observed,
                                       expected,
                                       ddof=ddof,
                                       axis=None,
                                       lambda_=lambda_)
            dof_cramer = min(expected.shape) - 1
            cramer = np.sqrt(chi2 / (n * dof_cramer))
            power = power_chi2(dof=dof, w=cramer, n=n, alpha=0.05)

        stats.append({
            'test': name,
            'lambda': lambda_,
            'chi2': chi2,
            'dof': dof,
            'p': p,
            'cramer': cramer,
            'power': power
        })

    stats = pd.DataFrame(stats)[[
        'test', 'lambda', 'chi2', 'dof', 'p', 'cramer', 'power'
    ]]
    return expected, observed, stats
Ejemplo n.º 21
0
 def chi_square(self, c, n):
     s = np.ceil(np.random.rand(n) * c)
     ct = list(Counter(s).values())
     print(chisquare(ct))
     print(power_divergence(ct, lambda_=1))
Ejemplo n.º 22
0
simulated_probs = list(map(lambda x: x/sum(simulated_probs), simulated_probs))

# Now the dictionaries are ready, let randomize 
#simulated_position #array of positions
background = np.random.choice(simulated_position, size=(N, N_observed_mutations), p=simulated_probs, replace=True) 

# Now in background we have the simulations let's start counting per each chunk of interest
list_results = []

idchunk=0
for startp in tqdm(range (0,last_position,chunk_size)):
    observed_count_chunk=count_mutation_chunk(startp, observed_mutations)
    # Now the simulated
    chunk_simulation = []
    for simulation in background:
        chunk_simulation.append(count_mutation_chunk(startp, simulation))
    mean_count_simulated = np.nanmean(chunk_simulation) # This is the mean number of simulated mutations in the chunk! let's compare it with the observed ones
    #confidence interval
    (cih,cil) = st.t.interval(0.95, len(chunk_simulation)-1, loc=np.mean(chunk_simulation), scale=st.sem(chunk_simulation))
    std=np.std(chunk_simulation)
    a = observed_count_chunk
    b = N_observed_mutations - observed_count_chunk
    c = mean_count_simulated
    d = N_observed_mutations - mean_count_simulated  
    if a > 0:
        odds_ratio = (a/b) / (c/d)
        u,p_value = stats.power_divergence(f_obs=[a, b], f_exp=[c, d], lambda_="log-likelihood") # perform g-test
        list_results.append([startp,startp+chunk_size,idchunk, observed_count_chunk, mean_count_simulated, odds_ratio, u, p_value, cil, cih, std]) # save results
        df_results = pd.DataFrame(list_results,columns=["start_chunk","end_chunk","id_chunk","n_observed","mean_simulated","odds_ratio","u","pvalue", "cintervL", "cintervH","stdev"]) 
        df_results.to_csv(outfile,sep="\t",index=False)
    idchunk+=1
Ejemplo n.º 23
0
def custom(a, b):
    v, p = stats.power_divergence(a, b)
    return p
Ejemplo n.º 24
0
def test_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250

    get_next_seed = lambda: random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]

    X = component_model_type.generate_data_from_parameters(
        data_params, N, gen_seed=get_next_seed())

    hyperparameters = component_model_type.draw_hyperparameters(X)[0]

    component_model = component_model_type.from_data(X, hyperparameters)

    model_parameters = component_model.sample_parameters_given_hyper()

    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(
        model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])

    state = State.p_State(M_c, T)

    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(
        su.simple_predictive_sample(M_c,
                                    X_L,
                                    X_D, [], [(N, 0)],
                                    get_next_seed,
                                    n=N)).flatten(1)

    # get support
    discrete_support = component_model_type.generate_discrete_support(
        model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(
        M_c,
        X_L,
        X_D,
        [] * len(Q),
        Q,
    )

    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ = numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(20, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data')
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(
                          component_model_type.log_pdf(
                              numpy.array(discrete_support),
                              model_parameters)),
                      c="blue",
                      s=100,
                      label="true pdf",
                      alpha=1)

        # pylab.ylim([0,2])

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      s=100,
                      label="predictive probability",
                      alpha=1)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def pvalue(f_obs, p_exp, modifier=id):
    chi2, p = stats.power_divergence(f_obs=modifier(f_obs),
                                     f_exp=modifier(p_exp * np.sum(f_obs)),
                                     lambda_='log-likelihood')
    return p
def check_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250
    
    get_next_seed = lambda : random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]
    
    X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed())
    
    hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0]
    
    component_model = component_model_type.from_data(X, hyperparameters)
    
    model_parameters = component_model.sample_parameters_given_hyper()
    
    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])
    
    state = State.p_State(M_c, T)
    
    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1)
    
    # get support
    discrete_support = component_model_type.generate_discrete_support(model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,)
    
    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data')
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), 
            model_parameters)), 
            c="blue", 
            s=100, 
            label="true pdf", 
            alpha=1)

        # pylab.ylim([0,2])
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            s=100, 
            label="predictive probability", 
            alpha=1)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None):
    """

    """
    random.seed(seed)

    N = 1000
    separation = .9
    
    get_next_seed = lambda : random.randrange(2147483647)

    cluster_weights = [[1.0/float(num_clusters)]*num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights,
                        [separation], seed=get_next_seed(),
                        distargs=[distargs[cctype]],
                        return_structure=True)

    T = numpy.array(T)
    T_list = T
    
    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])
    
    state = State.p_State(M_c, T_list)
    
    # transitions
    state.transition(n_steps=200)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0],
                            seed=get_next_seed()).flatten(1)
    
    # Get support over all component models
    discrete_support = qtu.get_mixture_support(cctype, component_model_type,
                         structure['component_params'][0], nbins=500)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q)
    
    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, 
                structure['component_params'][0], [1.0/num_clusters]*num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1)
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(lpdf), 
            c="blue", 
            edgecolor="none",
            s=100, 
            label="true pdf", 
            alpha=1,
            zorder=3)
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            edgecolor="none",
            s=100, 
            label="predictive probability", 
            alpha=1,
            zorder=4)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        pylab.show()

    return p
Ejemplo n.º 28
0
         len(controls) - len(segment[3])
     ]
 ]
 if method == 'chi':
     p = chi2_contingency(contingency_table, correction=yates)[1]
     if yates:
         method_name = 'Yates chi-squared'
     else:
         method_name = 'Chi-squared'
 elif method == 'fisher':
     p = fisher_exact(contingency_table)[1]
     method_name = 'Fisher'
 elif method == 'g':
     p = power_divergence(
         contingency_table[0] + contingency_table[1],
         f_exp=expected_freq(contingency_table).ravel(),
         ddof=2,
         lambda_='log-likelihood')[1]
     method_name = 'G-test'
 else:
     expected_frequency_table = expected_freq(contingency_table)
     num_large_cells = 0
     num_small_cells = 0
     for row in expected_frequency_table:
         for cell in row:
             if cell >= 5:
                 num_large_cells += 1
             elif cell < 1:
                 num_small_cells += 1
                 break
     if num_large_cells >= 3 and num_small_cells == 0:
Ejemplo n.º 29
0
def result_one_sample_chi():
    """
    接口请求参数:{
        "table_name": "" # str,数据库表名
        "X": ["x1", "x2"], # list,检测变量
        "E": ["e1","e2"], # list,期望频率变量
        "input_e": [2,3,4], #用户具体输入的期望频率
        "button_type": ["select","input","null"] #str 按钮的类型
    }
    :return:
    """
    log.info('result_one_sample_chi_get_results_init...')
    request_data = init_route()
    try:
        table_name = request_data['table_name']
        X = request_data['X']
        E = request_data['E']
        input_e = request_data['input_e'] #############################
        button_type = request_data['button_type']

    except Exception as e:
        log.info(e)
        raise e
    assert isinstance([X], list)
    results = []
    try:
        if button_type[0] == 'null':
            da = exec_sql(table_name, X)
            da = da.astype(float)
            data = [da[i] for i in X]
            log.info("输入数据大小:{}".format(len(data)))
            if da.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(da[X[0]], axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif da.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(data, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("无期望频率情况分析完成")

        elif button_type[0] == 'select':
            te = exec_sql(table_name, X)
            te = te.astype(float)
            test = [te[i] for i in X]
            ex = exec_sql(table_name, E)
            ex = ex.astype(float)
            expect = [ex[j] for j in E]
            log.info("输入数据大小:{}".format(len(test)))
            if te.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif te.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("有期望频率情况分析完成")

        elif button_type[0] == 'input':
            te = exec_sql(table_name, X)
            te = te.astype(float)
            test = [te[i] for i in X]
            expect = input_e
            expect = pd.DataFrame(expect)
            expect = expect.astype(float)
            expect = expect.values.tolist()
            log.info("输入数据大小:{}".format(len(test)))

            if te.shape[1] == 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=0)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results = transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            elif te.shape[1] > 1:
                statistic, pvalue = stats.power_divergence(test,expect, axis=1)
                title = '单样本卡方检验'
                col = ['卡方', '显著性']
                row = X
                d = pd.DataFrame([statistic, pvalue]).T
                d = d.astype(float)
                d = format_data_col(d)
                res = d.values.tolist()
                results =  transform_table_data_to_html({
                    'title': title,
                    'col': col,
                    'row': row,
                    'data': res
                })
            log.info("用户输入的期望频率情况分析完成")
        response_data = {
                             "code": "200",
                             "msg": "ok!",
                             "res":results}
        return jsonify(response_data)
    except Exception as e:
        log.error(e)
        raise e
def test_one_feature_mixture(component_model_type,
                             num_clusters=3,
                             show_plot=False,
                             seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9

    get_next_seed = lambda: random.randrange(2147483647)

    cluster_weights = [[1.0 / float(num_clusters)] * num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype],
                                     N, [0],
                                     cluster_weights, [separation],
                                     seed=get_next_seed(),
                                     distargs=[distargs[cctype]],
                                     return_structure=True)

    T_list = list(T)
    T = numpy.array(T)

    # pdb.set_trace()
    # create a crosscat state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(
        cctype,
        component_model_type,
        structure['component_params'][0],
        nbins=250)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(
        M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1)

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D,
                                                     [] * len(Q), Q)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(50, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type,
                                   structure['component_params'][0],
                                   [1.0 / num_clusters] * num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data',
                  zorder=1)
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples',
                  zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(lpdf),
                      c="blue",
                      edgecolor="none",
                      s=100,
                      label="true pdf",
                      alpha=1,
                      zorder=3)

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      edgecolor="none",
                      s=100,
                      label="predictive probability",
                      alpha=1,
                      zorder=4)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p
Ejemplo n.º 31
0
 def test_gGoodnessOfFit_xResult(self):
     observed = [10, 20, 30, 40]
     expected = [20, 20, 20, 20]
     x1, p1 = g_goodness_of_fit_test(observed, expected)
     x2, p2 = power_divergence(observed, expected, lambda_='log-likelihood')
     assert pytest.approx(x2) == x1
def check_predictive_sample_improvement(component_model_type, seed=0, show_plot=True):
	""" Shows the error of predictive sample over iterations.
	"""

	num_transitions = 100
	num_samples = 10	
	num_clusters = 2
	separation = .9	# cluster separation
	N = 150
	
	random.seed(seed)
	get_next_seed = lambda : random.randrange(2147483647)

	# generate a single column of data from the component_model 
	cctype = component_model_type.cctype
	T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], 
				seed=get_next_seed(), distargs=[distargs[cctype]], 
				return_structure=True)

	T_array = numpy.array(T)

	X = numpy.zeros((N,num_transitions))
	KL = numpy.zeros((num_samples, num_transitions))


	support = qtu.get_mixture_support(cctype, component_model_type, 
					struc['component_params'][0], nbins=1000, support=.995)
	true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, 
					struc['component_params'][0],[.5,.5])

	for s in range(num_samples):
		# generate the state
		state = State.p_State(M_c, T, SEED=get_next_seed())

		for i in range(num_transitions):
			# transition
			state.transition()

			# get partitions and generate a predictive column
			X_L = state.get_X_L()
			X_D = state.get_X_D()

			T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], 
					seed=get_next_seed())

			if cctype == 'multinomial':
				K = distargs[cctype]['K']
				weights = numpy.zeros(numpy.array(K))
				for params in struc['component_params'][0]:
					weights += numpy.array(params['weights'])*(1.0/num_clusters)
				weights *= float(N)
				inf_hist = qtu.bincount(T_inf, bins=list(range(K)))
				err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson')
				err = numpy.ones(N)*err
			else:
				err = (T_array-T_inf)**2.0

			KL[s,i] = qtu.KL_divergence(component_model_type, 
						struc['component_params'][0], [.5,.5], M_c, X_L, X_D,
						true_log_pdf=true_log_pdf, support=support)

			for j in range(N):
				X[j,i] += err[j]

	X /= num_samples

	# mean and standard error
	X_mean = numpy.mean(X,axis=0)
	X_err = numpy.std(X,axis=0)/float(num_samples)**.5

	KL_mean = numpy.mean(KL, axis=0)
	KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5

	if show_plot:
		pylab.subplot(1,2,1)
		pylab.errorbar(list(range(num_transitions)), X_mean, yerr=X_err)
		pylab.xlabel('iteration')
		pylab.ylabel('error across each data point')
		pylab.title('error of predictive sample over iterations, N=%i' % N)

		pylab.subplot(1,2,2)
		pylab.errorbar(list(range(num_transitions)), KL_mean, yerr=KL_err)
		pylab.xlabel('iteration')
		pylab.ylabel('KL divergence')
		pylab.title('KL divergence, N=%i' % N)

		pylab.show()

	# error should decrease over time
	return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
Ejemplo n.º 33
0
     #Cases without segment         Controls without segment
     [len(cases) - len(segment[2]), len(controls) - len(segment[3])]
 ]
 if method == 'chi':
     p = chi2_contingency(contingency_table, correction=yates)[1]
     if yates:
         method_name = 'Yates chi-squared'
     else:
         method_name = 'Chi-squared'
 elif method == 'fisher':
     p = fisher_exact(contingency_table)[1]
     method_name = 'Fisher'
 elif method == 'g':
     p = power_divergence(
         contingency_table[0] + contingency_table[1],
         f_exp=expected_freq(contingency_table).ravel(),
         ddof=2,
         lambda_='log-likelihood'
     )[1]
     method_name = 'G-test'
 else:
     expected_frequency_table = expected_freq(contingency_table)
     num_large_cells = 0
     num_small_cells = 0
     for row in expected_frequency_table:
         for cell in row:
             if cell >= 5:
                 num_large_cells += 1
             elif cell < 1:
                 num_small_cells += 1
                 break
     if num_large_cells >= 3 and num_small_cells == 0:
def lik_test(N,Y,p0):
    "likelihood ratio/ G-test. This will suffer from the multiple comparison issue, "
    "but is only an aid to guesstimation anyway."
    Y0 = int(round(p0*N))
    return power_divergence(f_obs=[N-Y,Y], f_exp=[N-Y0,Y0], lambda_="log-likelihood")[1]
Ejemplo n.º 35
0
    def goodness_of_fit(self,mode='G-test'):

        '''
        Method to perform a goodness of fit test of the derived count rate.
        It compares the expected counts (from the final rate atimes the interval times) with the "observed counts" the latter are obtained
        not from the actual data, but from the poisson latent variable (i.e. before read noise).
        

        :mode: the type of test perfomed.

               Possible values are 'G-test', 'Pearson-chi-sq', 'Squared-deviations'

               G-test: (https://en.wikipedia.org/wiki/G-test)
                   This is the default value.
                   The G-test statistics, based on a likelihood ratio, is a better approximation to the chi-squared distribution
                   than Pearson's chi-square, which fails for small number counts

               Pearson-chi-sq: (https://en.wikipedia.org/wiki/Pearson's_chi-squared_test)
                   Pearsons' chi square is implemented and should give similar results for moderately large observed count rates

               Squared deviations: (https://en.wikipedia.org/wiki/Reduced_chi-squared_statistic)
                   Use the variance of the counts plus the variance of the readnoise, summed together, as the
                   denominator 
        '''

        if np.sum(self.good_intervals) < 1:
            self.gof_stat = -np.inf
            self.gof_pval = 0.

        else:
            f_obs = (self.RM.noisy_counts[1:]-self.RM.noisy_counts[:-1])[self.good_intervals]
            f_exp = (self.mean_electron_rate * self.dt_groups[1:]/self.RM.gain)[self.good_intervals]

            if mode == 'G-test':
                ddof  = 1
                dof   = np.sum(self.good_intervals) - 1 - ddof
                g,p = power_divergence(f_obs, f_exp=f_exp, ddof=ddof,  lambda_='log-likelihood')

            elif mode == 'Pearson-chi-sq':
                ddof  = 1
                dof   = np.sum(self.good_intervals) - 1 - ddof
                g,p = power_divergence(f_obs, f_exp=f_exp, ddof=ddof,  lambda_='pearson')

            elif mode == 'Squared-deviations-nocov':
                var_signal_per_diff = (self.RM.RTS.group_times[1:]/self.RM.RTS.nframes
                                       + self.RM.RTS.group_times[:-1]*(1./self.RM.RTS.nframes -2)
                                       + 2./np.square(self.RM.RTS.nframes)*self.triangle_sums[1:]
                                       ) * self.mean_electron_rate
            
                variance = var_signal_per_diff[self.good_intervals]+self.var_RON_per_diff+self.var_quant_per_diff
                variance = variance / np.square(self.RM.gain)
                dof   = np.sum(self.good_intervals) - 1            
                g = np.sum(np.square(f_obs-f_exp)/variance)
                p = chi2.sf(g,dof)      
            
            elif mode == 'Squared-deviations':
                var_signal_per_diff = (self.RM.RTS.group_times[1:]/self.RM.RTS.nframes
                                       + self.RM.RTS.group_times[:-1]*(1./self.RM.RTS.nframes -2)
                                       + 2./np.square(self.RM.RTS.nframes)*self.triangle_sums[1:]
                                       ) * self.mean_electron_rate
            
                variance = var_signal_per_diff+self.var_RON_per_diff+self.var_quant_per_diff
                covmat = np.diag(variance)
                for k in range(self.stddev.size-1):
                    for l in range(k+1,self.stddev.size):
                        covmat[k,l] = covmat[l,k] =  (self.mean_electron_rate * self.RM.RTS.group_times[1+k] * (1.-1./self.RM.RTS.nframes)
                                                      -2*self.mean_electron_rate/np.square(self.RM.RTS.nframes)*self.RM.RTS.lower_triangle_sum[1+k]
                                                      -np.square(self.RM.RON_e)/self.RM.RTS.nframes
                                                      -1./12.*np.square(self.RM.gain*self.RM.RTS.nframes)
                                                      )
                                                            
                covmat    = covmat/np.square(self.RM.gain)
                covmat    = covmat[np.ix_(self.good_intervals,self.good_intervals)]
                invcovmat = np.linalg.inv(covmat)
                dof       = np.sum(self.good_intervals) - 1            
#            x0 = self.dt[1:][self.good_intervals]
#            H_p1 = 1/np.matmul(x0,np.matmul(invcovmat,x0))
#            H_p2 = np.matmul(x0,invcovmat)
#            H    = np.outer(x0,H_p1*H_p2)covma
#            I    = np.diag(np.ones(H.shape[0]))
#            M1 = I-H
#            M2 = M1.T
#            dof = np.trace(np.matmul(M2,M1))           
                g = np.matmul((f_obs-f_exp),np.matmul(invcovmat,(f_obs-f_exp)))
                p = chi2.sf(g,dof) 
            
            elif mode == 'poisson-likelihood':

                poisson_distr_groups = []
                for i in range(self.RM.RTS.ngroups):
                    poisson_distr_groups.append(poisson(mu=self.mean_electron_rate*self.dt_groups[i]))

                poisson_lpmf = np.empty(self.RM.RTS.ngroups,dtype=np.float_)
                for i in range(self.RM.RTS.ngroups):
                    if i == 0:
                        poisson_lpmf[i] = 0.
                    else:
                        if self.z_new[i] < self.z_new[i-1]:
                            poisson_lpmf[i] = np.inf
                        else:
                            poisson_lpmf[i] = poisson_distr_groups[i].logpmf(np.round(self.z_new[i]-self.z_new[i-1]))
            
                g = np.sum(poisson_lpmf[1:][self.good_intervals])
                ncompare = 10000
                lpmfs = np.empty([ncompare,np.sum(self.good_intervals)])
             
                i = 0
                for k in np.nonzero(self.good_intervals)[0]:
                    rv = poisson_distr_groups[k+1].rvs(size=ncompare)
                    lpmfs[:,i] = poisson_distr_groups[k+1].logpmf(rv) 
                    i = i+1
             
                loglik_compare = np.sum(lpmfs,axis=1)
                BM = g > loglik_compare
                p = np.sum(BM).astype(np.float_)/ncompare


            elif mode == 'full-likelihood':

                poisson_lpmf  = np.empty(self.RM.RTS.ngroups,dtype=np.float_)
                gaussian_lpdf = np.empty(self.RM.RTS.ngroups,dtype=np.float_) 

                poisson_distr_groups = []
                gauss_distr_groups = []
                
                for i in range(self.RM.RTS.ngroups):
                    poisson_distr_groups.append(poisson(mu=self.mean_electron_rate*self.dt_groups[i]))
                    gauss_distr_groups.append(norm(loc=self.RM.gain*(self.RM.noisy_counts[i]-self.RM.noisy_counts[i-1]),
                                           scale=np.sqrt(2./self.RM.RTS.nframes)*self.RM.RON_e))


                for i in range(self.RM.RTS.ngroups):
                    if i == 0:
                        poisson_lpmf[i] = 0.
                    else:
                        if self.z_new[i] < self.z_new[i-1]:
                            poisson_lpmf[i] = -np.inf
                        else:
                            poisson_lpmf[i] = poisson_distr_groups[i].logpmf(np.round(self.z_new[i]-self.z_new[i-1])) 

                    gaussian_lpdf[i] = gauss_distr_groups[i].logpdf(self.z_new[i]-self.z_new[i-1])


                keep_grps = np.empty(self.RM.RTS.ngroups,dtype=np.bool_) 
                for i in range(self.RM.RTS.ngroups):
                    if i == 0:
                        intdw = i
                        intup = i
                    elif i == (self.RM.RTS.ngroups-1):
                        intdw = i-1
                        intup = i-1
                    else:
                        intdw = i-1
                        intup = i
                        
                    if ((self.good_intervals[intdw] == True) | (self.good_intervals[intup] == True)):
                        keep_grps[i] = True
                    else:
                        keep_grps[i] = False
                


                g = np.sum(poisson_lpmf[1:][self.good_intervals])+np.sum(gaussian_lpdf[keep_grps])
                ncompare = 10000
                pllkls = np.empty([ncompare,np.sum(self.good_intervals)])
                gllkls = np.empty([ncompare,np.sum(keep_grps)])

                i = 0
                nh = norm(loc=0.,scale=np.sqrt(2./self.RM.RTS.nframes)*self.RM.RON_e)
                for k in np.nonzero(keep_grps)[0]:
                    noise = nh.rvs(size=ncompare)                
                    gllkls[:,i] = nh.logpdf(noise)
                    i = i+1
                i = 0 
                for k in np.nonzero(self.good_intervals)[0]:
                    rv = poisson_distr_groups[k+1].rvs(size=ncompare)
                    pllkls[:,i] = poisson_distr_groups[k+1].logpmf(rv) 
                    i = i+1


                loglik_compare = np.sum(pllkls,axis=1) + np.sum(gllkls,axis=1)
                BM = g > loglik_compare
                p = np.sum(BM).astype(np.float_)/ncompare


            else:
                print('Goodness of fit test type not supported')
                assert False

            self.gof_stat = g
            self.gof_pval = p
Ejemplo n.º 36
0
    tagss.append(xtag.replace('\n',""))

start=time.time()

test=atsMatrix(tagss,cats)
#print test.matrixAA.shape

#cat=Counter(categories)

for cat in categories:
    col=test.getIndxByWordB(cat)
    filcat=open(args.corpusloc+"categories/"+cat, 'r')
    ctemp=[]
    for ct in filcat.readlines():
        ctemp.append(ct.replace('\n',""))
    cctemp=Counter(ctemp)
    for atag in cctemp:
        linez=test.getIndxByWordA(atag)
        test.setValueByIndx(cctemp[atag],linez,col)
        #print atag+"--"+str(cctemp[atag])
#print test.matrixAA.nonzero()
#print test.getRelation("update","RFXcom")
#f=open(args.corpusloc+args.datanam+"_contigency.mtx", 'wb')
#cPickle.dump(test, f)
end=time.time()
print end-start
print test.matrixAA.shape
#print stats.chi2_contingency(test.matrixAA,lambda_="log-likelihood")
print stats.power_divergence(test.matrixAA,lambda_="log-likelihood")
#print "Lambda: "+str(lamdaGK(test.matrixAA))
#print "Lambda: "+str(lamdaGKr(test.matrixAA))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.")
    parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;")
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help="Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values."
    )
    parser.add_argument("--fisher", action="store_true", default=False, help="if true then Fisher definition is used")
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help="if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument("--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored")
    parser.add_argument(
        "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored"
    )
    parser.add_argument("--inclusive", action="store_true", default=False, help="if false,limit will be ignored")
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help="If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument("--correction", action="store_true", default=False, help="continuity correction ")
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help="the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram")
    parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction")
    parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.")
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds")
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help="lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e")
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols != None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols != None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols != None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one), dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one), n=args.n, p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(map(float, sample_one), axis=args.axis, fisher=args.fisher, bias=args.bias)
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one), score=args.score, kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one), alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one), low=args.m, high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one), correction=args.correction, lambda_=args.lambda_
            )
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf is 0 and mf is 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf is 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one), lowerlimit=mf, inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf is 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one), upperlimit=nf, inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf is 0 and mf is 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf is 0 and mf is 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf is 0 and mf is 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf), (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf is 0 and mf is 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), interpolation_method=args.interpolation
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one), map(float, sample_two), (mf, nf), interpolation_method=args.interpolation
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf is 0 and mf is 0:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf is 0 and mf is 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one), map(float, sample_two), statistic=args.statistic, bins=args.b
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf is 0 and mf is 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one), mf, nf, newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one), proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(map(float, sample_one), proportiontocut=args.proportiontocut, tail=args.tail)
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf is 0 and mf is 0:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf is 0 and mf is 0:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf is 0 and mf is 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf), method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda is 0:
                box, ma, ci = stats.boxcox(map(float, sample_one), alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one), imbda, alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one), map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one), map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one), map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two)
            )
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one), map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one), map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one), map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(
                map(float, sample_one), map(float, sample_two), equal_var=args.equal_var
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one), map(float, sample_two), axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one), map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one), map(float, sample_two), base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one), map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one), zero_method=args.zero_method, correction=args.correction
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one), map(float, sample_two), ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one), ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one), map(float, sample_two), ddof=args.ddof, lambda_=args.lambda_
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one), ddof=args.ddof, lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), map(float, sample_two), alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one), alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one), method=args.med, weights=map(float, sample_two)
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one), method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center, proportiontocut=args.proportiontocut, *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples
            )
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Ejemplo n.º 38
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--infile", required=True, help="Tabular file.")
    parser.add_argument("-o",
                        "--outfile",
                        required=True,
                        help="Path to the output file.")
    parser.add_argument("--sample_one_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument("--sample_two_cols",
                        help="Input format, like smi, sdf, inchi")
    parser.add_argument(
        "--sample_cols",
        help="Input format, like smi, sdf, inchi,separate arrays using ;",
    )
    parser.add_argument("--test_id", help="statistical test method")
    parser.add_argument(
        "--mwu_use_continuity",
        action="store_true",
        default=False,
        help=
        "Whether a continuity correction (1/2.) should be taken into account.",
    )
    parser.add_argument(
        "--equal_var",
        action="store_true",
        default=False,
        help=
        "If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.",
    )
    parser.add_argument(
        "--reta",
        action="store_true",
        default=False,
        help="Whether or not to return the internally computed a values.",
    )
    parser.add_argument(
        "--fisher",
        action="store_true",
        default=False,
        help="if true then Fisher definition is used",
    )
    parser.add_argument(
        "--bias",
        action="store_true",
        default=False,
        help=
        "if false,then the calculations are corrected for statistical bias",
    )
    parser.add_argument(
        "--inclusive1",
        action="store_true",
        default=False,
        help="if false,lower_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive2",
        action="store_true",
        default=False,
        help="if false,higher_limit will be ignored",
    )
    parser.add_argument(
        "--inclusive",
        action="store_true",
        default=False,
        help="if false,limit will be ignored",
    )
    parser.add_argument(
        "--printextras",
        action="store_true",
        default=False,
        help=
        "If True, if there are extra points a warning is raised saying how many of those points there are",
    )
    parser.add_argument(
        "--initial_lexsort",
        action="store_true",
        default="False",
        help=
        "Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.",
    )
    parser.add_argument(
        "--correction",
        action="store_true",
        default=False,
        help="continuity correction ",
    )
    parser.add_argument(
        "--axis",
        type=int,
        default=0,
        help=
        "Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)",
    )
    parser.add_argument(
        "--n",
        type=int,
        default=0,
        help=
        "the number of trials. This is ignored if x gives both the number of successes and failures",
    )
    parser.add_argument("--b",
                        type=int,
                        default=0,
                        help="The number of bins to use for the histogram")
    parser.add_argument("--N",
                        type=int,
                        default=0,
                        help="Score that is compared to the elements in a.")
    parser.add_argument("--ddof",
                        type=int,
                        default=0,
                        help="Degrees of freedom correction")
    parser.add_argument(
        "--score",
        type=int,
        default=0,
        help="Score that is compared to the elements in a.",
    )
    parser.add_argument("--m", type=float, default=0.0, help="limits")
    parser.add_argument("--mf", type=float, default=2.0, help="lower limit")
    parser.add_argument("--nf", type=float, default=99.9, help="higher_limit")
    parser.add_argument(
        "--p",
        type=float,
        default=0.5,
        help=
        "The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5",
    )
    parser.add_argument("--alpha", type=float, default=0.9, help="probability")
    parser.add_argument(
        "--new",
        type=float,
        default=0.0,
        help="Value to put in place of values in a outside of bounds",
    )
    parser.add_argument(
        "--proportiontocut",
        type=float,
        default=0.0,
        help="Proportion (in range 0-1) of total data set to trim of each end.",
    )
    parser.add_argument(
        "--lambda_",
        type=float,
        default=1.0,
        help=
        "lambda_ gives the power in the Cressie-Read power divergence statistic",
    )
    parser.add_argument(
        "--imbda",
        type=float,
        default=0,
        help=
        "If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.",
    )
    parser.add_argument(
        "--base",
        type=float,
        default=1.6,
        help="The logarithmic base to use, defaults to e",
    )
    parser.add_argument("--dtype", help="dtype")
    parser.add_argument("--med", help="med")
    parser.add_argument("--cdf", help="cdf")
    parser.add_argument("--zero_method", help="zero_method options")
    parser.add_argument("--dist", help="dist options")
    parser.add_argument("--ties", help="ties options")
    parser.add_argument("--alternative", help="alternative options")
    parser.add_argument("--mode", help="mode options")
    parser.add_argument("--method", help="method options")
    parser.add_argument("--md", help="md options")
    parser.add_argument("--center", help="center options")
    parser.add_argument("--kind", help="kind options")
    parser.add_argument("--tail", help="tail options")
    parser.add_argument("--interpolation", help="interpolation options")
    parser.add_argument("--statistic", help="statistic options")

    args = parser.parse_args()
    infile = args.infile
    outfile = open(args.outfile, "w+")
    test_id = args.test_id
    nf = args.nf
    mf = args.mf
    imbda = args.imbda
    inclusive1 = args.inclusive1
    inclusive2 = args.inclusive2
    sample0 = 0
    sample1 = 0
    sample2 = 0
    if args.sample_cols is not None:
        sample0 = 1
        barlett_samples = []
        for sample in args.sample_cols.split(";"):
            barlett_samples.append(map(int, sample.split(",")))
    if args.sample_one_cols is not None:
        sample1 = 1
        sample_one_cols = args.sample_one_cols.split(",")
    if args.sample_two_cols is not None:
        sample_two_cols = args.sample_two_cols.split(",")
        sample2 = 1
    for line in open(infile):
        sample_one = []
        sample_two = []
        cols = line.strip().split("\t")
        if sample0 == 1:
            b_samples = columns_to_values(barlett_samples, line)
        if sample1 == 1:
            for index in sample_one_cols:
                sample_one.append(cols[int(index) - 1])
        if sample2 == 1:
            for index in sample_two_cols:
                sample_two.append(cols[int(index) - 1])
        if test_id.strip() == "describe":
            size, min_max, mean, uv, bs, bk = stats.describe(
                map(float, sample_one))
            cols.append(size)
            cols.append(min_max)
            cols.append(mean)
            cols.append(uv)
            cols.append(bs)
            cols.append(bk)
        elif test_id.strip() == "mode":
            vals, counts = stats.mode(map(float, sample_one))
            cols.append(vals)
            cols.append(counts)
        elif test_id.strip() == "nanmean":
            m = stats.nanmean(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "kurtosistest":
            z_value, p_value = stats.kurtosistest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "itemfreq":
            freq = stats.itemfreq(map(float, sample_one))
            for list in freq:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "nanmedian":
            m = stats.nanmedian(map(float, sample_one))
            cols.append(m)
        elif test_id.strip() == "variation":
            ra = stats.variation(map(float, sample_one))
            cols.append(ra)
        elif test_id.strip() == "boxcox_llf":
            IIf = stats.boxcox_llf(imbda, map(float, sample_one))
            cols.append(IIf)
        elif test_id.strip() == "tiecorrect":
            fa = stats.tiecorrect(map(float, sample_one))
            cols.append(fa)
        elif test_id.strip() == "rankdata":
            r = stats.rankdata(map(float, sample_one), method=args.md)
            cols.append(r)
        elif test_id.strip() == "nanstd":
            s = stats.nanstd(map(float, sample_one), bias=args.bias)
            cols.append(s)
        elif test_id.strip() == "anderson":
            A2, critical, sig = stats.anderson(map(float, sample_one),
                                               dist=args.dist)
            cols.append(A2)
            for list in critical:
                cols.append(list)
            cols.append(",")
            for list in sig:
                cols.append(list)
        elif test_id.strip() == "binom_test":
            p_value = stats.binom_test(map(float, sample_one),
                                       n=args.n,
                                       p=args.p)
            cols.append(p_value)
        elif test_id.strip() == "gmean":
            gm = stats.gmean(map(float, sample_one), dtype=args.dtype)
            cols.append(gm)
        elif test_id.strip() == "hmean":
            hm = stats.hmean(map(float, sample_one), dtype=args.dtype)
            cols.append(hm)
        elif test_id.strip() == "kurtosis":
            k = stats.kurtosis(
                map(float, sample_one),
                axis=args.axis,
                fisher=args.fisher,
                bias=args.bias,
            )
            cols.append(k)
        elif test_id.strip() == "moment":
            n_moment = stats.moment(map(float, sample_one), n=args.n)
            cols.append(n_moment)
        elif test_id.strip() == "normaltest":
            k2, p_value = stats.normaltest(map(float, sample_one))
            cols.append(k2)
            cols.append(p_value)
        elif test_id.strip() == "skew":
            skewness = stats.skew(map(float, sample_one), bias=args.bias)
            cols.append(skewness)
        elif test_id.strip() == "skewtest":
            z_value, p_value = stats.skewtest(map(float, sample_one))
            cols.append(z_value)
            cols.append(p_value)
        elif test_id.strip() == "sem":
            s = stats.sem(map(float, sample_one), ddof=args.ddof)
            cols.append(s)
        elif test_id.strip() == "zscore":
            z = stats.zscore(map(float, sample_one), ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "signaltonoise":
            s2n = stats.signaltonoise(map(float, sample_one), ddof=args.ddof)
            cols.append(s2n)
        elif test_id.strip() == "percentileofscore":
            p = stats.percentileofscore(map(float, sample_one),
                                        score=args.score,
                                        kind=args.kind)
            cols.append(p)
        elif test_id.strip() == "bayes_mvs":
            c_mean, c_var, c_std = stats.bayes_mvs(map(float, sample_one),
                                                   alpha=args.alpha)
            cols.append(c_mean)
            cols.append(c_var)
            cols.append(c_std)
        elif test_id.strip() == "sigmaclip":
            c, c_low, c_up = stats.sigmaclip(map(float, sample_one),
                                             low=args.m,
                                             high=args.n)
            cols.append(c)
            cols.append(c_low)
            cols.append(c_up)
        elif test_id.strip() == "kstest":
            d, p_value = stats.kstest(
                map(float, sample_one),
                cdf=args.cdf,
                N=args.N,
                alternative=args.alternative,
                mode=args.mode,
            )
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "chi2_contingency":
            chi2, p, dof, ex = stats.chi2_contingency(
                map(float, sample_one),
                correction=args.correction,
                lambda_=args.lambda_)
            cols.append(chi2)
            cols.append(p)
            cols.append(dof)
            cols.append(ex)
        elif test_id.strip() == "tmean":
            if nf == 0 and mf == 0:
                mean = stats.tmean(map(float, sample_one))
            else:
                mean = stats.tmean(map(float, sample_one), (mf, nf),
                                   (inclusive1, inclusive2))
            cols.append(mean)
        elif test_id.strip() == "tmin":
            if mf == 0:
                min = stats.tmin(map(float, sample_one))
            else:
                min = stats.tmin(map(float, sample_one),
                                 lowerlimit=mf,
                                 inclusive=args.inclusive)
            cols.append(min)
        elif test_id.strip() == "tmax":
            if nf == 0:
                max = stats.tmax(map(float, sample_one))
            else:
                max = stats.tmax(map(float, sample_one),
                                 upperlimit=nf,
                                 inclusive=args.inclusive)
            cols.append(max)
        elif test_id.strip() == "tvar":
            if nf == 0 and mf == 0:
                var = stats.tvar(map(float, sample_one))
            else:
                var = stats.tvar(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(var)
        elif test_id.strip() == "tstd":
            if nf == 0 and mf == 0:
                std = stats.tstd(map(float, sample_one))
            else:
                std = stats.tstd(map(float, sample_one), (mf, nf),
                                 (inclusive1, inclusive2))
            cols.append(std)
        elif test_id.strip() == "tsem":
            if nf == 0 and mf == 0:
                s = stats.tsem(map(float, sample_one))
            else:
                s = stats.tsem(map(float, sample_one), (mf, nf),
                               (inclusive1, inclusive2))
            cols.append(s)
        elif test_id.strip() == "scoreatpercentile":
            if nf == 0 and mf == 0:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    interpolation_method=args.interpolation,
                )
            else:
                s = stats.scoreatpercentile(
                    map(float, sample_one),
                    map(float, sample_two),
                    (mf, nf),
                    interpolation_method=args.interpolation,
                )
            for list in s:
                cols.append(list)
        elif test_id.strip() == "relfreq":
            if nf == 0 and mf == 0:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b)
            else:
                rel, low_range, binsize, ex = stats.relfreq(
                    map(float, sample_one), args.b, (mf, nf))
            for list in rel:
                cols.append(list)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "binned_statistic":
            if nf == 0 and mf == 0:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                )
            else:
                st, b_edge, b_n = stats.binned_statistic(
                    map(float, sample_one),
                    map(float, sample_two),
                    statistic=args.statistic,
                    bins=args.b,
                    range=(mf, nf),
                )
            cols.append(st)
            cols.append(b_edge)
            cols.append(b_n)
        elif test_id.strip() == "threshold":
            if nf == 0 and mf == 0:
                o = stats.threshold(map(float, sample_one), newval=args.new)
            else:
                o = stats.threshold(map(float, sample_one),
                                    mf,
                                    nf,
                                    newval=args.new)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trimboth":
            o = stats.trimboth(map(float, sample_one),
                               proportiontocut=args.proportiontocut)
            for list in o:
                cols.append(list)
        elif test_id.strip() == "trim1":
            t1 = stats.trim1(
                map(float, sample_one),
                proportiontocut=args.proportiontocut,
                tail=args.tail,
            )
            for list in t1:
                cols.append(list)
        elif test_id.strip() == "histogram":
            if nf == 0 and mf == 0:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b)
            else:
                hi, low_range, binsize, ex = stats.histogram(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(hi)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "cumfreq":
            if nf == 0 and mf == 0:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b)
            else:
                cum, low_range, binsize, ex = stats.cumfreq(
                    map(float, sample_one), args.b, (mf, nf))
            cols.append(cum)
            cols.append(low_range)
            cols.append(binsize)
            cols.append(ex)
        elif test_id.strip() == "boxcox_normmax":
            if nf == 0 and mf == 0:
                ma = stats.boxcox_normmax(map(float, sample_one))
            else:
                ma = stats.boxcox_normmax(map(float, sample_one), (mf, nf),
                                          method=args.method)
            cols.append(ma)
        elif test_id.strip() == "boxcox":
            if imbda == 0:
                box, ma, ci = stats.boxcox(map(float, sample_one),
                                           alpha=args.alpha)
                cols.append(box)
                cols.append(ma)
                cols.append(ci)
            else:
                box = stats.boxcox(map(float, sample_one),
                                   imbda,
                                   alpha=args.alpha)
                cols.append(box)
        elif test_id.strip() == "histogram2":
            h2 = stats.histogram2(map(float, sample_one),
                                  map(float, sample_two))
            for list in h2:
                cols.append(list)
        elif test_id.strip() == "ranksums":
            z_statistic, p_value = stats.ranksums(map(float, sample_one),
                                                  map(float, sample_two))
            cols.append(z_statistic)
            cols.append(p_value)
        elif test_id.strip() == "ttest_1samp":
            t, prob = stats.ttest_1samp(map(float, sample_one),
                                        map(float, sample_two))
            for list in t:
                cols.append(list)
            for list in prob:
                cols.append(list)
        elif test_id.strip() == "ansari":
            AB, p_value = stats.ansari(map(float, sample_one),
                                       map(float, sample_two))
            cols.append(AB)
            cols.append(p_value)
        elif test_id.strip() == "linregress":
            slope, intercept, r_value, p_value, stderr = stats.linregress(
                map(float, sample_one), map(float, sample_two))
            cols.append(slope)
            cols.append(intercept)
            cols.append(r_value)
            cols.append(p_value)
            cols.append(stderr)
        elif test_id.strip() == "pearsonr":
            cor, p_value = stats.pearsonr(map(float, sample_one),
                                          map(float, sample_two))
            cols.append(cor)
            cols.append(p_value)
        elif test_id.strip() == "pointbiserialr":
            r, p_value = stats.pointbiserialr(map(float, sample_one),
                                              map(float, sample_two))
            cols.append(r)
            cols.append(p_value)
        elif test_id.strip() == "ks_2samp":
            d, p_value = stats.ks_2samp(map(float, sample_one),
                                        map(float, sample_two))
            cols.append(d)
            cols.append(p_value)
        elif test_id.strip() == "mannwhitneyu":
            mw_stats_u, p_value = stats.mannwhitneyu(
                map(float, sample_one),
                map(float, sample_two),
                use_continuity=args.mwu_use_continuity,
            )
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "zmap":
            z = stats.zmap(map(float, sample_one),
                           map(float, sample_two),
                           ddof=args.ddof)
            for list in z:
                cols.append(list)
        elif test_id.strip() == "ttest_ind":
            mw_stats_u, p_value = stats.ttest_ind(map(float, sample_one),
                                                  map(float, sample_two),
                                                  equal_var=args.equal_var)
            cols.append(mw_stats_u)
            cols.append(p_value)
        elif test_id.strip() == "ttest_rel":
            t, prob = stats.ttest_rel(map(float, sample_one),
                                      map(float, sample_two),
                                      axis=args.axis)
            cols.append(t)
            cols.append(prob)
        elif test_id.strip() == "mood":
            z, p_value = stats.mood(map(float, sample_one),
                                    map(float, sample_two),
                                    axis=args.axis)
            cols.append(z)
            cols.append(p_value)
        elif test_id.strip() == "shapiro":
            W, p_value, a = stats.shapiro(map(float, sample_one),
                                          map(float, sample_two), args.reta)
            cols.append(W)
            cols.append(p_value)
            for list in a:
                cols.append(list)
        elif test_id.strip() == "kendalltau":
            k, p_value = stats.kendalltau(
                map(float, sample_one),
                map(float, sample_two),
                initial_lexsort=args.initial_lexsort,
            )
            cols.append(k)
            cols.append(p_value)
        elif test_id.strip() == "entropy":
            s = stats.entropy(map(float, sample_one),
                              map(float, sample_two),
                              base=args.base)
            cols.append(s)
        elif test_id.strip() == "spearmanr":
            if sample2 == 1:
                rho, p_value = stats.spearmanr(map(float, sample_one),
                                               map(float, sample_two))
            else:
                rho, p_value = stats.spearmanr(map(float, sample_one))
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "wilcoxon":
            if sample2 == 1:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    map(float, sample_two),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            else:
                T, p_value = stats.wilcoxon(
                    map(float, sample_one),
                    zero_method=args.zero_method,
                    correction=args.correction,
                )
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "chisquare":
            if sample2 == 1:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               map(float, sample_two),
                                               ddof=args.ddof)
            else:
                rho, p_value = stats.chisquare(map(float, sample_one),
                                               ddof=args.ddof)
            cols.append(rho)
            cols.append(p_value)
        elif test_id.strip() == "power_divergence":
            if sample2 == 1:
                stat, p_value = stats.power_divergence(
                    map(float, sample_one),
                    map(float, sample_two),
                    ddof=args.ddof,
                    lambda_=args.lambda_,
                )
            else:
                stat, p_value = stats.power_divergence(map(float, sample_one),
                                                       ddof=args.ddof,
                                                       lambda_=args.lambda_)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "theilslopes":
            if sample2 == 1:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     map(float, sample_two),
                                                     alpha=args.alpha)
            else:
                mpe, met, lo, up = stats.theilslopes(map(float, sample_one),
                                                     alpha=args.alpha)
            cols.append(mpe)
            cols.append(met)
            cols.append(lo)
            cols.append(up)
        elif test_id.strip() == "combine_pvalues":
            if sample2 == 1:
                stat, p_value = stats.combine_pvalues(
                    map(float, sample_one),
                    method=args.med,
                    weights=map(float, sample_two),
                )
            else:
                stat, p_value = stats.combine_pvalues(map(float, sample_one),
                                                      method=args.med)
            cols.append(stat)
            cols.append(p_value)
        elif test_id.strip() == "obrientransform":
            ob = stats.obrientransform(*b_samples)
            for list in ob:
                elements = ",".join(map(str, list))
                cols.append(elements)
        elif test_id.strip() == "f_oneway":
            f_value, p_value = stats.f_oneway(*b_samples)
            cols.append(f_value)
            cols.append(p_value)
        elif test_id.strip() == "kruskal":
            h, p_value = stats.kruskal(*b_samples)
            cols.append(h)
            cols.append(p_value)
        elif test_id.strip() == "friedmanchisquare":
            fr, p_value = stats.friedmanchisquare(*b_samples)
            cols.append(fr)
            cols.append(p_value)
        elif test_id.strip() == "fligner":
            xsq, p_value = stats.fligner(center=args.center,
                                         proportiontocut=args.proportiontocut,
                                         *b_samples)
            cols.append(xsq)
            cols.append(p_value)
        elif test_id.strip() == "bartlett":
            T, p_value = stats.bartlett(*b_samples)
            cols.append(T)
            cols.append(p_value)
        elif test_id.strip() == "levene":
            w, p_value = stats.levene(center=args.center,
                                      proportiontocut=args.proportiontocut,
                                      *b_samples)
            cols.append(w)
            cols.append(p_value)
        elif test_id.strip() == "median_test":
            stat, p_value, m, table = stats.median_test(
                ties=args.ties,
                correction=args.correction,
                lambda_=args.lambda_,
                *b_samples)
            cols.append(stat)
            cols.append(p_value)
            cols.append(m)
            cols.append(table)
            for list in table:
                elements = ",".join(map(str, list))
                cols.append(elements)
        outfile.write("%s\n" % "\t".join(map(str, cols)))
    outfile.close()
Ejemplo n.º 39
0
        print(fourth)
        print(fifth)
        dif_breit = [np.absolute(best_breit[0] - initials_breit[0]), np.absolute(best_breit[1] - initials_breit[1]), np.absolute(best_breit[2] - initials_breit[2]), np.absolute(best_breit[3] - initials_breit[3]), np.absolute(best_breit[4] - initials_breit[4])]
        i += 1
        print("Interação número: ", i)

    print("Número de iterações: ", i)
    if (i == 1):
        print("O fit ficou bom? Legal! \nNão ficou? Tente outros valores iniciais! ")
    elif (i >= 1 and i <= 13):
        print("O fit convergiu!")
    else:
        print("O fit provavelmente está divergindo... Tente outros valores iniciais!")

    print(chisquare(y, breitwigner(x, best_breit[0], best_breit[1], best_breit[2], best_breit[3], best_breit[4])))
    print(power_divergence(y, breitwigner(x, 4, 91, -2, 150, 13000), lambda_="neyman"))
    plt.plot(x, breitwigner(x, *best_breit), 'r-', label='gamma = {}, M = {}'.format(best_breit[0], best_breit[1]))
    plt.xlabel('Massa Invariante [GeV]')
    plt.ylabel('Número de Eventos')
    plt.title('Bóson Z: Ajuste com Breit-Wigne')
    plt.legend()    
    plt.show()


elif escolha == 2:
    print("gamma (a largura total do meio no máximo da distribuição),\nM(valor onde ocorre o máximo da distribuição),\na (inclinação que é usada para pereber o efeito de backgrund),\nb (intercepção em y, que é usada para perceber o efeito de background),\nA (amplitude da distribuição de Breit-Wigner)")
    initials_breit =  [float(x) for x in input('Preencha com os parâmetros da Briet-Wigner (gamma, M, a, b, A) dando apenas espaços entre eles: ').split()] #Para o Upsilon: [0.2 9.5 50 -370 180]
    # Vamos importar o módulo que é usado na otimização, executar a otimização e calcular as incertezas dos parâmetros otimizados.
    best_breit, covariance = curve_fit(breitwigner, x, y, p0=initials_breit, sigma=np.sqrt(y))
    error_breit = np.sqrt(np.diag(covariance))