def chi_square_test(input1, input2):
    truth1 = [0.5 for e in input1]
    truth2 = [0.5 for e in input2]
    print('---')
    print(stats.chisquare(input1, truth1))
    print(stats.chisquare(input2, truth2))
    print('---')
def describe_date_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a date series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    stats = {"min": series.min(), "max": series.max(), "histogram_data": series}

    bins = config["plot"]["histogram"]["bins"].get(int)
    # Bins should never be larger than the number of distinct values
    bins = min(series_description["distinct_count_with_nan"], bins)
    stats["histogram_bins"] = bins

    stats["range"] = stats["max"] - stats["min"]

    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
    if chi_squared_threshold > 0.0:
        histogram = np.histogram(
            series[series.notna()].astype("int64").values, bins="auto"
        )[0]
        stats["chi_squared"] = chisquare(histogram)

    return stats
Ejemplo n.º 3
0
    def describe_categorical_1d(series: pd.Series,
                                series_description: dict) -> dict:
        """Describe a categorical series.

        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.

        Returns:
            A dict containing calculated series description values.
        """
        # Make sure we deal with strings (Issue #100)
        series = series.astype(str)

        # Only run if at least 1 non-missing value
        value_counts = series_description["value_counts_without_nan"]

        stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

        redact = config["vars"]["cat"]["redact"].get(float)
        if not redact:
            stats.update({"first_rows": series.head(5)})

        stats.update(
            histogram_compute(value_counts,
                              len(value_counts),
                              name="histogram_frequencies"))

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            stats["chi_squared"] = list(chisquare(value_counts.values))

        check_length = config["vars"]["cat"]["length"].get(bool)
        if check_length:
            stats.update(length_summary(series))
            stats.update(
                histogram_compute(stats["length"],
                                  stats["length"].nunique(),
                                  name="histogram_length"))

        check_unicode = config["vars"]["cat"]["characters"].get(bool)
        if check_unicode:
            stats.update(unicode_summary(series))
            stats["n_characters_distinct"] = stats["n_characters"]
            stats["n_characters"] = stats["character_counts"].values.sum()

            stats["category_alias_counts"].index = stats[
                "category_alias_counts"].index.str.replace("_", " ")

        words = config["vars"]["cat"]["words"]
        if words:
            stats.update(word_summary(series))

        coerce_str_to_date = config["vars"]["cat"]["coerce_str_to_date"].get(
            bool)
        if coerce_str_to_date:
            stats["date_warning"] = warning_type_date(series)

        return stats
Ejemplo n.º 4
0
def chi_square(dictionary, matrix, neg_num=0, pos_num=0):
    """
    计算各个特征的卡方值,a,b,c,d分别为观测值,A,B,C,D为预测值,这里因为
    采用的训练语料是非平衡的,比例为3:7,因此A=(a+b)*.7,B=(a+b)*.3,以此类推
            正    负
    包含x1   a     b
    不包含x1 c     d
    通常用计算出的卡方值筛选特征
    :param dictionary: 字典
    :param matrix: 文本的频率矩阵
    :param neg_num: 负文本的数量
    :param pos_num: 正文本的数量
    :return:一个一维数组,包含每个特征X的卡方值,p值越小说明越有区分力,应当选取
    """
    chi_squares = []
    As = []
    Ts = []
    for i in range(0, len(dictionary)):
        a = 0
        b = 0
        for j in range(0, len(matrix)):
            if matrix[j][i] > 0 and j < neg_num:
                b += 1
            if matrix[j][i] > 0 and j >= neg_num:
                a += 1
        c = pos_num - a + 0.01
        d = neg_num - b + 0.01
        A = [a, b, c, d]
        T = [(a + b) * 0.7, (a + b) * 0.3, (c + d) * 0.7, (c + d) * 0.3]
        As.append(A)
        Ts.append(T)
        chi_squares.append(stats.chisquare(A, f_exp=T)[1])
    print(As)
    print(Ts)
    return chi_squares
Ejemplo n.º 5
0
    def describe_date_1d(series: pd.Series, series_description: dict) -> dict:
        """Describe a date series.

        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.

        Returns:
            A dict containing calculated series description values.
        """
        stats = {
            "min": pd.Timestamp.to_pydatetime(series.min()),
            "max": pd.Timestamp.to_pydatetime(series.max()),
        }

        stats["range"] = stats["max"] - stats["min"]

        values = series[series.notnull()].values.astype(np.int64) // 10**9

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            histogram, _ = np.histogram(values, bins="auto")
            stats["chi_squared"] = chisquare(histogram)

        stats.update(
            histogram_compute(values, series_description["n_distinct"]))
        return stats
Ejemplo n.º 6
0
def describe_categorical_1d(series: pd.Series,
                            series_description: dict) -> dict:
    """Describe a categorical series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = series_description["value_counts_without_nan"]

    stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
        float)
    if chi_squared_threshold > 0.0:
        stats["chi_squared"] = list(chisquare(value_counts.values))

    check_composition = config["vars"]["cat"]["check_composition"].get(bool)
    if check_composition:

        from visions.application.summaries.series.text_summary import text_summary

        stats.update(text_summary(series))
        stats["length"] = series.str.len()

    stats["date_warning"] = warning_type_date(series)

    return stats
Ejemplo n.º 7
0
    def find_not_link_loci(self, N, E, sigValue):
        '''
        Determines and returns a list of pairs of loci that are self.not in linkage equilibrium.
        '''
        not_link_loci = []
        k = 0
        s = 1
        ijk_count = 0
        rts_count = 0
        while k < (self.m//2 - 1):
            s = k + 1
            namek = self.loci[k * 2]
            ijk_obs = []
            ijk_exp = []
            for ival, jval in N[namek]:
                if E[namek][(ival, jval)] != 0:
                    #print P[k][i][j]
                    ijk_exp.append(E[namek][(ival,jval)])
                    ijk_obs.append(N[namek][(ival,jval)])
                    ijk_count = ijk_count + 1

                    while s < self.m//2:
                        names = self.loci[s * 2]
                        rts_obs = []
                        rts_exp = []

                        for rval, tval in N[names]:
                            if E[names][(rval, tval)] != 0:
                                rts_exp.append(E[names][(rval, tval)])
                                rts_obs.append(N[names][(rval, tval)])
                                trs_count = rts_count + 1
                        LK_ijk = len(self.alinlocus[namek])
                        LK_rts = len(self.alinlocus[names])
                        ddof_ijk = 0.5 * LK_ijk * (LK_ijk - 1)
                        ddof_rts = 0.5 * LK_rts * (LK_rts - 1)
                        ddof = (ddof_ijk - 1) * (ddof_rts - 1)
                        tmp = 0
                        obs = []
                        exp = []
                        # wow i put or instead of and and i must have forgot how to program last night
                        while tmp < len(ijk_obs) and tmp < len(rts_obs):
                            obs.append(ijk_obs[tmp] * rts_obs[tmp])
                            exp.append(ijk_exp[tmp] * rts_exp[tmp])
                            tmp = tmp + 1
                        if tmp < len(ijk_obs):
                            obs.append(ijk_obs[tmp])
                            exp.append(ijk_exp[tmp])
                            tmp = tmp + 1
                        elif tmp < len(rts_obs):
                            obs.append(rts_obs[tmp])
                            exp.append(rts_exp[tmp])
                            tmp = tmp + 1
                        chisq, p = chisquare(obs, exp, ddof)
                        if p < sigValue:
                            not_link_loci.append((namek, names))
                        s = s + 1
            k = k + 1
        return not_link_loci
Ejemplo n.º 8
0
 def optimiseP(self,loP=-0.005,hiP=0.005,Pstep=0.000005):
     periods = numpy.arange(loP,hiP,float(Pstep))
     pcurve = numpy.empty_like(periods)
     for jj,p in enumerate(periods):
         nTimePhase = numpy.empty_like(self.timePhase)
         delays = self.getPdelays(p)
         for ii in range(self.nints):
             nTimePhase[ii] = rollArray(self.timePhase[ii],delays[ii],0)
         pcurve[jj] = chisquare(nTimePhase.sum(axis=0))[0]
     return pcurve
Ejemplo n.º 9
0
 def optimiseDM(self,hidm=50,lodm=-50,dmstep=1):
     dms = numpy.arange(lodm,hidm,float(dmstep))
     dmcurve = numpy.empty_like(dms)
     for jj,dm in enumerate(dms):
         nFreqPhase = numpy.empty_like(self.freqPhase)
         delays = self.getDMdelays(dm)
         for ii in range(self.nbands):
             nFreqPhase[ii] = rollArray(self.freqPhase[ii],delays[ii],0)
         dmcurve[jj] = chisquare(nFreqPhase.sum(axis=0))[0]
     return dmcurve
Ejemplo n.º 10
0
def describe_categorical_1d(series: pd.Series,
                            series_description: dict) -> dict:
    """Describe a categorical series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.
    """
    # Make sure we deal with strings (Issue #100)
    series = series.astype(str)

    # Only run if at least 1 non-missing value
    value_counts = series_description["value_counts_without_nan"]

    stats = {"top": value_counts.index[0], "freq": value_counts.iloc[0]}

    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
        float)
    if chi_squared_threshold > 0.0:
        stats["chi_squared"] = list(chisquare(value_counts.values))

    check_composition = config["vars"]["cat"]["check_composition"].get(bool)
    if check_composition:
        contains = {
            "chars": series.str.contains(r"[a-zA-Z]", case=False,
                                         regex=True).any(),
            "digits": series.str.contains(r"[0-9]", case=False,
                                          regex=True).any(),
            "spaces": series.str.contains(r"\s", case=False, regex=True).any(),
            "non-words": series.str.contains(r"\W", case=False,
                                             regex=True).any(),
        }

        stats["length"] = series.str.len()
        stats["max_length"] = series.str.len().max()
        stats["mean_length"] = series.str.len().mean()
        stats["min_length"] = series.str.len().min()
        stats["composition"] = contains

    stats["date_warning"] = warning_type_date(series)

    return stats
Ejemplo n.º 11
0
    def find_not_hwe_loci(self, N, E, sigValue):
        '''
        Determines and returns list of loci that are self.not in HWE.
        '''
        not_hwe_loci = []
        for k in range(0, self.m//2):
            obs = []
            exp = []
            namek = self.loci[k*2]
            for ival, jval in N[namek]:
                if (E[namek][(ival,jval)] != 0):
                    #print P[k][i][j]
                    exp.append(E[namek][(ival,jval)])
                    obs.append(N[namek][(ival,jval)])

            Lk = len(self.alinlocus[namek])
            ddof = 0.5 * Lk *(Lk-1)
            chisq, p = chisquare(obs, exp, ddof)
            if p < sigValue:
                not_hwe_loci.append(namek)
        return not_hwe_loci
Ejemplo n.º 12
0
    def describe_numeric_1d(series: pd.Series,
                            series_description: dict) -> dict:
        """Describe a numeric series.
        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.
        Returns:
            A dict containing calculated series description values.
        Notes:
            When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of
            bins. Read the docs:
            https://docs.astropy.org/en/stable/visualization/histogram.html
            https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html
            This method might print warnings, which we suppress.
            https://github.com/astropy/astropy/issues/4927
        """
        def mad(arr):
            """Median Absolute Deviation: a "Robust" version of standard deviation.
            Indices variability of the sample.
            https://en.wikipedia.org/wiki/Median_absolute_deviation
            """
            return np.median(np.abs(arr - np.median(arr)))

        quantiles = config["vars"]["num"]["quantiles"].get(list)

        n_infinite = ((series == np.inf) | (series == -np.inf)).sum()

        if isinstance(series.dtype, _IntegerDtype):
            stats = numeric_stats_pandas(series)
            present_values = series.loc[series.notnull()].astype(
                str(series.dtype).lower())
            stats["n_zeros"] = series_description["count"] - np.count_nonzero(
                present_values)
            stats["histogram_data"] = present_values
            finite_values = present_values
        else:
            values = series.values
            present_values = values[~np.isnan(values)]
            finite_values = values[np.isfinite(values)]
            stats = numeric_stats_numpy(present_values)
            stats["histogram_data"] = finite_values

        stats.update({
            "mad": mad(present_values),
            "scatter_data": series,  # For complex
            "p_infinite": n_infinite / series_description["n"],
            "n_infinite": n_infinite,
        })

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            histogram, _ = np.histogram(finite_values, bins="auto")
            stats["chi_squared"] = chisquare(histogram)

        stats["range"] = stats["max"] - stats["min"]
        stats.update({
            f"{percentile:.0%}": value
            for percentile, value in series.quantile(
                quantiles).to_dict().items()
        })
        stats["iqr"] = stats["75%"] - stats["25%"]
        stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
        stats["p_zeros"] = stats["n_zeros"] / series_description["n"]

        stats["monotonic_increase"] = series.is_monotonic_increasing
        stats["monotonic_decrease"] = series.is_monotonic_decreasing

        stats["monotonic_increase_strict"] = (stats["monotonic_increase"]
                                              and series.is_unique)
        stats["monotonic_decrease_strict"] = (stats["monotonic_decrease"]
                                              and series.is_unique)

        stats.update(
            histogram_compute(finite_values, series_description["n_distinct"]))

        return stats
def chi_squared_test(actual, expected):
    chi2_stat, p = stats.chisquare(actual, expected)
    return chi2_stat, p
Ejemplo n.º 14
0
def chi_square(values=None, histogram=None):
    if histogram is None:
        histogram, _ = np.histogram(values, bins="auto")
    return dict(chisquare(histogram)._asdict())
Ejemplo n.º 15
0
def def_chisquare(f_obs1, f_exp1=None):
    res = chisquare(f_obs=f_obs1, f_exp=f_exp1, axis=None)
    return res
Ejemplo n.º 16
0
def chi_square(
    values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None
) -> dict:
    if histogram is None:
        histogram, _ = np.histogram(values, bins="auto")
    return dict(chisquare(histogram)._asdict())
Ejemplo n.º 17
0
    def describe_numeric_1d(series: pd.Series,
                            series_description: dict) -> dict:
        """Describe a numeric series.
        Args:
            series: The Series to describe.
            series_description: The dict containing the series description so far.
        Returns:
            A dict containing calculated series description values.
        Notes:
            When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of
            bins. Read the docs:
            https://docs.astropy.org/en/stable/visualization/histogram.html
            https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html
            This method might print warnings, which we suppress.
            https://github.com/astropy/astropy/issues/4927
        """
        def mad(arr):
            """ Median Absolute Deviation: a "Robust" version of standard deviation.
                Indices variability of the sample.
                https://en.wikipedia.org/wiki/Median_absolute_deviation
            """
            return np.median(np.abs(arr - np.median(arr)))

        quantiles = config["vars"]["num"]["quantiles"].get(list)

        n_infinite = ((series == np.inf) | (series == -np.inf)).sum()

        values = series.values
        present_values = values[~np.isnan(values)]
        finite_values = values[np.isfinite(values)]

        stats = {
            "mean":
            np.mean(present_values),
            "std":
            np.std(present_values, ddof=1),
            "variance":
            np.var(present_values, ddof=1),
            "min":
            np.min(present_values),
            "max":
            np.max(present_values),
            # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
            "kurtosis":
            series.kurt(),
            # Unbiased skew normalized by N-1
            "skewness":
            series.skew(),
            "sum":
            np.sum(present_values),
            "mad":
            mad(present_values),
            "n_zeros":
            (series_description["count"] - np.count_nonzero(present_values)),
            "histogram_data":
            finite_values,
            "scatter_data":
            series,  # For complex
            "p_infinite":
            n_infinite / series_description["n"],
            "n_infinite":
            n_infinite,
        }

        chi_squared_threshold = config["vars"]["num"][
            "chi_squared_threshold"].get(float)
        if chi_squared_threshold > 0.0:
            histogram, _ = np.histogram(finite_values, bins="auto")
            stats["chi_squared"] = chisquare(histogram)

        stats["range"] = stats["max"] - stats["min"]
        stats.update({
            f"{percentile:.0%}": value
            for percentile, value in series.quantile(
                quantiles).to_dict().items()
        })
        stats["iqr"] = stats["75%"] - stats["25%"]
        stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
        stats["p_zeros"] = stats["n_zeros"] / series_description["n"]

        bins = config["plot"]["histogram"]["bins"].get(int)
        # Bins should never be larger than the number of distinct values
        bins = min(series_description["distinct_count_with_nan"], bins)
        stats["histogram_bins"] = bins

        bayesian_blocks_bins = config["plot"]["histogram"][
            "bayesian_blocks_bins"].get(bool)
        if bayesian_blocks_bins:
            from astropy.stats import bayesian_blocks

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                ret = bayesian_blocks(stats["histogram_data"])

                # Sanity check
                if not np.isnan(ret).any() and ret.size > 1:
                    stats["histogram_bins_bayesian_blocks"] = ret

        return stats
def describe_numeric_1d(series: pd.Series, series_description: dict) -> dict:
    """Describe a numeric series.

    Args:
        series: The Series to describe.
        series_description: The dict containing the series description so far.

    Returns:
        A dict containing calculated series description values.

    Notes:
        When 'bins_type' is set to 'bayesian_blocks', astropy.stats.bayesian_blocks is used to determine the number of
        bins. Read the docs:
        https://docs.astropy.org/en/stable/visualization/histogram.html
        https://docs.astropy.org/en/stable/api/astropy.stats.bayesian_blocks.html

        This method might print warnings, which we suppress.
        https://github.com/astropy/astropy/issues/4927
    """
    quantiles = config["vars"]["num"]["quantiles"].get(list)

    stats = {
        "mean": series.mean(),
        "std": series.std(),
        "variance": series.var(),
        "min": series.min(),
        "max": series.max(),
        "kurtosis": series.kurt(),
        "skewness": series.skew(),
        "sum": series.sum(),
        "mad": series.mad(),
        "n_zeros": (len(series) - np.count_nonzero(series)),
        "histogram_data": series,
        "scatter_data": series,  # For complex
    }

    chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(float)
    if chi_squared_threshold > 0.0:
        histogram = np.histogram(series[series.notna()].values, bins="auto")[0]
        stats["chi_squared"] = chisquare(histogram)

    stats["range"] = stats["max"] - stats["min"]
    stats.update(
        {
            f"{percentile:.0%}": value
            for percentile, value in series.quantile(quantiles).to_dict().items()
        }
    )
    stats["iqr"] = stats["75%"] - stats["25%"]
    stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
    stats["p_zeros"] = float(stats["n_zeros"]) / len(series)

    bins = config["plot"]["histogram"]["bins"].get(int)
    # Bins should never be larger than the number of distinct values
    bins = min(series_description["distinct_count_with_nan"], bins)
    stats["histogram_bins"] = bins

    bayesian_blocks_bins = config["plot"]["histogram"]["bayesian_blocks_bins"].get(bool)
    if bayesian_blocks_bins:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            ret = bayesian_blocks(stats["histogram_data"])

            # Sanity check
            if not np.isnan(ret).any() and ret.size > 1:
                stats["histogram_bins_bayesian_blocks"] = ret

    return stats
Ejemplo n.º 19
0
    significant_comparisons = []
    for systemA_num, resultsA in enumerate(results[:-1]):
        systemA = "%s/%s" % (model_types[systemA_num],
                             model_names[systemA_num])
        for systemB_num, resultsB in enumerate(results[systemA_num + 1:],
                                               start=systemA_num + 1):
            systemB = "%s/%s" % (model_types[systemB_num],
                                 model_names[systemB_num])
            print("Comparing %s to system %s" % (systemA, systemB))
            # Output accuracy scores
            accuracyA = 100. * resultsA[1] / resultsA.sum()
            accuracyB = 100. * resultsB[1] / resultsB.sum()
            print("  %s: %.2f%%" % (systemA, accuracyA))
            print("  %s: %.2f%%" % (systemB, accuracyB))
            # Compute significance with chi-squared test
            chi2_stat, p = chisquare(resultsA, resultsB)
            print("  p=%g %s" %
                  (p, "***" if p < 0.01 else "**" if p < 0.05 else ""))

            if p < 0.05:
                significant_comparisons.append({
                    "A":
                    systemA,
                    "B":
                    systemB,
                    "p":
                    p,
                    "A higher": (accuracyA > accuracyB),
                })

    if significant_comparisons: