Exemple #1
0
 def get_clevel(self, alpha: float = 0.05):
     # determine confidence threshold value
     from scipy.stats.distributions import chi2
     etol = chi2.isf(alpha, df=1)
     clevel = etol / 2 + np.log(self.obj)
     self.clevel = clevel
     return clevel
Exemple #2
0
def chi2stats(data):
    xs, stdevs, x, k, l, df, name = data

    sample_chi2 = chi2_old(x, k, l, xs, stdevs)
    print(name)
    print(sample_chi2)
    print(chi_squared.isf(.05, df))
    print(1 - chi_squared.cdf(sample_chi2, df))

    plt.plot(xs, "o")
    plt.plot([model(x, k, l, i) for i in range(len(xs))])
    plt.title(name)
    plt.show()
def hypothesis_test(observed_no_treatment_data, observed_treatment_data, treatment_name, alpha=0.05, graph=True):
    """
    Performs an hypothesis test to check if a given treatment changes distributions.
    :param observed_no_treatment_data: observed value counts for the distribution when there is no treatment.
    :param observed_treatment_data: observed value counts for the distribution when there is a treatment.
    :param treatment_name: the name of the treatment.
    :param alpha: the alpha of the test.
    """
    # We've chosen chi-squared as the hypothesis test
    print_header("Hypothesis testing for the treatment: %r\n"
                 "We will perform the chi-squared test with alpha = %r" % (treatment_name, alpha))

    no_treatment_percent = observed_no_treatment_data / sum(observed_no_treatment_data)
    expected_treatment_data = sum(observed_treatment_data) * no_treatment_percent

    # check if the expected frequencies are large enough for the chi-squared test
    can_use_chi_squared = sum(expected_treatment_data > MINIMAL_EXPECTED_FREQUENCY) == len(expected_treatment_data)
    print("Are the expected frequencies enough for the chi-squared test? %r" % can_use_chi_squared)
    if not can_use_chi_squared:
        print("Frequencies not high enough for this treatment.")
        return

    # the decision rule
    critical_value = chi2.isf(q=alpha, df=len(expected_treatment_data) - 1)
    print("The critical value is: %r" % critical_value)

    # perform the test
    chisq, p_value = chisquare(f_obs=observed_treatment_data, f_exp=expected_treatment_data)
    rejection = (chisq > critical_value) and (p_value < alpha)
    print("The chi-squared value is %r, the p-value is %r.\nShould we reject H0? %r" % (chisq, p_value, rejection))
    print("Note: H0 is the hypothesis that the treatment makes no change to the observed "
          "distribution of the untreated data.")

    # Supporting graphs and data
    if graph:
        # Value counts for patients who had no treatment
        pretty_graph(observed_no_treatment_data, None, 'Value counts, without treatment: ' + treatment_name,
                     'Readmission CHANGE THIS', 'Crime count')

        # Value counts for patients who had the treatment
        pretty_graph(observed_treatment_data, None, 'Value counts, with treatment: ' + treatment_name,
                     'Readmission CHANGE THIS', 'Crime count')

    # Various relevant values
    # Note that of course that the first two percentages should be the same
    print("Some supporting statistics:")
    print("Observed percentages without treatment:\n%r" % no_treatment_percent)
    print("Expected percentages with treatment (should be the same as above):\n%r" %
          (expected_treatment_data / sum(expected_treatment_data)))
    print("Observed percentages with treatment:\n%r" % (observed_treatment_data / sum(observed_treatment_data)))
Exemple #4
0
def prosac(data,
           quality,
           model_type,
           tolerance,
           beta,
           eta0,
           psi,
           max_outlier_proportion,
           p_good_sample,
           max_number_of_draws,
           enable_n_star_optimization=True):
    """
    Progressive random sampling algorithm (PROSAC)
    Adapted from: http://devernay.free.fr/vision/src/prosac.c
    :param data: Data to fit
    :param quality: Point quality
    :param model_type: Model subclass
    :param tolerance: Tolerance on the error to consider a point inlier to a model
    :param beta: Probability that a match is declared inlier by mistake, i.e. the ratio of the "inlier"
    :param eta0: Maximum probability that a solution with more than In_star inliers in Un_star exists and was not found after k samples (typically set to 5%, see Sec. 2.2 of [Chum-Matas-05]).
    :param psi: Probability that In_star out of n_star data points are by chance inliers to an arbitrary (typically set to 5%)
    :param max_outlier_proportion: Maximum allowed outliers proportion in the input data, used to compute T_N (can be as high as 0.95)
    :param p_good_sample: Probability that at least one of the random samples picked up by RANSAC is free of outliers
    :param max_number_of_draws: Max number of draws
    :param enable_n_star_optimization: Enable early stopping if the probability of finding a better match fall below eta0
    :return: A model of type model_type, fitted to the inliers
    """
    indexes = np.argsort(quality)
    data = data[indexes[::-1]]

    num_points = data.shape[0]
    num_points_to_sample = model_type.get_complexity()
    chi2_value = chi2.isf(2 * psi, 1)

    def niter_ransac(p, epsilon, s, n_max):
        """
        Compute the maximum number of iterations for RANSAC
        :param p: Probability that at least one of the random samples picked up by RANSAC is free of outliers
        :param epsilon: Proportion of outliers
        :param s: Sample size
        :param n_max: Upper bound on the number of iterations (-1 means INT_MAX)
        :return: maximum number of iterations for RANSAC
        """
        if n_max == -1:
            n_max = np.iinfo(np.int32).max
        if not (n_max >= 1):
            raise ValueError('n_max must be positive')
        if epsilon <= 0:
            return 1
        logarg = -np.exp(s * np.log(1 - epsilon))
        logval = np.log(1 + logarg)
        n = np.log(1 - p) / logval
        if logval < 0 and n < n_max:
            return np.ceil(n)
        return n_max

    def i_min(m, n, beta):
        """
        Non-randomness, prevent from choosing a model supported by outliers
        :param m: Model complexity
        :param n: Number of considered points
        :param beta: Beta parameter
        :return: Minimum number of inlier to avoid model only supported by outliers
        """
        mu = n * beta
        sigma = np.sqrt(n * beta * (1 - beta))
        return np.ceil(m + mu + sigma * np.sqrt(chi2_value))

    N = num_points
    m = num_points_to_sample
    T_N = niter_ransac(p_good_sample, max_outlier_proportion,
                       num_points_to_sample, -1)
    I_N_min = (1 - max_outlier_proportion) * N

    n_star = N
    I_n_star = 0
    I_N_best = 0
    t = 0
    n = m
    T_n = T_N

    for i in range(m):
        T_n = T_n * (n - i) / (N - i)

    T_n_prime = 1
    k_n_star = T_N

    while ((I_N_best < I_N_min)
           or t <= k_n_star) and t < T_N and t <= max_number_of_draws:
        t = t + 1

        if (t > T_n_prime) and (n < n_star):
            T_nplus1 = (T_n * (n + 1)) / (n + 1 - m)
            n = n + 1
            T_n_prime = T_n_prime + np.ceil(T_nplus1 - T_n)
            T_n = T_nplus1

        if t > T_n_prime:
            pts_idx = np.random.choice(n, m, replace=False)
        else:
            pts_idx = np.append(np.random.choice(n - 1, m - 1, replace=False),
                                n)

        sample = data[pts_idx]

        # 3. Model parameter estimation
        model = model_type()
        model.fit(sample)

        # 4. Model verification
        error = model.error(data)
        is_inlier = (error < tolerance)
        I_N = is_inlier.sum()

        if I_N > I_N_best:
            I_N_best = I_N
            n_best = N
            I_n_best = I_N
            best_model = model

            if enable_n_star_optimization:
                epsilon_n_best = I_n_best / n_best
                I_n_test = I_N
                for n_test in range(N, m, -1):
                    if not (n_test >= I_n_test):
                        raise RuntimeError(
                            'Loop invariant broken: n_test >= I_n_test')
                    if ((I_n_test * n_best > I_n_best * n_test)
                            and (I_n_test > epsilon_n_best * n_test +
                                 np.sqrt(n_test * epsilon_n_best *
                                         (1 - epsilon_n_best) * chi2_value))):
                        if I_n_test < i_min(m, n_test, beta):
                            break
                        n_best = n_test
                        I_n_best = I_n_test
                        epsilon_n_best = I_n_best / n_best
                    I_n_test = I_n_test - is_inlier[n_test - 1]

            if I_n_best * n_star > I_n_star * n_best:
                if not (n_best >= I_n_best):
                    raise RuntimeError(
                        'Assertion not respected: n_best >= I_n_best')
                n_star = n_best
                I_n_star = I_n_best
                k_n_star = niter_ransac(1 - eta0, 1 - I_n_star / n_star, m,
                                        T_N)

    return best_model
Exemple #5
0
print("CHR" +"\t"+ "BP" +"\t"+ "SNP" +"\t"+ "A1" +"\t"+ "A2"+"\t"+ "P" +"\t"+ "OR" +"\t"+ "BETA" +"\t"+ "SE" +"\t"+ "N" +"\t"+ "CHISQ" +"\t"+ "Z" +"\t"+ "SOURCE" +"\t"+ "FRQ_A_A2" +"\t"+ "FRQ_U_A2")# +"\t"+ "RefPanel_A1" +"\t"+ "RefPanelAF_A1" +"\t"+ "INFO" +"\t"+ "Genotyped" +"\t"+ "RSID")
line = fh1.readline().replace("\n", "")
while line:

    list = re.split("\s+",line)
    chromosome = int(list[0])
    SNPID = list[2]
    pos = int(list[1])
    INFO = list[7]

    A1   = list[3]
    A2   = list[4]
    FRQ_A_A1 = list[17] # IMPORTANT: here this is FREQ from affected samples
    FRQ_U_A1 = list[18] # IMPORTANT: here this is FREQ from unaffected samples
    pval = list[12]
    if pval == "0":
        pval = "1e-324"
    chisq = chi2.isf(float(pval), 1) # https://www.biostars.org/p/261698/
    OR = float(list[19])
    source = "SAIGE"
    #    BETA = round(math.log(OR),4)
    BETA = list[9]
    SE = list[10]
    zscore = float(BETA)/float(SE)
    print(str(chromosome) +"\t"+ str(pos) +"\t"+ SNPID +"\t"+ A1 +"\t"+ A2 +"\t"+ pval +"\t"+ str(OR) +"\t"+ str(BETA) +"\t"+ str(SE) +"\t"+ num_samples + "\t" + str(chisq) + "\t" + str(zscore) + "\t" + source + "\t" + str(FRQ_A_A1) + "\t" + str(FRQ_U_A1))
    line = fh1.readline().replace("\n", "")

fh1.close()

sys.exit(0)
def get_chi2_stats(**kwargs):
    """Get Chi-Square statistics for the features present in training dataset.

    Calculates and returns chi-square stats for the features present in files
    'Itemset_train.txt' and 'Classes_train.txt'.

    Parameters
    ----------
    **kwargs
        pass

    verbose : bool, optional
        Specifies whether to print logs of operation performed by the method
        to the console. Default is False.

    Returns
    -------
    feature_list : array_like
    chi2_critical_list : array_like
    chi2_stats_list : array_like
    
    """
    
    dataset = apriori.get_dataset_from_file('Itemset_train.txt')
    classes = apriori.get_classes_from_file('Classes_train.txt')
    assert len(dataset) == len(classes)

    feature_options_list = list(map(set, zip(*dataset)))
    classes_list = list(set(classes))
    
    verbose = kwargs.get('verbose', False)

    if verbose:
        print("Chi Square Feature Selection at {}% Confidence Interval\n".format((1-SIGNIFICANCE_LEVEL)*100))
        table_printer = TablePrinter(4, 1)
        table_printer.set_column_headers("Feature", "Chi2_stats", "Chi2_critical_value", "Importance")
        table_printer.set_column_alignments('^', '^', '^', '^')
        table_printer.set_column_widths(20, 20, 20, 20)
        table_printer.begin()

    feature_list = []
    chi2_critical_list = []
    chi2_stats_list = []

    for i, feature_options in enumerate(feature_options_list):
        degree = (len(feature_options) - 1) * (len(classes_list) - 1)
        chi2_stats = 0
        for feature_option in feature_options:
            classwise_count = apriori.get_classwise_count(dataset, classes, [feature_option])
            classwise_count_list = list(classwise_count.values())
            
            feature_option_count = apriori.get_itemcount_from_classwise_count(classwise_count)

            #item_count is the count of 'item' in the specific class
            #class_count is the count of the specific class

            for item_count, class_count in classwise_count_list:
                expected = class_count * feature_option_count / len(dataset)
                observed = item_count
                chi2_stats += (observed - expected)**2 / expected
                
        chi2_critical = round(chi2.isf(SIGNIFICANCE_LEVEL, degree), 3)
        importance = "Important" if chi2_stats > chi2_critical else "Not Important"
        feature = "Feature {}".format(i+1)
        
        feature_list.append(feature)
        chi2_critical_list.append(chi2_critical)
        chi2_stats_list.append(chi2_stats)

        if verbose:
            table_printer.append_row(feature, round(chi2_stats, 3), chi2_critical, importance)

    if verbose:
        table_printer.end()
    return feature_list, chi2_critical_list, chi2_stats_list