def get_clevel(self, alpha: float = 0.05): # determine confidence threshold value from scipy.stats.distributions import chi2 etol = chi2.isf(alpha, df=1) clevel = etol / 2 + np.log(self.obj) self.clevel = clevel return clevel
def chi2stats(data): xs, stdevs, x, k, l, df, name = data sample_chi2 = chi2_old(x, k, l, xs, stdevs) print(name) print(sample_chi2) print(chi_squared.isf(.05, df)) print(1 - chi_squared.cdf(sample_chi2, df)) plt.plot(xs, "o") plt.plot([model(x, k, l, i) for i in range(len(xs))]) plt.title(name) plt.show()
def hypothesis_test(observed_no_treatment_data, observed_treatment_data, treatment_name, alpha=0.05, graph=True): """ Performs an hypothesis test to check if a given treatment changes distributions. :param observed_no_treatment_data: observed value counts for the distribution when there is no treatment. :param observed_treatment_data: observed value counts for the distribution when there is a treatment. :param treatment_name: the name of the treatment. :param alpha: the alpha of the test. """ # We've chosen chi-squared as the hypothesis test print_header("Hypothesis testing for the treatment: %r\n" "We will perform the chi-squared test with alpha = %r" % (treatment_name, alpha)) no_treatment_percent = observed_no_treatment_data / sum(observed_no_treatment_data) expected_treatment_data = sum(observed_treatment_data) * no_treatment_percent # check if the expected frequencies are large enough for the chi-squared test can_use_chi_squared = sum(expected_treatment_data > MINIMAL_EXPECTED_FREQUENCY) == len(expected_treatment_data) print("Are the expected frequencies enough for the chi-squared test? %r" % can_use_chi_squared) if not can_use_chi_squared: print("Frequencies not high enough for this treatment.") return # the decision rule critical_value = chi2.isf(q=alpha, df=len(expected_treatment_data) - 1) print("The critical value is: %r" % critical_value) # perform the test chisq, p_value = chisquare(f_obs=observed_treatment_data, f_exp=expected_treatment_data) rejection = (chisq > critical_value) and (p_value < alpha) print("The chi-squared value is %r, the p-value is %r.\nShould we reject H0? %r" % (chisq, p_value, rejection)) print("Note: H0 is the hypothesis that the treatment makes no change to the observed " "distribution of the untreated data.") # Supporting graphs and data if graph: # Value counts for patients who had no treatment pretty_graph(observed_no_treatment_data, None, 'Value counts, without treatment: ' + treatment_name, 'Readmission CHANGE THIS', 'Crime count') # Value counts for patients who had the treatment pretty_graph(observed_treatment_data, None, 'Value counts, with treatment: ' + treatment_name, 'Readmission CHANGE THIS', 'Crime count') # Various relevant values # Note that of course that the first two percentages should be the same print("Some supporting statistics:") print("Observed percentages without treatment:\n%r" % no_treatment_percent) print("Expected percentages with treatment (should be the same as above):\n%r" % (expected_treatment_data / sum(expected_treatment_data))) print("Observed percentages with treatment:\n%r" % (observed_treatment_data / sum(observed_treatment_data)))
def prosac(data, quality, model_type, tolerance, beta, eta0, psi, max_outlier_proportion, p_good_sample, max_number_of_draws, enable_n_star_optimization=True): """ Progressive random sampling algorithm (PROSAC) Adapted from: http://devernay.free.fr/vision/src/prosac.c :param data: Data to fit :param quality: Point quality :param model_type: Model subclass :param tolerance: Tolerance on the error to consider a point inlier to a model :param beta: Probability that a match is declared inlier by mistake, i.e. the ratio of the "inlier" :param eta0: Maximum probability that a solution with more than In_star inliers in Un_star exists and was not found after k samples (typically set to 5%, see Sec. 2.2 of [Chum-Matas-05]). :param psi: Probability that In_star out of n_star data points are by chance inliers to an arbitrary (typically set to 5%) :param max_outlier_proportion: Maximum allowed outliers proportion in the input data, used to compute T_N (can be as high as 0.95) :param p_good_sample: Probability that at least one of the random samples picked up by RANSAC is free of outliers :param max_number_of_draws: Max number of draws :param enable_n_star_optimization: Enable early stopping if the probability of finding a better match fall below eta0 :return: A model of type model_type, fitted to the inliers """ indexes = np.argsort(quality) data = data[indexes[::-1]] num_points = data.shape[0] num_points_to_sample = model_type.get_complexity() chi2_value = chi2.isf(2 * psi, 1) def niter_ransac(p, epsilon, s, n_max): """ Compute the maximum number of iterations for RANSAC :param p: Probability that at least one of the random samples picked up by RANSAC is free of outliers :param epsilon: Proportion of outliers :param s: Sample size :param n_max: Upper bound on the number of iterations (-1 means INT_MAX) :return: maximum number of iterations for RANSAC """ if n_max == -1: n_max = np.iinfo(np.int32).max if not (n_max >= 1): raise ValueError('n_max must be positive') if epsilon <= 0: return 1 logarg = -np.exp(s * np.log(1 - epsilon)) logval = np.log(1 + logarg) n = np.log(1 - p) / logval if logval < 0 and n < n_max: return np.ceil(n) return n_max def i_min(m, n, beta): """ Non-randomness, prevent from choosing a model supported by outliers :param m: Model complexity :param n: Number of considered points :param beta: Beta parameter :return: Minimum number of inlier to avoid model only supported by outliers """ mu = n * beta sigma = np.sqrt(n * beta * (1 - beta)) return np.ceil(m + mu + sigma * np.sqrt(chi2_value)) N = num_points m = num_points_to_sample T_N = niter_ransac(p_good_sample, max_outlier_proportion, num_points_to_sample, -1) I_N_min = (1 - max_outlier_proportion) * N n_star = N I_n_star = 0 I_N_best = 0 t = 0 n = m T_n = T_N for i in range(m): T_n = T_n * (n - i) / (N - i) T_n_prime = 1 k_n_star = T_N while ((I_N_best < I_N_min) or t <= k_n_star) and t < T_N and t <= max_number_of_draws: t = t + 1 if (t > T_n_prime) and (n < n_star): T_nplus1 = (T_n * (n + 1)) / (n + 1 - m) n = n + 1 T_n_prime = T_n_prime + np.ceil(T_nplus1 - T_n) T_n = T_nplus1 if t > T_n_prime: pts_idx = np.random.choice(n, m, replace=False) else: pts_idx = np.append(np.random.choice(n - 1, m - 1, replace=False), n) sample = data[pts_idx] # 3. Model parameter estimation model = model_type() model.fit(sample) # 4. Model verification error = model.error(data) is_inlier = (error < tolerance) I_N = is_inlier.sum() if I_N > I_N_best: I_N_best = I_N n_best = N I_n_best = I_N best_model = model if enable_n_star_optimization: epsilon_n_best = I_n_best / n_best I_n_test = I_N for n_test in range(N, m, -1): if not (n_test >= I_n_test): raise RuntimeError( 'Loop invariant broken: n_test >= I_n_test') if ((I_n_test * n_best > I_n_best * n_test) and (I_n_test > epsilon_n_best * n_test + np.sqrt(n_test * epsilon_n_best * (1 - epsilon_n_best) * chi2_value))): if I_n_test < i_min(m, n_test, beta): break n_best = n_test I_n_best = I_n_test epsilon_n_best = I_n_best / n_best I_n_test = I_n_test - is_inlier[n_test - 1] if I_n_best * n_star > I_n_star * n_best: if not (n_best >= I_n_best): raise RuntimeError( 'Assertion not respected: n_best >= I_n_best') n_star = n_best I_n_star = I_n_best k_n_star = niter_ransac(1 - eta0, 1 - I_n_star / n_star, m, T_N) return best_model
print("CHR" +"\t"+ "BP" +"\t"+ "SNP" +"\t"+ "A1" +"\t"+ "A2"+"\t"+ "P" +"\t"+ "OR" +"\t"+ "BETA" +"\t"+ "SE" +"\t"+ "N" +"\t"+ "CHISQ" +"\t"+ "Z" +"\t"+ "SOURCE" +"\t"+ "FRQ_A_A2" +"\t"+ "FRQ_U_A2")# +"\t"+ "RefPanel_A1" +"\t"+ "RefPanelAF_A1" +"\t"+ "INFO" +"\t"+ "Genotyped" +"\t"+ "RSID") line = fh1.readline().replace("\n", "") while line: list = re.split("\s+",line) chromosome = int(list[0]) SNPID = list[2] pos = int(list[1]) INFO = list[7] A1 = list[3] A2 = list[4] FRQ_A_A1 = list[17] # IMPORTANT: here this is FREQ from affected samples FRQ_U_A1 = list[18] # IMPORTANT: here this is FREQ from unaffected samples pval = list[12] if pval == "0": pval = "1e-324" chisq = chi2.isf(float(pval), 1) # https://www.biostars.org/p/261698/ OR = float(list[19]) source = "SAIGE" # BETA = round(math.log(OR),4) BETA = list[9] SE = list[10] zscore = float(BETA)/float(SE) print(str(chromosome) +"\t"+ str(pos) +"\t"+ SNPID +"\t"+ A1 +"\t"+ A2 +"\t"+ pval +"\t"+ str(OR) +"\t"+ str(BETA) +"\t"+ str(SE) +"\t"+ num_samples + "\t" + str(chisq) + "\t" + str(zscore) + "\t" + source + "\t" + str(FRQ_A_A1) + "\t" + str(FRQ_U_A1)) line = fh1.readline().replace("\n", "") fh1.close() sys.exit(0)
def get_chi2_stats(**kwargs): """Get Chi-Square statistics for the features present in training dataset. Calculates and returns chi-square stats for the features present in files 'Itemset_train.txt' and 'Classes_train.txt'. Parameters ---------- **kwargs pass verbose : bool, optional Specifies whether to print logs of operation performed by the method to the console. Default is False. Returns ------- feature_list : array_like chi2_critical_list : array_like chi2_stats_list : array_like """ dataset = apriori.get_dataset_from_file('Itemset_train.txt') classes = apriori.get_classes_from_file('Classes_train.txt') assert len(dataset) == len(classes) feature_options_list = list(map(set, zip(*dataset))) classes_list = list(set(classes)) verbose = kwargs.get('verbose', False) if verbose: print("Chi Square Feature Selection at {}% Confidence Interval\n".format((1-SIGNIFICANCE_LEVEL)*100)) table_printer = TablePrinter(4, 1) table_printer.set_column_headers("Feature", "Chi2_stats", "Chi2_critical_value", "Importance") table_printer.set_column_alignments('^', '^', '^', '^') table_printer.set_column_widths(20, 20, 20, 20) table_printer.begin() feature_list = [] chi2_critical_list = [] chi2_stats_list = [] for i, feature_options in enumerate(feature_options_list): degree = (len(feature_options) - 1) * (len(classes_list) - 1) chi2_stats = 0 for feature_option in feature_options: classwise_count = apriori.get_classwise_count(dataset, classes, [feature_option]) classwise_count_list = list(classwise_count.values()) feature_option_count = apriori.get_itemcount_from_classwise_count(classwise_count) #item_count is the count of 'item' in the specific class #class_count is the count of the specific class for item_count, class_count in classwise_count_list: expected = class_count * feature_option_count / len(dataset) observed = item_count chi2_stats += (observed - expected)**2 / expected chi2_critical = round(chi2.isf(SIGNIFICANCE_LEVEL, degree), 3) importance = "Important" if chi2_stats > chi2_critical else "Not Important" feature = "Feature {}".format(i+1) feature_list.append(feature) chi2_critical_list.append(chi2_critical) chi2_stats_list.append(chi2_stats) if verbose: table_printer.append_row(feature, round(chi2_stats, 3), chi2_critical, importance) if verbose: table_printer.end() return feature_list, chi2_critical_list, chi2_stats_list