def randomization_test_old(num_sequences, mean, std, score, chr_len, bin_size, num=1000, verbose=False): rand_distr = [] rand_len = [] best = (None, None) for n in xrange(num): if verbose: n = float(n) if not n / num * 100 % 5: stdout.write("\r" + " " * 10 + " randomizing: {:.2%} completed".format(n / num)) stdout.flush() random_tads = [generate_random_tads(chr_len, mean, std, bin_size) for _ in xrange(num_sequences)] rand_len.append(float(sum([len(r) for r in random_tads])) / len(random_tads)) rand_distr.append(align(random_tads, bin_size=bin_size, chr_len=chr_len, verbose=False)[1]) if rand_distr[-1] > best[0]: best = rand_distr[-1], random_tads p_value = float(len([n for n in rand_distr if n > score])) / len(rand_distr) if verbose: stdout.write("\n {} randomizations finished.".format(num)) stdout.flush() align(best[-1], bin_size=bin_size, chr_len=chr_len, verbose=True) print "Observed alignment score: {}".format(score) print " Randomized scores between {} and {}".format(min(rand_distr), max(rand_distr)) print "p-value: {}".format(p_value if p_value else "<{}".format(1.0 / num)) print sum(rand_len) / len(rand_len) return p_value
def randomization_test(self, num_sequences, distr, score=None, num=1000, verbose=False): """ Return the probability that original alignment is better than an alignment of randomized boundaries. :argument num_sequences: number of sequences aligned :argument distr: the function to interpolate TAD lengths from\ probability :argument None score: just to print it when verbose :argument 1000 num: number of random alignment to generate for\ comparison :argument False verbose: to print something nice """ rand_distr = [] rand_len = [] for n in xrange(num): if verbose: n = float(n) if not n / num * 100 % 5: stdout.write("\r" + " " * 10 + " randomizing: {:.2%} completed".format(n / num)) stdout.flush() random_tads = [generate_random_tads(self.r_size, distr, self.resolution) for _ in xrange(num_sequences)] rand_len.append(float(sum([len(r) for r in random_tads])) / len(random_tads)) rand_distr.append(align(random_tads, bin_size=self.resolution, chr_len=self.r_size, verbose=False)[1]) p_value = float(len([n for n in rand_distr if n > score])) / len(rand_distr) if verbose: stdout.write("\n {} randomizations finished.".format(num)) stdout.flush() print " Observed alignment score: {}".format(score) print " Mean number of boundaries: {}; observed: {}".format( sum(rand_len) / len(rand_len), str([len(self.experiments[e]["brks"]) for e in self.experiments]) ) print "Randomized scores between {} and {}; observed: {}".format(min(rand_distr), max(rand_distr), score) print "p-value: {}".format(p_value if p_value else "<{}".format(1.0 / num)) return p_value
def align_experiments(self, names=None, verbose=False, randomize=False, **kwargs): """ Align prediction of boundaries of two different experiments :argument None names: list of names of experiments to align. If None\ align all. :argument experiment1: name of the first experiment to align :argument experiment2: name of the second experiment to align :argument -0.1 penalty: penalty of inserting a gap in the alignment :argument 500000 max_dist: Maximum distance between 2 boundaries allowing match :argument False verbose: print somethings :argument False randomize: check alignment quality by comparing randomization\ of boundaries over chromosomes of same size. This will return a extra value,\ the p-value of accepting that observed alignment is not better than random\ alignment """ experiments = names or self.experiments.keys() tads = [] for e in experiments: if not self.experiments[e]["tads"]: raise Exception("No TADs defined, use find_TAD function.\n") tads.append(self.experiments[e]["brks"]) aligneds, score = align(tads, bin_size=self.resolution, chr_len=self.r_size, **kwargs) for e, ali in zip(experiments, aligneds): self.experiments[e]["align"] = ali self.experiments[e]["align"] = ali if verbose: self.print_alignment(experiments) if not randomize: return self.get_alignment(names), score # mean, std = self._get_tads_mean_std(experiments) # print 'mean', mean, 'std', std, self.r_size, self.r_size/mean # p_value = randomization_test(len(experiments), mean, std, score, # self.r_size, self.resolution, # verbose=verbose, **kwargs) distr = self.interpolation(experiments) p_value = self.randomization_test(len(experiments), distr, score, verbose=verbose, **kwargs) return score, p_value