Beispiel #1
0
    def align_experiments(self, names=None, verbose=False, randomize=False,
                          rnd_method='interpolate', rnd_num=1000, **kwargs):
        """
        Align the predicted boundaries of two different experiments. The 
        resulting alignment will be stored in the self.experiment list.
        
        :param None names: list of names of the experiments to align. If None,
            align all
        :param experiment1: name of the first experiment to align
        :param experiment2: name of the second experiment to align
        :param -0.1 penalty: penalty for inserting a gap in the alignment
        :param 100000 max_dist: maximum distance between two boundaries
            allowing match (100Kb seems fair with HUMAN chromosomes)
        :param False verbose: if True, print some information about the 
            alignments
        :param False randomize: check the alignment quality by comparing
            randomized boundaries over Chromosomes of the same size. This will
            return a extra value, the p-value of accepting that the observed
            alignment is not better than a random alignment
        :param interpolate rnd_method: by default uses the interpolation of TAD
           distribution. The alternative method is 'shuffle', where TADs are
           simply shuffled
        :param 1000 rnd_num: number of randomizations to do
        :param reciprocal method: if global, Needleman-Wunsch is used to align
            (see :func:`pytadbit.boundary_aligner.globally.needleman_wunsch`);
            if reciprocal, a method based on reciprocal closest boundaries is
            used (see :func:`pytadbit.boundary_aligner.reciprocally.reciprocal`)

        :returns: the alignment and the score of the alignment (by default)
        """
        if names:
            xpers = ExperimentList([self.get_experiment(n) for n in names],
                                   self)
        else:
            xpers = self.experiments
        tads = []
        for xpr in xpers:
            if not xpr.tads:
                raise Exception('No TADs defined, use find_tad function.\n')
            tads.append([xpr.tads[x]['brk'] * xpr.resolution for x in xpr.tads])
        # new
        aligneds, score = align(tads, verbose=verbose, **kwargs)
        name = tuple(sorted([x.name for x in xpers]))
        ali = Alignment(name, aligneds, xpers, score=score)
        self.alignment[name] = ali
        if verbose:
            print self.alignment[name]
        # old
        # self.alignment[name] = {}
        # for xpr, ali in zip(xpers, aligneds):
        #     self.alignment[name][xpr.name] = ali
        # if verbose:
        #     self.print_alignment(xpers=xpers)
        if not randomize:
            # return self.get_alignment(name), score
            return ali
        p_value = randomization_test(xpers, score=score, rnd_method=rnd_method,
                                     verbose=verbose, r_size=self.r_size,
                                     num=rnd_num, **kwargs)
        return score, p_value
Beispiel #2
0
def randomization_test(xpers, score=None, num=1000, verbose=False, max_dist=100000,
                       rnd_method='interpolate', r_size=None, method='reciprocal'):
    """
    Return the probability that original alignment is better than an
    alignment of randomized boundaries.

    :param tads: original TADs of each experiment to align
    :param distr: the function to interpolate TAD lengths from probability
    :param None score: just to print it when verbose
    :param 1000 num: number of random alignment to generate for comparison
    :param False verbose: to print something nice
    :param interpolate method: how to generate random tads (alternative is
       'shuffle'). 'interpolate' will calculate the distribution of TAD lengths,
       and generate a random set of TADs according to this distribution (see
       :func:`pytadbit.alignment.generate_rnd_tads`). In contrast, the 'shuffle'
       method uses directly the set of observed TADs and shuffle them (see
       :func:`pytadbit.alignment.generate_shuffle_tads`).
    """
    if not rnd_method in ['interpolate', 'shuffle']:
        raise Exception('method should be either "interpolate" or ' +
                        '"shuffle"\n')
    if rnd_method == 'interpolate' and not r_size:
        raise Exception('should provide Chromosome r_size if interpolate\n')
    tads = []
    for xpr in xpers:
        if not xpr.tads:
            raise Exception('No TADs defined, use find_tad function.\n')
        tads.append([(t['end'] - t['start']) * \
                     xpr.resolution for t in xpr.tads.values()])
    rnd_distr = []
    # rnd_len = []
    distr = _interpolation(xpers) if rnd_method is 'interpolate' else None
    rnd_exp = lambda : tads[int(random() * len(tads))]
    for val in xrange(num):
        if verbose:
            val = float(val)
            if not val / num * 100 % 5:
                stdout.write('\r' + ' ' * 10 + 
                             ' randomizing: '
                             '%.2f completed' % (100 * val/num))
                stdout.flush()
        if rnd_method is 'interpolate':
            rnd_tads = [generate_rnd_tads(r_size, distr)
                        for _ in xrange(len(tads))]
            # rnd_len.append(float(sum([len(r) for r in rnd_tads]))
            #                / len(rnd_tads))
        else:
            rnd_tads = [generate_shuffle_tads(rnd_exp())
                        for _ in xrange(len(tads))]
            # rnd_len.append(len(tads))
        rnd_distr.append(align(rnd_tads, verbose=False, method=method,
                               max_dist=max_dist)[1])
        # aligns, sc = align(rnd_tads, verbose=False)
        # rnd_distr.append(sc)
        # for xpr in aligns:
        #     print sc, '|'.join(['%5s' % (str(x/1000)[:-2] \
        # if x!='-' else '-' * 4)\
        #                         for x in xpr])
        # print ''
    pval = float(len([n for n in rnd_distr if n > score])) / len(rnd_distr)
    if verbose:
        stdout.write('\n %s randomizations finished.' % (num))
        stdout.flush()
        print '  Observed alignment score: %s' % (score)
        # print '  Mean number of boundaries: {}; observed: {}'.format (
        #     sum(rnd_len)/len(rnd_len),
        #     str([len(self.experiments[e].brks)
        #          for e in self.experiments]))
        print 'Randomized scores between %s and %s; observed: %s' % (
            min(rnd_distr), max(rnd_distr), score)
        print 'p-value: %s' % (pval if pval else '<%s' % (1./num))
    return pval
Beispiel #3
0
def randomization_test(xpers, score=None, num=1000, verbose=False, max_dist=100000,
                       rnd_method='interpolate', r_size=None, method='reciprocal'):
    """
    Return the probability that original alignment is better than an
    alignment of randomized boundaries.

    :param tads: original TADs of each experiment to align
    :param distr: the function to interpolate TAD lengths from probability
    :param None score: just to print it when verbose
    :param 1000 num: number of random alignment to generate for comparison
    :param False verbose: to print something nice
    :param interpolate method: how to generate random tads (alternative is
       'shuffle'). 'interpolate' will calculate the distribution of TAD lengths,
       and generate a random set of TADs according to this distribution (see
       :func:`pytadbit.alignment.generate_rnd_tads`). In contrast, the 'shuffle'
       method uses directly the set of observed TADs and shuffle them (see
       :func:`pytadbit.alignment.generate_shuffle_tads`).
    """
    if not rnd_method in ['interpolate', 'shuffle']:
        raise Exception('method should be either "interpolate" or ' +
                        '"shuffle"\n')
    if rnd_method == 'interpolate' and not r_size:
        raise Exception('should provide Chromosome r_size if interpolate\n')
    tads = []
    for xpr in xpers:
        if not xpr.tads:
            raise Exception('No TADs defined, use find_tad function.\n')
        tads.append([(t['end'] - t['start']) * \
                     xpr.resolution for t in xpr.tads.values()])
    rnd_distr = []
    # rnd_len = []
    distr = _interpolation(xpers) if rnd_method is 'interpolate' else None
    rnd_exp = lambda : tads[int(random() * len(tads))]
    for val in xrange(num):
        if verbose:
            val = float(val)
            if not val / num * 100 % 5:
                stdout.write('\r' + ' ' * 10 +
                             ' randomizing: '
                             '%.2f completed' % (100 * val/num))
                stdout.flush()
        if rnd_method is 'interpolate':
            rnd_tads = [generate_rnd_tads(r_size, distr)
                        for _ in xrange(len(tads))]
            # rnd_len.append(float(sum([len(r) for r in rnd_tads]))
            #                / len(rnd_tads))
        else:
            rnd_tads = [generate_shuffle_tads(rnd_exp())
                        for _ in xrange(len(tads))]
            # rnd_len.append(len(tads))
        rnd_distr.append(align(rnd_tads, verbose=False, method=method,
                               max_dist=max_dist)[0][1])
        # aligns, sc = align(rnd_tads, verbose=False)
        # rnd_distr.append(sc)
        # for xpr in aligns:
        #     print sc, '|'.join(['%5s' % (str(x/1000)[:-2] \
        # if x!='-' else '-' * 4)\
        #                         for x in xpr])
        # print ''
    pval = float(len([n for n in rnd_distr if n > score])) / len(rnd_distr)
    if verbose:
        stdout.write('\n %s randomizations finished.' % (num))
        stdout.flush()
        print '  Observed alignment score: %s' % (score)
        # print '  Mean number of boundaries: {}; observed: {}'.format (
        #     sum(rnd_len)/len(rnd_len),
        #     str([len(self.experiments[e].brks)
        #          for e in self.experiments]))
        print 'Randomized scores between %s and %s; observed: %s' % (
            min(rnd_distr), max(rnd_distr), score)
        print 'p-value: %s' % (pval if pval else '<%s' % (1./num))
    return pval
Beispiel #4
0
    def align_experiments(self, names=None, verbose=False, randomize=False,
                          rnd_method='interpolate', rnd_num=1000,
                          get_score=False, **kwargs):
        """
        Align the predicted boundaries of two different experiments. The
        resulting alignment will be stored in the self.experiment list.

        :param None names: list of names of the experiments to align. If None,
            align all
        :param experiment1: name of the first experiment to align
        :param experiment2: name of the second experiment to align
        :param -0.1 penalty: penalty for inserting a gap in the alignment
        :param 100000 max_dist: maximum distance between two boundaries
            allowing match (100Kb seems fair with HUMAN chromosomes)
        :param False verbose: if True, print some information about the
            alignments
        :param False randomize: check the alignment quality by comparing
            randomized boundaries over Chromosomes of the same size. This will
            return a extra value, the p-value of accepting that the observed
            alignment is not better than a random alignment
        :param False get_score: returns alignemnt object, alignment score and
           percentage of identity from one side and from the other
        :param interpolate rnd_method: by default uses the interpolation of TAD
           distribution. The alternative method is 'shuffle', where TADs are
           simply shuffled
        :param 1000 rnd_num: number of randomizations to do
        :param reciprocal method: if global, Needleman-Wunsch is used to align
            (see :func:`pytadbit.boundary_aligner.globally.needleman_wunsch`);
            if reciprocal, a method based on reciprocal closest boundaries is
            used (see :func:`pytadbit.boundary_aligner.reciprocally.reciprocal`)

        :returns: an alignment object or, if the randomizattion was invoked,
           an alignment object, and a list of statistics that are, the alignment
           score, the probability that observed alignment performs better than
           randoms, the proportion of borders from the first experiment found
           aligned in the second experiment and the proportion of borders from
           the second experiment found aligned in the first experiment.
           Returned calues can be catched like this:

               ali = crm.align_experiments()

           or, with randomization test:

               ali, (score, pval, prop1, prop2) = crm.align_experiments(randomize=True)

        """
        if names:
            xpers = ExperimentList([self.get_experiment(n) for n in names],
                                   self)
        else:
            xpers = self.experiments
        tads = []
        for xpr in xpers:
            if not xpr.tads:
                raise Exception('No TADs defined, use find_tad function.\n')
            tads.append([xpr.tads[x]['brk'] * xpr.resolution for x in xpr.tads
                         if xpr.tads[x]['score'] >= 0])
        (aligneds, score, perc1, perc2), consensus = align(tads, verbose=verbose, **kwargs)
        name = tuple(sorted([x.name for x in xpers]))
        ali = Alignment(name, aligneds, xpers, consensus, score=score)
        self.alignment[name] = ali
        if verbose:
            print(self.alignment[name])
        if not randomize:
            if get_score:
                return ali, score, perc1, perc2
            else:
                return ali
        p_value = randomization_test(xpers, score=score, rnd_method=rnd_method,
                                     verbose=verbose, r_size=self.r_size,
                                     num=rnd_num, **kwargs)
        return ali, (score, p_value, perc1, perc2)
Beispiel #5
0
    def align_experiments(self, names=None, verbose=False, randomize=False,
                          rnd_method='interpolate', rnd_num=1000,
                          get_score=False, **kwargs):
        """
        Align the predicted boundaries of two different experiments. The
        resulting alignment will be stored in the self.experiment list.

        :param None names: list of names of the experiments to align. If None,
            align all
        :param experiment1: name of the first experiment to align
        :param experiment2: name of the second experiment to align
        :param -0.1 penalty: penalty for inserting a gap in the alignment
        :param 100000 max_dist: maximum distance between two boundaries
            allowing match (100Kb seems fair with HUMAN chromosomes)
        :param False verbose: if True, print some information about the
            alignments
        :param False randomize: check the alignment quality by comparing
            randomized boundaries over Chromosomes of the same size. This will
            return a extra value, the p-value of accepting that the observed
            alignment is not better than a random alignment
        :param False get_score: returns alignemnt object, alignment score and
           percentage of identity from one side and from the other
        :param interpolate rnd_method: by default uses the interpolation of TAD
           distribution. The alternative method is 'shuffle', where TADs are
           simply shuffled
        :param 1000 rnd_num: number of randomizations to do
        :param reciprocal method: if global, Needleman-Wunsch is used to align
            (see :func:`pytadbit.boundary_aligner.globally.needleman_wunsch`);
            if reciprocal, a method based on reciprocal closest boundaries is
            used (see :func:`pytadbit.boundary_aligner.reciprocally.reciprocal`)

        :returns: an alignment object or, if the randomizattion was invoked,
           an alignment object, and a list of statistics that are, the alignment
           score, the probability that observed alignment performs better than
           randoms, the proportion of borders from the first experiment found
           aligned in the second experiment and the proportion of borders from
           the second experiment found aligned in the first experiment.
           Returned calues can be catched like this:

               ali = crm.align_experiments()

           or, with randomization test:

               ali, (score, pval, prop1, prop2) = crm.align_experiments(randomize=True)

        """
        if names:
            xpers = ExperimentList([self.get_experiment(n) for n in names],
                                   self)
        else:
            xpers = self.experiments
        tads = []
        for xpr in xpers:
            if not xpr.tads:
                raise Exception('No TADs defined, use find_tad function.\n')
            tads.append([xpr.tads[x]['brk'] * xpr.resolution for x in xpr.tads
                         if xpr.tads[x]['score'] >= 0])
        (aligneds, score, perc1, perc2), consensus = align(tads, verbose=verbose, **kwargs)
        name = tuple(sorted([x.name for x in xpers]))
        ali = Alignment(name, aligneds, xpers, consensus, score=score)
        self.alignment[name] = ali
        if verbose:
            print self.alignment[name]
        if not randomize:
            if get_score:
                return ali, score, perc1, perc2
            else:
                return ali
        p_value = randomization_test(xpers, score=score, rnd_method=rnd_method,
                                     verbose=verbose, r_size=self.r_size,
                                     num=rnd_num, **kwargs)
        return ali, (score, p_value, perc1, perc2)