Ejemplo n.º 1
0
    def find_compartments(self, crms=None, savefig=None, savedata=None,
                          show=False, **kwargs):
        """
        Search for A/B copartments in each chromsome of the Hi-C matrix.
        Hi-C matrix is normalized by the number interaction expected at a given
        distance, and by visibility (one iteration of ICE). A correlation matrix
        is then calculated from this normalized matrix, and its first
        eigenvector is used to identify compartments. Changes in sign marking
        boundaries between compartments.
        Result is stored as a dictionary of compartment boundaries, keys being
        chromsome names.
        
        :param 99 perc_zero: to filter bad columns
        :param 0.05 signal_to_noise: to calculate expected interaction counts,
           if not enough reads are observed at a given distance the observations
           of the distance+1 are summed. a signal to noise ratio of < 0.05
           corresponds to > 400 reads.
        :param None crms: only runs these given list of chromosomes
        :param None savefig: path to a directory to store matrices with
           compartment predictions, one image per chromosome, stored under
           'chromosome-name.png'.
        :param False show: show the plot
        :param None savedata: path to a new file to store compartment
           predictions, one file only.
        :param -1 vmin: for the color scale of the plotted map
        :param 1 vmax: for the color scale of the plotted map

        TODO: this is really slow...

        Notes: building the distance matrix using the amount of interactions
               instead of the mean correlation, gives generally worse results.
        
        """
        if not self.bads:
            if kwargs.get('verbose', True):
                print 'Filtering bad columns %d' % 99
            self.filter_columns(perc_zero=kwargs.get('perc_zero', 99),
                                by_mean=False, silent=True)
        if not self.expected:
            if kwargs.get('verbose', True):
                print 'Normalizing by expected values'
            self.expected = expected(self, bads=self.bads, **kwargs)
        if not self.bias:
            if kwargs.get('verbose', True):
                print 'Normalizing by ICE (1 round)'
            self.normalize_hic(iterations=0)
        if savefig:
            mkdir(savefig)

        cmprts = {}
        for sec in self.section_pos:
            if crms and sec not in crms:
                continue
            if kwargs.get('verbose', False):
                print 'Processing chromosome', sec
                warn('Processing chromosome %s' % (sec))
            matrix = [[(float(self[i,j]) / self.expected[abs(j-i)]
                       / self.bias[i] / self.bias[j])
                      for i in xrange(*self.section_pos[sec])
                       if not i in self.bads]
                     for j in xrange(*self.section_pos[sec])
                      if not j in self.bads]
            if not matrix: # MT chromosome will fall there
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                continue
            for i in xrange(len(matrix)):
                for j in xrange(i+1, len(matrix)):
                    matrix[i][j] = matrix[j][i]
            matrix = [list(m) for m in corrcoef(matrix)]
            try:
                # This eighs is very very fast, only ask for one eigvector
                _, evect = eigsh(array(matrix), k=1)
            except LinAlgError:
                warn('Chromosome %s too small to compute PC1' % (sec))
                cmprts[sec] = [] # Y chromosome, or so...
                continue
            first = list(evect[:, -1])
            beg, end = self.section_pos[sec]
            bads = [k - beg for k in self.bads if beg <= k <= end]
            _ = [first.insert(b, 0) for b in bads]
            _ = [matrix.insert(b, [float('nan')] * len(matrix[0]))
                 for b in bads]
            _ = [matrix[i].insert(b, float('nan'))
                 for b in bads for i in xrange(len(first))]
            breaks = [0] + [i for i, (a, b) in
                            enumerate(zip(first[1:], first[:-1]))
                            if a * b < 0] + [len(first)]
            breaks = [{'start': b, 'end': breaks[i+1]}
                      for i, b in enumerate(breaks[: -1])]
            cmprts[sec] = breaks
            
            # calculate compartment internal density
            for k, cmprt in enumerate(cmprts[sec]):
                beg = self.section_pos[sec][0]
                beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg
                sec_matrix = [(self[i,j] / self.expected[abs(j-i)]
                               / self.bias[i] / self.bias[j])
                              for i in xrange(beg1, end1) if not i in self.bads
                              for j in xrange(i, end1) if not j in self.bads]
                try:
                    cmprt['dens'] = sum(sec_matrix) / len(sec_matrix)
                except ZeroDivisionError:
                    cmprt['dens'] = 0.
            try:
                meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec])
            except ZeroDivisionError:
                meanh = 1.
            for cmprt in cmprts[sec]:
                try:
                    cmprt['dens'] /= meanh
                except ZeroDivisionError:
                    cmprt['dens'] = 1.
            gammas = {}
            for gamma in range(101):
                gammas[gamma] = _find_ab_compartments(float(gamma)/100, matrix,
                                                      breaks, cmprts[sec],
                                                      save=False)
                # print gamma, gammas[gamma]
            gamma = min(gammas.keys(), key=lambda k: gammas[k][0])
            _ = _find_ab_compartments(float(gamma)/100, matrix, breaks,
                                      cmprts[sec], save=True)
            if savefig or show:
                vmin = kwargs.get('vmin', -1)
                vmax = kwargs.get('vmax',  1)
                if vmin == 'auto' == vmax:
                    vmax = max([abs(npmin(matrix)), abs(npmax(matrix))])
                    vmin = -vmax
                plot_compartments(sec, first, cmprts, matrix, show,
                                  savefig + '/chr' + sec + '.pdf',
                                  vmin=vmin, vmax=vmax)
                plot_compartments_summary(sec, cmprts, show,
                                          savefig + '/chr' + sec + '_summ.pdf')
            
        self.compartments = cmprts
        if savedata:
            self.write_compartments(savedata)
Ejemplo n.º 2
0
    def find_compartments(self,
                          crms=None,
                          savefig=None,
                          savedata=None,
                          show=False,
                          **kwargs):
        """
        Search for A/B copartments in each chromsome of the Hi-C matrix.
        Hi-C matrix is normalized by the number interaction expected at a given
        distance, and by visibility (one iteration of ICE). A correlation matrix
        is then calculated from this normalized matrix, and its first
        eigenvector is used to identify compartments. Changes in sign marking
        boundaries between compartments.
        Result is stored as a dictionary of compartment boundaries, keys being
        chromsome names.
        
        :param 99 perc_zero: to filter bad columns
        :param 0.05 signal_to_noise: to calculate expected interaction counts,
           if not enough reads are observed at a given distance the observations
           of the distance+1 are summed. a signal to noise ratio of < 0.05
           corresponds to > 400 reads.
        :param None crms: only runs these given list of chromosomes
        :param None savefig: path to a directory to store matrices with
           compartment predictions, one image per chromosome, stored under
           'chromosome-name.png'.
        :param False show: show the plot
        :param None savedata: path to a new file to store compartment
           predictions, one file only.
        :param -1 vmin: for the color scale of the plotted map
        :param 1 vmax: for the color scale of the plotted map

        TODO: this is really slow...

        Notes: building the distance matrix using the amount of interactions
               instead of the mean correlation, gives generally worse results.
        
        """
        if not self.bads:
            if kwargs.get('verbose', True):
                print 'Filtering bad columns %d' % 99
            self.filter_columns(perc_zero=kwargs.get('perc_zero', 99),
                                by_mean=False,
                                silent=True)
        if not self.expected:
            if kwargs.get('verbose', True):
                print 'Normalizing by expected values'
            self.expected = expected(self, bads=self.bads, **kwargs)
        if not self.bias:
            if kwargs.get('verbose', True):
                print 'Normalizing by ICE (1 round)'
            self.normalize_hic(iterations=0)
        if savefig:
            mkdir(savefig)

        cmprts = {}
        for sec in self.section_pos:
            if crms and sec not in crms:
                continue
            if kwargs.get('verbose', False):
                print 'Processing chromosome', sec
                warn('Processing chromosome %s' % (sec))
            matrix = [[(float(self[i, j]) / self.expected[abs(j - i)] /
                        self.bias[i] / self.bias[j])
                       for i in xrange(*self.section_pos[sec])
                       if not i in self.bads]
                      for j in xrange(*self.section_pos[sec])
                      if not j in self.bads]
            if not matrix:  # MT chromosome will fall there
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                continue
            for i in xrange(len(matrix)):
                for j in xrange(i + 1, len(matrix)):
                    matrix[i][j] = matrix[j][i]
            matrix = [list(m) for m in corrcoef(matrix)]
            try:
                # This eighs is very very fast, only ask for one eigvector
                _, evect = eigsh(array(matrix), k=1)
            except LinAlgError:
                warn('Chromosome %s too small to compute PC1' % (sec))
                cmprts[sec] = []  # Y chromosome, or so...
                continue
            first = list(evect[:, -1])
            beg, end = self.section_pos[sec]
            bads = [k - beg for k in self.bads if beg <= k <= end]
            _ = [first.insert(b, 0) for b in bads]
            _ = [
                matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads
            ]
            _ = [
                matrix[i].insert(b, float('nan')) for b in bads
                for i in xrange(len(first))
            ]
            breaks = [0] + [
                i for i, (a, b) in enumerate(zip(first[1:], first[:-1]))
                if a * b < 0
            ] + [len(first)]
            breaks = [{
                'start': b,
                'end': breaks[i + 1]
            } for i, b in enumerate(breaks[:-1])]
            cmprts[sec] = breaks

            # calculate compartment internal density
            for k, cmprt in enumerate(cmprts[sec]):
                beg = self.section_pos[sec][0]
                beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg
                sec_matrix = [(self[i, j] / self.expected[abs(j - i)] /
                               self.bias[i] / self.bias[j])
                              for i in xrange(beg1, end1) if not i in self.bads
                              for j in xrange(i, end1) if not j in self.bads]
                try:
                    cmprt['dens'] = sum(sec_matrix) / len(sec_matrix)
                except ZeroDivisionError:
                    cmprt['dens'] = 0.
            try:
                meanh = sum([cmprt['dens']
                             for cmprt in cmprts[sec]]) / len(cmprts[sec])
            except ZeroDivisionError:
                meanh = 1.
            for cmprt in cmprts[sec]:
                try:
                    cmprt['dens'] /= meanh
                except ZeroDivisionError:
                    cmprt['dens'] = 1.
            gammas = {}
            for gamma in range(101):
                gammas[gamma] = _find_ab_compartments(float(gamma) / 100,
                                                      matrix,
                                                      breaks,
                                                      cmprts[sec],
                                                      save=False)
                # print gamma, gammas[gamma]
            gamma = min(gammas.keys(), key=lambda k: gammas[k][0])
            _ = _find_ab_compartments(float(gamma) / 100,
                                      matrix,
                                      breaks,
                                      cmprts[sec],
                                      save=True)
            if savefig or show:
                vmin = kwargs.get('vmin', -1)
                vmax = kwargs.get('vmax', 1)
                if vmin == 'auto' == vmax:
                    vmax = max([abs(npmin(matrix)), abs(npmax(matrix))])
                    vmin = -vmax
                plot_compartments(sec,
                                  first,
                                  cmprts,
                                  matrix,
                                  show,
                                  savefig + '/chr' + sec + '.pdf',
                                  vmin=vmin,
                                  vmax=vmax)
                plot_compartments_summary(sec, cmprts, show,
                                          savefig + '/chr' + sec + '_summ.pdf')

        self.compartments = cmprts
        if savedata:
            self.write_compartments(savedata)