Example #1
0
    def test_19_matrix_manip(self):
        if ONLY and not "19" in ONLY:
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads("lala-map~", resolution=10000)
        hic_map(hic_data1, savedata="lala-map.tsv~", savefig="lala.pdf")
        hic_map(hic_data1,
                by_chrom="intra",
                savedata="lala-maps~",
                savefig="lalalo~")
        hic_map(hic_data1,
                by_chrom="inter",
                savedata="lala-maps~",
                savefig="lalala~")
        # slowest part of the all test:
        hic_data2 = read_matrix("lala-map.tsv~", resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        # vals = plot_distance_vs_interactions(hic_data1)

        # self.assertEqual([round(i, 2) if str(i)!="nan" else 0.0 for i in
        #                   reduce(lambda x, y: x + y, vals)],
        #                  [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0])

        a, b = insert_sizes("lala-map~")
        self.assertEqual([int(a), int(b)], [43, 1033])

        hic_data1 = read_matrix(PATH + "/20Kb/chrT/chrT_A.tsv",
                                resolution=20000)
        hic_data2 = read_matrix(PATH + "/20Kb/chrT/chrT_B.tsv",
                                resolution=20000)

        corr = correlate_matrices(hic_data1, hic_data2)
        corr = [round(i, 3) for i in corr[0]]
        self.assertEqual(corr, [
            0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797,
            0.832
        ])

        ecorr = eig_correlate_matrices(hic_data1,
                                       hic_data2,
                                       savefig='lala3.pdf')

        ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)]
        self.assertEqual(ecorr, [
            0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01,
            0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
            0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974,
            0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89
        ])
        system("rm -rf lala*")
        if CHKTIME:
            self.assertEqual(True, True)
            print "19", time() - t0
Example #2
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1,
                by_chrom='intra',
                savedata='lala-maps~',
                savefig='lalalo~')
        hic_map(hic_data1,
                by_chrom='inter',
                savedata='lala-maps~',
                savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)

        self.assertEqual([
            round(i, 2) if str(i) != 'nan' else 0.0
            for i in reduce(lambda x, y: x + y, vals)
        ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0])

        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a), int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)

        corr = correlate_matrices(hic_data1, hic_data2)
        corr = [round(i, 3) for i in corr[0]]
        self.assertEqual(corr, [
            0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797,
            0.832
        ])

        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)]
        self.assertEqual(ecorr, [
            0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01,
            0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
            0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974,
            0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89
        ])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
Example #3
0
def tadbit(x, n_cpus=None, verbose=True, max_tad_size="auto",
           no_heuristic=False, get_weights=False):
    """
    The tadbit algorithm works on raw chromosome interaction count data.
    Not only is normalization not necessary, it is also not recommended
    since the data is assumed to be discrete counts.
    
    Tadbit is a breakpoint detection algorithm that returns the optimal
    segmentation of the chromosome under BIC-penalized likelihood. The
    model assumes that counts have a Poisson distribution and that the
    expected value of the counts decreases like a power-law with the
    linear distance on the chromosome. This expected value of the counts
    at position (i,j) is corrected by the counts at diagonal positions
    (i,i) and (j,j). This normalizes for different restriction enzynme
    site densities and 'mappability' of the reads in case a bin contains
    repeated regions.

    :param x: A square matrix of interaction counts in hi-C data or a list of
        such matrices for replicated experiments. The counts must be evenly
        sampled and not normalized. x might be either a list of list, a path to
        a file or a file handler
    :param None n_cpus: The number of CPUs to allocate to tadbit. The value
        default is the total number of CPUs minus 1.
    :param auto max_tad_size: an integer defining maximum size of TAD. Default
        (auto) defines it to the number of rows/columns.
    :param False no_heuristic: whether to use or not some heuristics
    :param False get_weights: either to return the weights corresponding to the
       Hi-C count (weights are a normalization dependent of the count of each
       columns).

    :returns: the :py:func:`list` of topologically associated domains'
       boundaries, and the corresponding list associated log likelihoods.
       Depending on the value of the get_weights parameter, may also return
       weights.
    """
    nums, size = read_matrix(x)
    max_tad_size = size if max_tad_size is "auto" else max_tad_size
    _, nbks, passages, _, _, bkpts, weights = \
       _tadbit_wrapper(nums,             # list of lists representing matrices
                       size,             # size of one row/column
                       len(nums),        # number of matrices
                       n_cpus or 0,      # number of threads
                       int(verbose),     # verbose 0/1
                       max_tad_size,     # max_tad_size
                       int(no_heuristic) # heuristic 0/1
                       )

    breaks = [i for i in xrange(size) if bkpts[i + nbks * size] == 1]
    scores = [p for p in passages if p > 0]

    result = {'start': [], 'end'  : [], 'score': []}
    for brk in xrange(len(breaks)+1):
        result['start'].append((breaks[brk-1] + 1) if brk > 0 else 0)
        result['end'  ].append(breaks[brk] if brk < len(breaks) else size - 1)
        result['score'].append(scores[brk] if brk < len(breaks) else None)

    if get_weights:
        return result, weights
    return result
Example #4
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~')
        hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)
        
        self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in
                          reduce(lambda x, y: x + y, vals)],
                         [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0])
        
        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a),int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)
        
        corr = correlate_matrices(hic_data1, hic_data2)
        corr =  [round(i,3) for i in corr[0]]
        self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828,
                                0.757, 0.797, 0.832])
        
        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)]
        self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014,
                                 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451,
                                 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
                                 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013,
                                 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0,
                                 0.028, 0.034, 0.89])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
    def load_hic_matrix_data(self, norm=True):
        """
        Load the interactions from Hi-C adjacency matrix into the HiC-Data data
        type
        """
        if norm == True:
            # Dump the data pre-normalized
            adj_list = self.parsed_reads_dir + '/adjlist_map.tsv'
        else:
            adj_list = self.parsed_reads_dir + '/adjlist_map_norm.tsv'

        self.hic_data = read_matrix(adj_list, resolution=self.resolution)
Example #6
0
    def load_experiment(self,
                        hic_data,
                        parser=None,
                        resolution=None,
                        filter_columns=True):
        """
        Add a Hi-C experiment to the Chromosome object.
        
        :param None hic_data: whether a file or a list of lists corresponding to
           the Hi-C data
        :param name: name of the experiment
        :param False force: overwrite the experiments loaded under the same 
           name
        :param None parser: a parser function that returns a tuple of lists
           representing the data matrix and the length of a row/column. 
           With the file example.tsv:

           ::
           
             chrT_001	chrT_002	chrT_003	chrT_004
             chrT_001	629	164	88	105
             chrT_002	86	612	175	110
             chrT_003	159	216	437	105
             chrT_004	100	111	146	278
           
           the output of parser('example.tsv') would be:
           ``[([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105,
           110, 105, 278]), 4]``
        :param None resolution: resolution of the experiment in the file; it
           will be adjusted to the resolution of the experiment. By default the
           file is expected to contain a Hi-C experiment with the same resolution
           as the :class:`pytadbit.Experiment` created, and no change is made
        :param True filter_columns: filter the columns with unexpectedly high content
           of low values
        
        """
        nums, size = read_matrix(hic_data, parser=parser)
        self.hic_data = nums
        self.size = size
        resolution = resolution or self.resolution
        self.set_resolution(resolution, keep_original=False)
        # self._zeros   = [int(pos) for pos, raw in enumerate(
        #     xrange(0, self.size**2, self.size))
        #                  if sum(self.hic_data[0][raw:raw + self.size]) <= 100]
        if filter_columns:
            self._zeros = hic_filtering_for_modelling(self.get_hic_matrix())
Example #7
0
 def add_experiment(self, f_name, name, force=False):
     """
     Add Hi-C experiment to Chromosome
     """
     nums, size = read_matrix(f_name)
     if name in self.experiments:
         if "hi-c" in self.experiments[name] and not force:
             raise Exception(
                 """Hi-C data already loaded under the name: {}.
                 Force loading or use an other name.\n""".format(
                     name
                 )
             )
         self.experiments[name]["hi-c"] = nums
         self.experiments[name]["size"] = size
     else:
         self.experiments[name] = {"hi-c": nums, "size": size, "tads": None, "brks": None, "wght": None}
Example #8
0
    def load_hic_data(self, hic_data, parser=None, wanted_resolution=None,
                      data_resolution=None, filter_columns=True):
        """
        Add a Hi-C experiment to the Chromosome object.
        
        :param None hic_data: whether a file or a list of lists corresponding to
           the Hi-C data
        :param name: name of the experiment
        :param False force: overwrite the experiments loaded under the same 
           name
        :param None parser: a parser function that returns a tuple of lists
           representing the data matrix and the length of a row/column. 
           With the file example.tsv:

           ::
           
             chrT_001	chrT_002	chrT_003	chrT_004
             chrT_001	629	164	88	105
             chrT_002	86	612	175	110
             chrT_003	159	216	437	105
             chrT_004	100	111	146	278
           
           the output of parser('example.tsv') would be:
           ``[([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105,
           110, 105, 278]), 4]``
        :param None resolution: resolution of the experiment in the file; it
           will be adjusted to the resolution of the experiment. By default the
           file is expected to contain a Hi-C experiment with the same resolution
           as the :class:`pytadbit.Experiment` created, and no change is made
        :param True filter_columns: filter the columns with unexpectedly high content
           of low values
        
        """
        nums, size = read_matrix(hic_data, parser=parser)
        self.hic_data = nums
        self.size     = size
        self._ori_resolution = self.resolution = data_resolution or self._ori_resolution
        wanted_resolution = wanted_resolution or self.resolution
        self.set_resolution(wanted_resolution, keep_original=False)
        # self._zeros   = [int(pos) for pos, raw in enumerate(
        #     xrange(0, self.size**2, self.size))
        #                  if sum(self.hic_data[0][raw:raw + self.size]) <= 100]
        if filter_columns:
            self._zeros = hic_filtering_for_modelling(self.get_hic_matrix())
Example #9
0
    def load_experiment(self, handler, parser=None, resolution=None):
        """
        Add Hi-C experiment to Chromosome
        
        :param f_name: path to tsv file
        :param name: name of the experiment
        :param False force: overwrite experiments loaded under the same name
        :param None parser: a parser function that returns a tuple of lists
           representing the data matrix, and the length of a row/column, with
           this file example.tsv:

           ::
           
             chrT_001	chrT_002	chrT_003	chrT_004
             chrT_001	629	164	88	105
             chrT_002	86	612	175	110
             chrT_003	159	216	437	105
             chrT_004	100	111	146	278
           
           the output of parser('example.tsv') might be:
           ``[([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105,
           110, 105, 278]), 4]``
        :param None resolution: resolution of the experiment in the file, it
           will be adjusted to the resolution of the experiment. By default the
           file is expected to contain an hi-c experiment at the same resolution
           as the :class:`pytadbit.Experiment` created, and no change is made.
        
        """
        nums, size = read_matrix(handler, parser=parser)
        self.hic_data = nums
        self.size     = size
        resolution = resolution or self.resolution
        self.set_resolution(resolution, keep_original=False)
        # self._zeros   = [int(pos) for pos, raw in enumerate(
        #     xrange(0, self.size**2, self.size))
        #                  if sum(self.hic_data[0][raw:raw + self.size]) <= 100]
        self._zeros = hic_filtering_for_modelling(self.get_hic_matrix())
    def generate_tads(self, chrom):
        """
        Uses TADbit to generate the TAD borders based on the computed hic_data
        """
        from pytadbit import Chromosome

        exptName = self.library + "_" + str(
            self.resolution) + "_" + str(chrom) + "-" + str(chrom)
        fname = self.parsed_reads_dir + '/adjlist_map_' + str(
            chrom) + '-' + str(chrom) + '_' + str(self.resolution) + '.tsv'
        chr_hic_data = read_matrix(fname, resolution=int(self.resolution))

        my_chrom = Chromosome(name=exptName, centromere_search=True)
        my_chrom.add_experiment(exptName,
                                hic_data=chr_hic_data,
                                resolution=int(self.resolution))

        # Run core TADbit function to find TADs on each expt.
        # For the current dataset required 61GB of RAM
        my_chrom.find_tad(exptName, n_cpus=15)

        exp = my_chrom.experiments[exptName]
        tad_file = self.library_dir + exptName + '_tads.tsv'
        exp.write_tad_borders(savedata=tad_file)
Example #11
0
def tadbit(x,
           remove=None,
           n_cpus=1,
           verbose=True,
           max_tad_size="max",
           no_heuristic=0,
           use_topdom=False,
           topdom_window=5,
           **kwargs):
    """
    The TADbit algorithm works on raw chromosome interaction count data.
    The normalization is neither necessary nor recommended,
    since the data is assumed to be discrete counts.

    TADbit is a breakpoint detection algorithm that returns the optimal
    segmentation of the chromosome under BIC-penalized likelihood. The
    model assumes that counts have a Poisson distribution and that the
    expected value of the counts decreases like a power-law with the
    linear distance on the chromosome. This expected value of the counts
    at position (i,j) is corrected by the counts at diagonal positions
    (i,i) and (j,j). This normalizes for different restriction enzyme
    site densities and 'mappability' of the reads in case a bin contains
    repeated regions.

    :param x: a square matrix of interaction counts in the HI-C data or a list
       of such matrices for replicated experiments. The counts must be evenly
       sampled and not normalized. x might be either a list of list, a path to
       a file or a file handler
    :argument 'visibility' norm: kind of normalization to use. Choose between
       'visibility' of 'Imakaev'
    :argument None remove: a python list of lists of booleans mapping positively
       columns to remove (if None only columns with a 0 in the diagonal will be
       removed)
    :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
       n_cpus='max' the total number of CPUs will be used
    :param auto max_tad_size: an integer defining maximum size of TAD. Default
       (auto or max) defines it as the number of rows/columns
    :param False no_heuristic: whether to use or not some heuristics
    :param False use_topdom: whether to use TopDom algorithm to find tads or not (http://www.ncbi.nlm.nih.gov/pubmed/26704975, http://zhoulab.usc.edu/TopDom/)
    :param 5 topdom_window: the window size for topdom algorithm
    :param False get_weights: either to return the weights corresponding to the
       Hi-C count (weights are a normalization dependent of the count of each
       columns)

    :returns: the :py:func:`list` of topologically associated domains'
       boundaries, and the corresponding list associated log likelihoods.
       If no weights are given, it may also return calculated weights.
    """
    nums = [hic_data for hic_data in read_matrix(x, one=False)]

    if not use_topdom:
        size = len(nums[0])
        nums = [num.get_as_tuple() for num in nums]
        if not remove:
            # if not given just remove columns with zero in diagonal
            remove = tuple(
                [0 if nums[0][i * size + i] else 1 for i in xrange(size)])
        n_cpus = n_cpus if n_cpus != 'max' else 0
        max_tad_size = size if max_tad_size in ["max", "auto"
                                                ] else max_tad_size
        _, nbks, passages, _, _, bkpts = \
           _tadbit_wrapper(nums,             # list of lists of Hi-C data
                           remove,           # list of columns marking filtered
                           size,             # size of one row/column
                           len(nums),        # number of matrices
                           n_cpus,           # number of threads
                           int(verbose),     # verbose 0/1
                           max_tad_size,     # max_tad_size
                           kwargs.get('ntads', -1) + 1,
                           int(no_heuristic),# heuristic 0/1
                           )

        breaks = [i for i in xrange(size) if bkpts[i + nbks * size] == 1]
        scores = [p for p in passages if p > 0]

        result = {'start': [], 'end': [], 'score': []}
        for brk in xrange(len(breaks) + 1):
            result['start'].append((breaks[brk - 1] + 1) if brk > 0 else 0)
            result['end'].append(breaks[brk] if brk < len(breaks) else size -
                                 1)
            result['score'].append(scores[brk] if brk < len(breaks) else None)
    else:
        result = {'start': [], 'end': [], 'score': [], 'tag': []}

        ret = TopDom(nums[0], window_size=topdom_window)

        for key in sorted(ret):
            result['tag'].append(ret[key]['tag'])
            result['start'].append(ret[key]['start'])
            result['end'].append(ret[key]['end'])
            if ret[key]['tag'] == 'domain':
                result['score'].append(ret[key]['score'])
            else:
                result['score'].append(0)

        max_score = max(result['score'])
        for i in xrange(len(result['score'])):
            result['score'][i] = 1 - int((result['score'][i] / max_score) * 10)

    return result
Example #12
0
def tadbit(x, remove=None, n_cpus=1, verbose=True,
           max_tad_size="max", no_heuristic=0, use_topdom=False, topdom_window=5, **kwargs):
    """
    The TADbit algorithm works on raw chromosome interaction count data.
    The normalization is neither necessary nor recommended,
    since the data is assumed to be discrete counts.

    TADbit is a breakpoint detection algorithm that returns the optimal
    segmentation of the chromosome under BIC-penalized likelihood. The
    model assumes that counts have a Poisson distribution and that the
    expected value of the counts decreases like a power-law with the
    linear distance on the chromosome. This expected value of the counts
    at position (i,j) is corrected by the counts at diagonal positions
    (i,i) and (j,j). This normalizes for different restriction enzyme
    site densities and 'mappability' of the reads in case a bin contains
    repeated regions.

    :param x: a square matrix of interaction counts in the HI-C data or a list
       of such matrices for replicated experiments. The counts must be evenly
       sampled and not normalized. x might be either a list of list, a path to
       a file or a file handler
    :argument 'visibility' norm: kind of normalization to use. Choose between
       'visibility' of 'Imakaev'
    :argument None remove: a python list of lists of booleans mapping positively
       columns to remove (if None only columns with a 0 in the diagonal will be
       removed)
    :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
       n_cpus='max' the total number of CPUs will be used
    :param auto max_tad_size: an integer defining maximum size of TAD. Default
       (auto or max) defines it as the number of rows/columns
    :param False no_heuristic: whether to use or not some heuristics
    :param False use_topdom: whether to use TopDom algorithm to find tads or not (http://www.ncbi.nlm.nih.gov/pubmed/26704975, http://zhoulab.usc.edu/TopDom/)
    :param 5 topdom_window: the window size for topdom algorithm
    :param False get_weights: either to return the weights corresponding to the
       Hi-C count (weights are a normalization dependent of the count of each
       columns)

    :returns: the :py:func:`list` of topologically associated domains'
       boundaries, and the corresponding list associated log likelihoods.
       If no weights are given, it may also return calculated weights.
    """
    nums = [hic_data for hic_data in read_matrix(x, one=False)]

    if not use_topdom:
        size = len(nums[0])
        nums = [num.get_as_tuple() for num in nums]
        if not remove:
            # if not given just remove columns with zero in diagonal
            remove = tuple([0 if nums[0][i*size+i] else 1 for i in xrange(size)])
        n_cpus = n_cpus if n_cpus != 'max' else 0
        max_tad_size = size if max_tad_size in ["max", "auto"] else max_tad_size
        _, nbks, passages, _, _, bkpts = \
           _tadbit_wrapper(nums,             # list of lists of Hi-C data
                           remove,           # list of columns marking filtered
                           size,             # size of one row/column
                           len(nums),        # number of matrices
                           n_cpus,           # number of threads
                           int(verbose),     # verbose 0/1
                           max_tad_size,     # max_tad_size
                           kwargs.get('ntads', -1) + 1,
                           int(no_heuristic),# heuristic 0/1
                           )

        breaks = [i for i in xrange(size) if bkpts[i + nbks * size] == 1]
        scores = [p for p in passages if p > 0]

        result = {'start': [], 'end'  : [], 'score': []}
        for brk in xrange(len(breaks)+1):
            result['start'].append((breaks[brk-1] + 1) if brk > 0 else 0)
            result['end'  ].append(breaks[brk] if brk < len(breaks) else size - 1)
            result['score'].append(scores[brk] if brk < len(breaks) else None)
    else:
        result = {'start': [], 'end'  : [], 'score': [], 'tag': []}

        ret = TopDom(nums[0],window_size=topdom_window)


        for key in sorted(ret):
            result['tag'].append(ret[key]['tag'])
            result['start'].append(ret[key]['start'])
            result['end'].append(ret[key]['end'])
            if ret[key]['tag'] == 'domain':
                result['score'].append(ret[key]['score'])
            else:
                result['score'].append(0)

        max_score = max(result['score'])
        for i in xrange(len(result['score'])):
            result['score'][i] = 1-int((result['score'][i]/max_score)*10)

    return result
Example #13
0
def tadbit(x, n_cpus=1, verbose=True, max_tad_size="max",
           no_heuristic=False, get_weights=False, use_visibility=False):
    """
    The TADBit algorithm works on raw chromosome interaction count data.
    The normalization is neither necessary nor recommended,
    since the data is assumed to be discrete counts.
    
    TADBit is a breakpoint detection algorithm that returns the optimal
    segmentation of the chromosome under BIC-penalized likelihood. The
    model assumes that counts have a Poisson distribution and that the
    expected value of the counts decreases like a power-law with the
    linear distance on the chromosome. This expected value of the counts
    at position (i,j) is corrected by the counts at diagonal positions
    (i,i) and (j,j). This normalizes for different restriction enzynme
    site densities and 'mappability' of the reads in case a bin contains
    repeated regions.

    :param x: a square matrix of interaction counts in the HI-C data or a list
       of such matrices for replicated experiments. The counts must be evenly
       sampled and not normalized. x might be either a list of list, a path to
       a file or a file handler
    :param 1 n_cpus: The number of CPUs to allocate to TADBit. If
       n_cpus='max' the total number of CPUs will be used
    :param auto max_tad_size: an integer defining maximum size of TAD. Default
       (auto) defines it as the number of rows/columns
    :param False no_heuristic: whether to use or not some heuristics
    :param False get_weights: either to return the weights corresponding to the
       Hi-C count (weights are a normalization dependent of the count of each
       columns)

    :returns: the :py:func:`list` of topologically associated domains'
       boundaries, and the corresponding list associated log likelihoods.
       Depending on the value of the get_weights parameter, may also return
       weights.
    """
    nums, size = read_matrix(x)
    n_cpus = n_cpus if n_cpus != 'max' else 0
    max_tad_size = size if max_tad_size is "auto" else max_tad_size
    _, nbks, passages, _, _, bkpts, weights = \
       _tadbit_wrapper(nums,             # list of lists representing matrices
                       size,             # size of one row/column
                       len(nums),        # number of matrices
                       n_cpus,      # number of threads
                       int(verbose),     # verbose 0/1
                       max_tad_size,     # max_tad_size
                       int(no_heuristic), # heuristic 0/1
                       int(use_visibility) # TODO: remove this
                       )

    breaks = [i for i in xrange(size) if bkpts[i + nbks * size] == 1]
    scores = [p for p in passages if p > 0]

    result = {'start': [], 'end'  : [], 'score': []}
    for brk in xrange(len(breaks)+1):
        result['start'].append((breaks[brk-1] + 1) if brk > 0 else 0)
        result['end'  ].append(breaks[brk] if brk < len(breaks) else size - 1)
        result['score'].append(scores[brk] if brk < len(breaks) else None)

    if get_weights:
        # in tadbit we are not using directly weights, but the
        # multiplication by the real value
        tadbit_weights = [[i/j if j else 0.0 for i, j in
                           zip(nums[k], weights[k])] for k in xrange(len(nums))]
        return result, tadbit_weights
    return result