Exemple #1
0
def read_matrix(things, parser=None, hic=True):
    """
    Read and checks a matrix from a file (using
    :func:`pytadbit.parser.hic_parser.autoreader`) or a list.

    :param things: might be either a file name, a file handler or a list of
        list (all with same length)
    :param None parser: a parser function that returns a tuple of lists
       representing the data matrix,
       with this file example.tsv:
       ::
       
         chrT_001	chrT_002	chrT_003	chrT_004
         chrT_001	629	164	88	105
         chrT_002	86	612	175	110
         chrT_003	159	216	437	105
         chrT_004	100	111	146	278

       the output of parser('example.tsv') might be:
       ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110,
       105, 278])``


    :param True hic: if False, TADbit assumes that files contains normalized
       data
    :returns: the corresponding matrix concatenated into a huge list, also
       returns number or rows

    """
    global HIC_DATA
    HIC_DATA = hic
    parser = parser or autoreader
    if not isinstance(things, list):
        things = [things]
    matrices = []
    for thing in things:
        if isinstance(thing, HiC_data):
            matrices.append(thing)
        elif isinstance(thing, file):
            matrix, size = parser(thing)
            thing.close()
            matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2)
                                      if matrix[i]], size))
        elif isinstance(thing, str):
            try:
                matrix, size = parser(gzopen(thing))
            except IOError:
                if len(thing.split('\n')) > 1:
                    matrix, size = parser(thing.split('\n'))
                else:
                    raise IOError('\n   ERROR: file %s not found\n' % thing)
            matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2)
                                      if matrix[i]], size))
        elif isinstance(thing, list):
            if all([len(thing)==len(l) for l in thing]):
                matrix  = reduce(lambda x, y: x+y, thing)
                size = len(thing)
            else:
                raise Exception('must be list of lists, all with same length.')
            matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2)
                                      if matrix[i]], size))
        elif isinstance(thing, tuple):
            # case we know what we are doing and passing directly list of tuples
            matrix = thing
            siz = sqrt(len(thing))
            if int(siz) != siz:
                raise AttributeError('ERROR: matrix should be square.\n')
            size = int(siz)
            matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2)
                                      if matrix[i]], size))
        elif 'matrix' in str(type(thing)):
            try:
                row, col = thing.shape
                if row != col:
                    raise Exception('matrix needs to be square.')
                matrix  = thing.reshape(-1).tolist()[0]
                size = row
            except Exception as exc:
                print 'Error found:', exc
            matrices.append(HiC_data([(i, matrix[i]) for i in xrange(size**2)
                                      if matrix[i]], size))
        else:
            raise Exception('Unable to read this file or whatever it is :)')
        
    return matrices
Exemple #2
0
def read_matrix(things, parser=None, hic=True, resolution=1, **kwargs):
    """
    Read and checks a matrix from a file (using
    :func:`pytadbit.parser.hic_parser.autoreader`) or a list.

    :param things: might be either a file name, a file handler or a list of
        list (all with same length)
    :param None parser: a parser function that returns a tuple of lists
       representing the data matrix,
       with this file example.tsv:
       ::

         chrT_001    chrT_002    chrT_003    chrT_004
         chrT_001    629    164    88    105
         chrT_002    86    612    175    110
         chrT_003    159    216    437    105
         chrT_004    100    111    146    278

       the output of parser('example.tsv') might be:
       ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110,
       105, 278])``

    :param 1 resolution: resolution of the matrix
    :param True hic: if False, TADbit assumes that files contains normalized
       data
    :returns: the corresponding matrix concatenated into a huge list, also
       returns number or rows

    """
    one = kwargs.get('one', True)
    global HIC_DATA
    HIC_DATA = hic
    if not isinstance(things, list):
        things = [things]
    matrices = []
    for thing in things:
        if isinstance(thing, HiC_data):
            matrices.append(thing)
        elif isinstance(thing, file):
            parser = parser or (abc_reader if __is_abc(thing) else autoreader)
            matrix, size, header, masked, sym = parser(thing)
            print(header)
            thing.close()
            chromosomes, sections, resolution = _header_to_section(
                header, resolution)
            matrices.append(
                HiC_data(matrix,
                         size,
                         dict_sec=sections,
                         chromosomes=chromosomes,
                         resolution=resolution,
                         symmetricized=sym,
                         masked=masked))
        elif isinstance(thing, str):
            if is_cooler(thing, resolution if resolution > 1 else None):
                matrix, size, header, masked, sym = parse_cooler(
                    thing, resolution if resolution > 1 else None, not hic)
            else:
                try:
                    parser = parser or (abc_reader if __is_abc(gzopen(thing))
                                        else autoreader)
                    matrix, size, header, masked, sym = parser(gzopen(thing))
                except IOError:
                    if len(thing.split('\n')) > 1:
                        parser = parser or (abc_reader if __is_abc(
                            thing.split('\n')) else autoreader)
                        matrix, size, header, masked, sym = parser(
                            thing.split('\n'))
                    else:
                        raise IOError('\n   ERROR: file %s not found\n' %
                                      thing)
            sections = dict([(h, i) for i, h in enumerate(header)])
            chromosomes, sections, resolution = _header_to_section(
                header, resolution)
            matrices.append(
                HiC_data(matrix,
                         size,
                         dict_sec=sections,
                         chromosomes=chromosomes,
                         masked=masked,
                         resolution=resolution,
                         symmetricized=sym))
        elif isinstance(thing, list):
            if all([len(thing) == len(l) for l in thing]):
                size = len(thing)
                matrix = [(i + j * size, v) for i, l in enumerate(thing)
                          for j, v in enumerate(l) if v]
            else:
                raise Exception('must be list of lists, all with same length.')
            matrices.append(HiC_data(matrix, size))
        elif isinstance(thing, tuple):
            # case we know what we are doing and passing directly list of tuples
            matrix = thing
            siz = sqrt(len(thing))
            if int(siz) != siz:
                raise AttributeError('ERROR: matrix should be square.\n')
            size = int(siz)
            matrices.append(HiC_data(matrix, size))
        elif 'matrix' in str(type(thing)):
            try:
                row, col = thing.shape
                if row != col:
                    raise Exception('matrix needs to be square.')
                matrix = thing.reshape(-1).tolist()[0]
                size = row
            except Exception as exc:
                print 'Error found:', exc
            matrices.append(HiC_data(matrix, size))
        else:
            raise Exception('Unable to read this file or whatever it is :)')
    if one:
        return matrices[0]
    else:
        return matrices
Exemple #3
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    coord1 = opts.coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None

    printime('Importing hic in %s format' % opts.format)
    if opts.format == 'matrix' or opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing)
        if not chroms_gen or (region1 and region1 not in chroms_gen):
            raise Exception(
                '''ERROR: Chromosome size not included in import file.
                             Please include the chromosome sizes of the data that
                             you want to import in the header of the file. Example:
                             # CRM chr1    249250621''')
    elif opts.format == 'cooler':
        if is_cooler(opts.input, opts.reso if opts.reso > 1 else None):
            chroms_gen = parse_header(opts.input,
                                      opts.reso if opts.reso > 1 else None)
            if not chroms_gen or (region1 and region1 not in chroms_gen):
                raise Exception(
                    '''ERROR: Chromosome size not included in import file.
                                ''')
        else:
            raise Exception('''ERROR: The input file is not a cooler''')

    chroms = OrderedDict(
        (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen)
    sections = []
    if not region1:
        size = 0
        for crm in chroms:
            size += chroms[crm]
            sections.extend([(crm, i) for i in range(chroms[crm])])
    elif not start1:
        size = chroms[region1]
        sections.extend([(region1, i) for i in range(size)])
    else:
        #size = (end1 - start1)//opts.reso
        size = chroms[region1]
        sections.extend([
            (region1, i)
            for i in range(start1 // opts.reso, (end1 // opts.reso))
        ])
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    bias_file = None
    badcol = {}
    if opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            matrix = abc_reader(f_thing, size,
                                start1 // opts.reso if start1 else None)
        size_mat = size
    elif opts.format == 'matrix':
        with gzopen(opts.input) as in_f:
            matrix, size_mat, _, masked, _ = autoreader(in_f)
        if size != size_mat:
            raise Exception('''ERROR: The size of the specified region is
                            different from the data in the matrix''')
    elif opts.format == 'cooler':
        matrix, weights, size, header = parse_cooler(
            opts.input,
            opts.reso if opts.reso > 1 else None,
            normalized=True,
            raw_values=True)
        masked = {}
        size_mat = size
        if len(set(weights)) > 1:
            printime('Transforming cooler weights to biases')
            outdir_norm = path.join(opts.workdir, '04_normalization')
            mkdir(outdir_norm)

            bias_file = path.join(
                outdir_norm, 'biases_%s_%s.pickle' %
                (nicer(opts.reso).replace(' ', ''), param_hash))
            out = open(bias_file, 'wb')
            badcol.update((i, True) for i, m in enumerate(weights) if m == 0)
            dump(
                {
                    'biases':
                    dict((k, b if b > 0 else float('nan'))
                         for k, b in enumerate(weights)),
                    'decay': {},
                    'badcol':
                    badcol,
                    'resolution':
                    opts.reso
                }, out, HIGHEST_PROTOCOL)
            out.close()

    hic = HiC_data(matrix,
                   size_mat,
                   dict_sec=dict_sec,
                   chromosomes=chroms,
                   masked=masked,
                   resolution=opts.reso)

    #from pytadbit.mapping.analyze import hic_map
    #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis')

    printime('Creating BAM file')
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    total_counts = create_BAMhic(hic,
                                 opts.cpus,
                                 outbam,
                                 chroms_gen,
                                 opts.reso,
                                 samtools=opts.samtools)

    finish_time = time.localtime()
    # save all job information to sqlite DB
    save_to_db(opts, total_counts, size_mat, bias_file, len(badcol),
               outbam + '.bam', launch_time, finish_time)
Exemple #4
0
def read_matrix(things, parser=None):
    """
    Read and checks a matrix from a file (using
    :func:`pytadbit.parser.hic_parser.autoreader`) or a list.

    :param things: might be either a file name, a file handler, a list of them
        or a list of list (all with same length)
    :param None parser: a parser function that returns a tuple of lists representing the data matrix,
        with this file example.tsv:
        ::
        
          chrT_001	chrT_002	chrT_003	chrT_004
          chrT_001	629	164	88	105
          chrT_002	86	612	175	110
          chrT_003	159	216	437	105
          chrT_004	100	111	146	278

        the output of parser('example.tsv') might be:
        ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110,
        105, 278])``


    :returns: the corresponding matrix concatenated into a huge list, also
        returns number or rows

    """
    parser = parser or autoreader
    if type(things) is not list:
        things = [things]
    matrices = []
    sizes    = []
    for thing in things:
        if type(thing) is file:
            matrix, size = parser(thing)
            thing.close()
            matrices.append(matrix)
            sizes.append(size)
        elif type(thing) is str:
            try:
                matrix, size = parser(gzopen(thing))
            except IOError:
                if len(thing.split('\n')) > 1:
                    matrix, size = parser(thing.split('\n'))
                else:
                    raise Exception('\n   ERROR: file %s not found\n' % thing)
            matrices.append(matrix)
            sizes.append(size)
        elif type(thing) is list:
            if all([len(thing)==len(l) for l in thing]):
                matrices.append(reduce(lambda x, y: x+y, thing))
                sizes.append(len(thing))
            else:
                raise Exception('must be list of lists, all with same length.')
        elif type(thing) is tuple:
            # case we know what we are doing and passing directly list of tuples
            matrices.append(thing)
            siz = sqrt(len(thing))
            if int(siz) != siz:
                raise AttributeError('ERROR: matrix should be square.\n')
            sizes.append(int(siz))
        elif 'matrix' in str(type(thing)):
            try:
                row, col = thing.shape
                if row != col:
                    raise Exception('matrix needs to be square.')
                matrices.append(thing.reshape(-1).tolist()[0])
                sizes.append(row)
            except Exception as exc:
                print 'Error found:', exc
        else:
            raise Exception('Unable to read this file or whatever it is :)')
    if all([s==sizes[0] for s in sizes]):
        return matrices, sizes[0]
    raise Exception('All matrices must have the same size ' +
                    '(same chromosome and same bins).')
Exemple #5
0
def read_matrix(things, parser=None, hic=True, resolution=1, **kwargs):
    """
    Read and checks a matrix from a file (using
    :func:`pytadbit.parser.hic_parser.autoreader`) or a list.

    :param things: might be either a file name, a file handler or a list of
        list (all with same length)
    :param None parser: a parser function that returns a tuple of lists
       representing the data matrix,
       with this file example.tsv:
       ::

         chrT_001	chrT_002	chrT_003	chrT_004
         chrT_001	629	164	88	105
         chrT_002	86	612	175	110
         chrT_003	159	216	437	105
         chrT_004	100	111	146	278

       the output of parser('example.tsv') might be:
       ``([629, 86, 159, 100, 164, 612, 216, 111, 88, 175, 437, 146, 105, 110,
       105, 278])``

    :param 1 resolution: resolution of the matrix
    :param True hic: if False, TADbit assumes that files contains normalized
       data
    :returns: the corresponding matrix concatenated into a huge list, also
       returns number or rows

    """
    one = kwargs.get('one', True)
    global HIC_DATA
    HIC_DATA = hic
    if not isinstance(things, list):
        things = [things]
    matrices = []
    for thing in things:
        if isinstance(thing, HiC_data):
            matrices.append(thing)
        elif isinstance(thing, file):
            parser = parser or (abc_reader if __is_abc(thing) else autoreader)
            matrix, size, header, masked, sym = parser(thing)
            print header
            thing.close()
            chromosomes, sections, resolution = _header_to_section(header,
                                                                   resolution)
            matrices.append(HiC_data(matrix, size, dict_sec=sections,
                                     chromosomes=chromosomes,
                                     resolution=resolution,
                                     symmetricized=sym, masked=masked))
        elif isinstance(thing, str):
            try:
                parser = parser or (abc_reader if __is_abc(gzopen(thing)) else autoreader)
                matrix, size, header, masked, sym = parser(gzopen(thing))
            except IOError:
                if len(thing.split('\n')) > 1:
                    parser = parser or (abc_reader if __is_abc(thing.split('\n')) else autoreader)
                    matrix, size, header, masked, sym = parser(thing.split('\n'))
                else:
                    raise IOError('\n   ERROR: file %s not found\n' % thing)
            sections = dict([(h, i) for i, h in enumerate(header)])
            chromosomes, sections, resolution = _header_to_section(header,
                                                                   resolution)
            matrices.append(HiC_data(matrix, size, dict_sec=sections,
                                     chromosomes=chromosomes, masked=masked,
                                     resolution=resolution,
                                     symmetricized=sym))
        elif isinstance(thing, list):
            if all([len(thing)==len(l) for l in thing]):
                size = len(thing)
                matrix  = [(i + j * size, v) for i, l in enumerate(thing) for j, v in enumerate(l) if v]
            else:
                raise Exception('must be list of lists, all with same length.')
            matrices.append(HiC_data(matrix, size))
        elif isinstance(thing, tuple):
            # case we know what we are doing and passing directly list of tuples
            matrix = thing
            siz = sqrt(len(thing))
            if int(siz) != siz:
                raise AttributeError('ERROR: matrix should be square.\n')
            size = int(siz)
            matrices.append(HiC_data(matrix, size))
        elif 'matrix' in str(type(thing)):
            try:
                row, col = thing.shape
                if row != col:
                    raise Exception('matrix needs to be square.')
                matrix  = thing.reshape(-1).tolist()[0]
                size = row
            except Exception as exc:
                print 'Error found:', exc
            matrices.append(HiC_data(matrix, size))
        else:
            raise Exception('Unable to read this file or whatever it is :)')
    if one:
        return matrices[0]
    else:
        return matrices