Python filters_to_binの例、pytadbit.parsers.hic_bam_parser.filters_to_bin Pythonの例

コード例 #1

0

ファイルを表示

def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    if opts.format == 'hic':
        if not opts.juicerjar:
            raise IOError('ERROR: juicer jar file needed for "hic" export.')

    # for LUSTRE file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

コード例 #2

0

ファイルを表示

ファイル: tadbit_bin.py プロジェクト: aescrdni/TADbit

def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # enlighten plotting parameter writing
    if opts.only_plot:
        opts.plot = True
    if opts.interactive:
        if opts.nox:
            raise Exception('ERROR: no screen no fun.\n'
                            'Interactive plot incompatible with noX option.')
        opts.plot = True
        opts.only_plot = True

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # check resume
    if opts.triangular and opts.coord2:
        raise NotImplementedError('ERROR: triangular is only available for '
                                  'symmetric matrices.')

    # for LUSTRE file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if not opts.force:
                if 'tmpdb' in opts and opts.tmpdb:
                    remove(path.join(dbdir, dbfile))
                    exit(
                        'WARNING: exact same job already computed, see JOBs table above'
                    )
            else:
                warn(
                    'WARNING: exact same job already computed, overwriting...')
    except IOError:
        warn((""
              "\nWARNING:\n  new working directory created. It's ok... "
              "but next time use TADbit since the beginning!! :)"))

コード例 #3

0

ファイルを表示

ファイル: tadbit_bin.py プロジェクト: 3DGenomes/TADbit

def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # enlighten plotting parameter writing
    if opts.only_plot:
        opts.plot = True
    if opts.interactive:
        if opts.nox:
            raise Exception('ERROR: no screen no fun.\n'
                            'Interactive plot incompatible with noX option.')
        opts.plot = True
        opts.only_plot = True

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # check resume
    if opts.triangular and opts.coord2:
        raise NotImplementedError('ERROR: triangular is only available for '
                                  'symmetric matrices.')

    # for LUSTRE file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if not opts.force:
                if 'tmpdb' in opts and opts.tmpdb:
                    remove(path.join(dbdir, dbfile))
                    exit('WARNING: exact same job already computed, see JOBs table above')
            else:
                warn('WARNING: exact same job already computed, overwriting...')
    except IOError:
        warn((""
              "\nWARNING:\n  new working directory created. It's ok... "
              "but next time use TADbit from the beginning!! :)"))

コード例 #4

0

ファイルを表示

ファイル: bam2count.py プロジェクト: 3DGenomes/metawaffle

def write_matrix(inbam, resolution, biases, outdir,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 region1=None, start1=None, end1=None, clean=True,
                 region2=None, start2=None, end2=None,
                 tmpdir='.', ncpus=8, verbose=True):

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    regions, rand_hash, bin_coords, chunks = read_bam(
        inbam, filter_exclude, resolution, ncpus=ncpus,
        region1=region1, start1=start1, end1=end1,
        region2=region2, start2=start2, end2=end2,
        tmpdir=tmpdir, verbose=verbose)

    bamfile = pysam.AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,[x / resolution + 1 for x in bamfile.lengths]))

    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(biases, bin_coords)

    else:
        bads1 = bads2 = {}

    start_bin1, start_bin2 = bin_coords[::2]
    if verbose:
        printime('  - Writing matrices')

    fnam = outdir + '{}_mat_{}kb.tsv'.format(region1, resolution / 1000)
    mkdir (outdir)
    out = open(os.path.join(outdir, fnam), 'w')

    # pull all sub-matrices and write full matrix
    for c,j, k, v in _iter_matrix_frags(chunks, tmpdir, rand_hash,
                                         verbose=verbose, clean=clean):
        if k < j: # we are only going to keep half of the matrix
            continue
        if j not in bads1 and k not in bads2 and abs(j-k) in decay[c]:
            n = v / bias1[j] / bias2[k] / decay[c][abs(j-k)]
            pos1 = j + section_pos[region1][0]
            pos2 = k + section_pos[region1][0]
            out.write('{}\t{}\t{}\t{}\n'.format(pos1, pos2, v, n))

    out.close()

    # this is the last thing we do in case something goes wrong
    os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' % (rand_hash))))

    if  verbose:
        printime('\nDone.')

コード例 #5

0

ファイルを表示

def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # check custom normalization
    if opts.normalization == 'custom':
        if not opts.biases_path:
            raise IOError(
                'ERROR: biases file required for "custom" normalization.')
        elif not path.exists(opts.biases_path):
            raise IOError('ERROR: biases not found at path: %s' %
                          opts.biases_path)

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if 'tmpdb' in opts and opts.tmpdb:
                remove(path.join(dbdir, dbfile))
            exit(
                'WARNING: exact same job already computed, see JOBs table above'
            )
    except IOError:  # new working directory
        pass

コード例 #6

0

ファイルを表示

def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # enlight plotting parameter writing
    if opts.only_plot:
        opts.plot = True
    if opts.interactive:
        opts.plot = True
        opts.only_plot = True

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if not opts.force:
            if 'tmpdb' in opts and opts.tmpdb:
                remove(path.join(dbdir, dbfile))
            exit(
                'WARNING: exact same job already computed, see JOBs table above'
            )
        else:
            warn('WARNING: exact same job already computed, overwritting...')

コード例 #7

0

ファイルを表示

ファイル: tadbit_normalize.py プロジェクト: 3DGenomes/TADbit

def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # check custom normalization
    if opts.normalization=='custom':
        if not opts.biases_path:
            raise IOError('ERROR: biases file required for "custom" normalization.')
        elif not path.exists(opts.biases_path):
            raise IOError('ERROR: biases not found at path: %s' % opts.biases_path)

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if 'tmpdb' in opts and opts.tmpdb:
                remove(path.join(dbdir, dbfile))
            exit('WARNING: exact same job already computed, see JOBs table above')
    except IOError:  # new working directory
        pass

コード例 #8

0

ファイルを表示

ファイル: waffle-bam2count.py プロジェクト: fransua/prognosTF

def write_matrix(inbam,
                 resolution,
                 biases,
                 outfile,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 region1=None,
                 start1=None,
                 end1=None,
                 clean=True,
                 region2=None,
                 start2=None,
                 end2=None,
                 nchunks=100,
                 tmpdir='.',
                 ncpus=8,
                 verbose=True,
                 window=None):

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    _, rand_hash, bin_coords, chunks = read_bam(inbam,
                                                filter_exclude,
                                                resolution,
                                                ncpus=ncpus,
                                                region1=region1,
                                                start1=start1,
                                                end1=end1,
                                                region2=region2,
                                                start2=start2,
                                                end2=end2,
                                                tmpdir=tmpdir,
                                                nchunks=nchunks,
                                                verbose=verbose)

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))

    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(
            biases, bin_coords)
        transform = lambda x, c, j, k: x / bias1[j] / bias2[k] / decay[c][abs(
            k - j)]
        transform2 = lambda x, j, k: x / bias1[j] / bias2[k]
    else:
        bads1 = bads2 = {}
        transform = transform2 = lambda x, c, k, j: x

    if bads1 is bads2:
        badcols = bads1
    else:  # should never happen
        badcols = bads1
        badcols.update(bads2)

    if verbose:
        printime('  - Writing matrices')

    mkdir(os.path.split(os.path.abspath(outfile))[0])
    # write the rest of the file to be sorted
    out = open(outfile, 'w')
    nheader = 0
    for i, c in enumerate(bamfile.references):
        out.write('# CHROM\t{}\t{}\n'.format(c, bamfile.lengths[i]))
        nheader += 1
    out.write('# RESOLUTION\t{}\n'.format(resolution))
    nheader += 1
    out.write('# BADCOLS\t{}\n'.format(','.join(map(str, badcols.keys()))))
    nheader += 1

    if window == 'all':
        outside = lambda c_, j_, k_: False
    elif window == 'intra':
        outside = lambda c_, j_, k_: c_ == ''
    elif window == 'inter':
        outside = lambda c_, j_, k_: c_ != ''
    else:
        min_, max_ = window
        outside = lambda c_, j_, k_: (k_ - j_) < min_ or (k_ - j_) > max_

    # pull all sub-matrices and write full matrix
    for c, j, k, v in _iter_matrix_frags(chunks,
                                         tmpdir,
                                         rand_hash,
                                         verbose=verbose,
                                         clean=clean):
        if k < j or j in badcols or k in badcols:  # we keep only half matrix
            continue
        if outside(c, j, k):
            continue
        try:
            n = transform(v, c, j, k)  # normalize
        except KeyError:
            n = transform2(v, j, k)  # normalize no decay
        out.write('{}\t{}\t{}\t{}\n'.format(j, k, v, n))
    out.close()

    # this is the last thing we do in case something goes wrong
    if clean:
        os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' %
                                              (rand_hash))))
    return nheader

コード例 #9

0

ファイルを表示

def get_options():
    parser = ArgumentParser(usage="%(prog)s -i PATH -r INT [options]")

    parser.add_argument('-i',
                        '--infile',
                        dest='inbam',
                        metavar='',
                        required=True,
                        default=False,
                        help='input HiC-BAM file.')
    parser.add_argument('-o',
                        '--outdir',
                        dest='outdir',
                        metavar='',
                        default=True,
                        help='output directory.')
    parser.add_argument('-t',
                        '--tarfile',
                        dest='tarfile',
                        metavar='',
                        default=False,
                        help='''skip the generation of files, directly
                        append them to a tar file
                        (does not need to be created).''')
    parser.add_argument('--tmp',
                        dest='tmpdir',
                        metavar='',
                        default=False,
                        help='''path where to store temporary
                        files (by default outdir is used).''')
    parser.add_argument('-r',
                        '--resolution',
                        dest='reso',
                        type=int,
                        metavar='',
                        required=True,
                        help='''wanted resolution form the
                        generated matrix''')
    parser.add_argument('-b',
                        '--biases',
                        dest='biases',
                        metavar='',
                        help='''path to pickle file with array of biases''')
    parser.add_argument('-c',
                        '--coord',
                        dest='coord1',
                        metavar='',
                        default=None,
                        help='''Coordinate of the region to
                        retrieve. By default all genome, arguments can be
                        either one chromosome name, or the coordinate in
                        the form: "-c chr3:110000000-120000000"''')
    parser.add_argument('-c2',
                        '--coord2',
                        dest='coord2',
                        metavar='',
                        default=None,
                        help='''Coordinate of a second region to
                        retrieve the matrix in the intersection with the first
                        region.''')
    parser.add_argument('-C',
                        '--cpus',
                        dest='cpus',
                        metavar='',
                        type=int,
                        default=8,
                        help='''[%(default)s] number of cpus to be
                        used for parsing the HiC-BAM file''')
    parser.add_argument('--matrices',
                        dest='matrices',
                        metavar='',
                        type=str,
                        nargs='+',
                        default=['norm', 'raw', 'decay'],
                        help='''[%(default)s] which matrix to generate''')
    parser.add_argument('-f',
                        '--format',
                        dest='format',
                        default='abc',
                        choices=['abc', 'mat'],
                        required=False,
                        help='''[%(default)s]
                        format in which to write the output matrix (choose from %(choices)s)'''
                        )
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true',
                        help='display no running information')
    parser.add_argument(
        '-F',
        '--filter',
        dest='filter',
        nargs='+',
        type=int,
        metavar='INT',
        default=[1, 2, 3, 4, 6, 7, 9, 10],
        choices=range(1, 11),
        help=(
            """[%(default)s] Use filters to define a set os
                        valid pair of reads e.g.:
                        '--apply 1 2 3 4 8 9 10'. Where these numbers""" +
            "correspond to: %s" %
            (', '.join(['%2d: %15s' % (k, MASKED[k]['name'])
                        for k in MASKED]))))
    parser.add_argument('--nchunks',
                        dest='nchunks',
                        action='store',
                        default=None,
                        type=int,
                        help='''maximum number of chunks into which to
                        cut the BAM''')

    opts = parser.parse_args()
    # convert filters to binary for samtools
    opts.filter = filters_to_bin(opts.filter)
    if not opts.biases and ('norm' in opts.matrices
                            or 'decay' in opts.matrices):
        raise Exception('ERROR: should provide path to bias file.')
    if not opts.tmpdir:
        opts.tmpdir = opts.outdir

    return opts

コード例 #10

0

ファイルを表示

ファイル: matrix_from_BAM.py プロジェクト: 3DGenomes/TADbit

def get_options():
    parser = ArgumentParser(usage="%(prog)s -i PATH -r INT [options]")

    parser.add_argument('-i', '--infile', dest='inbam', metavar='',
                        required=True, default=False, help='input HiC-BAM file.')
    parser.add_argument('-o', '--outdir', dest='outdir', metavar='',
                        default=True, help='output directory.')
    parser.add_argument('-t', '--tarfile', dest='tarfile', metavar='',
                        default=False, help='''skip the generation of files, directly
                        append them to a tar file
                        (does not need to be created).''')
    parser.add_argument('--tmp', dest='tmpdir', metavar='',
                        default=False, help='''path where to store temporary
                        files (by default outdir is used).''')
    parser.add_argument('-r', '--resolution', dest='reso', type=int, metavar='',
                        required=True, help='''wanted resolution form the
                        generated matrix''')
    parser.add_argument('-b', '--biases', dest='biases', metavar='',
                        help='''path to pickle file with array of biases''')
    parser.add_argument('-c', '--coord', dest='coord1',  metavar='',
                        default=None, help='''Coordinate of the region to
                        retrieve. By default all genome, arguments can be
                        either one chromosome name, or the coordinate in
                        the form: "-c chr3:110000000-120000000"''')
    parser.add_argument('-c2', '--coord2', dest='coord2',  metavar='',
                        default=None, help='''Coordinate of a second region to
                        retrieve the matrix in the intersection with the first
                        region.''')
    parser.add_argument('-C', '--cpus', dest='cpus', metavar='', type=int,
                        default=8, help='''[%(default)s] number of cpus to be
                        used for parsing the HiC-BAM file''')
    parser.add_argument('--matrices', dest='matrices', metavar='', type=str,
                        nargs='+', default=['norm', 'raw', 'decay'],
                        help='''[%(default)s] which matrix to generate''')
    parser.add_argument('-f', '--format', dest='format', default='abc',
                        choices=['abc', 'mat'], required=False, help='''[%(default)s]
                        format in which to write the output matrix (choose from %(choices)s)''')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true',
                        help='display no running information')
    parser.add_argument('-F', '--filter', dest='filter', nargs='+',
                        type=int, metavar='INT', default=[1, 2, 3, 4, 6, 7, 9, 10],
                        choices = range(1, 11),
                        help=("""[%(default)s] Use filters to define a set os
                        valid pair of reads e.g.:
                        '--apply 1 2 3 4 8 9 10'. Where these numbers""" +
                              "correspond to: %s" % (', '.join(
                                  ['%2d: %15s' % (k, MASKED[k]['name'])
                                   for k in MASKED]))))
    parser.add_argument('--nchunks', dest='nchunks', action='store', default=None,
                        type=int,
                        help='''maximum number of chunks into which to
                        cut the BAM''')

    opts = parser.parse_args()
    # convert filters to binary for samtools
    opts.filter = filters_to_bin(opts.filter)
    if not opts.biases and ('norm' in opts.matrices or
                            'decay' in opts.matrices):
        raise Exception('ERROR: should provide path to bias file.')
    if not opts.tmpdir:
        opts.tmpdir = opts.outdir

    return opts

コード例 #11

0

ファイルを表示

ファイル: TADbit_to_HICUP_bam.py プロジェクト: 3DGenomes/TADbit

def main():
    """
    main function
    """
    opts = get_options()
    filter_exclude = filters_to_bin(opts.filter)
    tadbit_bam = opts.tadbit_bam
    hicup_bam = opts.hicup_bam
    map_folder = opts.map_folder
    nreads = opts.nreads * 1_000_000

    tag_dict = {
        (1, 1): (67, 131),
        (0, 0): (115, 179),
        (1, 0): (99, 147),
        (0, 1): (83, 163),
    }

    out = open(hicup_bam, 'w')
    for seqs in get_mapped_chunk(map_folder, nreads):
        bamfile = AlignmentFile(tadbit_bam, 'rb')
        refs = bamfile.references
        printime(f' - processing BAM (for {len(seqs) / 1_000_000}M reads)')
        for r in bamfile.fetch(multiple_iterators=False):
            if r.flag & filter_exclude:
                continue
            rid = r.qname
            ridname = rid.split('#')[0]
            pos1 = r.reference_start + 1
            which, len1 = r.cigar[0]
            tags = dict(r.tags)
            if which == 6:  # first read-end
                s1, s2 = tags['S1'], tags['S2']
            else:
                s2, s1 = tags['S1'], tags['S2']
            if s1 == 0:
                pos1 = pos1 - len1 + 1
            try:
                seq, qal = seqs[ridname, pos1]
            except KeyError:
                continue
            crm1 = r.reference_name
            crm2 = refs[r.mrnm]
            pos2 = r.mpos + 1
            len2 = r.tlen

            dist = 0 if crm1 != crm2 else abs(pos2 - pos1)
            tags = dict(r.tags)

            if s2 == 0:
                pos2 = pos2 - len2 + 1

            flag = tag_dict[s1, s2][0]

            out.write((f'{r.qname}\t{flag}\t{crm1}\t{pos1}\t{len1}\t'
                       f'{len(seq)}M\t{crm2}\t{pos2}\t{dist}\t{seq}\t'
                       f'{qal}\tMD:Z:{len1}\tPG:Z:MarkDuplicates\tNM:i:0\t'
                       f'AS:i:{len1}\tXS:i:1\n'))
        bamfile.close()
        seqs.clear()
    out.close()