Exemple #1
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    # prepare output folders
    mkdir(path.join(opts.workdir, '06_model'))
    outdir = path.join(opts.workdir, '06_model',
                       'chr%s_%s-%s' % (opts.crm, opts.beg, opts.end))
    mkdir(outdir)

    # load data
    crm = load_hic_data(opts)
    exp = crm.experiments[0]
    opts.beg, opts.end = opts.beg or 1, opts.end or exp.size

    # in case we are not going to run
    if opts.job_list:
        job_file_handler = open(path.join(outdir, 'job_list.q'), 'w')
    else:
        job_file_handler = None

    # optimization
    if opts.optimize:
        optimization(exp, opts, job_file_handler, outdir)
        finish_time = time.localtime()
        return

    # correlate all optimizations and get best set of parqameters
    optpar, dcutoff = correlate_models(opts, outdir, exp)

    # run good mmodels
    big_run(exp, opts, job_file_handler, outdir, optpar)

    finish_time = time.localtime()
Exemple #2
0
def check_options(opts):
    # do the division to bins
    try:
        opts.beg = int(float(opts.beg) / opts.reso)
        opts.end = int(float(opts.end) / opts.reso)
        if opts.end - opts.beg <= 2:
            raise Exception('"beg" and "end" parameter should be given in ' +
                            'genomic coordinates, not bin')
    except TypeError:
        pass

    # turn options into lists
    opts.scale   = (tuple(arange(*[float(s) for s in opts.scale.split(':')  ]))
                    if ':' in opts.scale   else [float(opts.scale  )])
    
    opts.maxdist = (tuple(range (*[int  (i) for i in opts.maxdist.split(':')]))
                    if ':' in opts.maxdist else [int  (opts.maxdist)])

    opts.upfreq  = (tuple(arange(*[float(i) for i in opts.upfreq.split(':') ]))
                    if ':' in opts.upfreq  else [float(opts.upfreq )])

    opts.lowfreq = (tuple(arange(*[float(i) for i in opts.lowfreq.split(':')]))
                    if ':' in opts.lowfreq else [float(opts.lowfreq)])

    opts.dcutoff = (tuple(arange(*[float(i) for i in opts.dcutoff.split(':')]))
                    if ':' in opts.dcutoff else [float(opts.dcutoff)])

    opts.nmodels_run = opts.nmodels_run or opts.nmodels

    opts.matrix  = path.abspath(opts.matrix)
    opts.workdir = path.abspath(opts.workdir)

    mkdir(opts.workdir)
Exemple #3
0
def optimization(exp, opts, job_file_handler, outdir):
    models = compile_models(opts, outdir)
    print 'Optimizing parameters...'
    print ('# %3s %6s %7s %7s %6s\n' % (
        "num", "upfrq", "lowfrq", "maxdist",
        "scale"))
    for m, u, l, s in product(opts.maxdist, opts.upfreq, opts.lowfreq, opts.scale):
        muls = tuple(map(my_round, (m, u, l, s)))
        if muls in models:
            print('%5s %6s %7s %7s %6s  ' % ('x', u, l, m, s))
            continue
        elif opts.job_list:
            print('%5s %6s %7s %7s %6s  ' % ('o', u, l, m, s))
        else:
            print('%5s %6s %7s %7s %6s  ' % ('-', u, l, m, s))
        mkdir(path.join(outdir, 'cfg_%s_%s_%s_%s' % muls))

        # write list of jobs to be run separately
        if opts.job_list:
            for rand in xrange(1, opts.nmodels + 1, opts.nmodels_run):
                write_one_job(opts, rand, m, u, l, s, job_file_handler)
            continue

        # compute models
        try:
            run_batch_job(exp, opts, m, u, l, s, outdir)
        except TADbitModelingOutOfBound:
            warn('WARNING: scale (here %s) x resolution (here %d) should be '
                 'lower than maxdist (here %d instead of at least: %d)' % (
                     s, opts.reso, m, s * opts.reso))
            continue

    if opts.job_list:
        job_file_handler.close()
def main():
    opts          = get_options()

    inbam          = opts.inbam
    resolution     = opts.reso
    filter_exclude = opts.filter
    min_count      = opts.min_count
    ncpus          = opts.cpus
    factor         = 1
    outdir         = opts.outdir
    sigma          = 2

    mkdir(outdir)

    sys.stdout.write('\nNormalization of full genome\n')

    biases, decay, badcol = read_bam(inbam, filter_exclude, resolution,
                                     min_count=min_count, ncpus=ncpus, sigma=sigma,
                                     factor=factor, outdir=outdir, check_sum=opts.check_sum)

    printime('  - Saving biases and badcol columns')
    # biases
    out = open(os.path.join(outdir, 'biases_%s.pickle' % (
        nicer(resolution).replace(' ', ''))), 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': resolution}, out)
    out.close()

    # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=())
    printime('\nDone.')
Exemple #5
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(
            reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'),
            savefig=hist_path)
        
        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f
    
        max_mole = max_f # pseudo DEs
        min_dist = max_f + mad # random breaks
        print ('   Using the maximum continuous fragment size'
               '(%d bp) to check '
               'for pseudo-dangling ends') % max_mole
        print ('   Using maximum continuous fragment size plus the MAD '
               '(%d bp) to check for random breaks') % min_dist
    
        print "identify pairs to filter..."
        masked = filter_reads(reads, max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist, fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked,
                                 filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
Exemple #6
0
def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # enlighten plotting parameter writing
    if opts.only_plot:
        opts.plot = True
    if opts.interactive:
        if opts.nox:
            raise Exception('ERROR: no screen no fun.\n'
                            'Interactive plot incompatible with noX option.')
        opts.plot = True
        opts.only_plot = True

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # check resume
    if opts.triangular and opts.coord2:
        raise NotImplementedError('ERROR: triangular is only available for '
                                  'symmetric matrices.')

    # for LUSTRE file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if not opts.force:
                if 'tmpdb' in opts and opts.tmpdb:
                    remove(path.join(dbdir, dbfile))
                    exit('WARNING: exact same job already computed, see JOBs table above')
            else:
                warn('WARNING: exact same job already computed, overwriting...')
    except IOError:
        warn((""
              "\nWARNING:\n  new working directory created. It's ok... "
              "but next time use TADbit from the beginning!! :)"))
Exemple #7
0
def check_options(opts):
    mkdir(opts.workdir)

    # create empty DB if don't exists
    dbpath = path.join(opts.workdir, 'trace.db')
    open(dbpath, 'a').close()

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass
        if opts.workdir1:
            # tmp file
            dbfile1 = 'trace1_%s' % (''.join([ascii_letters[int(random() * 52)]
                                              for _ in range(10)]))
            opts.tmpdb1 = path.join(dbdir, dbfile1)
            try:
                copyfile(path.join(opts.workdir1, 'trace.db'), opts.tmpdb1)
            except IOError:
                pass
        if opts.workdir2:
            # tmp file
            dbfile2 = 'trace2_%s' % (''.join([ascii_letters[int(random() * 52)]
                                              for _ in range(10)]))
            opts.tmpdb2 = path.join(dbdir, dbfile2)
            try:
                copyfile(path.join(opts.workdir2, 'trace.db'), opts.tmpdb2)
            except IOError:
                pass
    else:
        if opts.workdir1:
            opts.tmpdb1 = path.join(opts.workdir1, 'trace.db')
        if opts.workdir2:
            opts.tmpdb2 = path.join(opts.workdir2, 'trace.db')

    # resolution needed to compare
    if not opts.skip_comparison and not opts.reso:
        raise Exception('ERROR: need to define resolution at which to compare')

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if 'tmpdb' in opts and opts.tmpdb:
            remove(path.join(dbdir, dbfile))
            if opts.workdir1:
                remove(path.join(dbdir, dbfile1))
            if opts.workdir2:
                remove(path.join(dbdir, dbfile2))
        exit('WARNING: exact same job already computed, see JOBs table above')
Exemple #8
0
def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # enlight plotting parameter writing
    if opts.only_plot:
        opts.plot = True
    if opts.interactive:
        opts.plot = True
        opts.only_plot = True

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if not opts.force:
            if 'tmpdb' in opts and opts.tmpdb:
                remove(path.join(dbdir, dbfile))
            exit(
                'WARNING: exact same job already computed, see JOBs table above'
            )
        else:
            warn('WARNING: exact same job already computed, overwritting...')
Exemple #9
0
def check_options(opts):
    mkdir(opts.workdir)

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass
        if opts.workdir1:
            # tmp file
            dbfile1 = 'trace1_%s' % (''.join(
                [ascii_letters[int(random() * 52)] for _ in range(10)]))
            opts.tmpdb1 = path.join(dbdir, dbfile1)
            try:
                copyfile(path.join(opts.workdir1, 'trace.db'), opts.tmpdb1)
            except IOError:
                pass
        if opts.workdir2:
            # tmp file
            dbfile2 = 'trace2_%s' % (''.join(
                [ascii_letters[int(random() * 52)] for _ in range(10)]))
            opts.tmpdb2 = path.join(dbdir, dbfile2)
            try:
                copyfile(path.join(opts.workdir2, 'trace.db'), opts.tmpdb2)
            except IOError:
                pass
    else:
        opts.tmpdb1 = path.join(opts.workdir1, 'trace.db')
        opts.tmpdb2 = path.join(opts.workdir2, 'trace.db')

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if 'tmpdb' in opts and opts.tmpdb:
            remove(path.join(dbdir, dbfile))
            if opts.workdir1:
                remove(path.join(dbdir, dbfile1))
            if opts.workdir2:
                remove(path.join(dbdir, dbfile2))
        exit('WARNING: exact same job already computed, see JOBs table above')
Exemple #10
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    # prepare output folders
    mkdir(path.join(opts.workdir, '06_model'))
    outdir = path.join(opts.workdir, '06_model',
                       'chr%s_%s-%s' % (opts.crm, opts.beg, opts.end))
    mkdir(outdir)

    # load data
    if opts.matrix:
        crm = load_hic_data(opts)
    else:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        hic_data = load_hic_data_from_reads(mreads, reso)
        hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
        hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                             for l in open(biases))
        
    exp = crm.experiments[0]
    opts.beg, opts.end = opts.beg or 1, opts.end or exp.size

    # in case we are not going to run
    if opts.job_list:
        job_file_handler = open(path.join(outdir, 'job_list.q'), 'w')
    else:
        job_file_handler = None

    # optimization
    if opts.optimize:
        optimization(exp, opts, job_file_handler, outdir)
        finish_time = time.localtime()
        return

    # correlate all optimizations and get best set of parqameters
    optpar, dcutoff = correlate_models(opts, outdir, exp)

    # run good mmodels
    big_run(exp, opts, job_file_handler, outdir, optpar)

    finish_time = time.localtime()
def main():
    opts = get_options()

    inbam = opts.inbam
    resolution = opts.reso
    filter_exclude = opts.filter
    min_count = opts.min_count
    ncpus = opts.cpus
    factor = 1
    outdir = opts.outdir
    sigma = 2

    mkdir(outdir)

    sys.stdout.write('\nNormalization of full genome\n')

    biases, decay, badcol = read_bam(inbam,
                                     filter_exclude,
                                     resolution,
                                     min_count=min_count,
                                     ncpus=ncpus,
                                     sigma=sigma,
                                     factor=factor,
                                     outdir=outdir,
                                     check_sum=opts.check_sum)

    printime('  - Saving biases and badcol columns')
    # biases
    out = open(
        os.path.join(outdir, 'biases_%s.pickle' %
                     (nicer(resolution).replace(' ', ''))), 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': resolution
        }, out)
    out.close()

    # hic_data.write_matrix('chr_names%s_%d-%d.mat' % (region, start, end), focus=())
    printime('\nDone.')
Exemple #12
0
def check_options(opts):
    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: wordir not found.')
    # do the division to bins
    try:
        opts.beg = int(float(opts.beg) / opts.reso)
        opts.end = int(float(opts.end) / opts.reso)
        if opts.end - opts.beg <= 2:
            raise Exception('"beg" and "end" parameter should be given in ' +
                            'genomic coordinates, not bin')
    except TypeError:
        pass

    # turn options into lists
    opts.scale   = (tuple(arange(*[float(s) for s in opts.scale.split(':')  ]))
                    if ':' in opts.scale   else [float(opts.scale  )])
    
    opts.maxdist = (tuple(range (*[int  (i) for i in opts.maxdist.split(':')]))
                    if ':' in opts.maxdist else [int  (opts.maxdist)])

    opts.upfreq  = (tuple(arange(*[float(i) for i in opts.upfreq.split(':') ]))
                    if ':' in opts.upfreq  else [float(opts.upfreq )])

    opts.lowfreq = (tuple(arange(*[float(i) for i in opts.lowfreq.split(':')]))
                    if ':' in opts.lowfreq else [float(opts.lowfreq)])

    opts.dcutoff = (tuple(arange(*[float(i) for i in opts.dcutoff.split(':')]))
                    if ':' in opts.dcutoff else [float(opts.dcutoff)])

    opts.nmodels_run = opts.nmodels_run or opts.nmodels

    if opts.matrix:
        opts.matrix  = path.abspath(opts.matrix)
    opts.workdir = path.abspath(opts.workdir)

    mkdir(opts.workdir)
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
Exemple #13
0
def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # check custom normalization
    if opts.normalization=='custom':
        if not opts.biases_path:
            raise IOError('ERROR: biases file required for "custom" normalization.')
        elif not path.exists(opts.biases_path):
            raise IOError('ERROR: biases not found at path: %s' % opts.biases_path)

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if 'tmpdb' in opts and opts.tmpdb:
                remove(path.join(dbdir, dbfile))
            exit('WARNING: exact same job already computed, see JOBs table above')
    except IOError:  # new working directory
        pass
Exemple #14
0
def check_options(opts):
    mkdir(opts.workdir)

    # transform filtering reads option
    opts.filter = filters_to_bin(opts.filter)

    # check custom normalization
    if opts.normalization=='custom':
        if not opts.biases_path:
            raise IOError('ERROR: biases file required for "custom" normalization.')
        elif not path.exists(opts.biases_path):
            raise IOError('ERROR: biases not found at path: %s' % opts.biases_path)

    # check resume
    if not path.exists(opts.workdir):
        raise IOError('ERROR: workdir not found.')

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if 'tmpdb' in opts and opts.tmpdb:
                remove(path.join(dbdir, dbfile))
            exit('WARNING: exact same job already computed, see JOBs table above')
    except IOError:  # new working directory
        pass
Exemple #15
0
def check_options(opts):
    mkdir(opts.workdir)

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join([ascii_letters[int(random() * 52)]
                                        for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass
        if opts.workdir1:
            # tmp file
            dbfile1 = 'trace1_%s' % (''.join([ascii_letters[int(random() * 52)]
                                              for _ in range(10)]))
            opts.tmpdb1 = path.join(dbdir, dbfile1)
            try:
                copyfile(path.join(opts.workdir1, 'trace.db'), opts.tmpdb1)
            except IOError:
                pass
        if opts.workdir2:
            # tmp file
            dbfile2 = 'trace2_%s' % (''.join([ascii_letters[int(random() * 52)]
                                              for _ in range(10)]))
            opts.tmpdb2 = path.join(dbdir, dbfile2)
            try:
                copyfile(path.join(opts.workdir2, 'trace.db'), opts.tmpdb2)
            except IOError:
                pass

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if 'tmpdb' in opts and opts.tmpdb:
            remove(path.join(dbdir, dbfile))
            if opts.workdir1:
                remove(path.join(dbdir, dbfile1))
            if opts.workdir2:
                remove(path.join(dbdir, dbfile2))
        exit('WARNING: exact same job already computed, see JOBs table above')
Exemple #16
0
def check_options(opts):
    if not path.exists(opts.workdir):
        mkdir(opts.workdir)
        # write version log
        vlog_path = path.join(opts.workdir,
                              'TADbit_and_dependencies_versions.log')
        dependencies = get_dependencies_version()
        if not path.exists(
                vlog_path) or open(vlog_path).readlines() != dependencies:
            logging.info('Writing versions of TADbit and dependencies')
            vlog = open(vlog_path, 'w')
            vlog.write(dependencies)
            vlog.close()

    mkdir(path.join(opts.workdir, '03_filtered_reads'))

    # create empty DB if don't exists
    dbpath = path.join(opts.workdir, 'trace.db')
    open(dbpath, 'a').close()

    # for LUSTRE file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())
Exemple #17
0
def write_matrix(inbam,
                 resolution,
                 biases,
                 outfile,
                 filter_exclude=(1, 2, 3, 4, 6, 7, 8, 9, 10),
                 region1=None,
                 start1=None,
                 end1=None,
                 clean=True,
                 region2=None,
                 start2=None,
                 end2=None,
                 nchunks=100,
                 tmpdir='.',
                 ncpus=8,
                 verbose=True,
                 window=None):

    if not isinstance(filter_exclude, int):
        filter_exclude = filters_to_bin(filter_exclude)

    _, rand_hash, bin_coords, chunks = read_bam(inbam,
                                                filter_exclude,
                                                resolution,
                                                ncpus=ncpus,
                                                region1=region1,
                                                start1=start1,
                                                end1=end1,
                                                region2=region2,
                                                start2=start2,
                                                end2=end2,
                                                tmpdir=tmpdir,
                                                nchunks=nchunks,
                                                verbose=verbose)

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))

    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    if biases:
        bias1, bias2, decay, bads1, bads2 = get_biases_region(
            biases, bin_coords)
        transform = lambda x, c, j, k: x / bias1[j] / bias2[k] / decay[c][abs(
            k - j)]
        transform2 = lambda x, j, k: x / bias1[j] / bias2[k]
    else:
        bads1 = bads2 = {}
        transform = transform2 = lambda x, c, k, j: x

    if bads1 is bads2:
        badcols = bads1
    else:  # should never happen
        badcols = bads1
        badcols.update(bads2)

    if verbose:
        printime('  - Writing matrices')

    mkdir(os.path.split(os.path.abspath(outfile))[0])
    # write the rest of the file to be sorted
    out = open(outfile, 'w')
    nheader = 0
    for i, c in enumerate(bamfile.references):
        out.write('# CHROM\t{}\t{}\n'.format(c, bamfile.lengths[i]))
        nheader += 1
    out.write('# RESOLUTION\t{}\n'.format(resolution))
    nheader += 1
    out.write('# BADCOLS\t{}\n'.format(','.join(map(str, badcols.keys()))))
    nheader += 1

    if window == 'all':
        outside = lambda c_, j_, k_: False
    elif window == 'intra':
        outside = lambda c_, j_, k_: c_ == ''
    elif window == 'inter':
        outside = lambda c_, j_, k_: c_ != ''
    else:
        min_, max_ = window
        outside = lambda c_, j_, k_: (k_ - j_) < min_ or (k_ - j_) > max_

    # pull all sub-matrices and write full matrix
    for c, j, k, v in _iter_matrix_frags(chunks,
                                         tmpdir,
                                         rand_hash,
                                         verbose=verbose,
                                         clean=clean):
        if k < j or j in badcols or k in badcols:  # we keep only half matrix
            continue
        if outside(c, j, k):
            continue
        try:
            n = transform(v, c, j, k)  # normalize
        except KeyError:
            n = transform2(v, j, k)  # normalize no decay
        out.write('{}\t{}\t{}\t{}\n'.format(j, k, v, n))
    out.close()

    # this is the last thing we do in case something goes wrong
    if clean:
        os.system('rm -rf %s' % (os.path.join(tmpdir, '_tmp_%s' %
                                              (rand_hash))))
    return nheader
Exemple #18
0
def read_bam(inbam, filter_exclude, resolution, min_count=2500, biases_path='',
             normalization='Vanilla', mappability=None, n_rsites=None,
             cg_content=None, sigma=2, ncpus=8, factor=1, outdir='.', seed=1,
             extra_out='', only_valid=False, normalize_only=False, p_fit=None,
             max_njobs=100, min_perc=None, max_perc=None, extra_bads=None):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,
                               [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm)])

    start_bin = 0
    end_bin   = len(bins)
    total     = len(bins)

    regs = []
    begs = []
    ends = []
    njobs = min(total, max_njobs) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            try:
                (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
            except IndexError:
                break
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(pool.apply_async(
            read_bam_frag, args=(inbam, filter_exclude, bins, bins_dict,
                                 resolution, outdir, extra_out,
                                 region, start, end,)))
    pool.close()
    print_progress(procs)
    pool.join()
    ## COLLECT RESULTS
    cisprc = {}
    printime('  - Collecting cis and total interactions per bin (%d chunks)' % (len(regs)))
    stdout.write('     ')
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if not countbin % 10 and countbin:
            stdout.write(' ')
        if not countbin % 50 and countbin:
            stdout.write(' %9s\n     ' % ('%s/%s' % (countbin , len(regs))))
        stdout.write('.')
        stdout.flush()

        fname = path.join(outdir,
                          'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        tmp_cisprc = load(open(fname))
        system('rm -f %s' % fname)
        cisprc.update(tmp_cisprc)
    stdout.write('\n')

    printime('  - Removing columns with too few or too much interactions')
    if len(bamfile.references) == 1 and min_count is None:
        raise Exception("ERROR: only one chromosome can't filter by "
                        "cis-percentage, set min_count instead")
    elif min_count is None and len(bamfile.references) > 1:
        badcol = filter_by_cis_percentage(
            cisprc, sigma=sigma, verbose=True, min_perc=min_perc, max_perc=max_perc,
            size=total, savefig=None)
    else:
        print ('      -> too few interactions defined as less than %9d '
               'interactions') % (min_count)
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, [0, 0])[1] < min_count:
                badcol[c] = cisprc.get(c, [0, 0])[1]
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total, float(len(badcol)) / total * 100)

    # no mappability will result in NaNs, better to filter out these columns
    if mappability:
        badcol.update((i, True) for i, m in enumerate(mappability) if not m)

    # add manually columns to bad columns
    if extra_bads:
        removed_manually = 0
        for ebc in extra_bads:
            c, ebc = ebc.split(':')
            b, e = map(int, ebc.split('-'))
            b = b / resolution + section_pos[c][0]
            e = e / resolution + section_pos[c][0]
            removed_manually += (e - b)
            badcol.update(dict((p, 'manual') for p in xrange(b, e)))
        printime('  - Removed %d columns manually.' % removed_manually)
    raw_cisprc = sum(float(cisprc[k][0]) / cisprc[k][1]
                     for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol))

    printime('  - Rescaling sum of interactions per bins')
    size = len(bins)
    biases = [float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1]
              for k in xrange(size)]

    if normalization == 'ICE':
        printime('  - ICE normalization')
        hic_data = load_hic_data_from_bam(
            inbam, resolution, filter_exclude=filter_exclude,
            tmpdir=outdir, ncpus=ncpus)
        hic_data.bads = badcol
        hic_data.normalize_hic(iterations=100, max_dev=0.000001)
        biases = hic_data.bias.copy()
        del(hic_data)
    elif normalization == 'Vanilla':
        printime('  - Vanilla normalization')
        mean_col = nanmean(biases)
        biases   = dict((k, b / mean_col * mean_col**0.5)
                        for k, b in enumerate(biases))
    elif normalization == 'SQRT':
        printime('  - Vanilla-SQRT normalization')
        biases = [b**0.5 for b in biases]
        mean_col = nanmean(biases)
        biases   = dict((k, b / mean_col * mean_col**0.5)
                        for k, b in enumerate(biases))
    elif normalization == 'oneD':
        printime('  - oneD normalization')
        if len(set([len(biases), len(mappability), len(n_rsites), len(cg_content)])) > 1:
            print "biases", "mappability", "n_rsites", "cg_content"
            print len(biases), len(mappability), len(n_rsites), len(cg_content)
            raise Exception('Error: not all arrays have the same size')
        tmp_oneD = path.join(outdir,'tmp_oneD_%s' % (extra_out))
        mkdir(tmp_oneD)
        biases = oneD(tmp_dir=tmp_oneD, p_fit=p_fit, tot=biases, map=mappability,
                      res=n_rsites, cg=cg_content, seed=seed)
        biases = dict((k, b) for k, b in enumerate(biases))
        rmtree(tmp_oneD)
    elif normalization == 'custom':
        n_pos = 0
        biases = {}
        print 'Using provided biases...'
        with open(biases_path, 'r') as r:
            r.next()
            for line in r:
                if line[0] == 'N':
                    #b = float('nan')
                    badcol[n_pos] = 0
                    biases[n_pos] = float('nan')
                else:
                    b = float(line)
                    if b == 0:
                        badcol[n_pos] = 0
                        biases[n_pos] = float('nan')
                    else:
                        biases[n_pos] = b
                n_pos += 1
        for add in range(max(biases.keys()), total + 1):
            biases[add] = float('nan')
    else:
        raise NotImplementedError('ERROR: method %s not implemented' %
                                  normalization)

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    printime('  - Getting sum of normalized bins')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(outdir,
                          'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(pool.apply_async(sum_nrm_matrix,
                                      args=(fname, biases,)))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    if not normalize_only:
        printime('  - Computing Cis percentage')
        # Calculate Cis percentage

        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = path.join(outdir,
                              'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
            procs.append(pool.apply_async(get_cis_perc,
                                          args=(fname, biases, badcol, bins)))
        pool.close()
        print_progress(procs)
        pool.join()

        # collect results
        cis = total = 0
        for proc in procs:
            c, t = proc.get()
            cis += c
            total += t
        norm_cisprc = float(cis) / total
        print '    * Cis-percentage: %.1f%%' % (norm_cisprc * 100)
    else:
        norm_cisprc = 0.

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(outdir,
                          'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(pool.apply_async(sum_dec_matrix,
                                      args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    nrmdec = {}
    rawdec = {}
    for proc in procs:
        tmpnrm, tmpraw = proc.get()
        for c, d in tmpnrm.iteritems():
            for k, v in d.iteritems():
                try:
                    nrmdec[c][k] += v
                    rawdec[c][k] += tmpraw[c][k]
                except KeyError:
                    try:
                        nrmdec[c][k]  = v
                        rawdec[c][k] = tmpraw[c][k]
                    except KeyError:
                        nrmdec[c] = {k: v}
                        rawdec[c] = {k: tmpraw[c][k]}
    # count the number of cells per diagonal
    # TODO: parallelize
    # find largest chromosome
    len_crms = dict((c, section_pos[c][1] - section_pos[c][0]) for c in section_pos)
    # initialize dictionary
    ndiags = dict((c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections)
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[crm][dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set()  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b < maxp:  # not inclusive!!
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[crm][dist] -= len(bad_diag)
        # different behavior for longest diagonal:
        ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr for b in thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    signal_to_noise = 0.05
    min_n = signal_to_noise ** -2. # equals 400 when default
    for crm in sections:
        if not crm in nrmdec:
            nrmdec[crm] = {}
            rawdec[crm] = {}
        tmpdec = 0  # store count by diagonal
        tmpsum = 0  # store count by diagonal
        ndiag  = 0
        val    = 0
        previous = [] # store diagonals to be summed in case not reaching the minimum
        for k in ndiags[crm]:
            tmpdec += nrmdec[crm].get(k, 0.)
            tmpsum += rawdec[crm].get(k, 0.)
            previous.append(k)
            if tmpsum > min_n:
                ndiag = sum(ndiags[crm][k] for k in previous)
                val = tmpdec  # backup of tmpdec kept for last ones outside the loop
                try:
                    ratio = val / ndiag
                    for l in previous:
                        nrmdec[crm][l] = ratio
                except ZeroDivisionError:  # all columns at this distance are "bad"
                    pass
                previous = []
                tmpdec = 0
                tmpsum = 0
        # last ones we average with previous result
        if  len(previous) == len(ndiags[crm]):
            nrmdec[crm] = {}
        elif tmpsum < min_n:
            ndiag += sum(ndiags[crm][k] for k in previous)
            val += tmpdec
            try:
                ratio = val / ndiag
                for k in previous:
                    nrmdec[crm][k] = ratio
            except ZeroDivisionError:  # all columns at this distance are "bad"
                pass
    return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
Exemple #19
0
def check_options(opts):

    if not opts.workdir: raise Exception('ERROR: output option required.')
    if opts.type != 'map':
        raise NotImplementedError('ERROR: not yet there')

    if not opts.genome: raise Exception('ERROR: genome parameter required.')
    if not opts.workdir: raise Exception('ERROR: workdir parameter required.')

    # check skip
    if not path.exists(opts.workdir) and opts.skip:
        print('WARNING: can use output files, found, not skipping...')
        opts.skip = False

    if opts.workdir.endswith('/'):
        opts.workdir = opts.workdir[:-1]

    # write log
    newbie = False
    if not path.exists(opts.workdir):
        newbie = True
        mkdir(opts.workdir)
    log_format = '[PARSING]   %(message)s'

    # reset logging
    logging.getLogger().handlers = []

    try:
        print('Writing log to ' + path.join(opts.workdir, 'process.log'))
        logging.basicConfig(level=logging.INFO,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log'),
                            filemode='a+')
    except IOError:
        logging.basicConfig(level=logging.DEBUG,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log2'),
                            filemode='a+')

    # to display log on stdout also
    logging.getLogger().addHandler(logging.StreamHandler())

    # write version log
    vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log')
    dependencies = get_dependencies_version()
    if not path.exists(
            vlog_path) or open(vlog_path).readlines() != dependencies:
        logging.info('Writing versions of TADbit and dependencies')
        vlog = open(vlog_path, 'w')
        vlog.write(dependencies)
        vlog.close()

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # check if job already run using md5 digestion of parameters
    try:
        if already_run(opts):
            if 'tmpdb' in opts and opts.tmpdb:
                remove(path.join(dbdir, dbfile))
            exit(
                'WARNING: exact same job already computed, see JOBs table above'
            )
    except OSError:
        pass
Exemple #20
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(reads,
                                          nreads=1000000,
                                          stats=('median', 'first_decay',
                                                 'MAD'),
                                          savefig=hist_path)

        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print(
            '   Using the maximum continuous fragment size'
            '(%d bp) to check '
            'for pseudo-dangling ends') % max_mole
        print(
            '   Using maximum continuous fragment size plus the MAD '
            '(%d bp) to check for random breaks') % min_dist

        print "identify pairs to filter..."
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=0.001,
                              max_frag_size=100000,
                              min_frag_size=50,
                              re_proximity=5,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
Exemple #21
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts)

    if not opts.nosql:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        # store path ids to be saved in database
        inputs = bad_co_id, biases_id, mreads_id
    else:
        bad_co = opts.bad_co
        biases = opts.biases
        mreads = opts.mreads
        reso   = opts.reso

    mreads = path.join(opts.workdir, mreads)
    bad_co = path.join(opts.workdir, bad_co)
    biases = path.join(opts.workdir, biases)

    mkdir(path.join(opts.workdir, '05_segmentation'))

    print 'loading %s at resolution %s' % (mreads, nice(reso))
    hic_data = load_hic_data_from_reads(mreads, reso)
    hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
    hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                         for l in open(biases))

    # compartments
    cmp_result = {}
    if not opts.only_tads:
        print 'Searching compartments'
        hic_data.find_compartments(crms=opts.crms)

        cmprt_dir = path.join(opts.workdir, '05_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        for crm in opts.crms or hic_data.chromosomes:
            cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            hic_data.write_compartments(cmprt_file,
                                        chroms=[crm])
            cmp_result[crm] = {'path': cmprt_file,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '05_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            # maximum size of a TAD
            max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=True,
                            max_tad_size=max_tad_size,
                            no_heuristic=True)
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        save_to_db(opts, cmp_result, tad_result, reso, inputs, 
                   launch_time, finish_time)
Exemple #22
0
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True,
                 min_seq_len=15, windows=None, add_site=True, clean=False,
                 get_nread=False, **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            fastq=(   input_reads.endswith('.fastq'   )
                   or input_reads.endswith('.fastq.gz')
                   or input_reads.endswith('.fq.gz'   )
                   or input_reads.endswith('.dsrc'    )),
            min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            _gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs)
            # parse map file to extract not uniquely mapped reads
            print 'Parsing result...'
            _gem_filter(out_map_path,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_full_%s-%s%s.map' % (
                                         beg, end, suffix)))
            # clean
            if clean:
                print '   x removing GEM input %s' % curr_map
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append(
            (os.path.join(out_map_dir,
                          base_name + '_full_%s-%s%s.map' % (beg, end, suffix)),
             counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz,
            add_site=add_site, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if clean:
            print '   x removing pre-GEM input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            print 'Mapping fragments of remaining reads...'
            _gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs)
            print 'Parsing result...'
            _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
        # clean
        if clean:
            print '   x removing GEM input %s' % frag_map
            os.system('rm -f %s' % (frag_map))
            print '   x removing failed to map ' + curr_map + '_fail%s.map' % (suffix)
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print '   x removing tmp mapped %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(out_map_dir,
                                      base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)),
                         counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Exemple #23
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids)

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2  = None
    elif opts.read == 2:
        out_file2 = None
        f_names1  = f_names2
        f_names2  = None
        out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of cPickle genome to make it faster
        genome = load(open(opts.genome[0]))
    except UnpicklingError:
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1,
                                   out_file2=out_file2, re_name=renz, verbose=True,
                                   genome_seq=genome, compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (
                    read, counts[read][item],
                    out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
Exemple #24
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        if opts.fast_fragment:
            reads = fname1
            counts_multis = [
                '#' in line.split('\t')[0] for line in open(reads)
            ]
            count = len(counts_multis)
            multiples = {}
            multiples[1] = sum(
                [count_mult for count_mult in counts_multis if count_mult])
            del counts_multis
        else:
            # compute the intersection of the two read ends
            print('Getting intersection between read 1 and read 2')
            count, multiples = get_intersection(fname1,
                                                fname2,
                                                reads,
                                                compress=opts.compress_input)

        # compute insert size
        print('Get insert size...')
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        try:
            median, max_f, mad = fragment_size(reads,
                                               nreads=1000000,
                                               stats=('median', 'first_decay',
                                                      'MAD'),
                                               savefig=hist_path)
        except ZeroDivisionError:
            warn('WARNING: cannot compute fragment length, too few '
                 'dangling-ends. Setting median length to 400 nt.')
            median = max_f = mad = 0
        if median < 50:
            warn('WARNING: fragment length too short ({}). '
                 'Setting median length to 400 nt.'.format(mad))
            median, max_f, mad = 400, 100, 40
        if opts.median:
            median = opts.median
        if opts.max_f:
            max_f = opts.max_f
        if opts.mad:
            mad = opts.mad

        print('  - median insert size =', median)
        print('  - median absolution of insert size =', mad)
        print(
            '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =',
            max_f)

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print('   Using the maximum continuous fragment size'
              '(%d bp) to check '
              'for pseudo-dangling ends' % max_mole)
        print('   Using maximum continuous fragment size plus the MAD '
              '(%d bp) to check for random breaks' % min_dist)

        print("identify pairs to filter...")
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              strict_duplicates=opts.strict_duplicates,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile,
                    opts.valid,
                    opts.cpus,
                    outbam,
                    opts.format,
                    masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print(median, max_f, mad)
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time,
               finish_time)
Exemple #25
0
def read_bam(inbam, filter_exclude, resolution, ncpus=8,
             region1=None, start1=None, end1=None,
             region2=None, start2=None, end2=None, nchunks=None,
             tmpdir='.', verbose=True, normalize=False, max_size=None):

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(zip(bamfile.references,
                               [x / resolution + 1 for x in bamfile.lengths]))
    # get chromosomes and genome sizes
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    # define genomic bins
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm)])
    if not bins:
        raise Exception('ERROR: Chromosome %s smaller than bin size\n' % (crm))

    # define start, end position of region to grab
    start_bin1 = 0
    end_bin1   = len(bins) + 1
    regions = bamfile.references
    if region1:
        regions = [region1]
        if region2:
            regions.append(region2)
    else:
        total = len(bins)
        if start1 is not None or end1:
            raise Exception('ERROR: Cannot use start/end1 without region')

    if start1 is not None:
        start_bin1 = section_pos[region1][0] + start1 / resolution
    else:
        if region1:
            start_bin1 = section_pos[region1][0]
        else:
            start_bin1 = 0
        start1 = 0
    if end1 is not None:
        end_bin1 = section_pos[region1][0] + end1 / resolution
    else:
        if region1:
            end_bin1 = section_pos[region1][1]
            end1 = sections[region1] * resolution
        else:
            end_bin1 = total
            end1 = total * resolution

    # define chunks, using at most 100 sub-divisions of region1
    total = end_bin1 - start_bin1
    regs  = []
    begs  = []
    ends  = []
    if nchunks is None:
        njobs = min(total, 100) + 1
    else:
        njobs = min(nchunks, 1)
    nbins = total / njobs + 1
    for i in xrange(start_bin1, end_bin1, nbins):
        if i + nbins > end_bin1:  # make sure that we stop at the right place
            nbins = end_bin1 - i
        try:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[-1]
        if crm1 != crm2:
            fin1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(fin1 * resolution + resolution)  # last nt included
            ends.append(fin2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(fin2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included
    # reduce dictionaries
    all_bins = []
    seenbins = set()
    for crm in regions:
        beg_crm = section_pos[crm][0]
        if region1:
            start = start_bin1 - beg_crm
            end   = end_bin1   - beg_crm
        else:
            start = 0
            end   = section_pos[crm][1] - section_pos[crm][0]
        all_bins.extend([(crm, i) for i in xrange(start, end)
                          if not (crm, i) in seenbins])
        seenbins = set(all_bins)
    del(seenbins)

    bins_dict1 = dict((j, i) for i, j in enumerate(all_bins))
    if region2:
        if not region2 in section_pos:
            raise Exception('ERROR: chromosome %s not found' % region2)
        bins = []
        beg_crm = section_pos[region2][0]
        if start2 is not None:
            start_bin2 = section_pos[region2][0] + start2 / resolution
        else:
            start_bin2 = section_pos[region2][0]
            start2 = 0
        if end2 is not None:
            end_bin2   = section_pos[region2][0] + end2   / resolution
        else:
            end_bin2   = section_pos[region2][1]
            end2       = sections[region2] * resolution
        start = start_bin2 - beg_crm
        end   = end_bin2   - beg_crm
        bins = [(region2, i) for i in xrange(start, end)]
        bins_dict2 = dict([(j, i) for i, j in enumerate(bins)])
    else:
        start_bin2 = start_bin1
        end_bin2 = end_bin1
        bins_dict2 = bins_dict1

    size1 = end_bin1 - start_bin1
    size2 = end_bin2 - start_bin2
    if verbose:
        printime('\n  (Matrix size %dx%d)' % (size1, size2))
    if max_size and max_size < size1 * size2:
        raise Exception(('ERROR: matrix too large ({0}x{1}) should be at most '
                         '{2}x{2}').format(size1, size2, int(max_size**0.5)))

    pool = mu.Pool(ncpus)
    # create random hash associated to the run:
    rand_hash = "%016x" % getrandbits(64)

    ## RUN!
    if verbose:
        printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    mkdir(os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))
    # empty all_bins array if we are not going to normalize
    if not normalize:
        all_bins = []
    procs = []
    for i, (region, b, e) in enumerate(zip(regs, begs, ends)):
        if ncpus == 1:
            _read_bam_frag(inbam, filter_exclude, all_bins,
                           bins_dict1, bins_dict2, rand_hash,
                           resolution, tmpdir, region, b, e,)
        else:
            procs.append(pool.apply_async(
                _read_bam_frag, args=(inbam, filter_exclude, all_bins,
                                      bins_dict1, bins_dict2, rand_hash,
                                      resolution, tmpdir, region, b, e,)))
    pool.close()
    if verbose:
        print_progress(procs)
    pool.join()
    bin_coords = start_bin1, end_bin1, start_bin2, end_bin2
    chunks = regs, begs, ends
    return regions, rand_hash, bin_coords, chunks
Exemple #26
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count,
                                draw_hist=True,
                                by_mean=not opts.fast_filter, savefig=path.join(
                                    opts.workdir, '04_normalization',
                                    'bad_columns_%s_%d_%d_%s.pdf' % (
                                        opts.reso, opts.perc_zeros, opts.min_count,
                                        param_hash)) if
                                not opts.fast_filter else None)
    except ValueError:
        raise ValueError('ERROR: probably all columns filtered out...')
    # bad columns
    bad_columns_file = path.join(opts.workdir, '04_normalization',
                                 'bad_columns_%s_%d_%d_%s.tsv' % (
                                     opts.reso, opts.perc_zeros, opts.min_count, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    if not opts.filter_only:
        print 'Get biases using ICE...'
        hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                               factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = cis_trans_N_d = float('nan')
    if not opts.filter_only:
        cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
        cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)
        
    if not opts.filter_only:
        print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
        print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (
                                    opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
        savefig=inter_vs_gcoord)
    
    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = 'NA'
    if not opts.filter_only:
        out_bias = open(bias_file, 'w')
        out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                                  for i in hic_data.bias])
                       + '\n')
        out_bias.close()


    # pickle the HiC-data object
    print 'Saving genomic matrix'
    pickle_path = path.join(opts.workdir, '04_normalization',
                            'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash))
    out = open(pickle_path, 'w')
    dump(hic_data, out)
    out.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            if not opts.filter_only:
                inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                      'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                  'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        else:
            if not opts.filter_only:
                genom_map_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        if not opts.filter_only:
            genom_map_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, cmap='jet',
                    name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt)
        hic_map(hic_data, normalized=False, cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig, savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d,
                a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads,
                len(hic_data.bads.keys()), len(hic_data),
                intra_dir_nrm_fig, intra_dir_nrm_txt,
                inter_dir_nrm_fig, inter_dir_nrm_txt,
                genom_map_nrm_fig, genom_map_nrm_txt,
                intra_dir_raw_fig, intra_dir_raw_txt,
                inter_dir_raw_fig, inter_dir_raw_txt,
                genom_map_raw_fig, genom_map_raw_txt,
                pickle_path, launch_time, finish_time)
Exemple #27
0
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True,
                 min_seq_len=15, windows=None, add_site=True, clean=False,
                 **kwargs):
    """
    Do the mapping

    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for begining and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermedite files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.map_parser.parse_map`
    """
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    if get_free_space_mb(temp_dir, div=3) < 50:
        warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir)

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    input_reads = fastq_path
    if windows is None:
        windows = (None, )
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map = transform_fastq(input_reads, 
                                   mkstemp(prefix=base_name + '_',
                                           dir=temp_dir)[1],
                                   fastq=(input_reads.endswith('.fastq')
                                          or input_reads.endswith('.fastq.gz')),
                                   min_seq_len=min_seq_len, trim=win)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s.map' % (beg, end)
        if end:
            print 'Mapping reads in window %s-%s...' % (beg, end)
        else:
            print 'Mapping full reads...', curr_map
        map_file = gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs)
        map_file.close()

        # parse map file to extract not uniquely mapped reads
        print 'Parsing result...'
        _gem_filter(out_map_path, curr_map + '_filt_%s-%s.map' % (beg, end),
                    os.path.join(out_map_dir,
                                 base_name + '_full_%s-%s.map' % (beg, end)))
        # clean
        if clean:
            print '   x removing GEM input %s' % curr_map
            os.system('rm -f %s' % (curr_map))
            print '   x removing map %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        # for next round, we will use remaining unmapped reads
        input_reads = curr_map + '_filt_%s-%s.map' % (beg, end)
        outfiles.append(os.path.join(out_map_dir,
                                     base_name + '_full_%s-%s.map' % (beg, end)))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map = transform_fastq(input_reads,
                                   mkstemp(prefix=base_name + '_',
                                           dir=temp_dir)[1],
                                   min_seq_len=min_seq_len, trim=win,
                                   fastq=False, r_enz=r_enz, add_site=add_site)
        out_map_path = frag_map + '_frag.map'
        print 'Mapping fragments of remaining reads...'
        map_file = gem_mapping(gem_index_path, frag_map, out_map_path,
                               **kwargs)
        map_file.close()
        print 'Parsing result...'
        _gem_filter(out_map_path, curr_map + '_fail.map',
                    os.path.join(out_map_dir, base_name + '_frag.map'))
        outfiles.append(os.path.join(out_map_dir, base_name + '_frag.map'))
    return outfiles
Exemple #28
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)
        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print '         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print '         - reproducibility score: %.4f' % (reprod)
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'

        corr = eig_corr = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    printime('  - Mergeing experiments')
    system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
    printime('  - Indexing new BAM file')
    # check samtools version number and modify command line
    version = LooseVersion([l.split()[1]
                            for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
                            if 'Version' in l][0])
    if version >= LooseVersion('1.3.1'):
        system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
    else:
        system(samtools  + ' index %s' % (outbam))

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(bads.keys()), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, launch_time, finish_time)
    printime('\nDone.')
Exemple #29
0
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True,
                 min_seq_len=15, windows=None, add_site=True, clean=False,
                 get_nread=False, **kwargs):
    """
    Do the mapping

    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for begining and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermedite files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    if get_free_space_mb(temp_dir, div=3) < 50:
        warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir)

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    input_reads = fastq_path
    if windows is None:
        windows = (None, )
    elif isinstance(windows[0], int):
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            fastq=(   input_reads.endswith('.fastq'   )
                   or input_reads.endswith('.fastq.gz')
                   or input_reads.endswith('.fq.gz'   )
                   or input_reads.endswith('.dsrc'    )),
            min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs)
            # parse map file to extract not uniquely mapped reads
            print 'Parsing result...'
            _gem_filter(out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            # clean
            if clean:
                print '   x removing GEM input %s' % curr_map
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append(
            (os.path.join(out_map_dir,
                          base_name + '_full_%s-%s%s.map' % (beg, end, suffix)),
             counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz,
            add_site=add_site, skip=skip, nthreads=nthreads)
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            print 'Mapping fragments of remaining reads...'
            gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs)
            print 'Parsing result...'
            _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
        outfiles.append((os.path.join(out_map_dir,
                                      base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)),
                         counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Exemple #30
0
def check_options(opts):

    if not opts.mapper_binary:
        if opts.mapper == 'gem':
            opts.mapper_binary = 'gem-mapper'
        else:
            opts.mapper_binary = opts.mapper
    opts.mapper_binary = which(opts.mapper_binary)
    if not opts.mapper_binary:
        raise Exception(
            '\n\nERROR: Mapper binary not found, for GEM install it from:'
            '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
            '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
            'have a recent computer, the '
            'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
            'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
            'Copy the binary gem-mapper to /usr/local/bin/ for '
            'example (somewhere in your PATH).\n\nNOTE: GEM does '
            'not provide any binary for MAC-OS.')

    opts.gem_version = 0
    if opts.mapper == 'gem':
        opts.gem_version = None
        try:
            out, _ = Popen([opts.mapper_binary, '--version'],
                           stdout=PIPE,
                           stderr=STDOUT,
                           universal_newlines=True).communicate()
            opts.gem_version = int(out[1])
        except ValueError as e:
            opts.gem_version = 2
            print('Falling to gem v2')

    if opts.fast_fragment:
        if opts.gem_version < 3:
            raise Exception('ERROR: Fast fragment mapping needs GEM v3')
        if not opts.fastq2 or not path.exists(opts.fastq2):
            raise Exception(
                'ERROR: Fast fragment mapping needs both fastq files. '
                'Please specify --fastq2')
        if opts.read != 0:
            raise Exception(
                'ERROR: Fast fragment mapping needs to be specified with --read 0'
            )
        if not opts.genome:
            raise Exception('ERROR: Fast fragment mapping needs '
                            'the genome parameter.')
    # check RE name
    if opts.renz == ['CHECK']:
        print('\nSearching for most probable restriction enzyme in file: %s' %
              (opts.fastq))
        try:
            pat, enz, pv = identify_re(opts.fastq, nreads=100000)
            print(' -> Most probable digested site: %s (pv: %f)' % (pat, pv))
            print(' -> Enzymes matching: %s' % (', '.join(enz)))
        except ValueError:
            print(' -> Nothing found...')
        exit()
    for n, renz in enumerate(opts.renz):
        if renz == 'NONE':
            opts.renz[n] = None
            continue
        try:
            _ = RESTRICTION_ENZYMES[renz]
        except KeyError:
            print('\n\nERROR: restriction enzyme %s not found.' % (renz) +
                  'Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) +
                  '\n\n')
            raise KeyError()
        except AttributeError:
            pass

    # check skip
    if not path.exists(opts.workdir) and opts.skip:
        print('WARNING: can use output files, found, not skipping...')
        opts.skip = False

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check paths
    if opts.mapper == 'gem' and not path.exists(opts.index):
        raise IOError('ERROR: index file not found at ' + opts.index)

    if not path.exists(opts.fastq):
        raise IOError('ERROR: FASTQ file not found at ' + opts.fastq)

    if not is_fastq(opts.fastq):
        raise IOError(
            ('ERROR: FASTQ file %s wrong format, check') % (opts.fastq))

    try:
        opts.windows = [[int(i) for i in win.split(':')]
                        for win in opts.windows]
    except TypeError:
        pass

    mkdir(opts.workdir)
    # write log
    # if opts.mapping_only:
    log_format = '[MAPPING {} READ{}]   %(message)s'.format(
        opts.fastq, opts.read)
    # else:
    #     log_format = '[DEFAULT]   %(message)s'

    # reset logging
    logging.getLogger().handlers = []

    try:
        print('Writing log to ' + path.join(opts.workdir, 'process.log'))
        logging.basicConfig(level=logging.INFO,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log'),
                            filemode='a+')
    except IOError:
        logging.basicConfig(level=logging.DEBUG,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log2'),
                            filemode='a+')

    # to display log on stdout also
    logging.getLogger().addHandler(logging.StreamHandler())

    # write version log
    vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log')
    dependencies = get_dependencies_version()
    if not path.exists(
            vlog_path) or open(vlog_path).readlines() != dependencies:
        logging.info('Writing versions of TADbit and dependencies')
        vlog = open(vlog_path, 'w')
        vlog.write(dependencies)
        vlog.close()

    # check mapper extra options
    if opts.mapper_param:
        if (len(opts.mapper_param) == 1 and
            ('-' in opts.mapper_param[0] or '--' in opts.mapper_param[0])):
            # Single string surrounded by quotes
            opts.mapper_param = opts.mapper_param[0].split()
        else:
            opts.mapper_param = dict([o.split(':') for o in opts.mapper_param])
    else:
        opts.mapper_param = {}
    if opts.mapper == 'gem' and opts.gem_version < 3:
        gem_valid_option = set([
            "granularity", "q", "quality-format", "gem-quality-threshold",
            "mismatch-alphabet", "m", "e", "min-matched-bases",
            "max-big-indel-length", "s", "strata-after-best", "fast-mapping",
            "unique-mapping", "d", "D", "allow-incomplete-strata",
            "max-decoded-matches", "min-decoded-strata", "p",
            "paired-end-alignment", "b", "map-both-ends", "min-insert-size",
            "max-insert-size", "E", "max-extendable-matches",
            "max-extensions-per-match", "unique-pairing"
        ])
        for k in opts.mapper_param:
            if not k in gem_valid_option:
                raise NotImplementedError(
                    ('ERROR: option "%s" not a valid GEM option'
                     'or not suported by this tool.') % k)

    # create empty DB if don't exists
    dbpath = path.join(opts.workdir, 'trace.db')
    open(dbpath, 'a').close()

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if 'tmpdb' in opts and opts.tmpdb:
            remove(path.join(dbdir, dbfile))
        exit('WARNING: exact same job already computed, see JOBs table above')
Exemple #31
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, get_md5=True)

    if opts.nosql:
        biases = opts.biases
        mreads = opts.mreads
        inputs = []
    elif opts.biases or opts.mreads:
        if not opts.mreads:
            raise Exception('ERROR: also need to provide BAM file')
        if not opts.biases:
            raise Exception('ERROR: also need to provide biases file')
        biases = opts.biases
        mreads = opts.mreads
        inputs = ['NA', 'NA']
        mkdir(path.join(opts.workdir))
    else:
        biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts)
        inputs = [biases_id, mreads_id]
        # store path ids to be saved in database
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases)

    reso   = opts.reso

    mkdir(path.join(opts.workdir, '06_segmentation'))

    print 'loading %s \n    at resolution %s' % (mreads, nice(reso))
    region = None
    if opts.crms and len(opts.crms) == 1:
        region = opts.crms[0]
    hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus,
                                      region=region,
                                      biases=None if opts.all_bins else biases,
                                      filter_exclude=opts.filter)

    # compartments
    cmp_result = {}
    richA_stats = {}
    firsts = {}
    if not opts.only_tads:
        print 'Searching compartments'
        cmprt_dir = path.join(opts.workdir, '06_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        if opts.fasta:
            print '  - Computing GC content to label compartments'
            rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso,
                                       chromosomes=opts.crms,
                                       by_chrom=True, n_cpus=opts.cpus)
        elif opts.rich_in_A:
            rich_in_A = opts.rich_in_A
        else:
            rich_in_A = None
        n_evs = opts.n_evs if opts.n_evs > 0 else 3
        firsts, richA_stats = hic_data.find_compartments(
            crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash,
            rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None,
            savecorr=cmprt_dir if opts.savecorr else None,
            max_ev=n_evs,
            ev_index=opts.ev_index,
            vmin=None if opts.fix_corr_scale else 'auto',
            vmax=None if opts.fix_corr_scale else 'auto')

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            if not crm in firsts:
                continue
            ev_file = open(path.join(
                cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                    crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                    param_hash)), 'w')
            ev_file.write('# %s\n' % ('\t'.join(
                'EV_%d (%.4f)' % (i, v)
                for i, v in enumerate(firsts[crm][0], 1))))
            ev_file.write('\n'.join(['\t'.join([str(v) for v in vs])
                                     for vs in zip(*firsts[crm][1])]))
            ev_file.close()

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash))
            cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                param_hash, opts.format))
            if opts.savecorr:
                cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' %
                                       (crm, param_hash))
            else:
                cormat_file = None
            hic_data.write_compartments(cmprt_file1, chroms=[crm])
            cmp_result[crm] = {'path_cmprt1': cmprt_file1,
                               'path_cmprt2': cmprt_file2,
                               'path_cormat': cormat_file,
                               'image_cmprt': cmprt_image,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '06_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            if hic_data.bads:
                to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            else:
                to_rm = None
            # maximum size of a TAD
            max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=opts.verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=False)

            # use normalization to compute height on TADs called
            if opts.all_bins:
                if opts.nosql:
                    biases = load(open(biases))
                else:
                    biases = load(open(path.join(opts.workdir, biases)))
                hic_data.bads = biases['badcol']
                hic_data.bias = biases['biases']
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        try:
            save_to_db(opts, cmp_result, tad_result, reso, inputs,
                       richA_stats, firsts, param_hash,
                       launch_time, finish_time)
        except:
            # release lock anyway
            print_exc()
            try:
                remove(path.join(opts.workdir, '__lock_db'))
            except OSError:
                pass
            exit(1)
Exemple #32
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    # hash that gonna be append to output file names
    param_hash = digest_parameters(opts, get_md5=True)

    # create tmp directory
    if not opts.tmp:
        temp_dir = opts.workdir + '_tmp_r%d_%s' % (opts.read, param_hash)
    else:
        temp_dir = path.join(opts.tmp,
                             'TADbit_tmp_r%d_%s' % (opts.read, param_hash))

    # QC plot
    fig_path = path.join(
        opts.workdir, '%s_%s_%s.png' % (path.split(opts.fastq)[-1], '-'.join(
            map(str, opts.renz)), param_hash))
    logging.info('Generating Hi-C QC plot')

    dangling_ends, ligated = quality_plot(opts.fastq,
                                          r_enz=opts.renz,
                                          nreads=100000,
                                          paired=False,
                                          savefig=fig_path)
    for renz in dangling_ends:
        logging.info('  - Dangling-ends (sensu-stricto): %.3f%%',
                     dangling_ends[renz])
    for renz in ligated:
        logging.info('  - Ligation sites: %.3f%%', ligated[renz])
    if opts.skip_mapping:
        save_to_db(opts, dangling_ends, ligated, fig_path, [], launch_time,
                   time.localtime())
        return

    # Mapping
    if opts.fast_fragment:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))
        logging.info('parsing genomic sequence')
        try:
            # allows the use of pickle genome to make it faster
            genome_seq = load(open(opts.genome[0], 'rb'))
        except (UnpicklingError, KeyError):
            genome_seq = parse_fasta(opts.genome)

        logging.info('mapping %s and %s to %s', opts.fastq, opts.fastq2,
                     opts.workdir)
        outfiles = fast_fragment_mapping(
            opts.index,
            opts.fastq,
            opts.fastq2,
            opts.renz,
            genome_seq,
            path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash),
            clean=not opts.keep_tmp,
            get_nread=True,
            mapper_binary=opts.mapper_binary,
            mapper_params=opts.mapper_param,
            suffix=param_hash,
            temp_dir=temp_dir,
            nthreads=opts.cpus)
    else:
        logging.info('mapping %s read %s to %s', opts.fastq, opts.read,
                     opts.workdir)
        outfiles = full_mapping(opts.index,
                                opts.fastq,
                                path.join(opts.workdir,
                                          '01_mapped_r%d' % (opts.read)),
                                mapper=opts.mapper,
                                r_enz=opts.renz,
                                temp_dir=temp_dir,
                                nthreads=opts.cpus,
                                frag_map=not opts.iterative,
                                clean=not opts.keep_tmp,
                                windows=opts.windows,
                                get_nread=True,
                                skip=opts.skip,
                                suffix=param_hash,
                                mapper_binary=opts.mapper_binary,
                                mapper_params=opts.mapper_param)

    # adjust line count
    if opts.skip:
        for i, (out, _) in enumerate(outfiles[1:], 1):
            outfiles[i] = out, outfiles[i - 1][1] - sum(
                1 for _ in open(outfiles[i - 1][0]))

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time,
               finish_time)
    try:
        save_to_db(opts, dangling_ends, ligated, fig_path, outfiles,
                   launch_time, finish_time)
    except Exception as e:
        # release lock
        remove(path.join(opts.workdir, '__lock_db'))
        print_exc()
        exit(1)

    # write machine log
    try:
        while path.exists(path.join(opts.workdir, '__lock_log')):
            time.sleep(0.5)
            open(path.join(opts.workdir, '__lock_log'), 'a').close()
        with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
            mlog.write('\n'.join([('# MAPPED READ%s\t%d\t%s' %
                                   (opts.read, num, out))
                                  for out, num in outfiles]) + '\n')
            # release lock
        try:
            remove(path.join(opts.workdir, '__lock_log'))
        except OSError:
            pass
    except Exception as e:
        # release lock
        remove(path.join(opts.workdir, '__lock_db'))
        print_exc()
        exit(1)

    # clean
    if not opts.keep_tmp:
        logging.info('cleaning temporary files')
        system('rm -rf ' + temp_dir)
Exemple #33
0
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start,
                      range_stop, **kwargs):
    """
    Map iteratively a given FASTQ file to a reference genome.
    
    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_sam_path: path to a directory where to store mapped reads in SAM/
       BAM format (see option output_is_bam).
    :param range_start: list of integers representing the start position of each
       read fragment to be mapped (starting at 1 includes the first nucleotide
       of the read).
    :param range_stop: list of integers representing the end position of each
       read fragment to be mapped.
    :param True single_end: when FASTQ contains paired-ends flags
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param -1 max_reads_per_chunk: maximum number of reads to process at a time.
       If -1, all reads will be processed in one run (more RAM memory needed).
    :param False output_is_bam: Use binary (compressed) form of generated
       out-files with mapped reads (recommended to save disk space).
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    """
    gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path = os.path.abspath(os.path.expanduser(fastq_path))
    out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path))
    single_end = kwargs.get('single_end', True)
    max_edit_distance = kwargs.get('max_edit_distance', 0.04)
    mismatches = kwargs.get('mismatches', 0.04)
    nthreads = kwargs.get('nthreads', 4)
    max_reads_per_chunk = kwargs.get('max_reads_per_chunk', -1)
    out_files = kwargs.get('out_files', [])
    output_is_bam = kwargs.get('output_is_bam', False)
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', tempfile.gettempdir())))

    # check kwargs
    for kw in kwargs:
        if not kw in [
                'single_end', 'nthreads', 'max_edit_distance', 'mismatches',
                'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir'
        ]:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)

    # check windows:
    if not isinstance(range_start, list) or not isinstance(range_stop, list):
        if (not isinstance(range_start, tuple)
                or not isinstance(range_stop, tuple)):
            raise Exception(
                'ERROR: range_start and range_stop should be lists')
        range_start = list(range_start)
        range_stop = list(range_stop)
    if (not all(isinstance(i, int) for i in range_start)
            or not all(isinstance(i, int) for i in range_stop)):
        try:
            range_start = map(int, range_start)
            range_stop = map(int, range_stop)
            warn('WARNING: range_start and range_stop converted to integers')
        except ValueError:
            raise Exception(
                'ERROR: range_start and range_stop should contain' +
                ' integers only')
    if (len(zip(range_start, range_stop)) < len(range_start)
            or len(range_start) != len(range_stop)):
        raise Exception('ERROR: range_start and range_stop should have the ' +
                        'same sizes and windows should be uniques.')
    if any([i >= j for i, j in zip(range_start, range_stop)]):
        raise Exception('ERROR: start positions should always be lower than ' +
                        'stop positions.')
    if any([i <= 0 for i in range_start]):
        raise Exception('ERROR: start positions should be strictly positive.')

    # create directories
    for rep in [temp_dir, os.path.split(out_sam_path)[0]]:
        mkdir(rep)

    #get the length of a read
    if fastq_path.endswith('.gz'):
        fastqh = gzip.open(fastq_path)
    else:
        fastqh = open(fastq_path)
    # get the length from the length of the second line, which is the sequence
    # can not use the "length" keyword, as it is not always present
    try:
        _ = fastqh.next()
        raw_seq_len = len(fastqh.next().strip())
        fastqh.close()
    except StopIteration:
        raise IOError('ERROR: problem reading %s\n' % fastq_path)

    if not N_WINDOWS:
        N_WINDOWS = len(range_start)
    # Split input files if required and apply iterative mapping to each
    # segment separately.
    if max_reads_per_chunk > 0:
        kwargs['max_reads_per_chunk'] = -1
        print 'Split input file %s into chunks' % fastq_path
        chunked_files = _chunk_file(
            fastq_path, os.path.join(temp_dir,
                                     os.path.split(fastq_path)[1]),
            max_reads_per_chunk * 4)
        print '%d chunks obtained' % len(chunked_files)
        for i, fastq_chunk_path in enumerate(chunked_files):
            global N_WINDOWS
            N_WINDOWS = 0
            print 'Run iterative_mapping recursively on %s' % fastq_chunk_path
            out_files.extend(
                iterative_mapping(gem_index_path, fastq_chunk_path,
                                  out_sam_path + '.%d' % (i + 1),
                                  range_start[:], range_stop[:], **kwargs))

        for i, fastq_chunk_path in enumerate(chunked_files):
            # Delete chunks only if the file was really chunked.
            if len(chunked_files) > 1:
                print 'Remove the chunks: %s' % ' '.join(chunked_files)
                os.remove(fastq_chunk_path)
        return out_files

    # end position according to sequence in the file
    # removes 1 in order to start at 1 instead of 0
    try:
        seq_end = range_stop.pop(0)
        seq_beg = range_start.pop(0)
    except IndexError:
        return out_files

    # define what we trim
    seq_len = seq_end - seq_beg
    trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1)

    # output
    local_out_sam = out_sam_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop),
                                                  seq_beg, seq_end)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads,
        paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed,
                        gem_index_path,
                        min_decoded_strata=0,
                        max_decoded_matches=2,
                        unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped,
                          index=gem_index_path,
                          threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped,
                          index=gem_index_path,
                          output=local_out_sam,
                          threads=nthreads,
                          single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
        unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0]
    unmapped_fastq_path = os.path.join(
        temp_dir, unmapped_fastq_path + '.%d:%d-%d' %
        (N_WINDOWS - len(range_stop), seq_beg, seq_end))
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(
        iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path,
                          range_start, range_stop, **kwargs))
    os.remove(unmapped_fastq_path)
    return out_files
Exemple #34
0
def main():
    opts = get_options()
    inbam = opts.inbam
    resolution = opts.reso
    filter_exclude = opts.filter
    ncpus = opts.cpus
    if opts.biases:
        biases = load(open(opts.biases))
    else:
        biases = {}
    outdir = opts.outdir
    tmpdir = opts.tmpdir
    coord1 = opts.coord1
    coord2 = opts.coord2

    if biases and biases['resolution'] != resolution:
        raise Exception(
            'ERROR: different resolution in bias file (you want %d,'
            ' there is %d).\n' % (resolution, biases['resolution']))
    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    mkdir(outdir)
    mkdir(tmpdir)
    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    write_matrix(inbam,
                 resolution,
                 biases,
                 outdir,
                 filter_exclude=filter_exclude,
                 normalizations=opts.matrices,
                 region1=region1,
                 start1=start1,
                 end1=end1,
                 region2=region2,
                 start2=start2,
                 end2=end2,
                 nchunks=opts.nchunks,
                 append_to_tar=opts.tarfile,
                 ncpus=ncpus,
                 tmpdir=tmpdir,
                 verbose=not opts.quiet)
Exemple #35
0
def iterative_mapping(gem_index_path, fastq_path, out_sam_path,
                      range_start, range_stop, **kwargs):
    """
    Map iteratively a given FASTQ file to a reference genome.
    
    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_sam_path: path to a directory where to store mapped reads in SAM/
       BAM format (see option output_is_bam).
    :param range_start: list of integers representing the start position of each
       read fragment to be mapped (starting at 1 includes the first nucleotide
       of the read).
    :param range_stop: list of integers representing the end position of each
       read fragment to be mapped.
    :param True single_end: when FASTQ contains paired-ends flags
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param -1 max_reads_per_chunk: maximum number of reads to process at a time.
       If -1, all reads will be processed in one run (more RAM memory needed).
    :param False output_is_bam: Use binary (compressed) form of generated
       out-files with mapped reads (recommended to save disk space).
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    """
    gem_index_path      = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path          = os.path.abspath(os.path.expanduser(fastq_path))
    out_sam_path        = os.path.abspath(os.path.expanduser(out_sam_path))
    single_end          = kwargs.get('single_end'          , True)
    max_edit_distance   = kwargs.get('max_edit_distance'   , 0.04)
    mismatches          = kwargs.get('mismatches'          , 0.04)
    nthreads            = kwargs.get('nthreads'            , 4)
    max_reads_per_chunk = kwargs.get('max_reads_per_chunk' , -1)
    out_files           = kwargs.get('out_files'           , [])
    output_is_bam       = kwargs.get('output_is_bam'       , False)
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', tempfile.gettempdir())))

    # check kwargs
    for kw in kwargs:
        if not kw in ['single_end', 'nthreads', 'max_edit_distance',
                      'mismatches', 'max_reads_per_chunk',
                      'out_files', 'output_is_bam', 'temp_dir']:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)
    
    # check windows:
    if not isinstance(range_start, list) or not isinstance(range_stop, list):
        if (not isinstance(range_start, tuple) or
            not isinstance(range_stop, tuple)):
            raise Exception('ERROR: range_start and range_stop should be lists')
        range_start = list(range_start)
        range_stop  = list(range_stop)
    if (not all(isinstance(i, int) for i in range_start) or
        not all(isinstance(i, int) for i in range_stop)):
        try:
            range_start = map(int, range_start)
            range_stop  = map(int, range_stop)            
            warn('WARNING: range_start and range_stop converted to integers')
        except ValueError:
            raise Exception('ERROR: range_start and range_stop should contain' +
                            ' integers only')
    if (len(zip(range_start, range_stop)) < len(range_start) or
        len(range_start) != len(range_stop)):
        raise Exception('ERROR: range_start and range_stop should have the ' +
                        'same sizes and windows should be uniques.')
    if any([i >= j for i, j in zip(range_start, range_stop)]):
        raise Exception('ERROR: start positions should always be lower than ' +
                        'stop positions.')
    if any([i <= 0 for i in range_start]):
        raise Exception('ERROR: start positions should be strictly positive.')

    # create directories
    for rep in [temp_dir, os.path.split(out_sam_path)[0]]:
        mkdir(rep)

    #get the length of a read
    if fastq_path.endswith('.gz'):
        fastqh = gzip.open(fastq_path)
    else:
        fastqh = open(fastq_path)
    # get the length from the length of the second line, which is the sequence
    # can not use the "length" keyword, as it is not always present
    try:
        _ = fastqh.next()
        raw_seq_len = len(fastqh.next().strip())
        fastqh.close()
    except StopIteration:
        raise IOError('ERROR: problem reading %s\n' % fastq_path)

    if not  N_WINDOWS:
        N_WINDOWS = len(range_start)
    # Split input files if required and apply iterative mapping to each
    # segment separately.
    if max_reads_per_chunk > 0:
        kwargs['max_reads_per_chunk'] = -1
        print 'Split input file %s into chunks' % fastq_path
        chunked_files = _chunk_file(
            fastq_path,
            os.path.join(temp_dir, os.path.split(fastq_path)[1]),
            max_reads_per_chunk * 4)
        print '%d chunks obtained' % len(chunked_files)
        for i, fastq_chunk_path in enumerate(chunked_files):
            global N_WINDOWS
            N_WINDOWS = 0
            print 'Run iterative_mapping recursively on %s' % fastq_chunk_path
            out_files.extend(iterative_mapping(
                gem_index_path, fastq_chunk_path,
                out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:],
                **kwargs))

        for i, fastq_chunk_path in enumerate(chunked_files):
            # Delete chunks only if the file was really chunked.
            if len(chunked_files) > 1:
                print 'Remove the chunks: %s' % ' '.join(chunked_files)
                os.remove(fastq_chunk_path)
        return out_files

    # end position according to sequence in the file
    # removes 1 in order to start at 1 instead of 0
    try:
        seq_end = range_stop.pop(0)
        seq_beg = range_start.pop(0)
    except IndexError:
        return out_files

    # define what we trim
    seq_len = seq_end - seq_beg
    trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1)

    # output
    local_out_sam = out_sam_path + '.%d:%d-%d' % (
        N_WINDOWS - len(range_stop), seq_beg, seq_end)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads, paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0,
                        max_decoded_matches=2, unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                          threads=nthreads, single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
        unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0]
    unmapped_fastq_path = os.path.join(
        temp_dir, unmapped_fastq_path + '.%d:%d-%d' % (
            N_WINDOWS - len(range_stop), seq_beg, seq_end))
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path,
                                       out_sam_path,
                                       range_start, range_stop, **kwargs))
    os.remove(unmapped_fastq_path)
    return out_files
Exemple #36
0
def hic_map(data, resolution=None, normalized=False, masked=None,
            by_chrom=False, savefig=None, show=False, savedata=None,
            focus=None, clim=None, cmap='jet', pdf=False, decay=True,
            perc=10, name=None, decay_resolution=None, **kwargs):
    """
    function to retrieve data from HiC-data object. Data can be stored as
    a square matrix, or drawn using matplotlib

    :param data: can be either a path to a file with pre-processed reads
       (filtered or not), or a Hi-C-data object
    :param None resolution: at which to bin the data (try having a dense matrix
       with < 10% of cells with zero interaction counts). Note: not necessary
       if a hic_data object is passed as 'data'.
    :param False normalized: used normalized data, based on precalculated biases
    :param masked: a list of columns to be removed. Usually because to few
       interactions
    :param False by_chrom: data can be stored in a partitioned way. This
       parameter can take the values of:
        * 'intra': one output per each chromosome will be created
        * 'inter': one output per each possible pair of chromosome will be
           created
        * 'all'  : both of the above outputs
    :param None savefig: path where to store the output images. Note that, if
       the by_chrom option is used, then savefig will be the name of the
       directory containing the output files.
    :param None savedata: path where to store the output matrices. Note that, if
       the by_chrom option is used, then savefig will be the name of the
       directory containing the output files.
    :param None focus: can be either two number (i.e.: (1, 100)) specifying the
       start and end position of the sub-matrix to display (start and end, along
       the diagonal of the original matrix); or directly a chromosome name; or
       two chromosome names (i.e.: focus=('chr2, chrX')), in order to store the
       data corresponding to inter chromosomal interactions between these two
       chromosomes
    :param True decay: plot the correlation between genomic distance and
       interactions (usually a decay).
    :param False force_image: force to generate an image even if resolution is
       crazy...
    :param None clim: cutoff for the upper and lower bound in the coloring scale
       of the heatmap
    :param False pdf: when using the bny_chrom option, to specify the format of
       the stored images
    :param Reds cmap: color map to be used for the heatmap
    :param None decay_resolution: chromatin fragment size to consider when
       calculating decay of the number of interactions with genomic distance.
       Default is equal to resolution of the matrix.
    """
    if isinstance(data, str):
        data = load_hic_data_from_reads(data, resolution=resolution, **kwargs)
        if not kwargs.get('get_sections', True) and decay:
            warn('WARNING: not decay not available when get_sections is off.')
            decay = False
    hic_data = data
    resolution = data.resolution
    if not decay_resolution:
        decay_resolution = resolution
    if hic_data.bads and not masked:
        masked = hic_data.bads
    # save and draw the data
    if by_chrom:
        if focus:
            raise Exception('Incompatible options focus and by_chrom\n')
        if savedata:
            mkdir(savedata)
        if savefig:
            mkdir(savefig)
        for i, crm1 in enumerate(hic_data.chromosomes):
            for crm2 in hic_data.chromosomes.keys()[i:]:
                if by_chrom == 'intra' and crm1 != crm2:
                    continue
                if by_chrom == 'inter' and crm1 == crm2:
                    continue
                try:
                    subdata = hic_data.get_matrix(focus=(crm1, crm2), normalized=normalized)
                    start1, _ = hic_data.section_pos[crm1]
                    start2, _ = hic_data.section_pos[crm2]
                    masked1 = {}
                    masked2 = {}
                    if focus and hic_data.bads:
                        # rescale masked
                        masked1 = dict([(m - start1, hic_data.bads[m])
                                        for m in hic_data.bads])
                        masked2 = dict([(m - start2, hic_data.bads[m])
                                        for m in hic_data.bads])
                    if masked1 or masked2:
                        for i in xrange(len(subdata)):
                            if i in masked1:
                                subdata[i] = [float('nan')
                                              for j in xrange(len(subdata))]
                            for j in xrange(len(subdata)):
                                if j in masked2:
                                    subdata[i][j] = float('nan')
                    if savedata:
                        hic_data.write_matrix('%s/%s.mat' % (
                            savedata, '_'.join(set((crm1, crm2)))),
                                              focus=(crm1, crm2),
                                              normalized=normalized)
                    if show or savefig:
                        if (len(subdata) > 10000
                            and not kwargs.get('force_image', False)):
                            warn('WARNING: Matrix image not created, more than '
                                 '10000 rows, use a lower resolution to create images')
                            continue
                        draw_map(subdata, 
                                 OrderedDict([(k, hic_data.chromosomes[k])
                                              for k in hic_data.chromosomes.keys()
                                              if k in [crm1, crm2]]),
                                 hic_data.section_pos,
                                 '%s/%s.%s' % (savefig,
                                               '_'.join(set((crm1, crm2))),
                                               'pdf' if pdf else 'png'),
                                 show, one=True, clim=clim, cmap=cmap,
                                 decay_resolution=decay_resolution, perc=perc,
                                 name=name, cistrans=float('NaN'))
                except ValueError, e:
                    print 'Value ERROR: problem with chromosome %s' % crm1
                    print str(e)
                except IndexError, e:
                    print 'Index ERROR: problem with chromosome %s' % crm1
                    print str(e)
Exemple #37
0
def main():
    opts          = get_options()
    inbam          = opts.inbam
    resolution     = opts.reso
    filter_exclude = opts.filter
    ncpus          = opts.cpus
    if opts.biases:
        biases     = load(open(opts.biases))
    else:
        biases     = {}
    outdir         = opts.outdir
    tmpdir         = opts.tmpdir
    coord1         = opts.coord1
    coord2         = opts.coord2

    if biases and biases['resolution'] != resolution:
        raise Exception('ERROR: different resolution in bias file (you want %d,'
                        ' there is %d).\n' % (resolution, biases['resolution']))
    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    mkdir(outdir)
    mkdir(tmpdir)
    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    write_matrix(inbam, resolution, biases, outdir,
                 filter_exclude=filter_exclude,
                 normalizations=opts.matrices,
                 region1=region1, start1=start1, end1=end1,
                 region2=region2, start2=start2, end2=end2,
                 nchunks=opts.nchunks, append_to_tar=opts.tarfile,
                 ncpus=ncpus, tmpdir=tmpdir, verbose=not opts.quiet)
Exemple #38
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    if not opts.mapped1 and not opts.mapped2:
        f_names1, f_names2, renz = load_parameters_fromdb(
            opts, reads, opts.jobids)
    else:
        if opts.mapped1:
            f_names1 = opts.mapped1
        if opts.mapped2:
            f_names2 = opts.mapped2
        renz = opts.renz

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2 = None
    elif opts.read == 2:
        out_file2 = None
        f_names1 = f_names2
        f_names2 = None
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of pickle genome to make it faster
        genome = load(open(opts.genome[0], 'rb'))
    except (UnpicklingError, KeyError):
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        if opts.mapped1 or opts.mapped2:
            counts, multis = parse_sam(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
        else:
            counts, multis = parse_map(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' %
                           (read, counts[read][item],
                            out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
Exemple #39
0
    def find_compartments(self, crms=None, savefig=None, savedata=None,
                          show=False, **kwargs):
        """
        Search for A/B copartments in each chromsome of the Hi-C matrix.
        Hi-C matrix is normalized by the number interaction expected at a given
        distance, and by visibility (one iteration of ICE). A correlation matrix
        is then calculated from this normalized matrix, and its first
        eigenvector is used to identify compartments. Changes in sign marking
        boundaries between compartments.
        Result is stored as a dictionary of compartment boundaries, keys being
        chromsome names.
        
        :param 99 perc_zero: to filter bad columns
        :param 0.05 signal_to_noise: to calculate expected interaction counts,
           if not enough reads are observed at a given distance the observations
           of the distance+1 are summed. a signal to noise ratio of < 0.05
           corresponds to > 400 reads.
        :param None crms: only runs these given list of chromosomes
        :param None savefig: path to a directory to store matrices with
           compartment predictions, one image per chromosome, stored under
           'chromosome-name.png'.
        :param False show: show the plot
        :param None savedata: path to a new file to store compartment
           predictions, one file only.
        :param -1 vmin: for the color scale of the plotted map
        :param 1 vmax: for the color scale of the plotted map

        TODO: this is really slow...

        Notes: building the distance matrix using the amount of interactions
               instead of the mean correlation, gives generally worse results.
        
        """
        if not self.bads:
            if kwargs.get('verbose', True):
                print 'Filtering bad columns %d' % 99
            self.filter_columns(perc_zero=kwargs.get('perc_zero', 99),
                                by_mean=False, silent=True)
        if not self.expected:
            if kwargs.get('verbose', True):
                print 'Normalizing by expected values'
            self.expected = expected(self, bads=self.bads, **kwargs)
        if not self.bias:
            if kwargs.get('verbose', True):
                print 'Normalizing by ICE (1 round)'
            self.normalize_hic(iterations=0)
        if savefig:
            mkdir(savefig)

        cmprts = {}
        for sec in self.section_pos:
            if crms and sec not in crms:
                continue
            if kwargs.get('verbose', False):
                print 'Processing chromosome', sec
                warn('Processing chromosome %s' % (sec))
            matrix = [[(float(self[i,j]) / self.expected[abs(j-i)]
                       / self.bias[i] / self.bias[j])
                      for i in xrange(*self.section_pos[sec])
                       if not i in self.bads]
                     for j in xrange(*self.section_pos[sec])
                      if not j in self.bads]
            if not matrix: # MT chromosome will fall there
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                continue
            for i in xrange(len(matrix)):
                for j in xrange(i+1, len(matrix)):
                    matrix[i][j] = matrix[j][i]
            matrix = [list(m) for m in corrcoef(matrix)]
            try:
                # This eighs is very very fast, only ask for one eigvector
                _, evect = eigsh(array(matrix), k=1)
            except LinAlgError:
                warn('Chromosome %s too small to compute PC1' % (sec))
                cmprts[sec] = [] # Y chromosome, or so...
                continue
            first = list(evect[:, -1])
            beg, end = self.section_pos[sec]
            bads = [k - beg for k in self.bads if beg <= k <= end]
            _ = [first.insert(b, 0) for b in bads]
            _ = [matrix.insert(b, [float('nan')] * len(matrix[0]))
                 for b in bads]
            _ = [matrix[i].insert(b, float('nan'))
                 for b in bads for i in xrange(len(first))]
            breaks = [0] + [i for i, (a, b) in
                            enumerate(zip(first[1:], first[:-1]))
                            if a * b < 0] + [len(first)]
            breaks = [{'start': b, 'end': breaks[i+1]}
                      for i, b in enumerate(breaks[: -1])]
            cmprts[sec] = breaks
            
            # calculate compartment internal density
            for k, cmprt in enumerate(cmprts[sec]):
                beg = self.section_pos[sec][0]
                beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg
                sec_matrix = [(self[i,j] / self.expected[abs(j-i)]
                               / self.bias[i] / self.bias[j])
                              for i in xrange(beg1, end1) if not i in self.bads
                              for j in xrange(i, end1) if not j in self.bads]
                try:
                    cmprt['dens'] = sum(sec_matrix) / len(sec_matrix)
                except ZeroDivisionError:
                    cmprt['dens'] = 0.
            try:
                meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec])
            except ZeroDivisionError:
                meanh = 1.
            for cmprt in cmprts[sec]:
                try:
                    cmprt['dens'] /= meanh
                except ZeroDivisionError:
                    cmprt['dens'] = 1.
            gammas = {}
            for gamma in range(101):
                gammas[gamma] = _find_ab_compartments(float(gamma)/100, matrix,
                                                      breaks, cmprts[sec],
                                                      save=False)
                # print gamma, gammas[gamma]
            gamma = min(gammas.keys(), key=lambda k: gammas[k][0])
            _ = _find_ab_compartments(float(gamma)/100, matrix, breaks,
                                      cmprts[sec], save=True)
            if savefig or show:
                vmin = kwargs.get('vmin', -1)
                vmax = kwargs.get('vmax',  1)
                if vmin == 'auto' == vmax:
                    vmax = max([abs(npmin(matrix)), abs(npmax(matrix))])
                    vmin = -vmax
                plot_compartments(sec, first, cmprts, matrix, show,
                                  savefig + '/chr' + sec + '.pdf',
                                  vmin=vmin, vmax=vmax)
                plot_compartments_summary(sec, cmprts, show,
                                          savefig + '/chr' + sec + '_summ.pdf')
            
        self.compartments = cmprts
        if savedata:
            self.write_compartments(savedata)
Exemple #40
0
def get_intersection(fname1, fname2, out_path, verbose=False):
    """
    Merges the two files corresponding to each reads sides. Reads found in both
       files are merged and written in an output file.

    Dealing with multiple contacts:
       - a pairwise contact is created for each possible combnation of the
         multicontacts. The name of the read is extended by '# 1/3' in case
         the reported pairwise contact corresponds to the first of 3 possibles
       - it may happen that different contacts are mapped on a single RE fragment
         (if each are on different end), in which case:
          - if no other fragment from this read are mapped than, both are kept
          - otherwise, they are merged into one longer (as if they were mapped
            in the positive strand)

    :param fname1: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param fname2: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param out_path: path to an outfile. It will written in a similar format as
       the inputs

    :returns: final number of pair of interacting fragments, and a dictionary with
       the number of multiple contacts (keys of the dictionary being the number of
       fragment cought together, can be 3, 4, 5..)
    """

    # Get the headers of the two files
    reads1 = magic_open(fname1)
    line1 = reads1.next()
    header1 = ''
    while line1.startswith('#'):
        if line1.startswith('# CRM'):
            header1 += line1
        line1 = reads1.next()
    read1 = line1.split('\t', 1)[0]

    reads2 = magic_open(fname2)
    line2 = reads2.next()
    header2 = ''
    while line2.startswith('#'):
        if line2.startswith('# CRM'):
            header2 += line2
        line2 = reads2.next()
    read2 = line2.split('\t', 1)[0]
    if header1 != header2:
        raise Exception('seems to be mapped onover different chromosomes\n')

    # prepare to write read pairs into different files
    # depending on genomic position
    nchunks = 1024
    global CHROM_START
    CHROM_START = {}
    cum_pos = 0
    for line in header1.split('\n'):
        if line.startswith('# CRM'):
            _, _, crm, pos = line.split()
            CHROM_START[crm] = cum_pos
            cum_pos += int(pos)
    lchunk = cum_pos / nchunks
    buf = dict([(i, []) for i in xrange(nchunks + 1)])
    # prepare temporary directories
    tmp_dir = out_path + '_tmp_files'
    mkdir(tmp_dir)
    for i in xrange(nchunks / int(nchunks**0.5) + 1):
        mkdir(path.join(tmp_dir, 'rep_%03d' % i))

    # iterate over reads in each of the two input files
    # and store them into a dictionary and then into temporary files
    # dicitonary ois emptied each 1 milion entries
    if verbose:
        print ('Getting intersection of reads 1 and reads 2:')
    count = 0
    count_dots = -1
    multiples = {}
    try:
        while True:
            if verbose:
                if not count_dots % 10:
                    stdout.write(' ')
                if not count_dots % 50:
                    stdout.write('%s\n  ' % (
                        ('  %4d milion reads' % (count_dots)) if
                        count_dots else ''))
                if count_dots >= 0:
                    stdout.write('.')
                    stdout.flush()
                count_dots += 1
            for _ in xrange(1000000): # iterate 1 million times, write to files
                # same read id in both lianes, we store put the more upstream
                # before and store them
                if eq_reads(read1, read2):
                    count += 1
                    _process_lines(line1, line2, buf, multiples, lchunk)
                    line1 = reads1.next()
                    read1 = line1.split('\t', 1)[0]
                    line2 = reads2.next()
                    read2 = line2.split('\t', 1)[0]
                # if first element of line1 is greater than the one of line2:
                elif gt_reads(read1, read2):
                    line2 = reads2.next()
                    read2 = line2.split('\t', 1)[0]
                else:
                    line1 = reads1.next()
                    read1 = line1.split('\t', 1)[0]
            write_to_files(buf, tmp_dir, nchunks)
    except StopIteration:
        reads1.close()
        reads2.close()
    write_to_files(buf, tmp_dir, nchunks)
    if verbose:
        print '\nFound %d pair of reads mapping uniquely' % count

    # sort each tmp file according to first element (idx) and write them
    # to output file (without the idx)
    # sort also according to read 2 (to filter duplicates)
    #      and also according to strand
    if verbose:
        print 'Sorting each temporary file by genomic coordinate'

    out = open(out_path, 'w')
    out.write(header1)
    for b in buf:
        if verbose:
            stdout.write('\r    %4d/%d sorted files' % (b + 1, len(buf)))
            stdout.flush()
        out.write(''.join(['\t'.join(l[1:]) for l in sorted(
            [l.split('\t') for l in open(
                path.join(tmp_dir, 'rep_%03d' % (b / int(nchunks**0.5)),
                          'tmp_%05d.tsv' % b))],
            key=lambda x: (x[0], x[8], x[9], x[6]))]))
    out.close()

    if verbose:
        print '\nRemoving temporary files...'
    system('rm -rf ' + tmp_dir)
    return count, multiples
Exemple #41
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             min_count=2500,
             normalization='Vanilla',
             mappability=None,
             n_rsites=None,
             cg_content=None,
             sigma=2,
             ncpus=8,
             factor=1,
             outdir='.',
             extra_out='',
             only_valid=False,
             normalize_only=False,
             max_njobs=100,
             min_perc=None,
             max_perc=None,
             extra_bads=None):
    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm)])

    start_bin = 0
    end_bin = len(bins)
    total = len(bins)

    regs = []
    begs = []
    ends = []
    njobs = min(total, max_njobs) + 1
    nbins = total / njobs + 1
    for i in range(start_bin, end_bin, nbins):
        if i + nbins > end_bin:  # make sure that we stop
            nbins = end_bin - i
        try:
            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            try:
                (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
            except IndexError:
                break
        if crm1 != crm2:
            end1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(end1 * resolution + resolution)  # last nt included
            ends.append(end2 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(end2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included

    # print '\n'.join(['%s %d %d' % (a, b, c) for a, b, c in zip(regs, begs, ends)])
    printime('  - Parsing BAM (%d chunks)' % (len(regs)))
    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
    pool = mu.Pool(ncpus)
    procs = []
    read_bam_frag = read_bam_frag_valid if only_valid else read_bam_frag_filter
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        procs.append(
            pool.apply_async(read_bam_frag,
                             args=(
                                 inbam,
                                 filter_exclude,
                                 bins,
                                 bins_dict,
                                 resolution,
                                 outdir,
                                 extra_out,
                                 region,
                                 start,
                                 end,
                             )))
    pool.close()
    print_progress(procs)
    pool.join()
    ## COLLECT RESULTS
    cisprc = {}
    printime('  - Collecting cis and total interactions per bin (%d chunks)' %
             (len(regs)))
    stdout.write('     ')
    for countbin, (region, start, end) in enumerate(zip(regs, begs, ends)):
        if not countbin % 10 and countbin:
            stdout.write(' ')
        if not countbin % 50 and countbin:
            stdout.write(' %9s\n     ' % ('%s/%s' % (countbin, len(regs))))
        stdout.write('.')
        stdout.flush()

        fname = path.join(
            outdir,
            'tmp_bins_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        tmp_cisprc = load(open(fname))
        system('rm -f %s' % fname)
        cisprc.update(tmp_cisprc)
    stdout.write('\n')

    printime('  - Removing columns with too few or too much interactions')
    if len(bamfile.references) == 1 and min_count is None:
        raise Exception("ERROR: only one chromosome can't filter by "
                        "cis-percentage, set min_count instead")
    elif min_count is None and len(bamfile.references) > 1:
        badcol = filter_by_cis_percentage(
            cisprc,
            sigma=sigma,
            verbose=True,
            min_perc=min_perc,
            max_perc=max_perc,
            size=total,
            savefig=path.join(
                outdir, 'filtered_bins_%s_%s.png' %
                (nicer(resolution).replace(' ', ''), extra_out)))
    else:
        print(
            '      -> too few interactions defined as less than %9d '
            'interactions') % (min_count)
        badcol = {}
        countL = 0
        countZ = 0
        for c in xrange(total):
            if cisprc.get(c, [0, 0])[1] < min_count:
                badcol[c] = cisprc.get(c, [0, 0])[1]
                countL += 1
                if not c in cisprc:
                    countZ += 1
        print '      -> removed %d columns (%d/%d null/high counts) of %d (%.1f%%)' % (
            len(badcol), countZ, countL, total,
            float(len(badcol)) / total * 100)

    # no mappability will result in NaNs, better to filter out these columns
    if mappability:
        badcol.update((i, True) for i, m in enumerate(mappability) if not m)

    # add manually columns to bad columns
    if extra_bads:
        removed_manually = 0
        for ebc in extra_bads:
            c, ebc = ebc.split(':')
            b, e = map(int, ebc.split('-'))
            b = b / resolution + section_pos[c][0]
            e = e / resolution + section_pos[c][0]
            removed_manually += (e - b)
            badcol.update(dict((p, 'manual') for p in xrange(b, e)))
        printime('  - Removed %d columns manually.' % removed_manually)
    raw_cisprc = sum(
        float(cisprc[k][0]) / cisprc[k][1]
        for k in cisprc if not k in badcol) / (len(cisprc) - len(badcol))

    printime('  - Rescaling sum of interactions per bins')
    size = len(bins)
    biases = [
        float('nan') if k in badcol else cisprc.get(k, [0, 1.])[1]
        for k in xrange(size)
    ]

    if normalization == 'Vanilla':
        printime('  - Vanilla normalization')
        mean_col = nanmean(biases)
        biases = dict(
            (k, b / mean_col * mean_col**0.5) for k, b in enumerate(biases))
    elif normalization == 'oneD':
        printime('  - oneD normalization')
        if len(
                set([
                    len(biases),
                    len(mappability),
                    len(n_rsites),
                    len(cg_content)
                ])) > 1:
            print "biases", "mappability", "n_rsites", "cg_content"
            print len(biases), len(mappability), len(n_rsites), len(cg_content)
            raise Exception('Error: not all arrays have the same size')
        tmp_oneD = path.join(outdir, 'tmp_oneD_%s' % (extra_out))
        mkdir(tmp_oneD)
        biases = oneD(tmp_dir=tmp_oneD,
                      tot=biases,
                      map=mappability,
                      res=n_rsites,
                      cg=cg_content)
        biases = dict((k, b) for k, b in enumerate(biases))
        rmtree(tmp_oneD)
    else:
        raise NotImplementedError('ERROR: method %s not implemented' %
                                  normalization)

    # collect subset-matrices and write genomic one
    # out = open(os.path.join(outdir,
    #                         'hicdata_%s.abc' % (nicer(resolution).replace(' ', ''))), 'w')
    printime('  - Getting sum of normalized bins')
    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(
            outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(pool.apply_async(sum_nrm_matrix, args=(
            fname,
            biases,
        )))
    pool.close()
    print_progress(procs)
    pool.join()

    # to correct biases
    sumnrm = sum(p.get() for p in procs)

    target = (sumnrm / float(size * size * factor))**0.5
    biases = dict([(b, biases[b] * target) for b in biases])

    if not normalize_only:
        printime('  - Computing Cis percentage')
        # Calculate Cis percentage

        pool = mu.Pool(ncpus)
        procs = []
        for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
            fname = path.join(
                outdir,
                'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
            procs.append(
                pool.apply_async(get_cis_perc,
                                 args=(fname, biases, badcol, bins)))
        pool.close()
        print_progress(procs)
        pool.join()

        # collect results
        cis = total = 0
        for proc in procs:
            c, t = proc.get()
            cis += c
            total += t
        norm_cisprc = float(cis) / total
        print '    * Cis-percentage: %.1f%%' % (norm_cisprc * 100)
    else:
        norm_cisprc = 0.

    printime('  - Rescaling decay')
    # normalize decay by size of the diagonal, and by Vanilla correction
    # (all cells must still be equals to 1 in average)

    pool = mu.Pool(ncpus)
    procs = []
    for i, (region, start, end) in enumerate(zip(regs, begs, ends)):
        fname = path.join(
            outdir, 'tmp_%s:%d-%d_%s.pickle' % (region, start, end, extra_out))
        procs.append(
            pool.apply_async(sum_dec_matrix,
                             args=(fname, biases, badcol, bins)))
    pool.close()
    print_progress(procs)
    pool.join()

    # collect results
    nrmdec = {}
    rawdec = {}
    for proc in procs:
        tmpnrm, tmpraw = proc.get()
        for c, d in tmpnrm.iteritems():
            for k, v in d.iteritems():
                try:
                    nrmdec[c][k] += v
                    rawdec[c][k] += tmpraw[c][k]
                except KeyError:
                    try:
                        nrmdec[c][k] = v
                        rawdec[c][k] = tmpraw[c][k]
                    except KeyError:
                        nrmdec[c] = {k: v}
                        rawdec[c] = {k: tmpraw[c][k]}
    # count the number of cells per diagonal
    # TODO: parallelize
    # find largest chromosome
    len_crms = dict(
        (c, section_pos[c][1] - section_pos[c][0]) for c in section_pos)
    # initialize dictionary
    ndiags = dict(
        (c, dict((k, 0) for k in xrange(len_crms[c]))) for c in sections)
    for crm in section_pos:
        beg_chr, end_chr = section_pos[crm][0], section_pos[crm][1]
        chr_size = end_chr - beg_chr
        thesebads = [b for b in badcol if beg_chr <= b <= end_chr]
        for dist in xrange(1, chr_size):
            ndiags[crm][dist] += chr_size - dist
            # from this we remove bad columns
            # bad columns will only affect if they are at least as distant from
            # a border as the distance between the longest diagonal and the
            # current diagonal.
            bad_diag = set(
            )  # 2 bad rows can point to the same bad cell in diagonal
            maxp = end_chr - dist
            minp = beg_chr + dist
            for b in thesebads:
                if b < maxp:  # not inclusive!!
                    bad_diag.add(b)
                if b >= minp:
                    bad_diag.add(b - dist)
            ndiags[crm][dist] -= len(bad_diag)
        # different behavior for longest diagonal:
        ndiags[crm][0] += chr_size - sum(beg_chr <= b < end_chr
                                         for b in thesebads)

    # normalize sum per diagonal by total number of cells in diagonal
    signal_to_noise = 0.05
    min_n = signal_to_noise**-2.  # equals 400 when default
    for crm in sections:
        if not crm in nrmdec:
            nrmdec[crm] = {}
            rawdec[crm] = {}
        tmpdec = 0  # store count by diagonal
        tmpsum = 0  # store count by diagonal
        ndiag = 0
        val = 0
        previous = [
        ]  # store diagonals to be summed in case not reaching the minimum
        for k in ndiags[crm]:
            tmpdec += nrmdec[crm].get(k, 0.)
            tmpsum += rawdec[crm].get(k, 0.)
            previous.append(k)
            if tmpsum > min_n:
                ndiag = sum(ndiags[crm][k] for k in previous)
                val = tmpdec  # backup of tmpdec kept for last ones outside the loop
                try:
                    ratio = val / ndiag
                    for k in previous:
                        nrmdec[crm][k] = ratio
                except ZeroDivisionError:  # all columns at this distance are "bad"
                    pass
                previous = []
                tmpdec = 0
                tmpsum = 0
        # last ones we average with previous result
        if len(previous) == len(ndiags[crm]):
            nrmdec[crm] = {}
        elif tmpsum < min_n:
            ndiag += sum(ndiags[crm][k] for k in previous)
            val += tmpdec
            try:
                ratio = val / ndiag
                for k in previous:
                    nrmdec[crm][k] = ratio
            except ZeroDivisionError:  # all columns at this distance are "bad"
                pass
    return biases, nrmdec, badcol, raw_cisprc, norm_cisprc
Exemple #42
0
def read_bam(inbam,
             filter_exclude,
             resolution,
             ncpus=8,
             region1=None,
             start1=None,
             end1=None,
             region2=None,
             start2=None,
             end2=None,
             nchunks=100,
             tmpdir='.',
             verbose=True,
             normalize=False,
             max_size=None):

    bamfile = AlignmentFile(inbam, 'rb')
    sections = OrderedDict(
        zip(bamfile.references, [x / resolution + 1 for x in bamfile.lengths]))
    # get chromosomes and genome sizes
    total = 0
    section_pos = dict()
    for crm in sections:
        section_pos[crm] = (total, total + sections[crm])
        total += sections[crm]

    # define genomic bins
    bins = []
    for crm in sections:
        len_crm = sections[crm]
        bins.extend([(crm, i) for i in xrange(len_crm)])
    if not bins:
        raise Exception('ERROR: Chromosome %s smaller than bin size\n' % (crm))

    # define start, end position of region to grab
    start_bin1 = 0
    end_bin1 = len(bins) + 1
    regions = bamfile.references
    if region1:
        regions = [region1]
        if region2:
            regions.append(region2)
    else:
        total = len(bins)
        if start1 is not None or end1:
            raise Exception('ERROR: Cannot use start/end1 without region')

    if start1 is not None:
        start_bin1 = section_pos[region1][0] + start1 / resolution
    else:
        if region1:
            start_bin1 = section_pos[region1][0]
        else:
            start_bin1 = 0
        start1 = 0
    if end1 is not None:
        end_bin1 = section_pos[region1][0] + end1 / resolution
    else:
        if region1:
            end_bin1 = section_pos[region1][1]
            end1 = sections[region1] * resolution
        else:
            end_bin1 = total
            end1 = total * resolution

    # define chunks, using at most 100 sub-divisions of region1
    total = end_bin1 - start_bin1
    regs = []
    begs = []
    ends = []

    njobs = min(total, nchunks) + 1

    nbins = total / njobs + 1
    for i in xrange(start_bin1, end_bin1, nbins):
        if i + nbins > end_bin1:  # make sure that we stop at the right place
            nbins = end_bin1 - i
        try:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1]
        except IndexError:
            (crm1, beg1), (crm2, fin2) = bins[i], bins[-1]
        if crm1 != crm2:
            fin1 = sections[crm1]
            beg2 = 0
            regs.append(crm1)
            regs.append(crm2)
            begs.append(beg1 * resolution)
            begs.append(beg2 * resolution)
            ends.append(fin1 * resolution + resolution)  # last nt included
            ends.append(fin2 * resolution + resolution -
                        1)  # last nt not included (overlap with next window)
        else:
            regs.append(crm1)
            begs.append(beg1 * resolution)
            ends.append(fin2 * resolution + resolution - 1)
    ends[-1] += 1  # last nucleotide included
    # reduce dictionaries
    all_bins = []
    seenbins = set()
    for crm in regions:
        beg_crm = section_pos[crm][0]
        if region1:
            start = start_bin1 - beg_crm
            end = end_bin1 - beg_crm
        else:
            start = 0
            end = section_pos[crm][1] - section_pos[crm][0]
        all_bins.extend([(crm, i) for i in xrange(start, end)
                         if not (crm, i) in seenbins])
        seenbins = set(all_bins)
    del (seenbins)

    bins_dict1 = dict((j, i) for i, j in enumerate(all_bins))
    if region2:
        if not region2 in section_pos:
            raise Exception('ERROR: chromosome %s not found' % region2)
        bins = []
        beg_crm = section_pos[region2][0]
        if start2 is not None:
            start_bin2 = section_pos[region2][0] + start2 / resolution
        else:
            start_bin2 = section_pos[region2][0]
            start2 = 0
        if end2 is not None:
            end_bin2 = section_pos[region2][0] + end2 / resolution
        else:
            end_bin2 = section_pos[region2][1]
            end2 = sections[region2] * resolution
        start = start_bin2 - beg_crm
        end = end_bin2 - beg_crm
        bins = [(region2, i) for i in xrange(start, end)]
        bins_dict2 = dict([(j, i) for i, j in enumerate(bins)])
    else:
        start_bin2 = start_bin1
        end_bin2 = end_bin1
        bins_dict2 = bins_dict1

    size1 = end_bin1 - start_bin1
    size2 = end_bin2 - start_bin2
    if verbose:
        printime('\n  (Matrix size %dx%d)' % (size1, size2))
    if max_size and max_size < size1 * size2:
        raise Exception(('ERROR: matrix too large ({0}x{1}) should be at most '
                         '{2}x{2}').format(size1, size2, int(max_size**0.5)))

    pool = mu.Pool(ncpus)
    # create random hash associated to the run:
    rand_hash = "%016x" % getrandbits(64)

    ## RUN!
    if verbose:
        printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
    mkdir(os.path.join(tmpdir, '_tmp_%s' % (rand_hash)))
    # empty all_bins array if we are not going to normalize
    if not normalize:
        all_bins = []
    procs = []
    for i, (region, b, e) in enumerate(zip(regs, begs, ends)):
        if ncpus == 1:
            _read_bam_frag(
                inbam,
                filter_exclude,
                all_bins,
                bins_dict1,
                bins_dict2,
                rand_hash,
                resolution,
                tmpdir,
                region,
                b,
                e,
            )
        else:
            procs.append(
                pool.apply_async(_read_bam_frag,
                                 args=(
                                     inbam,
                                     filter_exclude,
                                     all_bins,
                                     bins_dict1,
                                     bins_dict2,
                                     rand_hash,
                                     resolution,
                                     tmpdir,
                                     region,
                                     b,
                                     e,
                                 )))
    pool.close()
    if verbose:
        print_progress(procs)
    pool.join()
    bin_coords = start_bin1, end_bin1, start_bin2, end_bin2
    chunks = regs, begs, ends
    return regions, rand_hash, bin_coords, chunks
Exemple #43
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bed1:
        mreads1 = path.realpath(opts.bed1)
        bad_co1 = opts.bad_co1
        biases1 = opts.biases1
    else:
        bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb(
                opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)

    if opts.bed2:
        mreads2 = path.realpath(opts.bed2)
        bad_co2 = opts.bad_co2
        biases2 = opts.biases2
    else:
        bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb(
                opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    print 'loading first sample', mreads1
    hic_data1 = load_hic_data_from_reads(mreads1, opts.reso)

    print 'loading second sample', mreads2
    hic_data2 = load_hic_data_from_reads(mreads2, opts.reso)

    if opts.norm and biases1:
        bad_co1 = path.join(opts.workdir1, bad_co1)
        print 'loading bad columns from first sample', bad_co1
        hic_data1.bads = dict((int(l.strip()), True) for l in open(bad_co1))
        biases1 = path.join(opts.workdir1, biases1)
        print 'loading biases from first sample', biases1
        hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1]))
                              for l in open(biases1))
    elif opts.norm:
        raise Exception('ERROR: biases or filtered-columns not found')
    if opts.norm and biases2:
        bad_co2 = path.join(opts.workdir2, bad_co2)
        print 'loading bad columns from second sample', bad_co2
        hic_data2.bads = dict((int(l.strip()), True) for l in open(bad_co2))
        biases2 = path.join(opts.workdir2, biases2)
        print 'loading biases from second sample', biases2
        hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1]))
                              for l in open(biases2))
    elif opts.norm:
        raise Exception('ERROR: biases or filtered-columns not found')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))
    else:
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'
        
    # if opts.norm:
        # has bias file

    if not opts.skip_comparison:
        print 'correlation between equidistant loci'
        corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                           remove_bad_columns=True,
                                           savefig=decay_corr_fig,
                                           savedata=decay_corr_dat, get_bads=True)
        print 'correlation between eigenvectors'
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)
    else:
        corr = eig_corr = None
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % (
        param_hash))

    nreads = merge_2d_beds(mreads1, mreads2, outbed)

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(bads.keys()), len(hic_data1), nreads,
                eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr,
                biases1, bad_co1, biases2, bad_co2, launch_time, finish_time)
Exemple #44
0
    def test_07_big_matrix(self):
        inbam = os_join(TEST_PATH, 'data', 'fake.bam')
        biases = os_join(TEST_PATH, 'data', 'biases3.pickle')
        outfile = os_join(TEST_PATH, 'lele', 'lala.tsv')
        tmppath = os_join(TEST_PATH, 'lele')
        mkdir(tmppath)

        nheader = write_big_matrix(inbam,
                                   RESOLUTION,
                                   biases,
                                   outfile,
                                   nchunks=100,
                                   wanted_chrom=None,
                                   wanted_pos1=None,
                                   wanted_pos2=None,
                                   dry_run=False,
                                   ncpus=8,
                                   tmpdir=tmppath,
                                   clean=True,
                                   verbose=False,
                                   square_size=100,
                                   waffle_radii=WINDOWS_SPAN,
                                   metric='loop')

        rand_hash = "%016x" % getrandbits(64)
        tmpdir = os_join(tmppath, '_tmp_%s' % (rand_hash))
        mkdir(tmpdir)

        #sort all files for only read once per pair of peaks to extract
        sort_BAMtsv(nheader, outfile, tmpdir)

        system('rm -rf {}'.format(tmpdir))
        fh = open(outfile)
        self.assertEqual(187515, sum(1 for l in fh))
        fh.close()
        with open(outfile) as fh:
            for line in fh:
                if line.startswith('525\t723\t'):
                    break
        b, e, r, p, c, vals = line.split()
        self.assertEqual(0.139, float(r))
        self.assertEqual(0.216, float(p))
        self.assertEqual(0.981, float(c))
        self.assertEqual([
            0.903, 0.889, 1.401, 0.411, 0.814, 0.417, 0.856, 0.454, 0.8, 2.171,
            0.892, 4.214, 0.433, 0, 0.402, 1.288, 0.455, 0.869, 0.852, 0.42,
            0.919, 0, 0.842, 1.579, 0.405, 1.788, 1.706, 1.164, 1.265, 1.328,
            1.281, 1.267, 1.249, 0, 0.431, 0.428, 1.116, 1.832, 1.698, 1.179,
            0.405, 1.996, 1.639, 0.828, 0, 0.749, 0.365, 0.383, 0.391, 1.161,
            0.795, 1.224, 0.866, 0.786, 1.932, 1.142, 1.186, 0.732, 0.798,
            0.393, 0.421, 1.786, 0.852, 1.366, 0.39, 0.819, 2.621, 0.741,
            1.611, 0.413, 1.371, 0.436, 1.051, 0.345, 0, 1.165, 1.14, 0.749,
            1.272, 0.45, 1.789
        ], [float(v) for v in vals.split(',')])
        with open(outfile) as fh:
            for line in fh:
                if line.startswith('854\t988\t'):
                    break
        b, e, r, p, c, vals = line.split()
        self.assertEqual(0.224, float(r))
        self.assertEqual(0.0448, float(p))
        self.assertEqual(1.394, float(c))
        self.assertEqual([
            2.123, 1.106, 0.585, 0.572, 1.636, 1.681, 0.517, 0.534, 0.556,
            1.521, 1.057, 1.059, 1.093, 0, 0, 1.04, 1.02, 1.062, 1.003, 2.09,
            2.093, 2.047, 1.03, 1.058, 1.028, 2.123, 0, 2.441, 1.03, 2.062,
            1.008, 1.441, 0.521, 1.013, 0, 1.088, 3.036, 1.055, 1.069, 1.045,
            0.996, 0.512, 3.148, 2.167, 2.256, 0.504, 2.103, 0, 0, 0.993, 1.02,
            1.486, 1.621, 0.562, 0.981, 2.044, 1.024, 0.5, 1.447, 1.983, 0.963,
            1.988, 1.639, 1.007, 1.59, 0, 0.519, 2.473, 2.057, 1.498, 0.516,
            0.537, 0.508, 1.059, 0, 0.524, 0.499, 0.513, 0.504, 0.521, 0
        ], [float(v) for v in vals.split(',')])
        system('rm -rf {}'.format(tmppath))
Exemple #45
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Exemple #46
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bed1:
        mreads1 = path.realpath(opts.bed1)
        bad_co1 = opts.bad_co1
        biases1 = opts.biases1
    else:
        bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)

    if opts.bed2:
        mreads2 = path.realpath(opts.bed2)
        bad_co2 = opts.bad_co2
        biases2 = opts.biases2
    else:
        bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        print 'Comparison'
        print ' - loading first sample', mreads1
        hic_data1 = load_hic_data_from_reads(mreads1, opts.reso)

        print ' - loading second sample', mreads2
        hic_data2 = load_hic_data_from_reads(mreads2, opts.reso)

        if opts.norm and biases1:
            bad_co1 = path.join(opts.workdir1, bad_co1)
            print ' - loading bad columns from first sample', bad_co1
            hic_data1.bads = dict(
                (int(l.strip()), True) for l in open(bad_co1))
            biases1 = path.join(opts.workdir1, biases1)
            print ' - loading biases from first sample', biases1
            hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1]))
                                  for l in open(biases1))
        elif opts.norm:
            raise Exception('ERROR: biases or filtered-columns not found')
        if opts.norm and biases2:
            bad_co2 = path.join(opts.workdir2, bad_co2)
            print ' - loading bad columns from second sample', bad_co2
            hic_data2.bads = dict(
                (int(l.strip()), True) for l in open(bad_co2))
            biases2 = path.join(opts.workdir2, biases2)
            print ' - loading biases from second sample', biases2
            hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1]))
                                  for l in open(biases2))
        elif opts.norm:
            raise Exception('ERROR: biases or filtered-columns not found')
        decay_corr_dat = path.join(
            opts.workdir, '00_merge',
            'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(
            opts.workdir, '00_merge',
            'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(
            opts.workdir, '00_merge',
            'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(
            opts.workdir, '00_merge',
            'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))
    else:
        hic_data1 = {}
        hic_data2 = {}
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'

    # if opts.norm:
    # has bias file

    if not opts.skip_comparison:
        print '  => correlation between equidistant loci'
        corr, _, bads = correlate_matrices(hic_data1,
                                           hic_data2,
                                           normalized=opts.norm,
                                           remove_bad_columns=True,
                                           savefig=decay_corr_fig,
                                           savedata=decay_corr_dat,
                                           get_bads=True)
        print '  => correlation between eigenvectors'
        eig_corr = eig_correlate_matrices(hic_data1,
                                          hic_data2,
                                          normalized=opts.norm,
                                          remove_bad_columns=True,
                                          nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)
    else:
        corr = eig_corr = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbed = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % (param_hash))

    print '\nMergeing...'
    nreads = merge_2d_beds(mreads1, mreads2, outbed)

    finish_time = time.localtime()
    save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
               len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat,
               eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1,
               biases2, bad_co2, launch_time, finish_time)
    print '\n\nDone.'
Exemple #47
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)))
            if len(fas - bam) <= 50:
                print('\n'.join([('  - ' + c) for c in (fas - bam)]))
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1)
            if len(mappability[c]) < len(refs) // opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) // opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # pad mappability at the end if the size is close to gc_content
        if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95:
            mappability += [float('nan')] * (len(gc_content)-len(mappability))

        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in range(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1`
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path, 
        cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, 
        min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter)

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'wb')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Exemple #48
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    if opts.figsize:
        opts.figsize = map(float, opts.figsize.split(','))
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v !='raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1         = opts.coord1
    coord2         = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    if opts.plot and not opts.force_plot:
        if opts.interactive:
            max_size = 1500**2
        else:
            max_size = 5000**2
    else:
        max_size = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(zip(bamfile.references,
                                   [x for x in bamfile.lengths]))
        total = 0
        section_pos = OrderedDict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else 'NRM'
                           if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads, opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1, start1=start1, end1=end1,
                    region2=region2, start2=start2, end2=end2,
                    tmpdir=tmpdir, ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks, verbose=not opts.quiet,
                    clean=clean, max_size=max_size)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemented '
                         'for matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions)
                             for p in range(starts[r] if r < len(starts) and starts[r] else 0,
                                            ends[r] if r < len(ends) and ends[r] else sections[reg],
                                            opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name,
                                           nicer(opts.reso, sep=''),
                                           ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) +
                                        '\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                # transform matrix
                matrix = array([array([matrix.get((i, j), 0)
                                       for i in xrange(b1, e1)])
                                for j in xrange(b2, e2)])
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:,bad1] = 1
                    for bad2 in bads2:
                        m[bad2,:] = 1
                matrix = ma.masked_array(matrix, m)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s%s.%s' % (
                    norm, name, nicer(opts.reso, sep=''),
                    ('_' + param_hash), '_tri' if opts.triangular else '',
                    opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                pltbeg1 = 0 if start1 is None else start1
                pltend1 = sections[regions[0]] if end1 is None else end1
                pltbeg2 = 0 if start2 is None else start2
                pltend2 = sections[regions[-1]] if end2 is None else end2
                xlabel = '{}:{:,}-{:,}'.format(
                    regions[0], pltbeg1 if pltbeg1 else 1, pltend1)
                ylabel = '{}:{:,}-{:,}'.format(
                    regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)
                section_pos = OrderedDict((k, section_pos[k]) for k in section_pos
                                   if k in regions)
                ax1, _ = plot_HiC_matrix(
                    matrix, triangular=opts.triangular,
                    vmin=vmin, vmax=vmax, cmap=opts.cmap,
                    figsize=opts.figsize,
                    bad_color=opts.bad_color if norm != 'raw' else None)
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (
                    name, norm, nicer(opts.reso)), y=1.05)
                _format_axes(ax1, start1, end1, start2, end2, opts.reso,
                             regions, section_pos, sections,
                             opts.xtick_rotation, triangular=False)
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(write_matrix(
            mreads, opts.reso,
            load(open(biases)) if biases else None,
            outdir, filter_exclude=opts.filter,
            normalizations=opts.normalizations,
            region1=region1, start1=start1, end1=end1,
            region2=region2, start2=start2, end2=end2,
            tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus,
            nchunks=opts.nchunks, verbose=not opts.quiet,
            extra=param_hash, clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s '% tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Exemple #49
0
def full_mapping(mapper_index_path, fastq_path, out_map_dir, mapper='gem',
                 r_enz=None, frag_map=True, min_seq_len=15, windows=None,
                 add_site=True, clean=False, get_nread=False,
                 mapper_binary=None, mapper_params=None, **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool or bowtie2-build
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    if mapper_params:
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = '.'.join(base_name.split('.')[:-1])
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            fastq=is_fastq(input_reads),
            min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            if mapper == 'gem':
                _gem_mapping(mapper_index_path, curr_map, out_map_path,
                             gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'),
                             **kwargs)
                # parse map file to extract not uniquely mapped reads
                print 'Parsing result...'
                _gem_filter(out_map_path,
                            curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                            os.path.join(out_map_dir,
                                         base_name + '_full_%s-%s%s.map' % (
                                             beg, end, suffix)))
            elif mapper == 'bowtie2':
                _bowtie2_mapping(mapper_index_path, curr_map, out_map_path,
                                 bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'),
                                 bowtie2_params=mapper_params, **kwargs)
                # parse map file to extract not uniquely mapped reads
                print 'Parsing result...'
                _bowtie2_filter(out_map_path, curr_map,
                                curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                                os.path.join(out_map_dir,
                                             base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
            # clean
            if clean:
                print '   x removing %s input %s' % (mapper.upper(),curr_map)
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append(
            (os.path.join(out_map_dir,
                          base_name + '_full_%s-%s%s.map' % (beg, end, suffix)),
             counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz,
            add_site=add_site, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if clean:
            print '   x removing pre-%s input %s' % (mapper.upper(),input_reads)
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            if mapper == 'gem':
                print 'Mapping fragments of remaining reads...'
                _gem_mapping(mapper_index_path, frag_map, out_map_path,
                             gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'),
                             **kwargs)
                print 'Parsing result...'
                _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix),
                            os.path.join(out_map_dir,
                                         base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            elif mapper == 'bowtie2':
                print 'Mapping fragments of remaining reads...'
                _bowtie2_mapping(mapper_index_path, frag_map, out_map_path,
                                 bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'),
                                 bowtie2_params=mapper_params, **kwargs)
                print 'Parsing result...'
                _bowtie2_filter(out_map_path, frag_map,
                                curr_map + '_fail%s.map' % (suffix),
                                os.path.join(out_map_dir,
                                         base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
        # clean
        if clean:
            print '   x removing %s input %s' % (mapper.upper(),frag_map)
            os.system('rm -f %s' % (frag_map))
            print '   x removing failed to map ' + curr_map + '_fail%s.map' % (suffix)
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print '   x removing tmp mapped %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(out_map_dir,
                                      base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)),
                         counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Exemple #50
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None
        except TypeError:  # Py3
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None
        except TypeError:  # Py3
            biases1 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        if opts.workdir1 and opts.workdir2:
            masked1 = {'valid-pairs': {'count': 0}}
            masked2 = {'valid-pairs': {'count': 0}}
        else:
            masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}}
            masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}}

        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print('         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std))
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print('         - reproducibility score: %.4f' % (reprod))
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'
        masked1 = {}
        masked2 = {}

        corr = eig_corr = scc = std = reprod = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    if not opts.skip_merge:
        outbam = path.join(opts.workdir, '03_filtered_reads',
                           'intersection_%s.bam' % (param_hash))
        printime('  - Mergeing experiments')
        system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
        printime('  - Indexing new BAM file')
        # check samtools version number and modify command line
        version = LooseVersion([l.split()[1]
                                for l in Popen(samtools, stderr=PIPE,
                                               universal_newlines=True).communicate()[1].split('\n')
                                if 'Version' in l][0])
        if version >= LooseVersion('1.3.1'):
            system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
        else:
            system(samtools  + ' index %s' % (outbam))
    else:
        outbam = ''

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(list(bads.keys())), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, masked1, masked2, launch_time, finish_time)
    printime('\nDone.')
Exemple #51
0
def check_options(opts):
    if opts.cfg:
        get_options_from_cfg(opts.cfg, opts)

    opts.gem_binary = which(opts.gem_binary)
    if not opts.gem_binary:
        raise Exception('\n\nERROR: GEM binary not found, install it from:'
                        '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
                        '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
                        'have a recent computer, the '
                        'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
                        'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n\nNOTE: GEM does '
                        'not provide any binary for MAC-OS.')

    # check RE name
    try:
        _ = RESTRICTION_ENZYMES[opts.renz]
    except KeyError:
        print ('\n\nERROR: restriction enzyme not found. Use one of:\n\n'
               + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n')
        raise KeyError()
    except AttributeError:
        pass

    # check skip
    if not path.exists(opts.workdir) and opts.skip:
        print ('WARNING: can use output files, found, not skipping...')
        opts.skip = False

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check paths
    if not path.exists(opts.index):
        raise IOError('ERROR: index file not found at ' + opts.index)

    if not path.exists(opts.fastq):
        raise IOError('ERROR: FASTQ file not found at ' + opts.fastq)
    
    # create tmp directory
    if not opts.tmp:
        opts.tmp = opts.workdir + '_tmp_r%d' % opts.read

    try:
        opts.windows = [[int(i) for i in win.split(':')]
                        for win in opts.windows]
    except TypeError:
        pass
        
    mkdir(opts.workdir)
    # write log
    # if opts.mapping_only:
    log_format = '[MAPPING {} READ{}]   %(message)s'.format(opts.fastq, opts.read)
    # else:
    #     log_format = '[DEFAULT]   %(message)s'

    # reset logging
    logging.getLogger().handlers = []

    try:
        print 'Writing log to ' + path.join(opts.workdir, 'process.log')
        logging.basicConfig(level=logging.INFO,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log'),
                            filemode='aw')
    except IOError:
        logging.basicConfig(level=logging.DEBUG,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log2'),
                            filemode='aw')

    # to display log on stdout also
    logging.getLogger().addHandler(logging.StreamHandler())

    # write version log
    vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log')
    dependencies = get_dependencies_version()
    if not path.exists(vlog_path) or open(vlog_path).readlines() != dependencies:
        logging.info('Writing versions of TADbit and dependencies')
        vlog = open(vlog_path, 'w')
        vlog.write(dependencies)
        vlog.close()

    # check GEM mapper extra options
    if opts.gem_param:
        opts.gem_param = dict([o.split(':') for o in opts.gem_param])
    else:
        opts.gem_param = {}
    gem_valid_option = set(["granularity", "q", "quality-format",
                            "gem-quality-threshold", "mismatch-alphabet",
                            "m", "e", "min-matched-bases",
                            "max-big-indel-length", "s", "strata-after-best",
                            "fast-mapping", "unique-mapping", "d", "D",
                            "allow-incomplete-strata", "max-decoded-matches",
                            "min-decoded-strata", "p", "paired-end-alignment",
                            "b", "map-both-ends", "min-insert-size",
                            "max-insert-size", "E", "max-extendable-matches",
                            "max-extensions-per-match", "unique-pairing"])
    for k in opts.gem_param:
        if not k in gem_valid_option:
            raise NotImplementedError(('ERROR: option "%s" not a valid GEM option'
                                       'or not suported by this tool.') % k)
    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        exit('WARNING: exact same job already computed, see JOBs table above')
Exemple #52
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (
                len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception(
                "ERROR: chromosomes in FASTA different the ones in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        fh = open(opts.mappability)
        mappability = dict((c, []) for c in refs)
        line = fh.next()
        crmM, begM, endM, val = line.split()
        crm = crmM
        if crmM not in mappability:
            print('     skipping %s' % crmM)
            while crmM not in mappability:
                line = fh.next()
                crmM, begM, endM, val = line.split()
                crm = crmM
        while any(not mappability[c] for c in mappability):
            for begB in xrange(0, len(genome[crmM]), opts.reso):
                endB = begB + opts.reso
                tmp = 0
                try:
                    while True:
                        crmM, begM, endM, val = line.split()
                        if crm != crmM:
                            try:
                                while crmM not in refs:
                                    line = fh.next()
                                    crmM, _ = line.split('\t', 1)
                            except StopIteration:
                                pass
                            break
                        begM = int(begM)
                        endM = int(endM)
                        if endM > endB:
                            weight = endB - begM
                            if weight >= 0:
                                tmp += weight * float(val)
                            break
                        weight = endM - (begM if begM > begB else begB)
                        if weight < 0:
                            break
                        tmp += weight * float(val)
                        line = fh.next()
                except StopIteration:
                    pass
                mappability[crm].append(tmp / opts.reso)
                crm = crmM
        mappability = reduce(lambda x, y: x + y,
                             (mappability[c] for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols)

    bad_col_image = path.join(
        outdir, 'filtered_bins_%s_%s.png' %
        (nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2,
                   opts.filter, launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Exemple #53
0
def main():
    opts = get_options()
    inbam = opts.inbam
    resolution = opts.reso
    filter_exclude = opts.filter
    ncpus = opts.cpus
    if opts.biases:
        biases = load(open(opts.biases))
    else:
        biases = {}
    outdir = opts.outdir
    coord1 = opts.coord1
    coord2 = opts.coord2

    if biases and biases['resolution'] != resolution:
        raise Exception(
            'ERROR: different resolution in bias file (you want %d,'
            ' there is %d).\n' % (resolution, biases['resolution']))
    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    mkdir(outdir)
    if region1:
        if region1:
            sys.stdout.write('\nExtraction of %s' % (region1))
            if start1:
                sys.stdout.write(':%s-%s' % (start1, end1))
            else:
                sys.stdout.write(' (full chromosome)')
            if region2:
                sys.stdout.write(' intersection with %s' % (region2))
                if start2:
                    sys.stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    sys.stdout.write(' (full chromosome)\n')
            else:
                sys.stdout.write('\n')
    else:
        sys.stdout.write('\nExtraction of full genome\n')

    read_bam(inbam,
             filter_exclude,
             resolution,
             biases,
             region1=region1,
             start1=start1,
             end1=end1,
             region2=region2,
             start2=start2,
             end2=end2,
             ncpus=ncpus,
             outdir=outdir)

    printime('\nDone.')
Exemple #54
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(
            perc_zero=opts.perc_zeros,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)
    except ValueError:
        hic_data.filter_columns(
            perc_zero=100,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)

    # bad columns
    bad_columns_file = path.join(
        opts.workdir, '04_normalization',
        'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    print 'Get biases using ICE...'
    hic_data.normalize_hic(silent=False,
                           max_dev=0.1,
                           iterations=0,
                           factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True)
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
    print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.pdf_%s_%s.pdf' %
        (opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data,
        max_diff=10000,
        resolution=opts.reso,
        normalized=True,
        savefig=inter_vs_gcoord)

    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = open(bias_file, 'w')
    out_bias.write(
        '\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                   for i in hic_data.bias]) + '\n')
    out_bias.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_nrm_fig,
                savedata=intra_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig,
                savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            inter_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        inter_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_nrm_fig,
                savedata=inter_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig,
                savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        else:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        genom_map_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig,
                savedata=genom_map_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig,
                savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord,
               mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig,
               inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt,
               intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig,
               inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt,
               launch_time, finish_time)
Exemple #55
0
def get_intersection(fname1, fname2, out_path, verbose=False, compress=False):
    """
    Merges the two files corresponding to each reads sides. Reads found in both
       files are merged and written in an output file.

    Dealing with multiple contacts:
       - a pairwise contact is created for each possible combnation of the
         multicontacts. The name of the read is extended by '# 1/3' in case
         the reported pairwise contact corresponds to the first of 3 possibles
       - it may happen that different contacts are mapped on a single RE fragment
         (if each are on different end), in which case:
          - if no other fragment from this read are mapped than, both are kept
          - otherwise, they are merged into one longer (as if they were mapped
            in the positive strand)

    :param fname1: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param fname2: path to a tab separated file generated by the function
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    :param out_path: path to an outfile. It will written in a similar format as
       the inputs
    :param False compress: compress (gzip) input files. This is done in the
       background while next input files are parsed.

    :returns: final number of pair of interacting fragments, and a dictionary with
       the number of multiple contacts (keys of the dictionary being the number of
       fragment cought together, can be 3, 4, 5..)
    """

    # Get the headers of the two files
    reads1 = magic_open(fname1)
    line1 = next(reads1)
    header1 = ''
    while line1.startswith('#'):
        if line1.startswith('# CRM'):
            header1 += line1
        line1 = next(reads1)
    read1 = line1.split('\t', 1)[0]

    reads2 = magic_open(fname2)
    line2 = next(reads2)
    header2 = ''
    while line2.startswith('#'):
        if line2.startswith('# CRM'):
            header2 += line2
        line2 = next(reads2)
    read2 = line2.split('\t', 1)[0]
    if header1 != header2:
        raise Exception('seems to be mapped onover different chromosomes\n')

    # prepare to write read pairs into different files
    # depending on genomic position
    nchunks = 1024
    global CHROM_START
    CHROM_START = {}
    cum_pos = 0
    for line in header1.split('\n'):
        if line.startswith('# CRM'):
            _, _, crm, pos = line.split()
            CHROM_START[crm] = cum_pos
            cum_pos += int(pos)
    lchunk = cum_pos // nchunks
    buf = dict([(i, []) for i in range(nchunks + 1)])
    # prepare temporary directories
    tmp_dir = out_path + '_tmp_files'
    mkdir(tmp_dir)
    for i in range(nchunks // int(nchunks**0.5) + 1):
        mkdir(path.join(tmp_dir, 'rep_%03d' % i))

    # iterate over reads in each of the two input files
    # and store them into a dictionary and then into temporary files
    # dicitonary ois emptied each 1 milion entries
    if verbose:
        print ('Getting intersection of reads 1 and reads 2:')
    count = 0
    count_dots = -1
    multiples = {}
    try:
        while True:
            if verbose:
                if not count_dots % 10:
                    stdout.write(' ')
                if not count_dots % 50:
                    stdout.write('%s\n  ' % (
                        ('  %4d milion reads' % (count_dots)) if
                        count_dots else ''))
                if count_dots >= 0:
                    stdout.write('.')
                    stdout.flush()
                count_dots += 1
            for _ in range(1000000): # iterate 1 million times, write to files
                # same read id in both lianes, we store put the more upstream
                # before and store them
                if eq_reads(read1, read2):
                    count += 1
                    _process_lines(line1, line2, buf, multiples, lchunk)
                    line1 = next(reads1)
                    read1 = line1.split('\t', 1)[0]
                    line2 = next(reads2)
                    read2 = line2.split('\t', 1)[0]
                # if first element of line1 is greater than the one of line2:
                elif gt_reads(read1, read2):
                    line2 = next(reads2)
                    read2 = line2.split('\t', 1)[0]
                else:
                    line1 = next(reads1)
                    read1 = line1.split('\t', 1)[0]
            write_to_files(buf, tmp_dir, nchunks)
    except StopIteration:
        reads1.close()
        reads2.close()
    write_to_files(buf, tmp_dir, nchunks)
    if verbose:
        print('\nFound %d pair of reads mapping uniquely' % count)

    # compression
    if compress:
        if verbose:
            print('compressing input files')
        procs = [Popen(['gzip', f]) for f in (fname1, fname2)]
    # sort each tmp file according to first element (idx) and write them
    # to output file (without the idx)
    # sort also according to read 2 (to filter duplicates)
    #      and also according to strand
    if verbose:
        print('Sorting each temporary file by genomic coordinate')

    out = open(out_path, 'w')
    out.write(header1)
    for b in buf:
        if verbose:
            stdout.write('\r    %4d/%d sorted files' % (b + 1, len(buf)))
            stdout.flush()
        with open(path.join(tmp_dir, 'rep_%03d' % (b // int(nchunks**0.5)),
                            'tmp_%05d.tsv' % b)) as f_tmp:
            out.write(''.join(['\t'.join(l[1:]) for l in sorted(
                [l.split('\t') for l in f_tmp],
                key=lambda x: (x[0], x[8], x[9], x[6]))]))
    out.close()

    if compress:
        for proc in procs:
            proc.communicate()
        system('rm -rf ' + fname1)
        system('rm -rf ' + fname2)
    if verbose:
        print('\nRemoving temporary files...')
    system('rm -rf ' + tmp_dir)
    return count, multiples