Beispiel #1
0
def check_options(opts):

    if not opts.mapper_binary:
        if opts.mapper == 'gem':
            opts.mapper_binary = 'gem-mapper'
        else:
            opts.mapper_binary = opts.mapper
    opts.mapper_binary = which(opts.mapper_binary)
    if not opts.mapper_binary:
        raise Exception(
            '\n\nERROR: Mapper binary not found, for GEM install it from:'
            '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
            '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
            'have a recent computer, the '
            'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
            'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
            'Copy the binary gem-mapper to /usr/local/bin/ for '
            'example (somewhere in your PATH).\n\nNOTE: GEM does '
            'not provide any binary for MAC-OS.')

    opts.gem_version = 0
    if opts.mapper == 'gem':
        opts.gem_version = None
        try:
            out, _ = Popen([opts.mapper_binary, '--version'],
                           stdout=PIPE,
                           stderr=STDOUT,
                           universal_newlines=True).communicate()
            opts.gem_version = int(out[1])
        except ValueError as e:
            opts.gem_version = 2
            print('Falling to gem v2')

    if opts.fast_fragment:
        if opts.gem_version < 3:
            raise Exception('ERROR: Fast fragment mapping needs GEM v3')
        if not opts.fastq2 or not path.exists(opts.fastq2):
            raise Exception(
                'ERROR: Fast fragment mapping needs both fastq files. '
                'Please specify --fastq2')
        if opts.read != 0:
            raise Exception(
                'ERROR: Fast fragment mapping needs to be specified with --read 0'
            )
        if not opts.genome:
            raise Exception('ERROR: Fast fragment mapping needs '
                            'the genome parameter.')
    # check RE name
    if opts.renz == ['CHECK']:
        print('\nSearching for most probable restriction enzyme in file: %s' %
              (opts.fastq))
        try:
            pat, enz, pv = identify_re(opts.fastq, nreads=100000)
            print(' -> Most probable digested site: %s (pv: %f)' % (pat, pv))
            print(' -> Enzymes matching: %s' % (', '.join(enz)))
        except ValueError:
            print(' -> Nothing found...')
        exit()
    for n, renz in enumerate(opts.renz):
        if renz == 'NONE':
            opts.renz[n] = None
            continue
        try:
            _ = RESTRICTION_ENZYMES[renz]
        except KeyError:
            print('\n\nERROR: restriction enzyme %s not found.' % (renz) +
                  'Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) +
                  '\n\n')
            raise KeyError()
        except AttributeError:
            pass

    # check skip
    if not path.exists(opts.workdir) and opts.skip:
        print('WARNING: can use output files, found, not skipping...')
        opts.skip = False

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check paths
    if opts.mapper == 'gem' and not path.exists(opts.index):
        raise IOError('ERROR: index file not found at ' + opts.index)

    if not path.exists(opts.fastq):
        raise IOError('ERROR: FASTQ file not found at ' + opts.fastq)

    if not is_fastq(opts.fastq):
        raise IOError(
            ('ERROR: FASTQ file %s wrong format, check') % (opts.fastq))

    try:
        opts.windows = [[int(i) for i in win.split(':')]
                        for win in opts.windows]
    except TypeError:
        pass

    mkdir(opts.workdir)
    # write log
    # if opts.mapping_only:
    log_format = '[MAPPING {} READ{}]   %(message)s'.format(
        opts.fastq, opts.read)
    # else:
    #     log_format = '[DEFAULT]   %(message)s'

    # reset logging
    logging.getLogger().handlers = []

    try:
        print('Writing log to ' + path.join(opts.workdir, 'process.log'))
        logging.basicConfig(level=logging.INFO,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log'),
                            filemode='a+')
    except IOError:
        logging.basicConfig(level=logging.DEBUG,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log2'),
                            filemode='a+')

    # to display log on stdout also
    logging.getLogger().addHandler(logging.StreamHandler())

    # write version log
    vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log')
    dependencies = get_dependencies_version()
    if not path.exists(
            vlog_path) or open(vlog_path).readlines() != dependencies:
        logging.info('Writing versions of TADbit and dependencies')
        vlog = open(vlog_path, 'w')
        vlog.write(dependencies)
        vlog.close()

    # check mapper extra options
    if opts.mapper_param:
        if (len(opts.mapper_param) == 1 and
            ('-' in opts.mapper_param[0] or '--' in opts.mapper_param[0])):
            # Single string surrounded by quotes
            opts.mapper_param = opts.mapper_param[0].split()
        else:
            opts.mapper_param = dict([o.split(':') for o in opts.mapper_param])
    else:
        opts.mapper_param = {}
    if opts.mapper == 'gem' and opts.gem_version < 3:
        gem_valid_option = set([
            "granularity", "q", "quality-format", "gem-quality-threshold",
            "mismatch-alphabet", "m", "e", "min-matched-bases",
            "max-big-indel-length", "s", "strata-after-best", "fast-mapping",
            "unique-mapping", "d", "D", "allow-incomplete-strata",
            "max-decoded-matches", "min-decoded-strata", "p",
            "paired-end-alignment", "b", "map-both-ends", "min-insert-size",
            "max-insert-size", "E", "max-extendable-matches",
            "max-extensions-per-match", "unique-pairing"
        ])
        for k in opts.mapper_param:
            if not k in gem_valid_option:
                raise NotImplementedError(
                    ('ERROR: option "%s" not a valid GEM option'
                     'or not suported by this tool.') % k)

    # create empty DB if don't exists
    dbpath = path.join(opts.workdir, 'trace.db')
    open(dbpath, 'a').close()

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if 'tmpdb' in opts and opts.tmpdb:
            remove(path.join(dbdir, dbfile))
        exit('WARNING: exact same job already computed, see JOBs table above')
Beispiel #2
0
def full_mapping(gem_index_path,
                 fastq_path,
                 out_map_dir,
                 r_enz=None,
                 frag_map=True,
                 min_seq_len=15,
                 windows=None,
                 add_site=True,
                 clean=False,
                 get_nread=False,
                 **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = '.'.join(base_name.split('.')[:-1])
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            fastq=is_fastq(input_reads),
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            _gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs)
            # parse map file to extract not uniquely mapped reads
            print 'Parsing result...'
            _gem_filter(
                out_map_path,
                curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                os.path.join(
                    out_map_dir,
                    base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            # clean
            if clean:
                print '   x removing GEM input %s' % curr_map
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            fastq=False,
                                            r_enz=r_enz,
                                            add_site=add_site,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if clean:
            print '   x removing pre-GEM input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            print 'Mapping fragments of remaining reads...'
            _gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs)
            print 'Parsing result...'
            _gem_filter(
                out_map_path, curr_map + '_fail%s.map' % (suffix),
                os.path.join(
                    out_map_dir,
                    base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
        # clean
        if clean:
            print '   x removing GEM input %s' % frag_map
            os.system('rm -f %s' % (frag_map))
            print '   x removing failed to map ' + curr_map + '_fail%s.map' % (
                suffix)
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print '   x removing tmp mapped %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Beispiel #3
0
def fast_fragment_mapping(mapper_index_path,
                          fastq_path1,
                          fastq_path2,
                          r_enz,
                          genome_seq,
                          out_map,
                          clean=True,
                          get_nread=False,
                          mapper_binary=None,
                          mapper_params=None,
                          samtools='samtools',
                          **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome with the knowledge of
    the restriction enzyme used (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool, bowtie2-build or hisat2-build
    :param fastq_path1: PATH to FASTQ file of read 1, either compressed or not.
    :param fastq_path2: PATH to FASTQ file of read 2, either compressed or not.
    :param out_map_dir: path to outfile tab separated format containing mapped
       read information.
    :param r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII.
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param False clean: remove intermediate files created in temp_dir
    :param False get_nread: returns a list of lists where each element contains
       a path and the number of reads processed
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper
    :param samtools samtools: path to samtools binary.

    :returns: outfile with the intersected read pairs
    """

    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    samtools = which(samtools)
    # check out folder
    if not os.path.isdir(os.path.dirname(os.path.abspath(out_map))):
        raise Exception(
            '\n\nERROR: Path to store the output does not exist.\n')
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    gem_version = None
    # check that we have the GEM binary:
    gem_binary = mapper_binary or 'gem-mapper'
    gem_binary = which(gem_binary)
    if not gem_binary:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    try:
        out, err = Popen([gem_binary, '--version'],
                         stdout=PIPE,
                         stderr=STDOUT,
                         universal_newlines=True).communicate()
        gem_version = int(out[1])
    except ValueError as e:
        gem_version = 2
        print('Falling to gem v2')
    if gem_version < 3:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    if mapper_params:
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name1 = os.path.split(fastq_path1)[-1].replace('.gz', '')
    base_name1 = '.'.join(base_name1.split('.')[:-1])

    curr_map1, _ = transform_fastq(fastq_path1,
                                   mkstemp(prefix=base_name1 + '_',
                                           dir=temp_dir)[1],
                                   fastq=is_fastq(fastq_path1),
                                   nthreads=nthreads,
                                   light_storage=True)

    base_name2 = os.path.split(fastq_path2)[-1].replace('.gz', '')
    base_name2 = '.'.join(base_name2.split('.')[:-1])

    curr_map2, count_fastq = transform_fastq(fastq_path2,
                                             mkstemp(prefix=base_name2 + '_',
                                                     dir=temp_dir)[1],
                                             fastq=is_fastq(fastq_path1),
                                             nthreads=nthreads,
                                             light_storage=True)

    out_map_path = curr_map1 + '_frag%s.map' % (suffix)

    print('Mapping fragments of remaining reads...')
    _gem_mapping(mapper_index_path,
                 curr_map1,
                 out_map_path,
                 fastq_path2=curr_map2,
                 r_enz=r_enz,
                 gem_binary=gem_binary,
                 gem_version=gem_version,
                 **kwargs)
    # clean
    if clean:
        print('   x removing GEM 3 input %s' % (curr_map1))
        os.system('rm -f %s' % (curr_map1))
        print('   x removing GEM 3 input %s' % (curr_map2))
        os.system('rm -f %s' % (curr_map2))

    #sort sam file
    os.system(samtools + ' sort -n -O SAM -@ %d -T %s -o %s %s' %
              (nthreads, out_map_path, out_map_path, out_map_path))
    genome_lengths = dict((crm, len(genome_seq[crm])) for crm in genome_seq)
    frag_chunk = kwargs.get('frag_chunk', 100000)
    frags = map_re_sites(r_enz, genome_seq, frag_chunk=frag_chunk)
    if samtools and nthreads > 1:
        print('Splitting sam file')
        # headers
        for i in range(nthreads):
            os.system(samtools + ' view -H -O SAM %s > "%s_%d"' %
                      (out_map_path, out_map_path, (i + 1)))
        chunk_lines = int(
            (count_fastq * 2.3) /
            nthreads)  # estimate lines in sam with reads and frags
        os.system(samtools + ''' view -O SAM %s | awk -v n=%d -v FS="\\t" '
              BEGIN { part=0; line=n }       
              { if( line>=n && $1!=last_read ) {part++; line=1; print $0 >> "%s_"part } 
                else { print $0 >> "%s_"part; line++; } 
                last_read = $1;
              }'
        ''' % (out_map_path, chunk_lines, out_map_path, out_map_path))
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))
        print('Parsing results...')
        kwargs['nthreads'] = 1
        procs = []
        pool = mu.Pool(nthreads)
        for i in range(nthreads):
            frags_shared = copy.deepcopy(frags)
            procs.append(
                pool.apply_async(parse_gem_3c,
                                 args=('%s_%d' % (out_map_path, (i + 1)),
                                       '%s_parsed_%d' % (out_map_path,
                                                         (i + 1)),
                                       copy.deepcopy(genome_lengths),
                                       frags_shared, False, True),
                                 kwds=kwargs))
            #results.append('%s_parsed_%d' % (out_map_path,(i+1)))
        pool.close()
        pool.join()
        results = [proc.get() for proc in procs if proc.get()]
        if clean:
            for i in range(nthreads):
                print('   x removing tmp mapped %s_%d' % (out_map_path,
                                                          (i + 1)))
                os.system('rm -f %s_%d' % (out_map_path, (i + 1)))

        #Final sort and merge
        nround = 0
        while len(results) > 1:
            nround += 1
            num_procs = min(nthreads, int(len(results) / 2))
            pool = mu.Pool(num_procs)
            procs = [
                pool.apply_async(merge_sort,
                                 (results.pop(0), results.pop(0),
                                  out_map_path + '_%d' % nround, i, True))
                for i in range(num_procs)
            ]
            pool.close()
            pool.join()
            results = [proc.get() for proc in procs if proc.get()]

        map_out = open(out_map, 'w')
        tmp_reads_fh = open(results[0], 'r')
        for crm in genome_seq:
            map_out.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        for read_line in tmp_reads_fh:
            read = read_line.split('\t')
            map_out.write('\t'.join([read[0]] + read[2:8] + read[9:]))
        map_out.close()
        if clean:
            print('   x removing tmp mapped %s' % results[0])
            os.system('rm -f %s' % (results[0]))

    else:
        print('Parsing result...')
        parse_gem_3c(out_map_path,
                     out_map,
                     genome_lengths,
                     frags,
                     verbose=False,
                     tmp_format=False,
                     **kwargs)

        # clean
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))

    if get_nread:
        return [(out_map, count_fastq)]
    return out_map
Beispiel #4
0
def full_mapping(mapper_index_path, fastq_path, out_map_dir, mapper='gem',
                 r_enz=None, frag_map=True, min_seq_len=15, windows=None,
                 add_site=True, clean=False, get_nread=False,
                 mapper_binary=None, mapper_params=None, **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool or bowtie2-build
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    if mapper_params:
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = '.'.join(base_name.split('.')[:-1])
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            fastq=is_fastq(input_reads),
            min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            if mapper == 'gem':
                _gem_mapping(mapper_index_path, curr_map, out_map_path,
                             gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'),
                             **kwargs)
                # parse map file to extract not uniquely mapped reads
                print 'Parsing result...'
                _gem_filter(out_map_path,
                            curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                            os.path.join(out_map_dir,
                                         base_name + '_full_%s-%s%s.map' % (
                                             beg, end, suffix)))
            elif mapper == 'bowtie2':
                _bowtie2_mapping(mapper_index_path, curr_map, out_map_path,
                                 bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'),
                                 bowtie2_params=mapper_params, **kwargs)
                # parse map file to extract not uniquely mapped reads
                print 'Parsing result...'
                _bowtie2_filter(out_map_path, curr_map,
                                curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                                os.path.join(out_map_dir,
                                             base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
            # clean
            if clean:
                print '   x removing %s input %s' % (mapper.upper(),curr_map)
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append(
            (os.path.join(out_map_dir,
                          base_name + '_full_%s-%s%s.map' % (beg, end, suffix)),
             counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz,
            add_site=add_site, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if clean:
            print '   x removing pre-%s input %s' % (mapper.upper(),input_reads)
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            if mapper == 'gem':
                print 'Mapping fragments of remaining reads...'
                _gem_mapping(mapper_index_path, frag_map, out_map_path,
                             gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'),
                             **kwargs)
                print 'Parsing result...'
                _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix),
                            os.path.join(out_map_dir,
                                         base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            elif mapper == 'bowtie2':
                print 'Mapping fragments of remaining reads...'
                _bowtie2_mapping(mapper_index_path, frag_map, out_map_path,
                                 bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'),
                                 bowtie2_params=mapper_params, **kwargs)
                print 'Parsing result...'
                _bowtie2_filter(out_map_path, frag_map,
                                curr_map + '_fail%s.map' % (suffix),
                                os.path.join(out_map_dir,
                                         base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
        # clean
        if clean:
            print '   x removing %s input %s' % (mapper.upper(),frag_map)
            os.system('rm -f %s' % (frag_map))
            print '   x removing failed to map ' + curr_map + '_fail%s.map' % (suffix)
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print '   x removing tmp mapped %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(out_map_dir,
                                      base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)),
                         counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Beispiel #5
0
def full_mapping(mapper_index_path,
                 fastq_path,
                 out_map_dir,
                 mapper='gem',
                 r_enz=None,
                 frag_map=True,
                 min_seq_len=15,
                 windows=None,
                 add_site=True,
                 clean=False,
                 get_nread=False,
                 mapper_binary=None,
                 mapper_params=None,
                 **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool or bowtie2-build
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    if mapper == 'gem':
        gem_version = None
        # check that we have the GEM binary:
        gem_binary = mapper_binary or 'gem-mapper'
        gem_binary = which(gem_binary)
        if not gem_binary:
            raise Exception(
                '\n\nERROR: GEM binary not found, install it from:'
                '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
                '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
                'have a recent computer, the '
                'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
                'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
                'Copy the binary gem-mapper to /usr/local/bin/ for '
                'example (somewhere in your PATH).\n\nNOTE: GEM does '
                'not provide any binary for MAC-OS.')
        try:
            out, err = Popen([gem_binary, '--version'],
                             stdout=PIPE,
                             stderr=STDOUT,
                             universal_newlines=True).communicate()
            gem_version = int(out[1])
        except ValueError as e:
            gem_version = 2
            print('Falling to gem v2')
    if mapper_params and isinstance(mapper_params, dict):
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = '.'.join(base_name.split('.')[:-1])
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            fastq=is_fastq(input_reads),
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print('   x removing original input %s' % input_reads)
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print('Mapping reads in window %s-%s%s...' % (beg, end, suffix))
        else:
            print('Mapping full reads...', curr_map)

        if not skip:
            if mapper == 'gem':
                _gem_mapping(mapper_index_path,
                             curr_map,
                             out_map_path,
                             gem_binary=gem_binary,
                             gem_version=gem_version,
                             gem_params=mapper_params,
                             **kwargs)
                # parse map file to extract not uniquely mapped reads
                print('Parsing result...')
                if gem_version >= 3:
                    _sam_filter(
                        out_map_path, curr_map,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(
                            out_map_dir, base_name + '_full_%s-%s%s.map' %
                            (beg, end, suffix)))
                else:
                    _gem_filter(
                        out_map_path,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(
                            out_map_dir, base_name + '_full_%s-%s%s.map' %
                            (beg, end, suffix)))
            elif mapper == 'bowtie2' or mapper == 'hisat2':
                _bowtie2_mapping(mapper_index_path,
                                 curr_map,
                                 out_map_path,
                                 bowtie2_binary=(mapper_binary
                                                 if mapper_binary else mapper),
                                 bowtie2_params=mapper_params,
                                 **kwargs)
                # parse map file to extract not uniquely mapped reads
                print('Parsing result...')
                _sam_filter(
                    out_map_path, curr_map,
                    curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                    os.path.join(
                        out_map_dir,
                        base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
            # clean
            if clean:
                print('   x removing %s input %s' % (mapper.upper(), curr_map))
                os.system('rm -f %s' % (curr_map))
                print('   x removing map %s' % out_map_path)
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            fastq=False,
                                            r_enz=r_enz,
                                            add_site=add_site,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if clean:
            print('   x removing pre-%s input %s' %
                  (mapper.upper(), input_reads))
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            if mapper == 'gem':
                print('Mapping fragments of remaining reads...')
                _gem_mapping(mapper_index_path,
                             frag_map,
                             out_map_path,
                             gem_binary=gem_binary,
                             gem_version=gem_version,
                             **kwargs)
                print('Parsing result...')
                # check if output is sam format for gem3
                if gem_version >= 3:
                    _sam_filter(
                        out_map_path, frag_map,
                        curr_map + '_fail%s.map' % (suffix),
                        os.path.join(
                            out_map_dir, base_name + '_frag_%s-%s%s.map' %
                            (beg, end, suffix)))
                else:
                    _gem_filter(
                        out_map_path, curr_map + '_fail%s.map' % (suffix),
                        os.path.join(
                            out_map_dir, base_name + '_frag_%s-%s%s.map' %
                            (beg, end, suffix)))
            elif mapper == 'bowtie2' or mapper == 'hisat2':
                print('Mapping fragments of remaining reads...')
                _bowtie2_mapping(mapper_index_path,
                                 frag_map,
                                 out_map_path,
                                 bowtie2_binary=(mapper_binary
                                                 if mapper_binary else mapper),
                                 bowtie2_params=mapper_params,
                                 **kwargs)
                print('Parsing result...')
                _sam_filter(
                    out_map_path, frag_map,
                    curr_map + '_fail%s.map' % (suffix),
                    os.path.join(
                        out_map_dir,
                        base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
        # clean
        if clean:
            print('   x removing %s input %s' % (mapper.upper(), frag_map))
            os.system('rm -f %s' % (frag_map))
            print('   x removing failed to map ' + curr_map + '_fail%s.map' %
                  (suffix))
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter))
    if clean:
        os.system('rm -rf %s' % (temp_dir))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]