Esempio n. 1
0
    :param False clean: remove intermedite files created in temp_dir
    """
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        try:
            os.mkdir(rep)
        except OSError, error:
            if error.strerror != 'File exists':
                warn('ERROR: problem loading file, probable problem with the ' +
                     'use of relative path')
                raise error
    # check space
    if get_free_space_mb(temp_dir, div=3) < 50:
        warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir)

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    input_reads = fastq_path
    for beg, end in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map = transform_fastq(input_reads, 
                                   mkstemp(prefix=base_name + '_',
                                           dir=temp_dir)[1],
                                   fastq=(input_reads.endswith('.fastq')
                                          or input_reads.endswith('.fastq.gz')),
                                   min_seq_len=min_seq_len, trim=(beg, end))
        # clean
Esempio n. 2
0
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True,
                 min_seq_len=15, windows=None, add_site=True, clean=False,
                 get_nread=False, **kwargs):
    """
    Do the mapping

    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for begining and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermedite files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    if get_free_space_mb(temp_dir, div=3) < 50:
        warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir)

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    input_reads = fastq_path
    if windows is None:
        windows = (None, )
    elif isinstance(windows[0], int):
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            fastq=(   input_reads.endswith('.fastq'   )
                   or input_reads.endswith('.fastq.gz')
                   or input_reads.endswith('.fq.gz'   )
                   or input_reads.endswith('.dsrc'    )),
            min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs)
            # parse map file to extract not uniquely mapped reads
            print 'Parsing result...'
            _gem_filter(out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            # clean
            if clean:
                print '   x removing GEM input %s' % curr_map
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append(
            (os.path.join(out_map_dir,
                          base_name + '_full_%s-%s%s.map' % (beg, end, suffix)),
             counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz,
            add_site=add_site, skip=skip, nthreads=nthreads)
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            print 'Mapping fragments of remaining reads...'
            gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs)
            print 'Parsing result...'
            _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
        outfiles.append((os.path.join(out_map_dir,
                                      base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)),
                         counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Esempio n. 3
0
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True,
                 min_seq_len=15, windows=None, add_site=True, clean=False,
                 get_nread=False, **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            fastq=(   input_reads.endswith('.fastq'   )
                   or input_reads.endswith('.fastq.gz')
                   or input_reads.endswith('.fq.gz'   )
                   or input_reads.endswith('.dsrc'    )),
            min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            _gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs)
            # parse map file to extract not uniquely mapped reads
            print 'Parsing result...'
            _gem_filter(out_map_path,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_full_%s-%s%s.map' % (
                                         beg, end, suffix)))
            # clean
            if clean:
                print '   x removing GEM input %s' % curr_map
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append(
            (os.path.join(out_map_dir,
                          base_name + '_full_%s-%s%s.map' % (beg, end, suffix)),
             counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz,
            add_site=add_site, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if clean:
            print '   x removing pre-GEM input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            print 'Mapping fragments of remaining reads...'
            _gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs)
            print 'Parsing result...'
            _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix),
                        os.path.join(out_map_dir,
                                     base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
        # clean
        if clean:
            print '   x removing GEM input %s' % frag_map
            os.system('rm -f %s' % (frag_map))
            print '   x removing failed to map ' + curr_map + '_fail%s.map' % (suffix)
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print '   x removing tmp mapped %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(out_map_dir,
                                      base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)),
                         counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Esempio n. 4
0
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True,
                 min_seq_len=15, windows=None, add_site=True, clean=False,
                 **kwargs):
    """
    Do the mapping

    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for begining and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermedite files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.map_parser.parse_map`
    """
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    if get_free_space_mb(temp_dir, div=3) < 50:
        warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir)

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    input_reads = fastq_path
    if windows is None:
        windows = (None, )
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map = transform_fastq(input_reads, 
                                   mkstemp(prefix=base_name + '_',
                                           dir=temp_dir)[1],
                                   fastq=(input_reads.endswith('.fastq')
                                          or input_reads.endswith('.fastq.gz')),
                                   min_seq_len=min_seq_len, trim=win)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s.map' % (beg, end)
        if end:
            print 'Mapping reads in window %s-%s...' % (beg, end)
        else:
            print 'Mapping full reads...', curr_map
        map_file = gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs)
        map_file.close()

        # parse map file to extract not uniquely mapped reads
        print 'Parsing result...'
        _gem_filter(out_map_path, curr_map + '_filt_%s-%s.map' % (beg, end),
                    os.path.join(out_map_dir,
                                 base_name + '_full_%s-%s.map' % (beg, end)))
        # clean
        if clean:
            print '   x removing GEM input %s' % curr_map
            os.system('rm -f %s' % (curr_map))
            print '   x removing map %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        # for next round, we will use remaining unmapped reads
        input_reads = curr_map + '_filt_%s-%s.map' % (beg, end)
        outfiles.append(os.path.join(out_map_dir,
                                     base_name + '_full_%s-%s.map' % (beg, end)))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map = transform_fastq(input_reads,
                                   mkstemp(prefix=base_name + '_',
                                           dir=temp_dir)[1],
                                   min_seq_len=min_seq_len, trim=win,
                                   fastq=False, r_enz=r_enz, add_site=add_site)
        out_map_path = frag_map + '_frag.map'
        print 'Mapping fragments of remaining reads...'
        map_file = gem_mapping(gem_index_path, frag_map, out_map_path,
                               **kwargs)
        map_file.close()
        print 'Parsing result...'
        _gem_filter(out_map_path, curr_map + '_fail.map',
                    os.path.join(out_map_dir, base_name + '_frag.map'))
        outfiles.append(os.path.join(out_map_dir, base_name + '_frag.map'))
    return outfiles
Esempio n. 5
0
    """
    Do the mapping
    """
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    # create directories
    for rep in [temp_dir, out_map_dir]:
        try:
            os.mkdir(rep)
        except OSError, error:
            if error.strerror != 'File exists':
                warn('ERROR: problem loading file, probable problem with the ' +
                     'use of relative path')
                raise error
    # check space
    if get_free_space_mb(temp_dir, div=3) < 50:
        warn('WARNING: less than 50 Gb left on tmp_dir: %s\n' % temp_dir)
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = base_name.replace('.fastq', '')
    # Prepare the FASTQ file and iterate over them
    full_map = transform_fastq(fastq_path,
                               mkstemp(prefix='fastq_', dir=temp_dir)[1],
                               min_seq_len=min_seq_len, trim=trim)
    # First mapping, full length
    out_map_path = full_map + '_full.map'
    print 'Mapping full reads...'
    map_file = gem_mapping(gem_index_path, full_map, out_map_path, **kwargs)
    map_file.close()

    # parse map file to extract not uniquely mapped reads
    print 'Parsing result...'
Esempio n. 6
0
def full_mapping(mapper_index_path, fastq_path, out_map_dir, mapper='gem',
                 r_enz=None, frag_map=True, min_seq_len=15, windows=None,
                 add_site=True, clean=False, get_nread=False,
                 mapper_binary=None, mapper_params=None, **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool or bowtie2-build
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', gettempdir())))
    if mapper_params:
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = '.'.join(base_name.split('.')[:-1])
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            fastq=is_fastq(input_reads),
            min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print '   x removing original input %s' % input_reads
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix)
        else:
            print 'Mapping full reads...', curr_map

        if not skip:
            if mapper == 'gem':
                _gem_mapping(mapper_index_path, curr_map, out_map_path,
                             gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'),
                             **kwargs)
                # parse map file to extract not uniquely mapped reads
                print 'Parsing result...'
                _gem_filter(out_map_path,
                            curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                            os.path.join(out_map_dir,
                                         base_name + '_full_%s-%s%s.map' % (
                                             beg, end, suffix)))
            elif mapper == 'bowtie2':
                _bowtie2_mapping(mapper_index_path, curr_map, out_map_path,
                                 bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'),
                                 bowtie2_params=mapper_params, **kwargs)
                # parse map file to extract not uniquely mapped reads
                print 'Parsing result...'
                _bowtie2_filter(out_map_path, curr_map,
                                curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                                os.path.join(out_map_dir,
                                             base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
            # clean
            if clean:
                print '   x removing %s input %s' % (mapper.upper(),curr_map)
                os.system('rm -f %s' % (curr_map))
                print '   x removing map %s' % out_map_path
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append(
            (os.path.join(out_map_dir,
                          base_name + '_full_%s-%s%s.map' % (beg, end, suffix)),
             counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(
            input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1],
            min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz,
            add_site=add_site, skip=skip, nthreads=nthreads,
            light_storage=light_storage)
        # clean
        if clean:
            print '   x removing pre-%s input %s' % (mapper.upper(),input_reads)
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            if mapper == 'gem':
                print 'Mapping fragments of remaining reads...'
                _gem_mapping(mapper_index_path, frag_map, out_map_path,
                             gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'),
                             **kwargs)
                print 'Parsing result...'
                _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix),
                            os.path.join(out_map_dir,
                                         base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            elif mapper == 'bowtie2':
                print 'Mapping fragments of remaining reads...'
                _bowtie2_mapping(mapper_index_path, frag_map, out_map_path,
                                 bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'),
                                 bowtie2_params=mapper_params, **kwargs)
                print 'Parsing result...'
                _bowtie2_filter(out_map_path, frag_map,
                                curr_map + '_fail%s.map' % (suffix),
                                os.path.join(out_map_dir,
                                         base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
        # clean
        if clean:
            print '   x removing %s input %s' % (mapper.upper(),frag_map)
            os.system('rm -f %s' % (frag_map))
            print '   x removing failed to map ' + curr_map + '_fail%s.map' % (suffix)
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print '   x removing tmp mapped %s' % out_map_path
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(out_map_dir,
                                      base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)),
                         counter))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Esempio n. 7
0
def fast_fragment_mapping(mapper_index_path,
                          fastq_path1,
                          fastq_path2,
                          r_enz,
                          genome_seq,
                          out_map,
                          clean=True,
                          get_nread=False,
                          mapper_binary=None,
                          mapper_params=None,
                          samtools='samtools',
                          **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome with the knowledge of
    the restriction enzyme used (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool, bowtie2-build or hisat2-build
    :param fastq_path1: PATH to FASTQ file of read 1, either compressed or not.
    :param fastq_path2: PATH to FASTQ file of read 2, either compressed or not.
    :param out_map_dir: path to outfile tab separated format containing mapped
       read information.
    :param r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII.
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param False clean: remove intermediate files created in temp_dir
    :param False get_nread: returns a list of lists where each element contains
       a path and the number of reads processed
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper
    :param samtools samtools: path to samtools binary.

    :returns: outfile with the intersected read pairs
    """

    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    samtools = which(samtools)
    # check out folder
    if not os.path.isdir(os.path.dirname(os.path.abspath(out_map))):
        raise Exception(
            '\n\nERROR: Path to store the output does not exist.\n')
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    gem_version = None
    # check that we have the GEM binary:
    gem_binary = mapper_binary or 'gem-mapper'
    gem_binary = which(gem_binary)
    if not gem_binary:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    try:
        out, err = Popen([gem_binary, '--version'],
                         stdout=PIPE,
                         stderr=STDOUT,
                         universal_newlines=True).communicate()
        gem_version = int(out[1])
    except ValueError as e:
        gem_version = 2
        print('Falling to gem v2')
    if gem_version < 3:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    if mapper_params:
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name1 = os.path.split(fastq_path1)[-1].replace('.gz', '')
    base_name1 = '.'.join(base_name1.split('.')[:-1])

    curr_map1, _ = transform_fastq(fastq_path1,
                                   mkstemp(prefix=base_name1 + '_',
                                           dir=temp_dir)[1],
                                   fastq=is_fastq(fastq_path1),
                                   nthreads=nthreads,
                                   light_storage=True)

    base_name2 = os.path.split(fastq_path2)[-1].replace('.gz', '')
    base_name2 = '.'.join(base_name2.split('.')[:-1])

    curr_map2, count_fastq = transform_fastq(fastq_path2,
                                             mkstemp(prefix=base_name2 + '_',
                                                     dir=temp_dir)[1],
                                             fastq=is_fastq(fastq_path1),
                                             nthreads=nthreads,
                                             light_storage=True)

    out_map_path = curr_map1 + '_frag%s.map' % (suffix)

    print('Mapping fragments of remaining reads...')
    _gem_mapping(mapper_index_path,
                 curr_map1,
                 out_map_path,
                 fastq_path2=curr_map2,
                 r_enz=r_enz,
                 gem_binary=gem_binary,
                 gem_version=gem_version,
                 **kwargs)
    # clean
    if clean:
        print('   x removing GEM 3 input %s' % (curr_map1))
        os.system('rm -f %s' % (curr_map1))
        print('   x removing GEM 3 input %s' % (curr_map2))
        os.system('rm -f %s' % (curr_map2))

    #sort sam file
    os.system(samtools + ' sort -n -O SAM -@ %d -T %s -o %s %s' %
              (nthreads, out_map_path, out_map_path, out_map_path))
    genome_lengths = dict((crm, len(genome_seq[crm])) for crm in genome_seq)
    frag_chunk = kwargs.get('frag_chunk', 100000)
    frags = map_re_sites(r_enz, genome_seq, frag_chunk=frag_chunk)
    if samtools and nthreads > 1:
        print('Splitting sam file')
        # headers
        for i in range(nthreads):
            os.system(samtools + ' view -H -O SAM %s > "%s_%d"' %
                      (out_map_path, out_map_path, (i + 1)))
        chunk_lines = int(
            (count_fastq * 2.3) /
            nthreads)  # estimate lines in sam with reads and frags
        os.system(samtools + ''' view -O SAM %s | awk -v n=%d -v FS="\\t" '
              BEGIN { part=0; line=n }       
              { if( line>=n && $1!=last_read ) {part++; line=1; print $0 >> "%s_"part } 
                else { print $0 >> "%s_"part; line++; } 
                last_read = $1;
              }'
        ''' % (out_map_path, chunk_lines, out_map_path, out_map_path))
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))
        print('Parsing results...')
        kwargs['nthreads'] = 1
        procs = []
        pool = mu.Pool(nthreads)
        for i in range(nthreads):
            frags_shared = copy.deepcopy(frags)
            procs.append(
                pool.apply_async(parse_gem_3c,
                                 args=('%s_%d' % (out_map_path, (i + 1)),
                                       '%s_parsed_%d' % (out_map_path,
                                                         (i + 1)),
                                       copy.deepcopy(genome_lengths),
                                       frags_shared, False, True),
                                 kwds=kwargs))
            #results.append('%s_parsed_%d' % (out_map_path,(i+1)))
        pool.close()
        pool.join()
        results = [proc.get() for proc in procs if proc.get()]
        if clean:
            for i in range(nthreads):
                print('   x removing tmp mapped %s_%d' % (out_map_path,
                                                          (i + 1)))
                os.system('rm -f %s_%d' % (out_map_path, (i + 1)))

        #Final sort and merge
        nround = 0
        while len(results) > 1:
            nround += 1
            num_procs = min(nthreads, int(len(results) / 2))
            pool = mu.Pool(num_procs)
            procs = [
                pool.apply_async(merge_sort,
                                 (results.pop(0), results.pop(0),
                                  out_map_path + '_%d' % nround, i, True))
                for i in range(num_procs)
            ]
            pool.close()
            pool.join()
            results = [proc.get() for proc in procs if proc.get()]

        map_out = open(out_map, 'w')
        tmp_reads_fh = open(results[0], 'r')
        for crm in genome_seq:
            map_out.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        for read_line in tmp_reads_fh:
            read = read_line.split('\t')
            map_out.write('\t'.join([read[0]] + read[2:8] + read[9:]))
        map_out.close()
        if clean:
            print('   x removing tmp mapped %s' % results[0])
            os.system('rm -f %s' % (results[0]))

    else:
        print('Parsing result...')
        parse_gem_3c(out_map_path,
                     out_map,
                     genome_lengths,
                     frags,
                     verbose=False,
                     tmp_format=False,
                     **kwargs)

        # clean
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))

    if get_nread:
        return [(out_map, count_fastq)]
    return out_map
Esempio n. 8
0
def full_mapping(mapper_index_path,
                 fastq_path,
                 out_map_dir,
                 mapper='gem',
                 r_enz=None,
                 frag_map=True,
                 min_seq_len=15,
                 windows=None,
                 add_site=True,
                 clean=False,
                 get_nread=False,
                 mapper_binary=None,
                 mapper_params=None,
                 **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool or bowtie2-build
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    if mapper == 'gem':
        gem_version = None
        # check that we have the GEM binary:
        gem_binary = mapper_binary or 'gem-mapper'
        gem_binary = which(gem_binary)
        if not gem_binary:
            raise Exception(
                '\n\nERROR: GEM binary not found, install it from:'
                '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
                '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
                'have a recent computer, the '
                'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
                'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
                'Copy the binary gem-mapper to /usr/local/bin/ for '
                'example (somewhere in your PATH).\n\nNOTE: GEM does '
                'not provide any binary for MAC-OS.')
        try:
            out, err = Popen([gem_binary, '--version'],
                             stdout=PIPE,
                             stderr=STDOUT,
                             universal_newlines=True).communicate()
            gem_version = int(out[1])
        except ValueError as e:
            gem_version = 2
            print('Falling to gem v2')
    if mapper_params and isinstance(mapper_params, dict):
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = '.'.join(base_name.split('.')[:-1])
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            fastq=is_fastq(input_reads),
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print('   x removing original input %s' % input_reads)
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print('Mapping reads in window %s-%s%s...' % (beg, end, suffix))
        else:
            print('Mapping full reads...', curr_map)

        if not skip:
            if mapper == 'gem':
                _gem_mapping(mapper_index_path,
                             curr_map,
                             out_map_path,
                             gem_binary=gem_binary,
                             gem_version=gem_version,
                             gem_params=mapper_params,
                             **kwargs)
                # parse map file to extract not uniquely mapped reads
                print('Parsing result...')
                if gem_version >= 3:
                    _sam_filter(
                        out_map_path, curr_map,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(
                            out_map_dir, base_name + '_full_%s-%s%s.map' %
                            (beg, end, suffix)))
                else:
                    _gem_filter(
                        out_map_path,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(
                            out_map_dir, base_name + '_full_%s-%s%s.map' %
                            (beg, end, suffix)))
            elif mapper == 'bowtie2' or mapper == 'hisat2':
                _bowtie2_mapping(mapper_index_path,
                                 curr_map,
                                 out_map_path,
                                 bowtie2_binary=(mapper_binary
                                                 if mapper_binary else mapper),
                                 bowtie2_params=mapper_params,
                                 **kwargs)
                # parse map file to extract not uniquely mapped reads
                print('Parsing result...')
                _sam_filter(
                    out_map_path, curr_map,
                    curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                    os.path.join(
                        out_map_dir,
                        base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
            # clean
            if clean:
                print('   x removing %s input %s' % (mapper.upper(), curr_map))
                os.system('rm -f %s' % (curr_map))
                print('   x removing map %s' % out_map_path)
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            fastq=False,
                                            r_enz=r_enz,
                                            add_site=add_site,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if clean:
            print('   x removing pre-%s input %s' %
                  (mapper.upper(), input_reads))
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            if mapper == 'gem':
                print('Mapping fragments of remaining reads...')
                _gem_mapping(mapper_index_path,
                             frag_map,
                             out_map_path,
                             gem_binary=gem_binary,
                             gem_version=gem_version,
                             **kwargs)
                print('Parsing result...')
                # check if output is sam format for gem3
                if gem_version >= 3:
                    _sam_filter(
                        out_map_path, frag_map,
                        curr_map + '_fail%s.map' % (suffix),
                        os.path.join(
                            out_map_dir, base_name + '_frag_%s-%s%s.map' %
                            (beg, end, suffix)))
                else:
                    _gem_filter(
                        out_map_path, curr_map + '_fail%s.map' % (suffix),
                        os.path.join(
                            out_map_dir, base_name + '_frag_%s-%s%s.map' %
                            (beg, end, suffix)))
            elif mapper == 'bowtie2' or mapper == 'hisat2':
                print('Mapping fragments of remaining reads...')
                _bowtie2_mapping(mapper_index_path,
                                 frag_map,
                                 out_map_path,
                                 bowtie2_binary=(mapper_binary
                                                 if mapper_binary else mapper),
                                 bowtie2_params=mapper_params,
                                 **kwargs)
                print('Parsing result...')
                _sam_filter(
                    out_map_path, frag_map,
                    curr_map + '_fail%s.map' % (suffix),
                    os.path.join(
                        out_map_dir,
                        base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
        # clean
        if clean:
            print('   x removing %s input %s' % (mapper.upper(), frag_map))
            os.system('rm -f %s' % (frag_map))
            print('   x removing failed to map ' + curr_map + '_fail%s.map' %
                  (suffix))
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter))
    if clean:
        os.system('rm -rf %s' % (temp_dir))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]