Esempio n. 1
0
def test_gem2sam_execution_to_file():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    result = results_dir + "/test_sam.sam"
    sam = gem.gem2sam(mappings, index, output=result, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename == result
    assert os.path.exists(result)
def test_gem2sam_execution_to_file():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    result = results_dir + "/test_sam.sam"
    sam = gem.gem2sam(mappings, index, output=result, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename == result
    assert os.path.exists(result)
Esempio n. 3
0
def test_gem2sam_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename is None
    count = 0
    for read in sam:
        count += 1
    assert count == 10000
Esempio n. 4
0
def test_gem2sam_sam2bam():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    result = results_dir + "/test_sam.bam"
    bam = gem.sam2bam(sam, output=result)
    assert os.path.exists(result)
    count = 0
    for l in gem.files.open(result):
        count += 1
    assert count == 10000, "Count 10000!=%d" % count
def test_gem2sam_sam2bam():
    input = gem.files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    result = results_dir+"/test_sam.bam"
    bam = gem.sam2bam(sam, output=result)
    assert os.path.exists(result)
    count = 0
    for l in gem.files.open(result):
        count += 1
    assert count == 10000, "Count 10000!=%d" % count
Esempio n. 6
0
def test_gem2sam_execution():
    input = files.open(testfiles["reads_1.fastq"])
    mappings = gem.mapper(input, index)
    sam = gem.gem2sam(mappings, index, compact=True)
    assert sam is not None
    assert sam.process is not None
    assert sam.filename is None
    count = 0
    for read in sam:
        count += 1
    assert count == 10000
Esempio n. 7
0
 def run(self, args):
     quality = gem._prepare_quality_parameter(args.quality)
     raw = False
     if args.input is not None:
         map_file = gem.files.open(args.input, quality=quality)
     else:
         map_file = gem.files.open(sys.stdin, quality=quality)
         raw = True
     cons = gem.extended_splice_consensus
     if args.no_xs:
         cons = None
     sam = gem.gem2sam(map_file,
                         index=args.index,
                         threads=args.threads,
                         quality=args.quality,
                         consensus=cons,
                         raw=raw
                         )
     gem.sam2bam(sam, output=args.output,
                 sorted=not args.no_sort,
                 threads=args.threads,
                 sort_memory=str(args.sort_memory))
     if not args.no_index:
         gem.bamIndex(args.output)
Esempio n. 8
0
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start,
                      range_stop, **kwargs):
    """
    Map iteratively a given FASTQ file to a reference genome.
    
    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_sam_path: path to a directory where to store mapped reads in SAM/
       BAM format (see option output_is_bam).
    :param range_start: list of integers representing the start position of each
       read fragment to be mapped (starting at 1 includes the first nucleotide
       of the read).
    :param range_stop: list of integers representing the end position of each
       read fragment to be mapped.
    :param True single_end: when FASTQ contains paired-ends flags
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param -1 max_reads_per_chunk: maximum number of reads to process at a time.
       If -1, all reads will be processed in one run (more RAM memory needed).
    :param False output_is_bam: Use binary (compressed) form of generated
       out-files with mapped reads (recommended to save disk space).
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    """
    gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path = os.path.abspath(os.path.expanduser(fastq_path))
    out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path))
    single_end = kwargs.get('single_end', True)
    max_edit_distance = kwargs.get('max_edit_distance', 0.04)
    mismatches = kwargs.get('mismatches', 0.04)
    nthreads = kwargs.get('nthreads', 4)
    max_reads_per_chunk = kwargs.get('max_reads_per_chunk', -1)
    out_files = kwargs.get('out_files', [])
    output_is_bam = kwargs.get('output_is_bam', False)
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', tempfile.gettempdir())))

    # check kwargs
    for kw in kwargs:
        if not kw in [
                'single_end', 'nthreads', 'max_edit_distance', 'mismatches',
                'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir'
        ]:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)

    # check windows:
    if not isinstance(range_start, list) or not isinstance(range_stop, list):
        if (not isinstance(range_start, tuple)
                or not isinstance(range_stop, tuple)):
            raise Exception(
                'ERROR: range_start and range_stop should be lists')
        range_start = list(range_start)
        range_stop = list(range_stop)
    if (not all(isinstance(i, int) for i in range_start)
            or not all(isinstance(i, int) for i in range_stop)):
        try:
            range_start = map(int, range_start)
            range_stop = map(int, range_stop)
            warn('WARNING: range_start and range_stop converted to integers')
        except ValueError:
            raise Exception(
                'ERROR: range_start and range_stop should contain' +
                ' integers only')
    if (len(zip(range_start, range_stop)) < len(range_start)
            or len(range_start) != len(range_stop)):
        raise Exception('ERROR: range_start and range_stop should have the ' +
                        'same sizes and windows should be uniques.')
    if any([i >= j for i, j in zip(range_start, range_stop)]):
        raise Exception('ERROR: start positions should always be lower than ' +
                        'stop positions.')
    if any([i <= 0 for i in range_start]):
        raise Exception('ERROR: start positions should be strictly positive.')

    # create directories
    for rep in [temp_dir, os.path.split(out_sam_path)[0]]:
        mkdir(rep)

    #get the length of a read
    if fastq_path.endswith('.gz'):
        fastqh = gzip.open(fastq_path)
    else:
        fastqh = open(fastq_path)
    # get the length from the length of the second line, which is the sequence
    # can not use the "length" keyword, as it is not always present
    try:
        _ = fastqh.next()
        raw_seq_len = len(fastqh.next().strip())
        fastqh.close()
    except StopIteration:
        raise IOError('ERROR: problem reading %s\n' % fastq_path)

    if not N_WINDOWS:
        N_WINDOWS = len(range_start)
    # Split input files if required and apply iterative mapping to each
    # segment separately.
    if max_reads_per_chunk > 0:
        kwargs['max_reads_per_chunk'] = -1
        print 'Split input file %s into chunks' % fastq_path
        chunked_files = _chunk_file(
            fastq_path, os.path.join(temp_dir,
                                     os.path.split(fastq_path)[1]),
            max_reads_per_chunk * 4)
        print '%d chunks obtained' % len(chunked_files)
        for i, fastq_chunk_path in enumerate(chunked_files):
            global N_WINDOWS
            N_WINDOWS = 0
            print 'Run iterative_mapping recursively on %s' % fastq_chunk_path
            out_files.extend(
                iterative_mapping(gem_index_path, fastq_chunk_path,
                                  out_sam_path + '.%d' % (i + 1),
                                  range_start[:], range_stop[:], **kwargs))

        for i, fastq_chunk_path in enumerate(chunked_files):
            # Delete chunks only if the file was really chunked.
            if len(chunked_files) > 1:
                print 'Remove the chunks: %s' % ' '.join(chunked_files)
                os.remove(fastq_chunk_path)
        return out_files

    # end position according to sequence in the file
    # removes 1 in order to start at 1 instead of 0
    try:
        seq_end = range_stop.pop(0)
        seq_beg = range_start.pop(0)
    except IndexError:
        return out_files

    # define what we trim
    seq_len = seq_end - seq_beg
    trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1)

    # output
    local_out_sam = out_sam_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop),
                                                  seq_beg, seq_end)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads,
        paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed,
                        gem_index_path,
                        min_decoded_strata=0,
                        max_decoded_matches=2,
                        unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped,
                          index=gem_index_path,
                          threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped,
                          index=gem_index_path,
                          output=local_out_sam,
                          threads=nthreads,
                          single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
        unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0]
    unmapped_fastq_path = os.path.join(
        temp_dir, unmapped_fastq_path + '.%d:%d-%d' %
        (N_WINDOWS - len(range_stop), seq_beg, seq_end))
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(
        iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path,
                          range_start, range_stop, **kwargs))
    os.remove(unmapped_fastq_path)
    return out_files
Esempio n. 9
0
    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads, paired=not single_end)
    
    # mapping
    mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0,
                        max_decoded_matches=2, unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam
    sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                      threads=nthreads, single_end=single_end)
    if output_is_bam:
        sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                          threads=nthreads, single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.join(
        temp_dir, os.path.split(fastq_path)[1] + '.%d' % seq_len)
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path,
                                       out_sam_path,
Esempio n. 10
0
    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads, paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0,
                        max_decoded_matches=2, unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                          threads=nthreads, single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
        unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0]
    unmapped_fastq_path = os.path.join(
        temp_dir, unmapped_fastq_path + '.%d:%d-%d' % (
            N_WINDOWS - len(range_stop), seq_beg, seq_end))
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path,
Esempio n. 11
0
def iterative_mapping(gem_index_path, fastq_path, out_sam_path,
                      range_start, range_stop, **kwargs):
    """
    Map iteratively a given FASTQ file to a reference genome.
    
    :param gem_index_path: path to index file created from a reference genome
       using gem-index tool
    :param fastq_path: PATH to fastq file, either compressed or not.
    :param out_sam_path: path to a directory where to store mapped reads in SAM/
       BAM format (see option output_is_bam).
    :param range_start: list of integers representing the start position of each
       read fragment to be mapped (starting at 1 includes the first nucleotide
       of the read).
    :param range_stop: list of integers representing the end position of each
       read fragment to be mapped.
    :param True single_end: when FASTQ contains paired-ends flags
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param -1 max_reads_per_chunk: maximum number of reads to process at a time.
       If -1, all reads will be processed in one run (more RAM memory needed).
    :param False output_is_bam: Use binary (compressed) form of generated
       out-files with mapped reads (recommended to save disk space).
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.

    :returns: a list of paths to generated outfiles. To be passed to 
       :func:`pytadbit.parsers.sam_parser.parse_sam`
    """
    gem_index_path      = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path          = os.path.abspath(os.path.expanduser(fastq_path))
    out_sam_path        = os.path.abspath(os.path.expanduser(out_sam_path))
    single_end          = kwargs.get('single_end'          , True)
    max_edit_distance   = kwargs.get('max_edit_distance'   , 0.04)
    mismatches          = kwargs.get('mismatches'          , 0.04)
    nthreads            = kwargs.get('nthreads'            , 4)
    max_reads_per_chunk = kwargs.get('max_reads_per_chunk' , -1)
    out_files           = kwargs.get('out_files'           , [])
    output_is_bam       = kwargs.get('output_is_bam'       , False)
    temp_dir = os.path.abspath(os.path.expanduser(
        kwargs.get('temp_dir', tempfile.gettempdir())))

    # check kwargs
    for kw in kwargs:
        if not kw in ['single_end', 'nthreads', 'max_edit_distance',
                      'mismatches', 'max_reads_per_chunk',
                      'out_files', 'output_is_bam', 'temp_dir']:
            warn('WARNING: %s not is usual keywords, misspelled?' % kw)
    
    # check windows:
    if not isinstance(range_start, list) or not isinstance(range_stop, list):
        if (not isinstance(range_start, tuple) or
            not isinstance(range_stop, tuple)):
            raise Exception('ERROR: range_start and range_stop should be lists')
        range_start = list(range_start)
        range_stop  = list(range_stop)
    if (not all(isinstance(i, int) for i in range_start) or
        not all(isinstance(i, int) for i in range_stop)):
        try:
            range_start = map(int, range_start)
            range_stop  = map(int, range_stop)            
            warn('WARNING: range_start and range_stop converted to integers')
        except ValueError:
            raise Exception('ERROR: range_start and range_stop should contain' +
                            ' integers only')
    if (len(zip(range_start, range_stop)) < len(range_start) or
        len(range_start) != len(range_stop)):
        raise Exception('ERROR: range_start and range_stop should have the ' +
                        'same sizes and windows should be uniques.')
    if any([i >= j for i, j in zip(range_start, range_stop)]):
        raise Exception('ERROR: start positions should always be lower than ' +
                        'stop positions.')
    if any([i <= 0 for i in range_start]):
        raise Exception('ERROR: start positions should be strictly positive.')

    # create directories
    for rep in [temp_dir, os.path.split(out_sam_path)[0]]:
        mkdir(rep)

    #get the length of a read
    if fastq_path.endswith('.gz'):
        fastqh = gzip.open(fastq_path)
    else:
        fastqh = open(fastq_path)
    # get the length from the length of the second line, which is the sequence
    # can not use the "length" keyword, as it is not always present
    try:
        _ = fastqh.next()
        raw_seq_len = len(fastqh.next().strip())
        fastqh.close()
    except StopIteration:
        raise IOError('ERROR: problem reading %s\n' % fastq_path)

    if not  N_WINDOWS:
        N_WINDOWS = len(range_start)
    # Split input files if required and apply iterative mapping to each
    # segment separately.
    if max_reads_per_chunk > 0:
        kwargs['max_reads_per_chunk'] = -1
        print 'Split input file %s into chunks' % fastq_path
        chunked_files = _chunk_file(
            fastq_path,
            os.path.join(temp_dir, os.path.split(fastq_path)[1]),
            max_reads_per_chunk * 4)
        print '%d chunks obtained' % len(chunked_files)
        for i, fastq_chunk_path in enumerate(chunked_files):
            global N_WINDOWS
            N_WINDOWS = 0
            print 'Run iterative_mapping recursively on %s' % fastq_chunk_path
            out_files.extend(iterative_mapping(
                gem_index_path, fastq_chunk_path,
                out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:],
                **kwargs))

        for i, fastq_chunk_path in enumerate(chunked_files):
            # Delete chunks only if the file was really chunked.
            if len(chunked_files) > 1:
                print 'Remove the chunks: %s' % ' '.join(chunked_files)
                os.remove(fastq_chunk_path)
        return out_files

    # end position according to sequence in the file
    # removes 1 in order to start at 1 instead of 0
    try:
        seq_end = range_stop.pop(0)
        seq_beg = range_start.pop(0)
    except IndexError:
        return out_files

    # define what we trim
    seq_len = seq_end - seq_beg
    trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1)

    # output
    local_out_sam = out_sam_path + '.%d:%d-%d' % (
        N_WINDOWS - len(range_stop), seq_beg, seq_end)
    out_files.append(local_out_sam)
    # input
    inputf = gem.files.open(fastq_path)

    # trimming
    trimmed = gem.filter.run_filter(
        inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)],
        threads=nthreads, paired=not single_end)

    # mapping
    mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0,
                        max_decoded_matches=2, unique_mapping=False,
                        max_edit_distance=max_edit_distance,
                        mismatches=mismatches,
                        output=temp_dir + '/test.map',
                        threads=nthreads)

    # convert to sam/bam
    if output_is_bam:
        sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads,
                          single_end=single_end)
        _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads)
    else:
        sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam,
                          threads=nthreads, single_end=single_end)

    # Recursively go to the next iteration.
    unmapped_fastq_path = os.path.split(fastq_path)[1]
    if unmapped_fastq_path[-1].isdigit():
        unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0]
    unmapped_fastq_path = os.path.join(
        temp_dir, unmapped_fastq_path + '.%d:%d-%d' % (
            N_WINDOWS - len(range_stop), seq_beg, seq_end))
    _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path)

    out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path,
                                       out_sam_path,
                                       range_start, range_stop, **kwargs))
    os.remove(unmapped_fastq_path)
    return out_files
    ## remove files
    if REMOVE_FILES:
        print "Removing intermediate files"
        os.remove(initial_out)
        os.remove(initial_split_out)
        os.remove(denovo_out)
        os.remove(junctions_out)
        os.remove(trim_20_out)
        os.remove(trim_20_split_out)

    ## pair align the mappings
    print "Running pair aligner"
    paired_mapping = gem.pairalign(merged, index, paired_out, max_insert_size=100000, threads=THREADS)

    if REMOVE_FILES:
        os.remove(merge_out)

    ## validate and score the alignment
    print "Validating and scoring alignment"
    scored = gem.score(paired_mapping, index, final_out, threads=THREADS)
    if REMOVE_FILES:
        os.remove(paired_out)

    ## convert the result to sam and then to bam
    print "Converting to sam"
    sam = gem.gem2sam(scored, index, threads=4)
    bam = gem.sam2bam(sam, sam_out, sorted=True)

    print "Done :)"