def test_gem2sam_execution_to_file(): input = gem.files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) result = results_dir + "/test_sam.sam" sam = gem.gem2sam(mappings, index, output=result, compact=True) assert sam is not None assert sam.process is not None assert sam.filename == result assert os.path.exists(result)
def test_gem2sam_execution(): input = files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) sam = gem.gem2sam(mappings, index, compact=True) assert sam is not None assert sam.process is not None assert sam.filename is None count = 0 for read in sam: count += 1 assert count == 10000
def test_gem2sam_sam2bam(): input = gem.files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) sam = gem.gem2sam(mappings, index, compact=True) result = results_dir + "/test_sam.bam" bam = gem.sam2bam(sam, output=result) assert os.path.exists(result) count = 0 for l in gem.files.open(result): count += 1 assert count == 10000, "Count 10000!=%d" % count
def test_gem2sam_sam2bam(): input = gem.files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) sam = gem.gem2sam(mappings, index, compact=True) result = results_dir+"/test_sam.bam" bam = gem.sam2bam(sam, output=result) assert os.path.exists(result) count = 0 for l in gem.files.open(result): count += 1 assert count == 10000, "Count 10000!=%d" % count
def run(self, args): quality = gem._prepare_quality_parameter(args.quality) raw = False if args.input is not None: map_file = gem.files.open(args.input, quality=quality) else: map_file = gem.files.open(sys.stdin, quality=quality) raw = True cons = gem.extended_splice_consensus if args.no_xs: cons = None sam = gem.gem2sam(map_file, index=args.index, threads=args.threads, quality=args.quality, consensus=cons, raw=raw ) gem.sam2bam(sam, output=args.output, sorted=not args.no_sort, threads=args.threads, sort_memory=str(args.sort_memory)) if not args.no_index: gem.bamIndex(args.output)
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start, range_stop, **kwargs): """ Map iteratively a given FASTQ file to a reference genome. :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_sam_path: path to a directory where to store mapped reads in SAM/ BAM format (see option output_is_bam). :param range_start: list of integers representing the start position of each read fragment to be mapped (starting at 1 includes the first nucleotide of the read). :param range_stop: list of integers representing the end position of each read fragment to be mapped. :param True single_end: when FASTQ contains paired-ends flags :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param -1 max_reads_per_chunk: maximum number of reads to process at a time. If -1, all reads will be processed in one run (more RAM memory needed). :param False output_is_bam: Use binary (compressed) form of generated out-files with mapped reads (recommended to save disk space). :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.sam_parser.parse_sam` """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path)) single_end = kwargs.get('single_end', True) max_edit_distance = kwargs.get('max_edit_distance', 0.04) mismatches = kwargs.get('mismatches', 0.04) nthreads = kwargs.get('nthreads', 4) max_reads_per_chunk = kwargs.get('max_reads_per_chunk', -1) out_files = kwargs.get('out_files', []) output_is_bam = kwargs.get('output_is_bam', False) temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', tempfile.gettempdir()))) # check kwargs for kw in kwargs: if not kw in [ 'single_end', 'nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir' ]: warn('WARNING: %s not is usual keywords, misspelled?' % kw) # check windows: if not isinstance(range_start, list) or not isinstance(range_stop, list): if (not isinstance(range_start, tuple) or not isinstance(range_stop, tuple)): raise Exception( 'ERROR: range_start and range_stop should be lists') range_start = list(range_start) range_stop = list(range_stop) if (not all(isinstance(i, int) for i in range_start) or not all(isinstance(i, int) for i in range_stop)): try: range_start = map(int, range_start) range_stop = map(int, range_stop) warn('WARNING: range_start and range_stop converted to integers') except ValueError: raise Exception( 'ERROR: range_start and range_stop should contain' + ' integers only') if (len(zip(range_start, range_stop)) < len(range_start) or len(range_start) != len(range_stop)): raise Exception('ERROR: range_start and range_stop should have the ' + 'same sizes and windows should be uniques.') if any([i >= j for i, j in zip(range_start, range_stop)]): raise Exception('ERROR: start positions should always be lower than ' + 'stop positions.') if any([i <= 0 for i in range_start]): raise Exception('ERROR: start positions should be strictly positive.') # create directories for rep in [temp_dir, os.path.split(out_sam_path)[0]]: mkdir(rep) #get the length of a read if fastq_path.endswith('.gz'): fastqh = gzip.open(fastq_path) else: fastqh = open(fastq_path) # get the length from the length of the second line, which is the sequence # can not use the "length" keyword, as it is not always present try: _ = fastqh.next() raw_seq_len = len(fastqh.next().strip()) fastqh.close() except StopIteration: raise IOError('ERROR: problem reading %s\n' % fastq_path) if not N_WINDOWS: N_WINDOWS = len(range_start) # Split input files if required and apply iterative mapping to each # segment separately. if max_reads_per_chunk > 0: kwargs['max_reads_per_chunk'] = -1 print 'Split input file %s into chunks' % fastq_path chunked_files = _chunk_file( fastq_path, os.path.join(temp_dir, os.path.split(fastq_path)[1]), max_reads_per_chunk * 4) print '%d chunks obtained' % len(chunked_files) for i, fastq_chunk_path in enumerate(chunked_files): global N_WINDOWS N_WINDOWS = 0 print 'Run iterative_mapping recursively on %s' % fastq_chunk_path out_files.extend( iterative_mapping(gem_index_path, fastq_chunk_path, out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:], **kwargs)) for i, fastq_chunk_path in enumerate(chunked_files): # Delete chunks only if the file was really chunked. if len(chunked_files) > 1: print 'Remove the chunks: %s' % ' '.join(chunked_files) os.remove(fastq_chunk_path) return out_files # end position according to sequence in the file # removes 1 in order to start at 1 instead of 0 try: seq_end = range_stop.pop(0) seq_beg = range_start.pop(0) except IndexError: return out_files # define what we trim seq_len = seq_end - seq_beg trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1) # output local_out_sam = out_sam_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop), seq_beg, seq_end) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit(): unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0] unmapped_fastq_path = os.path.join( temp_dir, unmapped_fastq_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop), seq_beg, seq_end)) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend( iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path, range_start, range_stop, **kwargs)) os.remove(unmapped_fastq_path) return out_files
# trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.join( temp_dir, os.path.split(fastq_path)[1] + '.%d' % seq_len) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path,
# trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit(): unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0] unmapped_fastq_path = os.path.join( temp_dir, unmapped_fastq_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end)) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path,
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start, range_stop, **kwargs): """ Map iteratively a given FASTQ file to a reference genome. :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_sam_path: path to a directory where to store mapped reads in SAM/ BAM format (see option output_is_bam). :param range_start: list of integers representing the start position of each read fragment to be mapped (starting at 1 includes the first nucleotide of the read). :param range_stop: list of integers representing the end position of each read fragment to be mapped. :param True single_end: when FASTQ contains paired-ends flags :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param -1 max_reads_per_chunk: maximum number of reads to process at a time. If -1, all reads will be processed in one run (more RAM memory needed). :param False output_is_bam: Use binary (compressed) form of generated out-files with mapped reads (recommended to save disk space). :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.sam_parser.parse_sam` """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path)) single_end = kwargs.get('single_end' , True) max_edit_distance = kwargs.get('max_edit_distance' , 0.04) mismatches = kwargs.get('mismatches' , 0.04) nthreads = kwargs.get('nthreads' , 4) max_reads_per_chunk = kwargs.get('max_reads_per_chunk' , -1) out_files = kwargs.get('out_files' , []) output_is_bam = kwargs.get('output_is_bam' , False) temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', tempfile.gettempdir()))) # check kwargs for kw in kwargs: if not kw in ['single_end', 'nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir']: warn('WARNING: %s not is usual keywords, misspelled?' % kw) # check windows: if not isinstance(range_start, list) or not isinstance(range_stop, list): if (not isinstance(range_start, tuple) or not isinstance(range_stop, tuple)): raise Exception('ERROR: range_start and range_stop should be lists') range_start = list(range_start) range_stop = list(range_stop) if (not all(isinstance(i, int) for i in range_start) or not all(isinstance(i, int) for i in range_stop)): try: range_start = map(int, range_start) range_stop = map(int, range_stop) warn('WARNING: range_start and range_stop converted to integers') except ValueError: raise Exception('ERROR: range_start and range_stop should contain' + ' integers only') if (len(zip(range_start, range_stop)) < len(range_start) or len(range_start) != len(range_stop)): raise Exception('ERROR: range_start and range_stop should have the ' + 'same sizes and windows should be uniques.') if any([i >= j for i, j in zip(range_start, range_stop)]): raise Exception('ERROR: start positions should always be lower than ' + 'stop positions.') if any([i <= 0 for i in range_start]): raise Exception('ERROR: start positions should be strictly positive.') # create directories for rep in [temp_dir, os.path.split(out_sam_path)[0]]: mkdir(rep) #get the length of a read if fastq_path.endswith('.gz'): fastqh = gzip.open(fastq_path) else: fastqh = open(fastq_path) # get the length from the length of the second line, which is the sequence # can not use the "length" keyword, as it is not always present try: _ = fastqh.next() raw_seq_len = len(fastqh.next().strip()) fastqh.close() except StopIteration: raise IOError('ERROR: problem reading %s\n' % fastq_path) if not N_WINDOWS: N_WINDOWS = len(range_start) # Split input files if required and apply iterative mapping to each # segment separately. if max_reads_per_chunk > 0: kwargs['max_reads_per_chunk'] = -1 print 'Split input file %s into chunks' % fastq_path chunked_files = _chunk_file( fastq_path, os.path.join(temp_dir, os.path.split(fastq_path)[1]), max_reads_per_chunk * 4) print '%d chunks obtained' % len(chunked_files) for i, fastq_chunk_path in enumerate(chunked_files): global N_WINDOWS N_WINDOWS = 0 print 'Run iterative_mapping recursively on %s' % fastq_chunk_path out_files.extend(iterative_mapping( gem_index_path, fastq_chunk_path, out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:], **kwargs)) for i, fastq_chunk_path in enumerate(chunked_files): # Delete chunks only if the file was really chunked. if len(chunked_files) > 1: print 'Remove the chunks: %s' % ' '.join(chunked_files) os.remove(fastq_chunk_path) return out_files # end position according to sequence in the file # removes 1 in order to start at 1 instead of 0 try: seq_end = range_stop.pop(0) seq_beg = range_start.pop(0) except IndexError: return out_files # define what we trim seq_len = seq_end - seq_beg trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1) # output local_out_sam = out_sam_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit(): unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0] unmapped_fastq_path = os.path.join( temp_dir, unmapped_fastq_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end)) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path, range_start, range_stop, **kwargs)) os.remove(unmapped_fastq_path) return out_files
## remove files if REMOVE_FILES: print "Removing intermediate files" os.remove(initial_out) os.remove(initial_split_out) os.remove(denovo_out) os.remove(junctions_out) os.remove(trim_20_out) os.remove(trim_20_split_out) ## pair align the mappings print "Running pair aligner" paired_mapping = gem.pairalign(merged, index, paired_out, max_insert_size=100000, threads=THREADS) if REMOVE_FILES: os.remove(merge_out) ## validate and score the alignment print "Validating and scoring alignment" scored = gem.score(paired_mapping, index, final_out, threads=THREADS) if REMOVE_FILES: os.remove(paired_out) ## convert the result to sam and then to bam print "Converting to sam" sam = gem.gem2sam(scored, index, threads=4) bam = gem.sam2bam(sam, sam_out, sorted=True) print "Done :)"