def go(nucleotides_per_input=8000000, gzip_output=True, gzip_level=3, to_stdout=False, push='.', mover=filemover.FileMover(), verbose=False, scratch=None, bin_qualities=True, short_qnames=False, skip_bad_records=False, workspace_dir=None, fastq_dump_exe='fastq-dump', ignore_missing_sra_samples=False): """ Runs Rail-RNA-preprocess Input (read from stdin) ---------------------------- Tab-separated fields: ---If URL is local: 1. #!splitload 2. \x1d-separated list of 0-based indexes of reads at which to start each new file 3. \x1d-separated list of numbers of reads to include in gzipped files 4. \x1d-separated list of manifest lines whose tabs are replaced by \x1es ---Otherwise: manifest line A manifest line has the following format (for single-end reads) <URL>(tab)<Optional MD5>(tab)<Sample label> (for paired-end reads) <URL 1>(tab)<Optional MD5 1>(tab)<URL 2>(tab)<Optional MD5 2>(tab) <Sample label> Hadoop output (written to stdout) ---------------------------- None. Other output (written to directory specified by command-line parameter --push) ____________________________ Files containing input data in one of the following formats: Format 1 (single-end, 3-column): 1. Nucleotide sequence or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name 4. Quality sequence or its reverse, whichever corresponds to field 1 Format 2 (paired, 2 lines, 3 columns each) (so this is the same as single-end) 1. Nucleotide sequence for mate 1 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name for mate 1 4. Quality sequence for mate 1 or its reverse, whichever corresponds to field 1 (new line) 1. Nucleotide sequence for mate 2 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse complemented else 0 3. Name for mate 2 4. Quality sequence for mate 2 or its reverse, whichever corresponds to field 1 Quality sequences are strings of Is for FASTA input. nucleotides_per_input: maximum number of nucleotides to put in a given input file gzip_output: True iff preprocessed input should be gzipped gzip_level: level of gzip compression to use push: where to send output verbose: True iff extra debugging statements should be printed to stderr scratch: scratch directory for storing temporary files or None if securely created temporary directory bin_qualities: True iff quality string should be binned according to rules in _mismatch_penalties_to_quality_scores and round_quality_string() defined in go() short_qnames: True iff original qname should be killed and a new qname should be written in a short base64-encoded format skip_bad_records: True iff bad records should be skipped; otherwise, raises exception if bad record is encountered workspace_dir: where to use fastq-dump -- needed for working with dbGaP data. None if temporary dir should be used. fastq_dump_exe: path to fastq-dump executable ignore_missing_sra_samples: does not return error if fastq-dump doesn't find a sample No return value """ if bin_qualities: import math def round_quality_string(qual): """ Bins phred+33 quality string to improve compression. Uses 5-bin scheme that does not affect Bowtie 2 alignments qual: quality string Return value: "binned" quality string. """ return ''.join( [str(int( _MN + math.floor((_MX - _MN) * min( ord(qual_char) - 33.0, 40.0 ) / 40.0) )) for qual_char in qual]).translate( _mismatch_penalties_to_quality_scores ) else: def round_quality_string(qual): """ Leaves quality string unbinned and untouched. qual: quality string Return value: qual """ return qual global _input_line_count, _output_line_count skip_stubs = False temp_dir = make_temp_dir(scratch) print >>sys.stderr, 'Created local destination directory "%s".' % temp_dir register_cleanup(tempdel.remove_temporary_directories, [temp_dir]) input_line_count, output_line_count = 0, 0 if not to_stdout: push_url = Url(push) if push_url.is_local: destination = push elif push_url.is_s3 or push_url.is_hdfs or push_url.is_nfs: destination = temp_dir else: raise RuntimeError('Push destination must be ' 'on S3, HDFS, NFS, or local.') fastq_cues = set(['@']) fasta_cues = set(['>', ';']) source_dict = {} onward = False for line in sys.stdin: _input_line_count += 1 if not line.strip(): continue # Kill offset from start of manifest file try: tokens = line.strip().split('\t')[1:] if tokens[0][0] == '#' and tokens[0] != '#!splitload': # Comment line continue except IndexError: # Be robust to bad lines continue token_count = len(tokens) qual_getter = None if tokens[0] == '#!splitload': '''Line specifies precisely how records from files should be placed.''' assert not to_stdout, ('Split manifest line inconsistent with ' 'writing to stdout.') qual_getter = phred_converter(phred_format=tokens[-1]) indexes = tokens[1].split('\x1d') read_counts = tokens[2].split('\x1d') manifest_lines = [token.split('\x1e') for token in tokens[3].split('\x1d')] assert len(indexes) == len(read_counts) == len(manifest_lines) for i, manifest_line in enumerate(manifest_lines): manifest_line_field_count = len(manifest_line) if manifest_line_field_count == 3: source_dict[(Url(manifest_line[0]),)] = ( manifest_line[-1], int(indexes[i]), int(read_counts[i]) ) else: assert manifest_line_field_count == 5 source_dict[(Url(manifest_line[0]), Url(manifest_line[2]))] = ( manifest_line[-1], int(indexes[i]), int(read_counts[i]) ) elif token_count == 3: # SRA or single-end reads source_dict[(Url(tokens[0]),)] = (tokens[-1],) elif token_count == 5: # Paired-end reads source_dict[(Url(tokens[0]), Url(tokens[2]))] = (tokens[-1],) else: # Not a valid line, but continue for robustness continue file_number = 0 for source_urls in source_dict: sample_label = source_dict[source_urls][0] downloaded = set() sources = [] records_printed = 0 if len(source_dict[source_urls]) == 3: skip_count = source_dict[source_urls][1] if len(source_urls) == 2: records_to_consume = source_dict[source_urls][2] if skip_count % 2: skip_count -= 1 records_to_consume += 1 if records_to_consume % 2: records_to_consume -= 1 # Index reads according to order in input to shorten read names read_index = skip_count / 2 # Index reads in pairs else: records_to_consume = source_dict[source_urls][2] read_index = skip_count else: skip_count = 0 records_to_consume = None # Consume all records read_index = 0 assert (records_to_consume >= 0 or records_to_consume is None), ( 'Negative value %d of records to consume encountered.' ) % records_to_consume if records_to_consume == 0: continue skipped = False for source_url in source_urls: if not source_url.is_local: # Download print >>sys.stderr, 'Retrieving URL "%s"...' \ % source_url.to_url() if source_url.is_dbgap: download_dir = workspace_dir elif source_url.is_sra: download_dir = temp_dir if source_url.is_sra: sra_accession = source_url.to_url() fastq_dump_command = ( 'set -exo pipefail; cd {download_dir}; ' '{fastq_dump_exe} -I -X 10000 --split-files ' '{sra_accession}' ).format(download_dir=download_dir, fastq_dump_exe=fastq_dump_exe, sra_accession=sra_accession) try: subprocess.check_call( fastq_dump_command, shell=True, executable='/bin/bash', stdout=sys.stderr ) except subprocess.CalledProcessError as e: if e.returncode == 3 and ignore_missing_sra_samples: onward = True break else: raise RuntimeError( ('Error "%s" encountered executing ' 'command "%s".') % (e.output, fastq_dump_command)) import glob sra_fastq_files = sorted( glob.glob(os.path.join(download_dir, '%s[_.]*' % sra_accession)) ) # ensure 1 before 2 if paired-end # Schedule for deletion def silent_remove(filename): try: os.remove(filename) except OSError as e: pass for sra_fastq_file in sra_fastq_files: register_cleanup(silent_remove, sra_fastq_file) sra_file_count = len(sra_fastq_files) check_for_paired = False if sra_file_count == 1: sra_paired_end = False print >>sys.stderr, 'Detected single-end SRA sample.' elif sra_file_count in [2, 3]: print >>sys.stderr, ('2 or 3 FASTQ files detected. ' 'Checking for barcodes...') check_for_paired = True else: raise RuntimeError( ('Unexpected number of files "%d" output ' 'by fastq-dump command "%s".') % (sra_file_count, fastq_dump_command) ) if check_for_paired: # Get max/min read lengths from FASTQ with open( sra_fastq_files[sra_file_count - 2] ) as fastq_stream: max_len, min_len = ( max_min_read_lengths_from_fastq_stream( fastq_stream ) ) print >>sys.stderr, ( 'Max/min read length found in candidate ' 'barcode FASTQ was {}/{}.' ).format(max_len, min_len) if max_len <= _max_stubby_read_length: print >>sys.stderr, ( 'Assumed barcode FASTQ.' ) skip_stubs = True if sra_file_count == 2: sra_paired_end = False else: sra_paired_end = True else: if sra_file_count == 2: sra_paired_end = True else: raise RuntimeError( '3 FASTQs detected, but one of them ' 'was not recognized as containing ' 'barcodes.' ) # Guess quality from first 10k lines with xopen(None, sra_fastq_files[0]) as source_stream: qual_getter = phred_converter( fastq_stream=source_stream ) for sra_fastq_file in sra_fastq_files: os.remove(sra_fastq_file) sources.append(os.devnull) fastq_dump_command = ( 'set -exo pipefail; cd {download_dir}; ' '{fastq_dump_exe} --split-spot -I --stdout ' '{sra_accession}' ).format(download_dir=download_dir, fastq_dump_exe=fastq_dump_exe, sra_accession=sra_accession) if skip_stubs: fastq_dump_command += ( ' | awk \'BEGIN {{OFS = "\\n"}} ' '{{header = $0; ' 'getline seq; getline qheader; getline qseq; ' 'if (length(seq) > {min_len}) {{print header, ' 'seq, qheader, qseq}}}}\'' ).format(min_len=_max_stubby_read_length) print >>sys.stderr, fastq_dump_command sra_process = subprocess.Popen(fastq_dump_command, shell=True, executable='/bin/bash', stdout=subprocess.PIPE, bufsize=-1) else: mover.get(source_url, temp_dir) downloaded = list( set(os.listdir(temp_dir)).difference(downloaded) ) sources.append(os.path.join(temp_dir, list(downloaded)[0])) else: sources.append(source_url.to_url()) if onward: continue '''Use os.devnull so single- and paired-end data can be handled in one loop.''' if len(sources) == 1: sources.append(os.devnull) if qual_getter is None: # Figure out Phred format with xopen(None, sources[0]) as source_stream: qual_getter = phred_converter(fastq_stream=source_stream) with xopen(None, sources[0]) as source_stream_1, xopen( None, sources[1] ) as source_stream_2: source_streams = [source_stream_1, source_stream_2] reorganize = all([source == os.devnull for source in sources]) if reorganize: # SRA data is live if sra_paired_end: source_streams = [sra_process.stdout, sra_process.stdout] else: source_streams = [sra_process.stdout, open(os.devnull)] break_outer_loop = False while True: if not to_stdout: '''Name files using Hadoop task environment property mapred.task.partition.''' if gzip_output: try: output_file = os.path.join( destination, '.'.join([ os.environ['mapred_task_partition'], str(file_number), 'gz' ]) ) except KeyError: '''Hadoop 2.x: mapreduce.task.partition; see http://hadoop.apache.org/docs/r2.0.3-alpha/ hadoop-project-dist/hadoop-common/ DeprecatedProperties.html.''' output_file = os.path.join( destination, '.'.join([ os.environ['mapreduce_task_partition'], str(file_number), 'gz' ]) ) open_args = [output_file, 'a', gzip_level] else: try: output_file = os.path.join( destination, '.'.join([ os.environ['mapred_task_partition'], str(file_number) ]) ) except KeyError: output_file = os.path.join( destination, '.'.join([ os.environ['mapreduce_task_partition'], str(k), str(file_number) ]) ) open_args = [output_file, 'a'] try: os.makedirs(os.path.dirname(output_file)) except OSError: pass else: open_args = [] '''Use xopen to handle compressed streams and normal streams generally.''' with xopen(gzip_output if not to_stdout else '-', *open_args) \ as output_stream: perform_push = False line_numbers = [0, 0] read_next_line = True nucs_read = 0 pairs_read = 0 while True: if read_next_line: # Read next line only if FASTA mode didn't already lines = [] for source_stream in source_streams: lines.append(source_stream.readline()) read_next_line = True if not lines[0]: break_outer_loop = True break line_numbers = [i + 1 for i in line_numbers] lines = [line.strip() for line in lines] bad_record_skip = False if lines[0][0] in fastq_cues: if records_to_consume and not skipped: '''Skip lines as necessary; for paired-end reads skip the largest even number of records less than records_to_consume.''' if len(source_urls) == 1: # single-end line_skip_count = max( skip_count * 4 - 1, 0 ) else: # paired-end line_skip_count = max( ((skip_count / 2) * 4 - 1), 0 ) for _ in xrange(line_skip_count): next(source_stream_2) for _ in xrange(line_skip_count): next(source_stream_1) if skip_count: lines = [] for source_stream in source_streams: lines.append(source_stream.readline()) if not lines[0]: break_outer_loop = True break lines = [line.strip() for line in lines] skipped = True seqs = [source_stream.readline().strip() for source_stream in source_streams] line_numbers = [i + 1 for i in line_numbers] plus_lines = [source_stream.readline().strip() for source_stream in source_streams] line_numbers = [i + 1 for i in line_numbers] quals = [source_stream.readline().strip() for source_stream in source_streams] if reorganize and sra_paired_end: # Fix order! lines, seqs, plus_lines, quals = ( [lines[0], plus_lines[0]], [lines[1], plus_lines[1]], [seqs[0], quals[0]], [seqs[1], quals[1]] ) try: assert plus_lines[0][0] == '+', ( 'Malformed read "%s" at line %d of ' 'file "%s".' ) % (lines[0], line_numbers[0], sources[0]) if plus_lines[1]: assert plus_lines[1][0] == '+', ( 'Malformed read "%s" at line %d ' 'of file "%s".' ) % ( lines[1], line_numbers[1], sources[1] ) try: # Kill spaces in name original_qnames = \ [line[1:].replace(' ', '_') for line in lines] except IndexError: raise RuntimeError( 'Error finding QNAME at ' 'line %d of either %s or %s' % ( sources[0], sources[1] ) ) except (AssertionError, IndexError, RuntimeError) as e: if skip_bad_records: print >>sys.stderr, ('Error "%s" ' 'encountered; skipping bad record.' ) % e.message for source_stream in source_streams: source_stream.readline() line_numbers = [ i + 1 for i in line_numbers ] bad_record_skip = True else: raise else: try: quals = [ qual_getter(qual) for qual in quals ] except Exception as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered ' 'trying to convert quality ' 'string to Sanger format; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise line_numbers = [i + 1 for i in line_numbers] try: for i in xrange(2): assert len(seqs[i]) == len(quals[i]), ( 'Length of read sequence does not ' 'match length of quality string ' 'at line %d of file "%s".' ) % (line_numbers[i], sources[i]) except (AssertionError, IndexError) as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise elif lines[0][0] in fasta_cues: seqs = [[], []] next_lines = [] for p, source_stream in enumerate(source_streams): while True: next_line \ = source_stream.readline().strip() try: if next_line[0] in fasta_cues: break else: try: seqs[p].append(next_line) except IndexError: raise except IndexError: break next_lines.append(next_line) seqs = [''.join(seq) for seq in seqs] line_numbers = [i + 1 for i in line_numbers] try: try: # Kill spaces in name original_qnames = \ [line[1:].replace(' ', '_') for line in lines] except IndexError: raise RuntimeError( 'Error finding QNAME at ' 'line %d of either %s or %s' % ( sources[0], sources[1] ) ) except (AssertionError, IndexError, RuntimeError) as e: if skip_bad_records: print >>sys.stderr, ('Error "%s" ' 'encountered; skipping bad record.' ) % e.message for source_stream in source_streams: source_stream.readline() line_numbers = [ i + 1 for i in line_numbers ] bad_record_skip = True else: raise else: try: quals = [ 'h'*len(seq) for seq in seqs ] except Exception as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered ' 'trying to convert quality ' 'string to Sanger format; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise line_numbers = [i + 1 for i in line_numbers] lines = next_lines read_next_line = False if bad_record_skip: seqs = [] # Fake record-printing to get to records_to_consume if source_streams[-1].name == os.devnull: records_printed += 1 else: records_printed += 2 elif len(original_qnames) == 2 and original_qnames[1]: # Paired-end write if original_qnames[0] == original_qnames[1]: # Add paired-end identifiers original_qnames[0] += '/1' original_qnames[1] += '/2' assert seqs[1] assert quals[1] seqs = [seq.upper() for seq in seqs] reversed_complement_seqs = [ seqs[0][::-1].translate( _reversed_complement_translation_table ), seqs[1][::-1].translate( _reversed_complement_translation_table ) ] if seqs[0] < reversed_complement_seqs[0]: left_seq = seqs[0] left_qual = quals[0] left_reversed = '0' else: left_seq = reversed_complement_seqs[0] left_qual = quals[0][::-1] left_reversed = '1' if seqs[1] < reversed_complement_seqs[1]: right_seq = seqs[1] right_qual = quals[1] right_reversed = '0' else: right_seq = reversed_complement_seqs[1] right_qual = quals[1][::-1] right_reversed = '1' if short_qnames: left_qname_to_write = encode(read_index) + '/1' right_qname_to_write = encode( read_index ) + '/2' else: left_qname_to_write = original_qnames[0] right_qname_to_write = original_qnames[1] print >>output_stream, '\t'.join( [ left_seq, left_reversed, qname_from_read( left_qname_to_write, seqs[0] + quals[0], sample_label, mate=seqs[1] ), '\n'.join([ round_quality_string( left_qual ), right_seq ]), right_reversed, qname_from_read( right_qname_to_write, seqs[1] + quals[1], sample_label, mate=seqs[0] ), round_quality_string(right_qual) ] ) records_printed += 2 _output_line_count += 1 else: seqs[0] = seqs[0].upper() reversed_complement_seqs = [ seqs[0][::-1].translate( _reversed_complement_translation_table ) ] # Single-end write if seqs[0] < reversed_complement_seqs[0]: seq = seqs[0] qual = quals[0] is_reversed = '0' else: seq = reversed_complement_seqs[0] qual = quals[0][::-1] is_reversed = '1' if short_qnames: qname_to_write = encode(read_index) else: qname_to_write = original_qnames[0] print >>output_stream, '\t'.join( [ seq, is_reversed, qname_from_read( qname_to_write, seqs[0] + quals[0], sample_label ), round_quality_string(qual) ] ) records_printed += 1 _output_line_count += 1 read_index += 1 for seq in seqs: nucs_read += len(seq) if records_printed == records_to_consume: break_outer_loop = True perform_push = True break if not to_stdout and not records_to_consume and \ nucs_read > nucleotides_per_input: file_number += 1 break if verbose: print >>sys.stderr, ( 'Exited with statement; line numbers are %s' % line_numbers ) if (not to_stdout) and (push_url.is_nfs or push_url.is_s3 or push_url.is_hdfs) \ and ((not records_to_consume) or (records_to_consume and perform_push)): print >>sys.stderr, 'Pushing "%s" to "%s" ...' % ( output_file, push_url.to_url() ) print >>sys.stderr, 'reporter:status:alive' mover.put(output_file, push_url.plus(os.path.basename( output_file ))) try: os.remove(output_file) except OSError: pass if break_outer_loop: break if verbose: print >>sys.stderr, 'Exiting source streams...' if verbose: print >>sys.stderr, 'Exited source streams.' # Clear temporary directory for input_file in os.listdir(temp_dir): try: os.remove(os.path.join(temp_dir, input_file)) except OSError: pass if 'sra_process' in locals(): sra_process.stdout.close() sra_return_code = sra_process.wait() if sra_return_code > 0: raise RuntimeError(('fastq-dump terminated with exit ' 'code %d. Command run was "%s".') % (sra_return_code, fastq_dump_command)) del sra_process
help='Print out extra debugging statements') filemover.add_args(parser) tempdel.add_args(parser) args = parser.parse_args() import time start_time = time.time() input_line_count = 0 if args.out is not None: '''If --out is a local file, just write directly to that file. Otherwise, write to a temporary file that will later be uploaded to the destination.''' output_url = Url(args.out) if output_url.is_local: try: os.makedirs(output_url.to_url()) except: pass output_filename = os.path.join(args.out, args.junction_filename) else: temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) output_filename = args.junction_filename + '.temp' output_filename = os.path.join(temp_dir_path, output_filename) with xopen(True, output_filename, 'w', args.gzip_level) as output_stream: for line in sys.stdin: tokens = line.strip().split('\t') # Remove leading zeros from ints print >>output_stream, '\t'.join(
help='Basename for index to be written') parser.add_argument(\ '--keep-alive', action='store_const', const=True, default=False, help='Prints reporter:status:alive messages to stderr to keep EMR ' 'task alive') filemover.add_args(parser) bowtie.add_args(parser) tempdel.add_args(parser) args = parser.parse_args() import time start_time = time.time() output_filename, output_stream, output_url = [None] * 3 output_url = Url(args.out) if args.out is not None \ else Url(os.getcwd()) # Set up temporary destination import tempfile temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) # For deleting temporary directory, even on unexpected exit register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) # Set up temporary destination try: os.makedirs(os.path.join(temp_dir_path, 'index')) except: pass # Write to temporary directory, and later upload to URL index_basename = os.path.join(temp_dir_path, 'index/' + args.basename) fasta_file = os.path.join(temp_dir_path, 'temp.fa') print >> sys.stderr, 'Opened %s for writing....' % fasta_file
if args.keep_alive: from dooplicity.tools import KeepAlive keep_alive_thread = KeepAlive(sys.stderr) keep_alive_thread.start() import time start_time = time.time() reference_index = bowtie_index.BowtieIndexReference( os.path.expandvars(args.bowtie_idx) ) # For mapping sample indices back to original sample labels manifest_object = manifest.LabelsAndIndices( os.path.expandvars(args.manifest) ) output_url = Url(args.out) if args.out is not None \ else Url(os.getcwd()) input_line_count = 0 counter = Counter('bed') register_cleanup(counter.flush) if output_url.is_local: # Set up destination directory try: os.makedirs(output_url.to_url()) except: pass else: mover = filemover.FileMover(args=args) # Set up temporary destination import tempfile temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) for (line_type, sample_label), xpartition in xstream(sys.stdin, 2):
help='Basename for index to be written') parser.add_argument(\ '--keep-alive', action='store_const', const=True, default=False, help='Prints reporter:status:alive messages to stderr to keep EMR ' 'task alive') filemover.add_args(parser) bowtie.add_args(parser) tempdel.add_args(parser) args = parser.parse_args() import time start_time = time.time() output_filename, output_stream, output_url = [None]*3 output_url = Url(args.out) if args.out is not None \ else Url(os.getcwd()) # Set up temporary destination import tempfile temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) # For deleting temporary directory, even on unexpected exit register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) # Set up temporary destination try: os.makedirs(os.path.join(temp_dir_path, 'index')) except: pass # Write to temporary directory, and later upload to URL index_basename = os.path.join(temp_dir_path, 'index/' + args.basename) fasta_file = os.path.join(temp_dir_path, 'temp.fa') print >>sys.stderr, 'Opened %s for writing....' % fasta_file with open(fasta_file, 'w') as fasta_stream: input_line_count = 0 for line in sys.stdin:
print 'counts\t-\t%s\t%s\t%d\t%d' % (sample_index, rname_index, total_count, unique_count) else: # Grab stats _and_ output SAM/BAMs if not args.output_sam: # Only need subprocess to start samtools if outputting bam import subprocess # Get RNAMEs in order of descending length sorted_rnames = [reference_index.string_to_rname['%012d' % i] for i in xrange( len(reference_index.string_to_rname) - 1 )] total_count, unique_count = 0, 0 if args.out is not None: output_url = Url(args.out) if output_url.is_local: # Set up destination directory try: os.makedirs(output_url.to_url()) except: pass output_dir = args.out else: mover = filemover.FileMover(args=args) # Set up temporary destination import tempfile temp_dir_path = make_temp_dir( tempdel.silentexpandvars(args.scratch) ) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) output_dir = temp_dir_path
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2', bowtie2_index_base='genome', bowtie2_args='', verbose=False, report_multiplier=1.2, stranded=False, fudge=5, score_min=60, gzip_level=3, mover=filemover.FileMover(), intermediate_dir='.', scratch=None): """ Runs Rail-RNA-cointron_enum Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds introns that cooccur on reads by local alignments to transcriptome elements from Bowtie 2. Input (read from stdin) ---------------------------- Tab-delimited output tuple columns (readletize) 1. SEQ or its reversed complement, whichever is first in alphabetical order 2. Comma-separated list of sample labels if field 1 is the read sequence; '\x1c' if empty 3. Comma-separated list of sample labels if field 1 is the reversed complement of the read sequence; '\x1c' if empty Hadoop output (written to stdout) ---------------------------- Tab-delimited tuple columns: 1. Reference name (RNAME in SAM format) + '+' or '-' indicating which strand is the sense strand 2. Comma-separated list of intron start positions in configuration 3. Comma-separated list of intron end positions in configuration 4. left_extend_size: by how many bases on the left side of an intron the reference should extend 5. right_extend_size: by how many bases on the right side of an intron the reference should extend 6. Read sequence input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie2_exe: filename of Bowtie 2 executable; include path if not in $PATH. bowtie2_index_base: the basename of the Bowtie index files associated with the reference. bowtie2_args: string containing precisely extra command-line arguments to pass to Bowtie 2, e.g., "--tryhard --best"; or None. verbose: True iff more informative messages should be written to stderr. report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the intron's strand. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels score_min: Bowtie2 CONSTANT minimum alignment score gzip_level: compression level to use for temporary files mover: FileMover object, for use in case Bowtie2 idx needs to be pulled from S3 intermediate_dir: where intermediates are stored; for temporarily storing transcript index if it needs to be pulled from S3 scratch: scratch directory for storing temporary files or None if securely created temporary directory No return value. """ bowtie2_index_base_url = Url(bowtie2_index_base) if bowtie2_index_base_url.is_s3: index_basename = os.path.basename(bowtie2_index_base) index_directory = os.path.join(intermediate_dir, 'transcript_index') if not os.path.exists(os.path.join(index_directory, '_STARTED')): # Download index with open(os.path.join(index_directory, '_STARTED'), 'w') \ as started_stream: print >> started_stream, 'STARTED' for extension in [ '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2', '.rev.2.bt2' ]: mover.get(bowtie2_index_base_url, index_directory) with open(os.path.join(index_directory, '_SUCCESS'), 'w') \ as success_stream: print >> success_stream, 'SUCCESS' while not os.path.exists(os.path.join(index_directory, '_SUCCESS')): time.sleep(0.5) bowtie2_index_base = os.path.join(index_directory, index_basename) global _input_line_count temp_dir_path = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) reads_file = os.path.join(temp_dir_path, 'reads.temp.gz') with xopen(True, reads_file, 'w', gzip_level) as reads_stream: for _input_line_count, line in enumerate(input_stream): seq = line.strip() print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)]) input_command = 'gzip -cd %s' % reads_file bowtie_command = ' '.join([ bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -', '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0' ]) delegate_command = ''.join([ sys.executable, ' ', os.path.realpath(__file__)[:-3], '_delegate.py --report-multiplier %08f --fudge %d %s %s' % (report_multiplier, fudge, '--stranded' if stranded else '', '--verbose' if verbose else '') ]) full_command = ' | '.join( [input_command, bowtie_command, delegate_command]) print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading Bowtie 2 output; ' 'exitlevel was %d.' % return_code)
) # For mapping sample indices back to original sample labels manifest_object = manifest.LabelsAndIndices( os.path.expandvars(args.manifest) ) # Create file with chromosome sizes for bedTobigwig sizes_filename = os.path.join(temp_dir_path, 'chrom.sizes') if args.verbose: print >>sys.stderr, 'Sizes file: %s .' % sizes_filename with open(sizes_filename, 'w') as sizes_stream: for rname in reference_index.rname_lengths: print >>sizes_stream, '%s %d' % (rname, reference_index.rname_lengths[rname]) input_line_count, output_line_count = 0, 0 output_url = Url(args.out) if output_url.is_local: # Set up destination directory try: os.makedirs(output_url.to_url()) except: pass mover = filemover.FileMover(args=args) track_line = ('track type=bedGraph name="{name}" ' 'description="{description}" visibility=full ' 'color=227,29,118 altColor=0,179,220 priority=400') for (sample_index,), xpartition in xstream(sys.stdin, 1): counter.add('partitions') real_sample = True try: sample_label = manifest_object.index_to_label[sample_index] except KeyError: # It's a nonref track, a mean, or a median
type=str, required=False, default='split.manifest', help='Output manifest filename') # Add scratch command-line parameter tempdel.add_args(parser) args = parser.parse_args(sys.argv[1:]) start_time = time.time() input_line_count, output_line_count = 0, 0 counter = Counter('assign_splits') register_cleanup(counter.flush) output_url = Url(args.out) if args.out is not None else Url(os.getcwd()) if output_url.is_local: # Set up destination directory try: os.makedirs(output_url.to_url()) except: pass output_path = os.path.join(args.out, args.filename) else: mover = filemover.FileMover(args=args) print >> sys.stderr, 'Instantiated FileMover.' # Set up temporary destination import tempfile from dooplicity.tools import make_temp_dir temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
formed; reference_index.rname_lengths[RNAME] is the length of RNAME.''' reference_index = bowtie_index.BowtieIndexReference( os.path.expandvars(args.bowtie_idx)) # For mapping sample indices back to original sample labels manifest_object = manifest.LabelsAndIndices(os.path.expandvars(args.manifest)) # Create file with chromosome sizes for bedTobigwig sizes_filename = os.path.join(temp_dir_path, 'chrom.sizes') if args.verbose: print >> sys.stderr, 'Sizes file: %s .' % sizes_filename with open(sizes_filename, 'w') as sizes_stream: for rname in reference_index.rname_lengths: print >> sizes_stream, '%s %d' % (rname, reference_index.rname_lengths[rname]) input_line_count, output_line_count = 0, 0 output_url = Url(args.out) if output_url.is_local: # Set up destination directory try: os.makedirs(output_url.to_url()) except: pass mover = filemover.FileMover(args=args) track_line = ('track type=bedGraph name="{name}" ' 'description="{description}" visibility=full ' 'color=227,29,118 altColor=0,179,220 priority=400') for (sample_index, ), xpartition in xstream(sys.stdin, 1): try: sample_label = manifest_object.index_to_label[sample_index] except KeyError: # It's a mean or median
filemover.add_args(parser) tempdel.add_args(parser) args = parser.parse_args() import time start_time = time.time() input_line_count = 0 counter = Counter('junction_collect') register_cleanup(counter.flush) if args.out is not None: '''If --out is a local file, just write directly to that file. Otherwise, write to a temporary file that will later be uploaded to the destination.''' output_url = Url(args.out) if output_url.is_local: try: os.makedirs(output_url.to_url()) except: pass output_filename = os.path.join(args.out, args.junction_filename) else: temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) output_filename = args.junction_filename + '.temp' output_filename = os.path.join(temp_dir_path, output_filename) with xopen(True, output_filename, 'w', args.gzip_level) as output_stream: for line in sys.stdin: counter.add('inputs') tokens = line.strip().split('\t') # Remove leading zeros from ints
else: # Grab stats _and_ output SAM/BAMs if not args.output_sam: # Only need subprocess to start samtools if outputting bam import subprocess # Get RNAMEs in order of descending length sorted_rnames = [reference_index.string_to_rname['%012d' % i] for i in xrange( len(reference_index.string_to_rname) - 1 )] (output_path, output_filename, output_stream, output_url, last_rname, last_sample_label) = [None]*6 total_count, unique_count = 0, 0 if args.out is not None: output_url = Url(args.out) if output_url.is_local: # Set up destination directory try: os.makedirs(output_url.to_url()) except: pass else: mover = filemover.FileMover(args=args) # Set up temporary destination import tempfile temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) move_temporary_file = False # True when temporary file should be uploaded while True: line = sys.stdin.readline() if not line:
for input_line_count, line in enumerate(sys.stdin): # Kill offset from start of manifest file tokens = line.strip().split('\t')[1:] try: stripped = tokens[0].strip() if stripped[0] == '#' or not line.strip(): continue except IndexError: continue token_count = len(tokens) assert token_count in [ 3, 5 ], ('Line {} of input has {} fields, but 3 or 5 are expected.').format( input_line_count + 1, token_count) file_to_count = tokens[0] if (not ((token_count == 3 and Url(tokens[0]).is_local) or (token_count == 5 and Url(tokens[0]).is_local and Url(tokens[2]).is_local))): sys.stdout.write(line) output_line_count += 1 continue with xopen(None, file_to_count) as input_stream: first_char = input_stream.readline()[0] if first_char in fastq_cues: # 4 lines per record line_divider = 4 elif first_char in fasta_cues: line_divider = 2 else: raise RuntimeError( 'File "{}" is neither a FASTA nor a FASTQ file.'.format(