counter.add('bowtie_build_processes') bowtie_build_process = subprocess.Popen( [args.bowtie2_build_exe, fasta_file, index_basename], stderr=sys.stderr, stdout=sys.stderr) bowtie_build_process.wait() if bowtie_build_process.returncode: raise RuntimeError( 'Bowtie index construction failed w/ exitlevel %d.' % bowtie_build_process.returncode) # Compress index files print >> sys.stderr, 'Compressing isofrag index...' junction_index_filename = args.basename + '.tar.gz' junction_index_path = os.path.join(temp_dir_path, junction_index_filename) index_path = os.path.join(temp_dir_path, 'index') tar = tarfile.TarFile.gzopen(junction_index_path, mode='w', compresslevel=3) for index_file in os.listdir(index_path): tar.add(os.path.join(index_path, index_file), arcname=index_file) tar.close() counter.add('junction_index_archive_bytes', os.path.getsize(junction_index_path)) # Upload compressed index print >> sys.stderr, 'Uploading or copying compressed index...' counter.add('files_moved') mover = filemover.FileMover(args=args) mover.put(junction_index_path, output_url.plus(junction_index_filename)) print >>sys.stderr, 'DONE with junction_index.py; in=%d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
def go(nucleotides_per_input=8000000, gzip_output=True, gzip_level=3, to_stdout=False, push='.', mover=filemover.FileMover(), verbose=False, scratch=None, bin_qualities=True, short_qnames=False, skip_bad_records=False, workspace_dir=None, fastq_dump_exe='fastq-dump', ignore_missing_sra_samples=False): """ Runs Rail-RNA-preprocess Input (read from stdin) ---------------------------- Tab-separated fields: ---If URL is local: 1. #!splitload 2. \x1d-separated list of 0-based indexes of reads at which to start each new file 3. \x1d-separated list of numbers of reads to include in gzipped files 4. \x1d-separated list of manifest lines whose tabs are replaced by \x1es ---Otherwise: manifest line A manifest line has the following format (for single-end reads) <URL>(tab)<Optional MD5>(tab)<Sample label> (for paired-end reads) <URL 1>(tab)<Optional MD5 1>(tab)<URL 2>(tab)<Optional MD5 2>(tab) <Sample label> Hadoop output (written to stdout) ---------------------------- None. Other output (written to directory specified by command-line parameter --push) ____________________________ Files containing input data in one of the following formats: Format 1 (single-end, 3-column): 1. Nucleotide sequence or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name 4. Quality sequence or its reverse, whichever corresponds to field 1 Format 2 (paired, 2 lines, 3 columns each) (so this is the same as single-end) 1. Nucleotide sequence for mate 1 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse-complemented else 0 3. Name for mate 1 4. Quality sequence for mate 1 or its reverse, whichever corresponds to field 1 (new line) 1. Nucleotide sequence for mate 2 or its reversed complement, whichever is first in alphabetical order 2. 1 if sequence was reverse complemented else 0 3. Name for mate 2 4. Quality sequence for mate 2 or its reverse, whichever corresponds to field 1 Quality sequences are strings of Is for FASTA input. nucleotides_per_input: maximum number of nucleotides to put in a given input file gzip_output: True iff preprocessed input should be gzipped gzip_level: level of gzip compression to use push: where to send output verbose: True iff extra debugging statements should be printed to stderr scratch: scratch directory for storing temporary files or None if securely created temporary directory bin_qualities: True iff quality string should be binned according to rules in _mismatch_penalties_to_quality_scores and round_quality_string() defined in go() short_qnames: True iff original qname should be killed and a new qname should be written in a short base64-encoded format skip_bad_records: True iff bad records should be skipped; otherwise, raises exception if bad record is encountered workspace_dir: where to use fastq-dump -- needed for working with dbGaP data. None if temporary dir should be used. fastq_dump_exe: path to fastq-dump executable ignore_missing_sra_samples: does not return error if fastq-dump doesn't find a sample No return value """ if bin_qualities: import math def round_quality_string(qual): """ Bins phred+33 quality string to improve compression. Uses 5-bin scheme that does not affect Bowtie 2 alignments qual: quality string Return value: "binned" quality string. """ return ''.join( [str(int( _MN + math.floor((_MX - _MN) * min( ord(qual_char) - 33.0, 40.0 ) / 40.0) )) for qual_char in qual]).translate( _mismatch_penalties_to_quality_scores ) else: def round_quality_string(qual): """ Leaves quality string unbinned and untouched. qual: quality string Return value: qual """ return qual global _input_line_count, _output_line_count skip_stubs = False temp_dir = make_temp_dir(scratch) print >>sys.stderr, 'Created local destination directory "%s".' % temp_dir register_cleanup(tempdel.remove_temporary_directories, [temp_dir]) input_line_count, output_line_count = 0, 0 if not to_stdout: push_url = Url(push) if push_url.is_local: destination = push elif push_url.is_s3 or push_url.is_hdfs or push_url.is_nfs: destination = temp_dir else: raise RuntimeError('Push destination must be ' 'on S3, HDFS, NFS, or local.') fastq_cues = set(['@']) fasta_cues = set(['>', ';']) source_dict = {} onward = False for line in sys.stdin: _input_line_count += 1 if not line.strip(): continue # Kill offset from start of manifest file try: tokens = line.strip().split('\t')[1:] if tokens[0][0] == '#' and tokens[0] != '#!splitload': # Comment line continue except IndexError: # Be robust to bad lines continue token_count = len(tokens) qual_getter = None if tokens[0] == '#!splitload': '''Line specifies precisely how records from files should be placed.''' assert not to_stdout, ('Split manifest line inconsistent with ' 'writing to stdout.') qual_getter = phred_converter(phred_format=tokens[-1]) indexes = tokens[1].split('\x1d') read_counts = tokens[2].split('\x1d') manifest_lines = [token.split('\x1e') for token in tokens[3].split('\x1d')] assert len(indexes) == len(read_counts) == len(manifest_lines) for i, manifest_line in enumerate(manifest_lines): manifest_line_field_count = len(manifest_line) if manifest_line_field_count == 3: source_dict[(Url(manifest_line[0]),)] = ( manifest_line[-1], int(indexes[i]), int(read_counts[i]) ) else: assert manifest_line_field_count == 5 source_dict[(Url(manifest_line[0]), Url(manifest_line[2]))] = ( manifest_line[-1], int(indexes[i]), int(read_counts[i]) ) elif token_count == 3: # SRA or single-end reads source_dict[(Url(tokens[0]),)] = (tokens[-1],) elif token_count == 5: # Paired-end reads source_dict[(Url(tokens[0]), Url(tokens[2]))] = (tokens[-1],) else: # Not a valid line, but continue for robustness continue file_number = 0 for source_urls in source_dict: sample_label = source_dict[source_urls][0] downloaded = set() sources = [] records_printed = 0 if len(source_dict[source_urls]) == 3: skip_count = source_dict[source_urls][1] if len(source_urls) == 2: records_to_consume = source_dict[source_urls][2] if skip_count % 2: skip_count -= 1 records_to_consume += 1 if records_to_consume % 2: records_to_consume -= 1 # Index reads according to order in input to shorten read names read_index = skip_count / 2 # Index reads in pairs else: records_to_consume = source_dict[source_urls][2] read_index = skip_count else: skip_count = 0 records_to_consume = None # Consume all records read_index = 0 assert (records_to_consume >= 0 or records_to_consume is None), ( 'Negative value %d of records to consume encountered.' ) % records_to_consume if records_to_consume == 0: continue skipped = False for source_url in source_urls: if not source_url.is_local: # Download print >>sys.stderr, 'Retrieving URL "%s"...' \ % source_url.to_url() if source_url.is_dbgap: download_dir = workspace_dir elif source_url.is_sra: download_dir = temp_dir if source_url.is_sra: sra_accession = source_url.to_url() fastq_dump_command = ( 'set -exo pipefail; cd {download_dir}; ' '{fastq_dump_exe} -I -X 10000 --split-files ' '{sra_accession}' ).format(download_dir=download_dir, fastq_dump_exe=fastq_dump_exe, sra_accession=sra_accession) try: subprocess.check_call( fastq_dump_command, shell=True, executable='/bin/bash', stdout=sys.stderr ) except subprocess.CalledProcessError as e: if e.returncode == 3 and ignore_missing_sra_samples: onward = True break else: raise RuntimeError( ('Error "%s" encountered executing ' 'command "%s".') % (e.output, fastq_dump_command)) import glob sra_fastq_files = sorted( glob.glob(os.path.join(download_dir, '%s[_.]*' % sra_accession)) ) # ensure 1 before 2 if paired-end # Schedule for deletion def silent_remove(filename): try: os.remove(filename) except OSError as e: pass for sra_fastq_file in sra_fastq_files: register_cleanup(silent_remove, sra_fastq_file) sra_file_count = len(sra_fastq_files) check_for_paired = False if sra_file_count == 1: sra_paired_end = False print >>sys.stderr, 'Detected single-end SRA sample.' elif sra_file_count in [2, 3]: print >>sys.stderr, ('2 or 3 FASTQ files detected. ' 'Checking for barcodes...') check_for_paired = True else: raise RuntimeError( ('Unexpected number of files "%d" output ' 'by fastq-dump command "%s".') % (sra_file_count, fastq_dump_command) ) if check_for_paired: # Get max/min read lengths from FASTQ with open( sra_fastq_files[sra_file_count - 2] ) as fastq_stream: max_len, min_len = ( max_min_read_lengths_from_fastq_stream( fastq_stream ) ) print >>sys.stderr, ( 'Max/min read length found in candidate ' 'barcode FASTQ was {}/{}.' ).format(max_len, min_len) if max_len <= _max_stubby_read_length: print >>sys.stderr, ( 'Assumed barcode FASTQ.' ) skip_stubs = True if sra_file_count == 2: sra_paired_end = False else: sra_paired_end = True else: if sra_file_count == 2: sra_paired_end = True else: raise RuntimeError( '3 FASTQs detected, but one of them ' 'was not recognized as containing ' 'barcodes.' ) # Guess quality from first 10k lines with xopen(None, sra_fastq_files[0]) as source_stream: qual_getter = phred_converter( fastq_stream=source_stream ) for sra_fastq_file in sra_fastq_files: os.remove(sra_fastq_file) sources.append(os.devnull) fastq_dump_command = ( 'set -exo pipefail; cd {download_dir}; ' '{fastq_dump_exe} --split-spot -I --stdout ' '{sra_accession}' ).format(download_dir=download_dir, fastq_dump_exe=fastq_dump_exe, sra_accession=sra_accession) if skip_stubs: fastq_dump_command += ( ' | awk \'BEGIN {{OFS = "\\n"}} ' '{{header = $0; ' 'getline seq; getline qheader; getline qseq; ' 'if (length(seq) > {min_len}) {{print header, ' 'seq, qheader, qseq}}}}\'' ).format(min_len=_max_stubby_read_length) print >>sys.stderr, fastq_dump_command sra_process = subprocess.Popen(fastq_dump_command, shell=True, executable='/bin/bash', stdout=subprocess.PIPE, bufsize=-1) else: mover.get(source_url, temp_dir) downloaded = list( set(os.listdir(temp_dir)).difference(downloaded) ) sources.append(os.path.join(temp_dir, list(downloaded)[0])) else: sources.append(source_url.to_url()) if onward: continue '''Use os.devnull so single- and paired-end data can be handled in one loop.''' if len(sources) == 1: sources.append(os.devnull) if qual_getter is None: # Figure out Phred format with xopen(None, sources[0]) as source_stream: qual_getter = phred_converter(fastq_stream=source_stream) with xopen(None, sources[0]) as source_stream_1, xopen( None, sources[1] ) as source_stream_2: source_streams = [source_stream_1, source_stream_2] reorganize = all([source == os.devnull for source in sources]) if reorganize: # SRA data is live if sra_paired_end: source_streams = [sra_process.stdout, sra_process.stdout] else: source_streams = [sra_process.stdout, open(os.devnull)] break_outer_loop = False while True: if not to_stdout: '''Name files using Hadoop task environment property mapred.task.partition.''' if gzip_output: try: output_file = os.path.join( destination, '.'.join([ os.environ['mapred_task_partition'], str(file_number), 'gz' ]) ) except KeyError: '''Hadoop 2.x: mapreduce.task.partition; see http://hadoop.apache.org/docs/r2.0.3-alpha/ hadoop-project-dist/hadoop-common/ DeprecatedProperties.html.''' output_file = os.path.join( destination, '.'.join([ os.environ['mapreduce_task_partition'], str(file_number), 'gz' ]) ) open_args = [output_file, 'a', gzip_level] else: try: output_file = os.path.join( destination, '.'.join([ os.environ['mapred_task_partition'], str(file_number) ]) ) except KeyError: output_file = os.path.join( destination, '.'.join([ os.environ['mapreduce_task_partition'], str(k), str(file_number) ]) ) open_args = [output_file, 'a'] try: os.makedirs(os.path.dirname(output_file)) except OSError: pass else: open_args = [] '''Use xopen to handle compressed streams and normal streams generally.''' with xopen(gzip_output if not to_stdout else '-', *open_args) \ as output_stream: perform_push = False line_numbers = [0, 0] read_next_line = True nucs_read = 0 pairs_read = 0 while True: if read_next_line: # Read next line only if FASTA mode didn't already lines = [] for source_stream in source_streams: lines.append(source_stream.readline()) read_next_line = True if not lines[0]: break_outer_loop = True break line_numbers = [i + 1 for i in line_numbers] lines = [line.strip() for line in lines] bad_record_skip = False if lines[0][0] in fastq_cues: if records_to_consume and not skipped: '''Skip lines as necessary; for paired-end reads skip the largest even number of records less than records_to_consume.''' if len(source_urls) == 1: # single-end line_skip_count = max( skip_count * 4 - 1, 0 ) else: # paired-end line_skip_count = max( ((skip_count / 2) * 4 - 1), 0 ) for _ in xrange(line_skip_count): next(source_stream_2) for _ in xrange(line_skip_count): next(source_stream_1) if skip_count: lines = [] for source_stream in source_streams: lines.append(source_stream.readline()) if not lines[0]: break_outer_loop = True break lines = [line.strip() for line in lines] skipped = True seqs = [source_stream.readline().strip() for source_stream in source_streams] line_numbers = [i + 1 for i in line_numbers] plus_lines = [source_stream.readline().strip() for source_stream in source_streams] line_numbers = [i + 1 for i in line_numbers] quals = [source_stream.readline().strip() for source_stream in source_streams] if reorganize and sra_paired_end: # Fix order! lines, seqs, plus_lines, quals = ( [lines[0], plus_lines[0]], [lines[1], plus_lines[1]], [seqs[0], quals[0]], [seqs[1], quals[1]] ) try: assert plus_lines[0][0] == '+', ( 'Malformed read "%s" at line %d of ' 'file "%s".' ) % (lines[0], line_numbers[0], sources[0]) if plus_lines[1]: assert plus_lines[1][0] == '+', ( 'Malformed read "%s" at line %d ' 'of file "%s".' ) % ( lines[1], line_numbers[1], sources[1] ) try: # Kill spaces in name original_qnames = \ [line[1:].replace(' ', '_') for line in lines] except IndexError: raise RuntimeError( 'Error finding QNAME at ' 'line %d of either %s or %s' % ( sources[0], sources[1] ) ) except (AssertionError, IndexError, RuntimeError) as e: if skip_bad_records: print >>sys.stderr, ('Error "%s" ' 'encountered; skipping bad record.' ) % e.message for source_stream in source_streams: source_stream.readline() line_numbers = [ i + 1 for i in line_numbers ] bad_record_skip = True else: raise else: try: quals = [ qual_getter(qual) for qual in quals ] except Exception as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered ' 'trying to convert quality ' 'string to Sanger format; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise line_numbers = [i + 1 for i in line_numbers] try: for i in xrange(2): assert len(seqs[i]) == len(quals[i]), ( 'Length of read sequence does not ' 'match length of quality string ' 'at line %d of file "%s".' ) % (line_numbers[i], sources[i]) except (AssertionError, IndexError) as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise elif lines[0][0] in fasta_cues: seqs = [[], []] next_lines = [] for p, source_stream in enumerate(source_streams): while True: next_line \ = source_stream.readline().strip() try: if next_line[0] in fasta_cues: break else: try: seqs[p].append(next_line) except IndexError: raise except IndexError: break next_lines.append(next_line) seqs = [''.join(seq) for seq in seqs] line_numbers = [i + 1 for i in line_numbers] try: try: # Kill spaces in name original_qnames = \ [line[1:].replace(' ', '_') for line in lines] except IndexError: raise RuntimeError( 'Error finding QNAME at ' 'line %d of either %s or %s' % ( sources[0], sources[1] ) ) except (AssertionError, IndexError, RuntimeError) as e: if skip_bad_records: print >>sys.stderr, ('Error "%s" ' 'encountered; skipping bad record.' ) % e.message for source_stream in source_streams: source_stream.readline() line_numbers = [ i + 1 for i in line_numbers ] bad_record_skip = True else: raise else: try: quals = [ 'h'*len(seq) for seq in seqs ] except Exception as e: if skip_bad_records: print >>sys.stderr, ( 'Error "%s" encountered ' 'trying to convert quality ' 'string to Sanger format; ' 'skipping bad record.' ) % e.message bad_record_skip = True else: raise line_numbers = [i + 1 for i in line_numbers] lines = next_lines read_next_line = False if bad_record_skip: seqs = [] # Fake record-printing to get to records_to_consume if source_streams[-1].name == os.devnull: records_printed += 1 else: records_printed += 2 elif len(original_qnames) == 2 and original_qnames[1]: # Paired-end write if original_qnames[0] == original_qnames[1]: # Add paired-end identifiers original_qnames[0] += '/1' original_qnames[1] += '/2' assert seqs[1] assert quals[1] seqs = [seq.upper() for seq in seqs] reversed_complement_seqs = [ seqs[0][::-1].translate( _reversed_complement_translation_table ), seqs[1][::-1].translate( _reversed_complement_translation_table ) ] if seqs[0] < reversed_complement_seqs[0]: left_seq = seqs[0] left_qual = quals[0] left_reversed = '0' else: left_seq = reversed_complement_seqs[0] left_qual = quals[0][::-1] left_reversed = '1' if seqs[1] < reversed_complement_seqs[1]: right_seq = seqs[1] right_qual = quals[1] right_reversed = '0' else: right_seq = reversed_complement_seqs[1] right_qual = quals[1][::-1] right_reversed = '1' if short_qnames: left_qname_to_write = encode(read_index) + '/1' right_qname_to_write = encode( read_index ) + '/2' else: left_qname_to_write = original_qnames[0] right_qname_to_write = original_qnames[1] print >>output_stream, '\t'.join( [ left_seq, left_reversed, qname_from_read( left_qname_to_write, seqs[0] + quals[0], sample_label, mate=seqs[1] ), '\n'.join([ round_quality_string( left_qual ), right_seq ]), right_reversed, qname_from_read( right_qname_to_write, seqs[1] + quals[1], sample_label, mate=seqs[0] ), round_quality_string(right_qual) ] ) records_printed += 2 _output_line_count += 1 else: seqs[0] = seqs[0].upper() reversed_complement_seqs = [ seqs[0][::-1].translate( _reversed_complement_translation_table ) ] # Single-end write if seqs[0] < reversed_complement_seqs[0]: seq = seqs[0] qual = quals[0] is_reversed = '0' else: seq = reversed_complement_seqs[0] qual = quals[0][::-1] is_reversed = '1' if short_qnames: qname_to_write = encode(read_index) else: qname_to_write = original_qnames[0] print >>output_stream, '\t'.join( [ seq, is_reversed, qname_from_read( qname_to_write, seqs[0] + quals[0], sample_label ), round_quality_string(qual) ] ) records_printed += 1 _output_line_count += 1 read_index += 1 for seq in seqs: nucs_read += len(seq) if records_printed == records_to_consume: break_outer_loop = True perform_push = True break if not to_stdout and not records_to_consume and \ nucs_read > nucleotides_per_input: file_number += 1 break if verbose: print >>sys.stderr, ( 'Exited with statement; line numbers are %s' % line_numbers ) if (not to_stdout) and (push_url.is_nfs or push_url.is_s3 or push_url.is_hdfs) \ and ((not records_to_consume) or (records_to_consume and perform_push)): print >>sys.stderr, 'Pushing "%s" to "%s" ...' % ( output_file, push_url.to_url() ) print >>sys.stderr, 'reporter:status:alive' mover.put(output_file, push_url.plus(os.path.basename( output_file ))) try: os.remove(output_file) except OSError: pass if break_outer_loop: break if verbose: print >>sys.stderr, 'Exiting source streams...' if verbose: print >>sys.stderr, 'Exited source streams.' # Clear temporary directory for input_file in os.listdir(temp_dir): try: os.remove(os.path.join(temp_dir, input_file)) except OSError: pass if 'sra_process' in locals(): sra_process.stdout.close() sra_return_code = sra_process.wait() if sra_return_code > 0: raise RuntimeError(('fastq-dump terminated with exit ' 'code %d. Command run was "%s".') % (sra_return_code, fastq_dump_command)) del sra_process
% bowtie_build_thread.bowtie_build_process) else: bowtie_build_process = subprocess.Popen( [args.bowtie2_build_exe, fasta_file, index_basename], stderr=sys.stderr, stdout=sys.stderr ) bowtie_build_process.wait() if bowtie_build_process.returncode: raise RuntimeError('Bowtie index construction failed w/ exitlevel %d.' % bowtie_build_process.returncode) # Compress index files print >>sys.stderr, 'Compressing intron index...' intron_index_filename = args.basename + '.tar.gz' intron_index_path = os.path.join(temp_dir_path, intron_index_filename) index_path = os.path.join(temp_dir_path, 'index') tar = tarfile.TarFile.gzopen(intron_index_path, mode='w', compresslevel=3) for index_file in os.listdir(index_path): tar.add(os.path.join(index_path, index_file), arcname=index_file) tar.close() # Upload compressed index print >>sys.stderr, 'Uploading or copying compressed index...' mover = filemover.FileMover(args=args) mover.put(intron_index_path, output_url.plus(intron_index_filename)) print >>sys.stderr, 'DONE with intron_index.py; in=%d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) output_filename = args.junction_filename + '.temp' output_filename = os.path.join(temp_dir_path, output_filename) with xopen(True, output_filename, 'w', args.gzip_level) as output_stream: for line in sys.stdin: tokens = line.strip().split('\t') # Remove leading zeros from ints print >>output_stream, '\t'.join( [tokens[0], str(int(tokens[1])), str(int(tokens[2]) - 1), tokens[3], tokens[4]] ) input_line_count += 1 else: # Default --out is stdout for line in sys.stdin: tokens = line.strip().split('\t') # Remove leading zeros from ints print '\t'.join([tokens[0], str(int(tokens[1])), str(int(tokens[2]) - 1), tokens[3], tokens[4]]) input_line_count += 1 if args.out is not None and not output_url.is_local: mover = filemover.FileMover(args=args) mover.put(output_filename, output_url.plus(args.junction_filename)) print >>sys.stderr, 'DONE with junction_collect.py; in = %d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
sample_index, span)) sample_index += span reads_assigned += span if not (reads_assigned % reads_per_file): lines_assigned.append([]) with open(output_path, 'w') as output_stream: for line_tuples in lines_assigned: if not line_tuples: continue print >>output_stream, '\t'.join(('#!splitload', '\x1d'.join( str(line_tuple[-2]) for line_tuple in line_tuples ), '\x1d'.join( str(line_tuple[-1]) for line_tuple in line_tuples ), '\x1d'.join( '\x1e'.join(samples[line_tuple[0]][1:]) for line_tuple in line_tuples ), phred_format )) for line in saved: print >>output_stream, line.strip() if not output_url.is_local: mover.put(output_path, output_url.plus(args.filename)) os.remove(output_path) sys.stdout.flush() print >>sys.stderr, 'DONE with assign_splits.py; in/out=%d/%d; ' \ 'time=%0.3f s' % (input_line_count, output_line_count, time.time() - start_time)
], start_position, end_position, i+1, maximin_overhang, coverage, reverse_strand_string, start_position, end_position, max_left_overhang, max_right_overhang, max_left_overhang + end_pos - pos ) input_line_count += i else: counter.add('insertion_line' if line_type == 'I' else 'deletion_line') for i, (rname, pos, end_pos, seq, _, _, _, coverage) \ in enumerate(xpartition): pos, end_pos = int(pos) - 1, int(end_pos) - 1 print >>output_stream, '%s\t%d\t%d\t%s\t%s' \ % (reference_index.string_to_rname[ rname ], pos, end_pos, seq, coverage) input_line_count += i counter.flush() if not output_url.is_local: counter.add('files_uploaded') mover.put(output_path, output_url.plus(output_filename)) os.remove(output_path) print >>sys.stderr, 'DONE with bed.py; in=%d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
'%d,%d\t0,%d' % ( reference_index.string_to_rname[ rname ], start_position, end_position, i+1, maximin_overhang, coverage, reverse_strand_string, start_position, end_position, max_left_overhang, max_right_overhang, max_left_overhang + end_pos - pos ) input_line_count += i else: for i, (rname, pos, end_pos, seq, _, _, _, coverage) \ in enumerate(xpartition): pos, end_pos = int(pos) - 1, int(end_pos) - 1 print >>output_stream, '%s\t%d\t%d\t%s\t%s' \ % (reference_index.string_to_rname[ rname ], pos, end_pos, seq, coverage) input_line_count += i if not output_url.is_local: mover.put(output_path, output_url.plus(output_filename)) os.remove(output_path) print >>sys.stderr, 'DONE with bed.py; in=%d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
if not span: continue lines_assigned[-1].append( (current_sample, samples[current_sample][-1], sample_index, span)) sample_index += span reads_assigned += span if not (reads_assigned % reads_per_file): lines_assigned.append([]) print >> sys.stderr, 'Finished crit block' with open(output_path, 'w') as output_stream: for line_tuples in lines_assigned: if not line_tuples: continue print >> output_stream, '\t'.join( ('#!splitload', '\x1d'.join(str(line_tuple[-2]) for line_tuple in line_tuples), '\x1d'.join(str(line_tuple[-1]) for line_tuple in line_tuples), '\x1d'.join('\x1e'.join(samples[line_tuple[0]][1:]) for line_tuple in line_tuples), phred_format)) for line in saved: print >> output_stream, line.strip() if not output_url.is_local: print >> sys.stderr, 'Uploading {} to {}....'.format(output_path, args.out) mover.put(output_path, output_url.plus(args.filename)) os.remove(output_path) sys.stdout.flush() print >>sys.stderr, 'DONE with assign_splits.py; in/out=%d/%d; ' \ 'time=%0.3f s' % (input_line_count, output_line_count, time.time() - start_time)
args.bigwig_exe, bed_filename, sizes_filename, bigwig_file_paths[0] ], [ args.bigwig_exe, unique_bed_filename, sizes_filename, bigwig_file_paths[1] ]] for i, bigwig_command in enumerate(bigwig_commands): if args.verbose: print >>sys.stderr, 'Writing bigwig with command %s .' \ % ' '.join(bigwig_command) bedtobigwig_process = subprocess.Popen(bigwig_command, stderr=sys.stderr, stdout=sys.stderr, bufsize=-1) bedtobigwig_process.wait() if bedtobigwig_process.returncode: raise RuntimeError('bedgraphtobigwig process failed w/ ' 'exitlevel %d.' % bedtobigwig_process.returncode) if args.verbose: print >> sys.stderr, ('bedTobigwig command %s succeeded .' % ' '.join(bigwig_command)) if not output_url.is_local: # bigwig must be uploaded to URL and deleted mover.put(bigwig_file_paths[i], output_url.plus(bigwig_filenames[i])) os.remove(bigwig_file_paths[i]) print >>sys.stderr, 'DONE with coverage.py; in/out=%d/%d; time=%0.3f s' \ % (input_line_count, output_line_count, time.time() - start_time)
bigwig_file_paths[1]]] for i, bigwig_command in enumerate(bigwig_commands): if args.verbose: print >>sys.stderr, 'Writing bigwig with command %s .' \ % ' '.join(bigwig_command) counter.add('call_bedgraphtobigwig') bedtobigwig_process = subprocess.Popen( bigwig_command, stderr=sys.stderr, stdout=sys.stderr, bufsize=-1 ) bedtobigwig_process.wait() if bedtobigwig_process.returncode: raise RuntimeError('bedgraphtobigwig process failed w/ ' 'exitlevel %d.' % bedtobigwig_process.returncode) if args.verbose: print >>sys.stderr, ('bedTobigwig command %s succeeded .' % ' '.join(bigwig_command)) if not output_url.is_local: # bigwig must be uploaded to URL and deleted counter.add('files_moved') mover.put(bigwig_file_paths[i], output_url.plus(bigwig_filenames[i])) os.remove(bigwig_file_paths[i]) print >>sys.stderr, 'DONE with coverage.py; in/out=%d/%d; time=%0.3f s' \ % (input_line_count, output_line_count, time.time() - start_time)
% bowtie_build_thread.bowtie_build_process) else: bowtie_build_process = subprocess.Popen( [args.bowtie2_build_exe, fasta_file, index_basename], stderr=sys.stderr, stdout=sys.stderr ) bowtie_build_process.wait() if bowtie_build_process.returncode: raise RuntimeError('Bowtie index construction failed w/ exitlevel %d.' % bowtie_build_process.returncode) # Compress index files print >>sys.stderr, 'Compressing isofrag index...' junction_index_filename = args.basename + '.tar.gz' junction_index_path = os.path.join(temp_dir_path, junction_index_filename) index_path = os.path.join(temp_dir_path, 'index') tar = tarfile.TarFile.gzopen(junction_index_path, mode='w', compresslevel=3) for index_file in os.listdir(index_path): tar.add(os.path.join(index_path, index_file), arcname=index_file) tar.close() # Upload compressed index print >>sys.stderr, 'Uploading or copying compressed index...' mover = filemover.FileMover(args=args) mover.put(junction_index_path, output_url.plus(junction_index_filename)) print >>sys.stderr, 'DONE with junction_index.py; in=%d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
[temp_dir_path]) output_filename = args.junction_filename + '.temp' output_filename = os.path.join(temp_dir_path, output_filename) with xopen(True, output_filename, 'w', args.gzip_level) as output_stream: for line in sys.stdin: counter.add('inputs') tokens = line.strip().split('\t') # Remove leading zeros from ints print >>output_stream, '\t'.join( [tokens[0][:-1], tokens[0][-1], str(int(tokens[1])), str(int(tokens[2]) - 1), tokens[3], tokens[4]] ) input_line_count += 1 else: # Default --out is stdout for line in sys.stdin: counter.add('inputs') tokens = line.strip().split('\t') # Remove leading zeros from ints print '\t'.join([tokens[0], str(int(tokens[1])), str(int(tokens[2]) - 1), tokens[3], tokens[4]]) input_line_count += 1 if args.out is not None and not output_url.is_local: mover = filemover.FileMover(args=args) mover.put(output_filename, output_url.plus(args.junction_filename)) print >>sys.stderr, 'DONE with junction_collect.py; in = %d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
last_output_path = output_path move_temporary_file = True else: tokens = line.rstrip().split('\t') sample_index, rname_index, pos, qname, flag = tokens[:5] if args.output_by_chromosome: (sample_index, rname_index) \ = sample_and_rname_indexes.sample_and_rname_indexes( sample_index ) sample_label = manifest_object.index_to_label[sample_index] rname = reference_index.string_to_rname[rname_index] if move_temporary_file and last_sample_label is not None \ and not output_url.is_local: mover.put(last_output_path, output_url.plus(last_output_filename)) os.remove(last_output_path) if not last_output_path.endswith('.unmapped.bam'): mover.put( ''.join([last_output_path, '.bai']), output_url.plus(''.join([last_output_filename, '.bai'])) ) os.remove(''.join([last_output_path, '.bai'])) move_temporary_file = False try: if (sample_label != last_sample_label or rname != last_rname or not line): print 'counts\t-\t%s\t%s\t%d\t%d' % (last_sample_index, last_rname_index, total_count, unique_count) total_count, unique_count = 0, 0
output_url = Url(args.out) if output_url.is_local: try: os.makedirs(output_url.to_url()) except: pass output_filename = os.path.join(args.out, args.intron_filename) else: temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch)) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) output_filename = args.intron_filename + '.temp' output_filename = os.path.join(temp_dir_path, output_filename) with xopen(True, output_filename, 'w', args.gzip_level) as output_stream: for line in sys.stdin: output_stream.write(line) input_line_count += 1 else: # Default --out is stdout for line in sys.stdin: tokens = line.strip().split('\t') # Remove leading zeros from ints sys.stdout.write('\t'.join([tokens[0], str(int(tokens[1])), str(int(tokens[2])), tokens[3], tokens[4]])) input_line_count += 1 if args.out is not None and not output_url.is_local: mover = filemover.FileMover(args=args) mover.put(output_filename, output_url.plus(args.intron_filename)) print >>sys.stderr, 'DONE with intron_collect.py; in = %d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
raise RuntimeError( 'Bowtie index construction failed w/ exitlevel %d.' % bowtie_build_thread.bowtie_build_process) else: bowtie_build_process = subprocess.Popen( [args.bowtie2_build_exe, fasta_file, index_basename], stderr=sys.stderr, stdout=sys.stderr) bowtie_build_process.wait() if bowtie_build_process.returncode: raise RuntimeError( 'Bowtie index construction failed w/ exitlevel %d.' % bowtie_build_process.returncode) # Compress index files print >> sys.stderr, 'Compressing intron index...' intron_index_filename = args.basename + '.tar.gz' intron_index_path = os.path.join(temp_dir_path, intron_index_filename) index_path = os.path.join(temp_dir_path, 'index') tar = tarfile.TarFile.gzopen(intron_index_path, mode='w', compresslevel=3) for index_file in os.listdir(index_path): tar.add(os.path.join(index_path, index_file), arcname=index_file) tar.close() # Upload compressed index print >> sys.stderr, 'Uploading or copying compressed index...' mover = filemover.FileMover(args=args) mover.put(intron_index_path, output_url.plus(intron_index_filename)) print >>sys.stderr, 'DONE with intron_index.py; in=%d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)