counter.add('bowtie_build_processes') bowtie_build_process = subprocess.Popen( [args.bowtie2_build_exe, fasta_file, index_basename], stderr=sys.stderr, stdout=sys.stderr) bowtie_build_process.wait() if bowtie_build_process.returncode: raise RuntimeError( 'Bowtie index construction failed w/ exitlevel %d.' % bowtie_build_process.returncode) # Compress index files print >> sys.stderr, 'Compressing isofrag index...' junction_index_filename = args.basename + '.tar.gz' junction_index_path = os.path.join(temp_dir_path, junction_index_filename) index_path = os.path.join(temp_dir_path, 'index') tar = tarfile.TarFile.gzopen(junction_index_path, mode='w', compresslevel=3) for index_file in os.listdir(index_path): tar.add(os.path.join(index_path, index_file), arcname=index_file) tar.close() counter.add('junction_index_archive_bytes', os.path.getsize(junction_index_path)) # Upload compressed index print >> sys.stderr, 'Uploading or copying compressed index...' counter.add('files_moved') mover = filemover.FileMover(args=args) mover.put(junction_index_path, output_url.plus(junction_index_filename)) print >>sys.stderr, 'DONE with junction_index.py; in=%d; time=%0.3f s' \ % (input_line_count, time.time() - start_time)
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2', bowtie2_index_base='genome', bowtie2_args='', verbose=False, report_multiplier=1.2, stranded=False, fudge=5, score_min=60, gzip_level=3, mover=filemover.FileMover(), intermediate_dir='.', scratch=None): """ Runs Rail-RNA-cointron_enum Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds introns that cooccur on reads by local alignments to transcriptome elements from Bowtie 2. Input (read from stdin) ---------------------------- Tab-delimited output tuple columns (readletize) 1. SEQ or its reversed complement, whichever is first in alphabetical order 2. Comma-separated list of sample labels if field 1 is the read sequence; '\x1c' if empty 3. Comma-separated list of sample labels if field 1 is the reversed complement of the read sequence; '\x1c' if empty Hadoop output (written to stdout) ---------------------------- Tab-delimited tuple columns: 1. Reference name (RNAME in SAM format) + '+' or '-' indicating which strand is the sense strand 2. Comma-separated list of intron start positions in configuration 3. Comma-separated list of intron end positions in configuration 4. left_extend_size: by how many bases on the left side of an intron the reference should extend 5. right_extend_size: by how many bases on the right side of an intron the reference should extend 6. Read sequence input_stream: where to find input reads. output_stream: where to emit exonic chunks and introns. bowtie2_exe: filename of Bowtie 2 executable; include path if not in $PATH. bowtie2_index_base: the basename of the Bowtie index files associated with the reference. bowtie2_args: string containing precisely extra command-line arguments to pass to Bowtie 2, e.g., "--tryhard --best"; or None. verbose: True iff more informative messages should be written to stderr. report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the intron's strand. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels score_min: Bowtie2 CONSTANT minimum alignment score gzip_level: compression level to use for temporary files mover: FileMover object, for use in case Bowtie2 idx needs to be pulled from S3 intermediate_dir: where intermediates are stored; for temporarily storing transcript index if it needs to be pulled from S3 scratch: scratch directory for storing temporary files or None if securely created temporary directory No return value. """ bowtie2_index_base_url = Url(bowtie2_index_base) if bowtie2_index_base_url.is_s3: index_basename = os.path.basename(bowtie2_index_base) index_directory = os.path.join(intermediate_dir, 'transcript_index') if not os.path.exists(os.path.join(index_directory, '_STARTED')): # Download index with open(os.path.join(index_directory, '_STARTED'), 'w') \ as started_stream: print >> started_stream, 'STARTED' for extension in [ '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2', '.rev.2.bt2' ]: mover.get(bowtie2_index_base_url, index_directory) with open(os.path.join(index_directory, '_SUCCESS'), 'w') \ as success_stream: print >> success_stream, 'SUCCESS' while not os.path.exists(os.path.join(index_directory, '_SUCCESS')): time.sleep(0.5) bowtie2_index_base = os.path.join(index_directory, index_basename) global _input_line_count temp_dir_path = make_temp_dir(scratch) register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path]) reads_file = os.path.join(temp_dir_path, 'reads.temp.gz') with xopen(True, reads_file, 'w', gzip_level) as reads_stream: for _input_line_count, line in enumerate(input_stream): seq = line.strip() print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)]) input_command = 'gzip -cd %s' % reads_file bowtie_command = ' '.join([ bowtie2_exe, bowtie2_args if bowtie2_args is not None else '', ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -', '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0' ]) delegate_command = ''.join([ sys.executable, ' ', os.path.realpath(__file__)[:-3], '_delegate.py --report-multiplier %08f --fudge %d %s %s' % (report_multiplier, fudge, '--stranded' if stranded else '', '--verbose' if verbose else '') ]) full_command = ' | '.join( [input_command, bowtie_command, delegate_command]) print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command bowtie_process = subprocess.Popen(' '.join( ['set -exo pipefail;', full_command]), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True, executable='/bin/bash') return_code = bowtie_process.wait() if return_code: raise RuntimeError('Error occurred while reading Bowtie 2 output; ' 'exitlevel was %d.' % return_code)