Exemple #1
0
    counter.add('bowtie_build_processes')
    bowtie_build_process = subprocess.Popen(
        [args.bowtie2_build_exe, fasta_file, index_basename],
        stderr=sys.stderr,
        stdout=sys.stderr)
    bowtie_build_process.wait()
    if bowtie_build_process.returncode:
        raise RuntimeError(
            'Bowtie index construction failed w/ exitlevel %d.' %
            bowtie_build_process.returncode)

# Compress index files
print >> sys.stderr, 'Compressing isofrag index...'
junction_index_filename = args.basename + '.tar.gz'
junction_index_path = os.path.join(temp_dir_path, junction_index_filename)
index_path = os.path.join(temp_dir_path, 'index')
tar = tarfile.TarFile.gzopen(junction_index_path, mode='w', compresslevel=3)
for index_file in os.listdir(index_path):
    tar.add(os.path.join(index_path, index_file), arcname=index_file)
tar.close()
counter.add('junction_index_archive_bytes',
            os.path.getsize(junction_index_path))
# Upload compressed index
print >> sys.stderr, 'Uploading or copying compressed index...'
counter.add('files_moved')
mover = filemover.FileMover(args=args)
mover.put(junction_index_path, output_url.plus(junction_index_filename))

print >>sys.stderr, 'DONE with junction_index.py; in=%d; time=%0.3f s' \
                        % (input_line_count, time.time() - start_time)
Exemple #2
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie2_exe='bowtie2',
       bowtie2_index_base='genome',
       bowtie2_args='',
       verbose=False,
       report_multiplier=1.2,
       stranded=False,
       fudge=5,
       score_min=60,
       gzip_level=3,
       mover=filemover.FileMover(),
       intermediate_dir='.',
       scratch=None):
    """ Runs Rail-RNA-cointron_enum 

        Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds
        introns that cooccur on reads by local alignments to transcriptome
        elements from Bowtie 2.

        Input (read from stdin)
        ----------------------------
        Tab-delimited output tuple columns (readletize)
        1. SEQ or its reversed complement, whichever is first in alphabetical
            order
        2. Comma-separated list of sample labels if field 1 is the read
            sequence; '\x1c' if empty
        3. Comma-separated list of sample labels if field 1 is the reversed
            complement of the read sequence; '\x1c' if empty

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited tuple columns:
        1. Reference name (RNAME in SAM format) + 
            '+' or '-' indicating which strand is the sense strand
        2. Comma-separated list of intron start positions in configuration
        3. Comma-separated list of intron end positions in configuration
        4. left_extend_size: by how many bases on the left side of an intron
            the reference should extend
        5. right_extend_size: by how many bases on the right side of an intron
            the reference should extend
        6. Read sequence

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie2_exe: filename of Bowtie 2 executable; include path if not in
            $PATH.
        bowtie2_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to Bowtie 2, e.g., "--tryhard --best"; or None.
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the intron's strand.
        fudge: by how many bases to extend left and right extend sizes
                to accommodate potential indels
        score_min: Bowtie2 CONSTANT minimum alignment score
        gzip_level: compression level to use for temporary files
        mover: FileMover object, for use in case Bowtie2 idx needs to be
            pulled from S3
        intermediate_dir: where intermediates are stored; for temporarily
            storing transcript index if it needs to be pulled from S3
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    bowtie2_index_base_url = Url(bowtie2_index_base)
    if bowtie2_index_base_url.is_s3:
        index_basename = os.path.basename(bowtie2_index_base)
        index_directory = os.path.join(intermediate_dir, 'transcript_index')
        if not os.path.exists(os.path.join(index_directory, '_STARTED')):
            # Download index
            with open(os.path.join(index_directory, '_STARTED'), 'w') \
                as started_stream:
                print >> started_stream, 'STARTED'
            for extension in [
                    '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2',
                    '.rev.2.bt2'
            ]:
                mover.get(bowtie2_index_base_url, index_directory)
            with open(os.path.join(index_directory, '_SUCCESS'), 'w') \
                as success_stream:
                print >> success_stream, 'SUCCESS'
        while not os.path.exists(os.path.join(index_directory, '_SUCCESS')):
            time.sleep(0.5)
        bowtie2_index_base = os.path.join(index_directory, index_basename)
    global _input_line_count
    temp_dir_path = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
    reads_file = os.path.join(temp_dir_path, 'reads.temp.gz')
    with xopen(True, reads_file, 'w', gzip_level) as reads_stream:
        for _input_line_count, line in enumerate(input_stream):
            seq = line.strip()
            print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)])
    input_command = 'gzip -cd %s' % reads_file
    bowtie_command = ' '.join([
        bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
        ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -',
        '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        '_delegate.py --report-multiplier %08f --fudge %d %s %s' %
        (report_multiplier, fudge, '--stranded' if stranded else '',
         '--verbose' if verbose else '')
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading Bowtie 2 output; '
                           'exitlevel was %d.' % return_code)