Example #1
0
 def __init__(self, zip_name, curl_exe=None, install_dir=None,
                 no_dependencies=False, prep_dependencies=False,
                 yes=False, me=False, add_symlinks=False):
     print_to_screen(u"""{0} Rail-RNA v{1} Installer""".format(
                                     u'\u2200', version_number)
                                 )
     if sys.platform in ['linux', 'linux2']:
         if os.path.isfile('/mnt/var/lib/info/instance.json'):
             # Assume an EMR cluster and prepend S3 download URLs
             self.depends = dependency_urls.ec2_dependencies
         else:
             self.depends = dependency_urls.linux_dependencies
     elif sys.platform == 'darwin':
         self.depends = dependency_urls.mac_dependencies
     else:
         print_to_screen(
                 'Rail-RNA cannot be installed because it is not supported '
                 'by your OS. Currently supported OSes are Mac OS X and '
                 'Linux.'
             )
         sys.exit(1)
     self.install_dir = install_dir
     self.no_dependencies = no_dependencies
     self.zip_name = os.path.abspath(zip_name)
     self.curl_exe = curl_exe
     log_dir = tempfile.mkdtemp()
     self.log_file = os.path.join(log_dir, 'rail-rna_install.log')
     self.log_stream = open(self.log_file, 'w')
     self.finished = False
     register_cleanup(remove_temporary_directories, [log_dir])
     self.yes = yes or me
     self.me = me
     self.prep_dependencies = prep_dependencies
     self.add_symlinks = add_symlinks
Example #2
0
 def __init__(self, zip_name, curl_exe=None, install_dir=None,
                 no_dependencies=False, prep_dependencies=False,
                 yes=False, me=False, add_symlinks=False):
     print_to_screen(u"""{0} Rail-RNA v{1} Installer""".format(
                                     u'\u2200', version_number)
                                 )
     if sys.platform in ['linux', 'linux2']:
         if os.path.isfile('/mnt/var/lib/info/instance.json'):
             # Assume an EMR cluster and prepend S3 download URLs
             self.depends = dependency_urls.ec2_dependencies
         else:
             self.depends = dependency_urls.linux_dependencies
     elif sys.platform == 'darwin':
         self.depends = dependency_urls.mac_dependencies
     else:
         print_to_screen(
                 'Rail-RNA cannot be installed because it is not supported '
                 'by your OS. Currently supported OSes are Mac OS X and '
                 'Linux.'
             )
         sys.exit(1)
     self.install_dir = install_dir
     self.no_dependencies = no_dependencies
     self.zip_name = os.path.abspath(zip_name)
     self.curl_exe = curl_exe
     log_dir = tempfile.mkdtemp()
     self.log_file = os.path.join(log_dir, 'rail-rna_install.log')
     self.log_stream = open(self.log_file, 'w')
     self.finished = False
     register_cleanup(remove_temporary_directories, [log_dir])
     self.yes = yes or me
     self.me = me
     self.prep_dependencies = prep_dependencies
     self.add_symlinks = add_symlinks
Example #3
0
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2',
    bowtie2_index_base='genome', bowtie2_args='', verbose=False,
    report_multiplier=1.2, stranded=False, fudge=5, score_min=60,
    gzip_level=3, mover=filemover.FileMover(), intermediate_dir='.',
    scratch=None):
    """ Runs Rail-RNA-cointron_enum 

        Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds
        introns that cooccur on reads by local alignments to transcriptome
        elements from Bowtie 2.

        Input (read from stdin)
        ----------------------------
        Tab-delimited output tuple columns (readletize)
        1. SEQ or its reversed complement, whichever is first in alphabetical
            order
        2. Comma-separated list of sample labels if field 1 is the read
            sequence; '\x1c' if empty
        3. Comma-separated list of sample labels if field 1 is the reversed
            complement of the read sequence; '\x1c' if empty

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited tuple columns:
        1. Reference name (RNAME in SAM format) + 
            '+' or '-' indicating which strand is the sense strand
        2. Comma-separated list of intron start positions in configuration
        3. Comma-separated list of intron end positions in configuration
        4. left_extend_size: by how many bases on the left side of an intron
            the reference should extend
        5. right_extend_size: by how many bases on the right side of an intron
            the reference should extend
        6. Read sequence

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie2_exe: filename of Bowtie 2 executable; include path if not in
            $PATH.
        bowtie2_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to Bowtie 2, e.g., "--tryhard --best"; or None.
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the intron's strand.
        fudge: by how many bases to extend left and right extend sizes
                to accommodate potential indels
        score_min: Bowtie2 CONSTANT minimum alignment score
        gzip_level: compression level to use for temporary files
        mover: FileMover object, for use in case Bowtie2 idx needs to be
            pulled from S3
        intermediate_dir: where intermediates are stored; for temporarily
            storing transcript index if it needs to be pulled from S3
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    bowtie2_index_base_url = Url(bowtie2_index_base)
    if bowtie2_index_base_url.is_s3:
        index_basename = os.path.basename(bowtie2_index_base)
        index_directory = os.path.join(intermediate_dir, 'transcript_index')
        if not os.path.exists(os.path.join(index_directory, '_STARTED')):
            # Download index
            with open(os.path.join(index_directory, '_STARTED'), 'w') \
                as started_stream:
                print >>started_stream, 'STARTED'
            for extension in ['.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', 
                                '.rev.1.bt2', '.rev.2.bt2']:
                mover.get(bowtie2_index_base_url, index_directory)
            with open(os.path.join(index_directory, '_SUCCESS'), 'w') \
                as success_stream:
                print >>success_stream, 'SUCCESS'
        while not os.path.exists(os.path.join(index_directory, '_SUCCESS')):
            time.sleep(0.5)
        bowtie2_index_base = os.path.join(index_directory, index_basename)  
    global _input_line_count
    temp_dir_path = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
    reads_file = os.path.join(temp_dir_path, 'reads.temp.gz')
    with xopen(True, reads_file, 'w', gzip_level) as reads_stream:
        for _input_line_count, line in enumerate(input_stream):
            seq = line.strip()
            print >>reads_stream, '\t'.join([seq, seq, 'I'*len(seq)])
    input_command = 'gzip -cd %s' % reads_file
    bowtie_command = ' '.join([bowtie2_exe,
        bowtie2_args if bowtie2_args is not None else '',
        ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -',
        '--score-min L,%d,0' % score_min, 
        '-D 24 -R 3 -N 1 -L 20 -i L,4,0'])
    delegate_command = ''.join(
            [sys.executable, ' ', os.path.realpath(__file__)[:-3],
                '_delegate.py --report-multiplier %08f --fudge %d %s %s'
                    % (report_multiplier, fudge,
                        '--stranded' if stranded else '',
                        '--verbose' if verbose else '')]
        )
    full_command = ' | '.join([input_command,
                                bowtie_command, delegate_command])
    print >>sys.stderr, 'Starting Bowtie2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
                ['set -exo pipefail;', full_command]
            ), bufsize=-1, stdout=sys.stdout, stderr=sys.stderr,
        shell=True, executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading Bowtie 2 output; '
                           'exitlevel was %d.' % return_code)
Example #4
0
def go(nucleotides_per_input=8000000, gzip_output=True, gzip_level=3,
        to_stdout=False, push='.', mover=filemover.FileMover(),
        verbose=False, scratch=None, bin_qualities=True, short_qnames=False,
        skip_bad_records=False, workspace_dir=None,
        fastq_dump_exe='fastq-dump', ignore_missing_sra_samples=False):
    """ Runs Rail-RNA-preprocess

        Input (read from stdin)
        ----------------------------
        Tab-separated fields:
        ---If URL is local:
        1. #!splitload
        2. \x1d-separated list of 0-based indexes of reads at which to start
            each new file
        3. \x1d-separated list of numbers of reads to include in gzipped files
        4. \x1d-separated list of manifest lines whose tabs are replaced by
            \x1es

        ---Otherwise:
        manifest line

        A manifest line has the following format

        (for single-end reads)
        <URL>(tab)<Optional MD5>(tab)<Sample label>

        (for paired-end reads)
        <URL 1>(tab)<Optional MD5 1>(tab)<URL 2>(tab)<Optional MD5 2>(tab)
        <Sample label>

        Hadoop output (written to stdout)
        ----------------------------
        None.

        Other output (written to directory specified by command-line parameter
            --push)
        ____________________________
        Files containing input data in one of the following formats:

        Format 1 (single-end, 3-column):
          1. Nucleotide sequence or its reversed complement, whichever is first
            in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name
          4. Quality sequence or its reverse, whichever corresponds to field 1

        Format 2 (paired, 2 lines, 3 columns each)
        (so this is the same as single-end)
          1. Nucleotide sequence for mate 1 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name for mate 1
          4. Quality sequence for mate 1 or its reverse, whichever corresponds
            to field 1
            
            (new line)

          1. Nucleotide sequence for mate 2 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse complemented else 0
          3. Name for mate 2
          4. Quality sequence for mate 2 or its reverse, whichever corresponds
            to field 1

        Quality sequences are strings of Is for FASTA input.

        nucleotides_per_input: maximum number of nucleotides to put in a given
            input file
        gzip_output: True iff preprocessed input should be gzipped
        gzip_level: level of gzip compression to use
        push: where to send output
        verbose: True iff extra debugging statements should be printed to
            stderr
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory
        bin_qualities: True iff quality string should be binned according to
            rules in _mismatch_penalties_to_quality_scores
            and round_quality_string() defined in go()
        short_qnames: True iff original qname should be killed and a new qname
            should be written in a short base64-encoded format
        skip_bad_records: True iff bad records should be skipped; otherwise,
            raises exception if bad record is encountered
        workspace_dir: where to use fastq-dump -- needed for working with
            dbGaP data. None if temporary dir should be used.
        fastq_dump_exe: path to fastq-dump executable
        ignore_missing_sra_samples: does not return error if fastq-dump doesn't
            find a sample

        No return value
    """
    if bin_qualities:
        import math
        def round_quality_string(qual):
            """ Bins phred+33 quality string to improve compression.

                Uses 5-bin scheme that does not affect Bowtie 2 alignments

                qual: quality string

                Return value: "binned" quality string.
            """
            return ''.join(
                [str(int(
                    _MN + math.floor((_MX - _MN) * min(
                                                    ord(qual_char) - 33.0, 40.0
                                                ) / 40.0)
                        )) for qual_char in qual]).translate(
                                _mismatch_penalties_to_quality_scores
                            )
    else:
        def round_quality_string(qual):
            """ Leaves quality string unbinned and untouched.

                qual: quality string

                Return value: qual
            """
            return qual
    global _input_line_count, _output_line_count
    skip_stubs = False
    temp_dir = make_temp_dir(scratch)
    print >>sys.stderr, 'Created local destination directory "%s".' % temp_dir
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    input_line_count, output_line_count = 0, 0
    if not to_stdout:
        push_url = Url(push)
        if push_url.is_local:
            destination = push
        elif push_url.is_s3 or push_url.is_hdfs or push_url.is_nfs:
            destination = temp_dir
        else:
            raise RuntimeError('Push destination must be '
                               'on S3, HDFS, NFS, or local.')
    fastq_cues = set(['@'])
    fasta_cues = set(['>', ';'])
    source_dict = {}
    onward = False
    for line in sys.stdin:
        _input_line_count += 1
        if not line.strip(): continue
        # Kill offset from start of manifest file
        try:
            tokens = line.strip().split('\t')[1:]
            if tokens[0][0] == '#' and tokens[0] != '#!splitload':
                # Comment line
                continue
        except IndexError:
            # Be robust to bad lines
            continue
        token_count = len(tokens)
        qual_getter = None
        if tokens[0] == '#!splitload':
            '''Line specifies precisely how records from files should be
            placed.'''
            assert not to_stdout, ('Split manifest line inconsistent with '
                                   'writing to stdout.')
            qual_getter = phred_converter(phred_format=tokens[-1])
            indexes = tokens[1].split('\x1d')
            read_counts = tokens[2].split('\x1d')
            manifest_lines = [token.split('\x1e')
                                for token in tokens[3].split('\x1d')]
            assert len(indexes) == len(read_counts) == len(manifest_lines)
            for i, manifest_line in enumerate(manifest_lines):
                manifest_line_field_count = len(manifest_line)
                if manifest_line_field_count == 3:
                    source_dict[(Url(manifest_line[0]),)] = (
                            manifest_line[-1],
                            int(indexes[i]),
                            int(read_counts[i])
                        )
                else:
                    assert manifest_line_field_count == 5
                    source_dict[(Url(manifest_line[0]),
                                 Url(manifest_line[2]))] = (
                                                        manifest_line[-1],
                                                        int(indexes[i]),
                                                        int(read_counts[i])
                                                    )
        elif token_count == 3:
            # SRA or single-end reads
            source_dict[(Url(tokens[0]),)] = (tokens[-1],)
        elif token_count == 5:
            # Paired-end reads
            source_dict[(Url(tokens[0]), Url(tokens[2]))] = (tokens[-1],)
        else:
            # Not a valid line, but continue for robustness
            continue
    file_number = 0
    for source_urls in source_dict:
        sample_label = source_dict[source_urls][0]
        downloaded = set()
        sources = []
        records_printed = 0
        if len(source_dict[source_urls]) == 3:
            skip_count = source_dict[source_urls][1]
            if len(source_urls) == 2:
                records_to_consume = source_dict[source_urls][2]
                if skip_count % 2:
                    skip_count -= 1
                    records_to_consume += 1
                if records_to_consume % 2:
                    records_to_consume -= 1
                # Index reads according to order in input to shorten read names
                read_index = skip_count / 2 # Index reads in pairs
            else:
                records_to_consume = source_dict[source_urls][2]
                read_index = skip_count
        else:
            skip_count = 0
            records_to_consume = None # Consume all records
            read_index = 0
        assert (records_to_consume >= 0 or records_to_consume is None), (
                'Negative value %d of records to consume encountered.'
            ) % records_to_consume
        if records_to_consume == 0: continue
        skipped = False
        for source_url in source_urls:
            if not source_url.is_local:
                # Download
                print >>sys.stderr, 'Retrieving URL "%s"...' \
                    % source_url.to_url()
                if source_url.is_dbgap:
                    download_dir = workspace_dir
                elif source_url.is_sra:
                    download_dir = temp_dir
                if source_url.is_sra:
                    sra_accession = source_url.to_url()
                    fastq_dump_command = (
                            'set -exo pipefail; cd {download_dir}; '
                            '{fastq_dump_exe} -I -X 10000 --split-files '
                            '{sra_accession}'
                        ).format(download_dir=download_dir,
                                    fastq_dump_exe=fastq_dump_exe,
                                    sra_accession=sra_accession)
                    try:
                        subprocess.check_call(
                            fastq_dump_command, shell=True, 
                            executable='/bin/bash',
                            stdout=sys.stderr
                        )
                    except subprocess.CalledProcessError as e:
                        if e.returncode == 3 and ignore_missing_sra_samples:
                            onward = True
                            break
                        else:
                            raise RuntimeError(
                                ('Error "%s" encountered executing '
                                 'command "%s".') % (e.output,
                                                        fastq_dump_command))
                    import glob
                    sra_fastq_files = sorted(
                                        glob.glob(os.path.join(download_dir,
                                            '%s[_.]*' % sra_accession))
                                        ) # ensure 1 before 2 if paired-end
                    # Schedule for deletion
                    def silent_remove(filename):
                        try:
                            os.remove(filename)
                        except OSError as e:
                            pass
                    for sra_fastq_file in sra_fastq_files:
                        register_cleanup(silent_remove, sra_fastq_file)
                    sra_file_count = len(sra_fastq_files)
                    check_for_paired = False
                    if sra_file_count == 1:
                        sra_paired_end = False
                        print >>sys.stderr, 'Detected single-end SRA sample.'
                    elif sra_file_count in [2, 3]:
                        print >>sys.stderr, ('2 or 3 FASTQ files detected. '
                                             'Checking for barcodes...')
                        check_for_paired = True
                    else:
                        raise RuntimeError(
                                ('Unexpected number of files "%d" output '
                                 'by fastq-dump command "%s".')
                                    % (sra_file_count, fastq_dump_command)
                            )
                    if check_for_paired:
                        # Get max/min read lengths from FASTQ
                        with open(
                                    sra_fastq_files[sra_file_count - 2]
                                ) as fastq_stream:
                            max_len, min_len = (
                                    max_min_read_lengths_from_fastq_stream(
                                        fastq_stream
                                    )
                                )
                            print >>sys.stderr, (
                                    'Max/min read length found in candidate '
                                    'barcode FASTQ was {}/{}.'
                                ).format(max_len, min_len)
                            if max_len <= _max_stubby_read_length:
                                print >>sys.stderr, (
                                        'Assumed barcode FASTQ.'
                                    )
                                skip_stubs = True
                                if sra_file_count == 2:
                                    sra_paired_end = False
                                else:
                                    sra_paired_end = True
                            else:
                                if sra_file_count == 2:
                                    sra_paired_end = True
                                else:
                                    raise RuntimeError(
                                        '3 FASTQs detected, but one of them '
                                        'was not recognized as containing '
                                        'barcodes.'
                                    )
                    # Guess quality from first 10k lines
                    with xopen(None, sra_fastq_files[0]) as source_stream:
                        qual_getter = phred_converter(
                                            fastq_stream=source_stream
                                        )
                    for sra_fastq_file in sra_fastq_files:
                        os.remove(sra_fastq_file)
                    sources.append(os.devnull)
                    fastq_dump_command = (
                            'set -exo pipefail; cd {download_dir}; '
                            '{fastq_dump_exe} --split-spot -I --stdout '
                            '{sra_accession}'
                        ).format(download_dir=download_dir,
                                    fastq_dump_exe=fastq_dump_exe,
                                    sra_accession=sra_accession)
                    if skip_stubs:
                        fastq_dump_command += (
                                ' | awk \'BEGIN {{OFS = "\\n"}} '
                                '{{header = $0; '
                                'getline seq; getline qheader; getline qseq; '
                                'if (length(seq) > {min_len}) {{print header, '
                                'seq, qheader, qseq}}}}\''
                            ).format(min_len=_max_stubby_read_length)
                    print >>sys.stderr, fastq_dump_command
                    sra_process = subprocess.Popen(fastq_dump_command,
                                                    shell=True,
                                                    executable='/bin/bash',
                                                    stdout=subprocess.PIPE,
                                                    bufsize=-1)
                else:
                    mover.get(source_url, temp_dir)
                    downloaded = list(
                            set(os.listdir(temp_dir)).difference(downloaded)
                        )
                    sources.append(os.path.join(temp_dir, list(downloaded)[0]))
            else:
                sources.append(source_url.to_url())
        if onward: continue
        '''Use os.devnull so single- and paired-end data can be handled in one
        loop.'''
        if len(sources) == 1:
            sources.append(os.devnull)
        if qual_getter is None:
            # Figure out Phred format
            with xopen(None, sources[0]) as source_stream:
                qual_getter = phred_converter(fastq_stream=source_stream)
        with xopen(None, sources[0]) as source_stream_1, xopen(
                None, sources[1]
            ) as source_stream_2:
            source_streams = [source_stream_1, source_stream_2]
            reorganize = all([source == os.devnull for source in sources])
            if reorganize:
                # SRA data is live
                if sra_paired_end:
                    source_streams = [sra_process.stdout, sra_process.stdout]
                else:
                    source_streams = [sra_process.stdout, open(os.devnull)]
            break_outer_loop = False
            while True:
                if not to_stdout:
                    '''Name files using Hadoop task environment property
                    mapred.task.partition.'''
                    if gzip_output:
                        try:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapred_task_partition'],
                                        str(file_number), 'gz'
                                    ])
                                )
                        except KeyError:
                            '''Hadoop 2.x: mapreduce.task.partition; see 
                            http://hadoop.apache.org/docs/r2.0.3-alpha/
                            hadoop-project-dist/hadoop-common/
                            DeprecatedProperties.html.'''
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapreduce_task_partition'],
                                        str(file_number), 'gz'
                                    ])
                                )
                        open_args = [output_file, 'a', gzip_level]
                    else:
                        try:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapred_task_partition'],
                                        str(file_number)
                                    ])
                                )
                        except KeyError:
                            output_file = os.path.join(
                                    destination, 
                                    '.'.join([
                                        os.environ['mapreduce_task_partition'],
                                        str(k), str(file_number)
                                    ])
                                )
                        open_args = [output_file, 'a']
                    try:
                        os.makedirs(os.path.dirname(output_file))
                    except OSError:
                        pass
                else:
                    open_args = []
                '''Use xopen to handle compressed streams and normal streams
                generally.'''
                with xopen(gzip_output if not to_stdout else '-', *open_args) \
                    as output_stream:
                    perform_push = False
                    line_numbers = [0, 0]
                    read_next_line = True
                    nucs_read = 0
                    pairs_read = 0
                    while True:
                        if read_next_line:
                            # Read next line only if FASTA mode didn't already
                            lines = []
                            for source_stream in source_streams:
                                lines.append(source_stream.readline())
                        read_next_line = True
                        if not lines[0]:
                            break_outer_loop = True
                            break
                        line_numbers = [i + 1 for i in line_numbers]
                        lines = [line.strip() for line in lines]
                        bad_record_skip = False
                        if lines[0][0] in fastq_cues:
                            if records_to_consume and not skipped:
                                '''Skip lines as necessary; for paired-end
                                reads skip the largest even number of records 
                                less than records_to_consume.'''
                                if len(source_urls) == 1:
                                    # single-end
                                    line_skip_count = max(
                                            skip_count * 4 - 1, 0
                                        )
                                else:
                                    # paired-end
                                    line_skip_count = max(
                                            ((skip_count / 2) * 4 - 1), 0
                                        )
                                    for _ in xrange(line_skip_count):
                                        next(source_stream_2)
                                for _ in xrange(line_skip_count):
                                    next(source_stream_1)
                                if skip_count:
                                    lines = []
                                    for source_stream in source_streams:
                                        lines.append(source_stream.readline())
                                    if not lines[0]:
                                        break_outer_loop = True
                                        break
                                    lines = [line.strip() for line in lines]
                                skipped = True
                            seqs = [source_stream.readline().strip()
                                        for source_stream in source_streams]
                            line_numbers = [i + 1 for i in line_numbers]
                            plus_lines = [source_stream.readline().strip()
                                            for source_stream
                                            in source_streams]
                            line_numbers = [i + 1 for i in line_numbers]
                            quals = [source_stream.readline().strip()
                                        for source_stream in source_streams]
                            if reorganize and sra_paired_end:
                                # Fix order!
                                lines, seqs, plus_lines, quals = (
                                        [lines[0], plus_lines[0]],
                                        [lines[1], plus_lines[1]],
                                        [seqs[0], quals[0]],
                                        [seqs[1], quals[1]]
                                    )
                            try:
                                assert plus_lines[0][0] == '+', (
                                        'Malformed read "%s" at line %d of '
                                        'file "%s".'
                                    ) % (lines[0], line_numbers[0], sources[0])
                                if plus_lines[1]:
                                    assert plus_lines[1][0] == '+', (
                                            'Malformed read "%s" at line %d '
                                            'of file "%s".'
                                        ) % (
                                        lines[1], line_numbers[1], sources[1]
                                    )
                                try:
                                    # Kill spaces in name
                                    original_qnames = \
                                        [line[1:].replace(' ', '_')
                                            for line in lines]
                                except IndexError:
                                    raise RuntimeError(
                                            'Error finding QNAME at ' 
                                            'line %d of either %s or %s' % (
                                                        sources[0],
                                                        sources[1]
                                                    )
                                        )
                            except (AssertionError,
                                    IndexError, RuntimeError) as e:
                                if skip_bad_records:
                                    print >>sys.stderr, ('Error "%s" '
                                            'encountered; skipping bad record.'
                                        ) % e.message
                                    for source_stream in source_streams:
                                        source_stream.readline()
                                    line_numbers = [
                                            i + 1 for i in line_numbers
                                        ]
                                    bad_record_skip = True
                                else:
                                    raise
                            else:
                                try:
                                    quals = [
                                            qual_getter(qual) for qual in quals
                                        ]
                                except Exception as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered '
                                                'trying to convert quality '
                                                'string to Sanger format; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                                line_numbers = [i + 1 for i in line_numbers]
                                try: 
                                    for i in xrange(2):
                                        assert len(seqs[i]) == len(quals[i]), (
                                            'Length of read sequence does not '
                                            'match length of quality string '
                                            'at line %d of file "%s".'
                                        ) % (line_numbers[i], sources[i])
                                except (AssertionError, IndexError) as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                        elif lines[0][0] in fasta_cues:
                            seqs = [[], []]
                            next_lines = []
                            for p, source_stream in enumerate(source_streams):
                                while True:
                                    next_line \
                                        = source_stream.readline().strip()
                                    try:
                                        if next_line[0] in fasta_cues:
                                            break
                                        else:
                                            try:
                                                seqs[p].append(next_line)
                                            except IndexError:
                                                raise
                                    except IndexError:
                                        break
                                next_lines.append(next_line)
                            seqs = [''.join(seq) for seq in seqs]
                            line_numbers = [i + 1 for i in line_numbers]
                            try:
                                try:
                                    # Kill spaces in name
                                    original_qnames = \
                                        [line[1:].replace(' ', '_')
                                            for line in lines]
                                except IndexError:
                                    raise RuntimeError(
                                            'Error finding QNAME at ' 
                                            'line %d of either %s or %s' % (
                                                        sources[0],
                                                        sources[1]
                                                    )
                                        )
                            except (AssertionError,
                                    IndexError, RuntimeError) as e:
                                if skip_bad_records:
                                    print >>sys.stderr, ('Error "%s" '
                                            'encountered; skipping bad record.'
                                        ) % e.message
                                    for source_stream in source_streams:
                                        source_stream.readline()
                                    line_numbers = [
                                            i + 1 for i in line_numbers
                                        ]
                                    bad_record_skip = True
                                else:
                                    raise
                            else:
                                try:
                                    quals = [
                                        'h'*len(seq) for seq in seqs
                                        ]
                                except Exception as e:
                                    if skip_bad_records:
                                        print >>sys.stderr, (
                                                'Error "%s" encountered '
                                                'trying to convert quality '
                                                'string to Sanger format; '
                                                'skipping bad record.'
                                            ) % e.message
                                        bad_record_skip = True
                                    else:
                                        raise
                                line_numbers = [i + 1 for i in line_numbers]
                            lines = next_lines
                            read_next_line = False
                        if bad_record_skip:
                            seqs = []
                            # Fake record-printing to get to records_to_consume
                            if source_streams[-1].name == os.devnull:
                                records_printed += 1
                            else:
                                records_printed += 2
                        elif len(original_qnames) == 2 and original_qnames[1]:
                            # Paired-end write
                            if original_qnames[0] == original_qnames[1]:
                                # Add paired-end identifiers
                                original_qnames[0] += '/1'
                                original_qnames[1] += '/2'
                            assert seqs[1]
                            assert quals[1]
                            seqs = [seq.upper() for seq in seqs]
                            reversed_complement_seqs = [
                                    seqs[0][::-1].translate(
                                        _reversed_complement_translation_table
                                    ),
                                    seqs[1][::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                                ]
                            if seqs[0] < reversed_complement_seqs[0]:
                                left_seq = seqs[0]
                                left_qual = quals[0]
                                left_reversed = '0'
                            else:
                                left_seq = reversed_complement_seqs[0]
                                left_qual = quals[0][::-1]
                                left_reversed = '1'
                            if seqs[1] < reversed_complement_seqs[1]:
                                right_seq = seqs[1]
                                right_qual = quals[1]
                                right_reversed = '0'
                            else:
                                right_seq = reversed_complement_seqs[1]
                                right_qual = quals[1][::-1]
                                right_reversed = '1'
                            if short_qnames:
                                left_qname_to_write = encode(read_index) + '/1'
                                right_qname_to_write = encode(
                                                            read_index
                                                        ) + '/2'
                            else:
                                left_qname_to_write = original_qnames[0]
                                right_qname_to_write = original_qnames[1]
                            print >>output_stream, '\t'.join(
                                        [
                                            left_seq,
                                            left_reversed,
                                            qname_from_read(
                                                    left_qname_to_write,
                                                    seqs[0] + quals[0], 
                                                    sample_label,
                                                    mate=seqs[1]
                                                ),
                                            '\n'.join([
                                                round_quality_string(
                                                    left_qual
                                                ), right_seq
                                            ]),
                                            right_reversed,
                                            qname_from_read(
                                                    right_qname_to_write,
                                                    seqs[1] + quals[1], 
                                                    sample_label,
                                                    mate=seqs[0]
                                                ),
                                            round_quality_string(right_qual)
                                        ]
                                    )
                            records_printed += 2
                            _output_line_count += 1
                        else:
                            seqs[0] = seqs[0].upper()
                            reversed_complement_seqs = [
                                    seqs[0][::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                                ]
                            # Single-end write
                            if seqs[0] < reversed_complement_seqs[0]:
                                seq = seqs[0]
                                qual = quals[0]
                                is_reversed = '0'
                            else:
                                seq = reversed_complement_seqs[0]
                                qual = quals[0][::-1]
                                is_reversed = '1'
                            if short_qnames:
                                qname_to_write = encode(read_index)
                            else:
                                qname_to_write = original_qnames[0]
                            print >>output_stream, '\t'.join(
                                        [
                                            seq,
                                            is_reversed,
                                            qname_from_read(
                                                qname_to_write,
                                                seqs[0] + quals[0], 
                                                sample_label
                                            ),
                                            round_quality_string(qual)
                                        ]
                                    )
                            records_printed += 1
                            _output_line_count += 1
                        read_index += 1
                        for seq in seqs:
                            nucs_read += len(seq)
                        if records_printed == records_to_consume:
                            break_outer_loop = True
                            perform_push = True
                            break
                        if not to_stdout and not records_to_consume and \
                            nucs_read > nucleotides_per_input:
                            file_number += 1
                            break
                if verbose:
                    print >>sys.stderr, (
                            'Exited with statement; line numbers are %s' 
                            % line_numbers
                        )
                if (not to_stdout) and (push_url.is_nfs or
                    push_url.is_s3 or push_url.is_hdfs) \
                    and ((not records_to_consume) or
                         (records_to_consume and perform_push)):
                    print >>sys.stderr, 'Pushing "%s" to "%s" ...' % (
                                                            output_file,
                                                            push_url.to_url()
                                                        )
                    print >>sys.stderr, 'reporter:status:alive'
                    mover.put(output_file, push_url.plus(os.path.basename(
                                                                output_file
                                                            )))
                    try:
                        os.remove(output_file)
                    except OSError:
                        pass
                if break_outer_loop: break
            if verbose:
                print >>sys.stderr, 'Exiting source streams...'
        if verbose:
            print >>sys.stderr, 'Exited source streams.'
        # Clear temporary directory
        for input_file in os.listdir(temp_dir):
            try:
                os.remove(os.path.join(temp_dir, input_file))
            except OSError:
                pass
        if 'sra_process' in locals():
            sra_process.stdout.close()
            sra_return_code = sra_process.wait()
            if sra_return_code > 0:
                raise RuntimeError(('fastq-dump terminated with exit '
                                    'code %d. Command run was "%s".')
                                        % (sra_return_code,
                                            fastq_dump_command))
            del sra_process
Example #5
0
base_path = os.path.abspath(
    os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

import bowtie
from dooplicity.ansibles import Url
from dooplicity.tools import register_cleanup, make_temp_dir
from dooplicity.counters import Counter
import filemover
import tempdel

counter = Counter('junction_index')
register_cleanup(counter.flush)

# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(\
    '--out', metavar='URL', type=str, required=False,
    default='None',
    help='Bowtie index files are written to this URL. DEFAULT IS CURRENT '
         'WORKING DIRECTORY.')
parser.add_argument(\
    '--basename', type=str, required=False,
    default='junction',
    help='Basename for index to be written')
parser.add_argument(\
    '--keep-alive', action='store_const', const=True, default=False,
Example #6
0
start_time = time.time()

input_line_count = 0

if args.out is not None:
    '''If --out is a local file, just write directly to that file. Otherwise,
    write to a temporary file that will later be uploaded to the
    destination.'''
    output_url = Url(args.out)
    if output_url.is_local:
        try: os.makedirs(output_url.to_url())
        except: pass
        output_filename = os.path.join(args.out, args.junction_filename)
    else:
        temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
        register_cleanup(tempdel.remove_temporary_directories,
                            [temp_dir_path])
        output_filename = args.junction_filename + '.temp'
        output_filename = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_filename, 'w', args.gzip_level) as output_stream:
        for line in sys.stdin:
            tokens = line.strip().split('\t')
            # Remove leading zeros from ints
            print >>output_stream, '\t'.join(
                    [tokens[0], str(int(tokens[1])),
                        str(int(tokens[2]) - 1), tokens[3], tokens[4]]
                )
            input_line_count += 1
else:
    # Default --out is stdout
    for line in sys.stdin:
        tokens = line.strip().split('\t')
Example #7
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie2_exe='bowtie2',
       bowtie2_index_base='genome',
       bowtie2_args='',
       verbose=False,
       report_multiplier=1.2,
       stranded=False,
       fudge=5,
       score_min=60,
       gzip_level=3,
       mover=filemover.FileMover(),
       intermediate_dir='.',
       scratch=None):
    """ Runs Rail-RNA-cointron_enum 

        Alignment script for MapReduce pipelines that wraps Bowtie 2. Finds
        introns that cooccur on reads by local alignments to transcriptome
        elements from Bowtie 2.

        Input (read from stdin)
        ----------------------------
        Tab-delimited output tuple columns (readletize)
        1. SEQ or its reversed complement, whichever is first in alphabetical
            order
        2. Comma-separated list of sample labels if field 1 is the read
            sequence; '\x1c' if empty
        3. Comma-separated list of sample labels if field 1 is the reversed
            complement of the read sequence; '\x1c' if empty

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited tuple columns:
        1. Reference name (RNAME in SAM format) + 
            '+' or '-' indicating which strand is the sense strand
        2. Comma-separated list of intron start positions in configuration
        3. Comma-separated list of intron end positions in configuration
        4. left_extend_size: by how many bases on the left side of an intron
            the reference should extend
        5. right_extend_size: by how many bases on the right side of an intron
            the reference should extend
        6. Read sequence

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie2_exe: filename of Bowtie 2 executable; include path if not in
            $PATH.
        bowtie2_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to Bowtie 2, e.g., "--tryhard --best"; or None.
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the intron's strand.
        fudge: by how many bases to extend left and right extend sizes
                to accommodate potential indels
        score_min: Bowtie2 CONSTANT minimum alignment score
        gzip_level: compression level to use for temporary files
        mover: FileMover object, for use in case Bowtie2 idx needs to be
            pulled from S3
        intermediate_dir: where intermediates are stored; for temporarily
            storing transcript index if it needs to be pulled from S3
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    bowtie2_index_base_url = Url(bowtie2_index_base)
    if bowtie2_index_base_url.is_s3:
        index_basename = os.path.basename(bowtie2_index_base)
        index_directory = os.path.join(intermediate_dir, 'transcript_index')
        if not os.path.exists(os.path.join(index_directory, '_STARTED')):
            # Download index
            with open(os.path.join(index_directory, '_STARTED'), 'w') \
                as started_stream:
                print >> started_stream, 'STARTED'
            for extension in [
                    '.1.bt2', '.2.bt2', '.3.bt2', '.4.bt2', '.rev.1.bt2',
                    '.rev.2.bt2'
            ]:
                mover.get(bowtie2_index_base_url, index_directory)
            with open(os.path.join(index_directory, '_SUCCESS'), 'w') \
                as success_stream:
                print >> success_stream, 'SUCCESS'
        while not os.path.exists(os.path.join(index_directory, '_SUCCESS')):
            time.sleep(0.5)
        bowtie2_index_base = os.path.join(index_directory, index_basename)
    global _input_line_count
    temp_dir_path = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
    reads_file = os.path.join(temp_dir_path, 'reads.temp.gz')
    with xopen(True, reads_file, 'w', gzip_level) as reads_stream:
        for _input_line_count, line in enumerate(input_stream):
            seq = line.strip()
            print >> reads_stream, '\t'.join([seq, seq, 'I' * len(seq)])
    input_command = 'gzip -cd %s' % reads_file
    bowtie_command = ' '.join([
        bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
        ' --local -t --no-hd --mm -x', bowtie2_index_base, '--12 -',
        '--score-min L,%d,0' % score_min, '-D 24 -R 3 -N 1 -L 20 -i L,4,0'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        '_delegate.py --report-multiplier %08f --fudge %d %s %s' %
        (report_multiplier, fudge, '--stranded' if stranded else '',
         '--verbose' if verbose else '')
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >> sys.stderr, 'Starting Bowtie2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading Bowtie 2 output; '
                           'exitlevel was %d.' % return_code)
Example #8
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie_exe='bowtie',
       bowtie_index_base='genome',
       bowtie_args='',
       gzip_level=3,
       verbose=False,
       report_multiplier=1.2,
       scratch=None):
    """ Runs Rail-RNA-align_readlets.

        Aligns input readlet sequences and writes a single output line per
        readlet belonging to a distinct read sequence.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns:
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Input is partitioned by field 1, the readlet sequence or its reversed
        complement.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited output tuple columns, where each line corresponds to a
        readlet from a distinct read rather than a unique readlet sequence:
        1. Read sequence ID
        2. Displacement of readlet's 5' end from read's 5' end + '\x1e' +
            displacement of readlet's 3' end from read's 3' end (+, for EXACTLY
            one readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            number of instances of read sequence + '\x1e' + number of instances
            of read sequence's reversed complement + '\x1e' (+, for EXACTLY one
            readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence))] + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective
            sample in list A) + '\x1e' + (an '\x1f'-separated list of the
            number of instances of the read sequence's reversed complement for
            each respective sample in list B)
        3. '\x1f'-separated list of alignment RNAMEs or '\x1c' if no alignments
        4. '\x1f'-separated list of alignment FLAGs or '\x1c' if no alignments
        5. '\x1f-separated list of alignment POSes or '\x1c' if no alignments

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie_exe: filename of Bowtie executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie, e.g., "--tryhard --best"; or None.
        gzip_level: level of gzip compression to use for qname file
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    global _input_line_count
    # For storing long qnames
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    qnames_file = os.path.join(temp_dir, 'qnames.temp.gz')
    readlet_file = os.path.join(temp_dir, 'readlets.temp.gz')
    with xopen(True, qnames_file, 'w', gzip_level) as qname_stream:
        with xopen(True, readlet_file, 'w', gzip_level) as readlet_stream:
            for (seq_count, ((seq,), xpartition)) \
                in enumerate(xstream(input_stream, 1)):
                print >>readlet_stream, \
                    '\t'.join([str(seq_count), seq, 'I'*len(seq)])
                print >> qname_stream, next(iter(xpartition))[0]
                for (qname, ) in xpartition:
                    _input_line_count += 1
                    print >> qname_stream, qname
                # Separate qnames with single + character
                print >> qname_stream, '+'
    input_command = 'gzip -cd %s' % readlet_file
    bowtie_command = ' '.join([
        bowtie_exe, bowtie_args, '-S -t --sam-nohead --mm', bowtie_index_base,
        '--12 -'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        '_delegate.py --report-multiplier %08f --qnames-file %s %s' %
        (report_multiplier, qnames_file, '--verbose' if verbose else '')
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >> sys.stderr, 'Starting Bowtie with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading Bowtie output; '
                           'exitlevel was %d.' % return_code)
Example #9
0
    global, properties of args are also arguments of the go() function so
    different command-line arguments can be passed to it for unit tests.'''
    args = parser.parse_args(argv[1:])

    # Start keep_alive thread immediately
    if args.keep_alive:
        from dooplicity.tools import KeepAlive
        keep_alive_thread = KeepAlive(sys.stderr)
        keep_alive_thread.start()

if __name__ == '__main__' and not args.test:
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    archive = os.path.join(args.archive,
        str(os.getpid())) if args.archive is not None else None
    # Handle temporary directory if CTRL+C'd
    register_cleanup(handle_temporary_directory, archive, temp_dir_path)
    if args.verbose:
        print >>sys.stderr, 'Creating temporary directory %s' \
            % temp_dir_path
    go(bowtie2_exe=os.path.expandvars(args.bowtie2_exe),
        bowtie2_build_exe=os.path.expandvars(args.bowtie2_build_exe),
        bowtie2_args=bowtie_args,
        temp_dir_path=temp_dir_path,
        verbose=args.verbose, 
        report_multiplier=args.report_multiplier,
        gzip_level=args.gzip_level,
        count_multiplier=args.count_multiplier,
        tie_margin=args.tie_margin)
elif __name__ == '__main__':
    # Test units
    del sys.argv[1:] # Don't choke on extra command-line parameters
Example #10
0
parser.add_argument(
        '--keep-alive', action='store_const', const=True,
        default=False,
        help='Periodically print Hadoop status messages to stderr to keep ' \
             'job alive'
    )

args = parser.parse_args()

if args.keep_alive:
    from dooplicity.tools import KeepAlive
    keep_alive_thread = KeepAlive(sys.stderr)

input_line_count, output_line_count = 0, 0
counter = Counter('realign_reads_delegate')
register_cleanup(counter.flush)

# Must consume a line of stdin before outputting status messages
line = sys.stdin.readline()
if args.keep_alive: keep_alive_thread.start()

if args.type == 1:
    last_key, totals, write_line = None, [0] * args.value_count, False
    while True:
        counter.add('type1_inputs')
        if not line:
            if last_key is None:
                # Input is empty
                break
            else:
                # Write final line
Example #11
0
site.addsitedir(utils_path)
site.addsitedir(base_path)

from dooplicity.tools import xstream, register_cleanup, xopen, \
    make_temp_dir
from dooplicity.counters import Counter
import bowtie
import argparse
import tempdel
import itertools
from copy import copy

# Initialize global variable for tracking number of input lines
_input_line_count = 0
counter = Counter('realign_reads')
register_cleanup(counter.flush)

_reversed_complement_translation_table = string.maketrans('ATCG', 'TAGC')


def input_files_from_input_stream(input_stream,
                                  output_stream,
                                  temp_dir_path=None,
                                  verbose=False,
                                  gzip_level=3):
    """ Generates FASTA reference to index and file with reads.

        Each line of the read file is in the following format:

        read number <TAB> SEQ <TAB> QUAL
Example #12
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie2_exe='bowtie2',
       bowtie_index_base='genome',
       bowtie2_index_base='genome2',
       manifest_file='manifest',
       bowtie2_args=None,
       bin_size=10000,
       verbose=False,
       exon_differentials=True,
       exon_intervals=False,
       report_multiplier=1.2,
       min_exon_size=8,
       search_filter=1,
       min_readlet_size=15,
       max_readlet_size=25,
       readlet_interval=12,
       capping_multiplier=1.5,
       drop_deletions=False,
       gzip_level=3,
       scratch=None,
       index_count=1,
       output_bam_by_chr=False,
       tie_margin=0,
       no_realign=False,
       no_polyA=False):
    """ Runs Rail-RNA-align_reads.

        A single pass of Bowtie is run to find end-to-end alignments. Unmapped
        reads are saved for readletizing to determine introns in sucessive
        reduce steps as well as for realignment in a later map step.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns in a mix of any of the following
        three formats:
        Format 1 (single-end, 3-column):
          1. Nucleotide sequence or its reversed complement, whichever is first
            in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name
          4. Quality sequence or its reverse, whichever corresponds to field 1

        Format 2 (paired, 2 lines, 3 columns each)
        (so this is the same as single-end)
          1. Nucleotide sequence for mate 1 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name for mate 1
          4. Quality sequence for mate 1 or its reverse, whichever corresponds
            to field 1
            
            (new line)

          1. Nucleotide sequence for mate 2 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse complemented else 0
          3. Name for mate 2
          4. Quality sequence for mate 2 or its reverse, whichever corresponds
            to field 1

        Input is partitioned and sorted by field 1, the read sequence.

        Hadoop output (written to stdout)
        ----------------------------
        A given RNAME sequence is partitioned into intervals ("bins") of some 
        user-specified length (see partition.py).

        Exonic chunks (aka ECs; three formats, any or all of which may be
        emitted):

        Format 1 (exon_ival); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. Sample index
        3. EC start (inclusive) on forward strand
        4. EC end (exclusive) on forward strand

        Format 2 (exon_diff); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. max(EC start, bin start) (inclusive) on forward strand IFF diff is
            positive and EC end (exclusive) on forward strand IFF diff is
            negative
        3. Sample index
        4. '1' if alignment from which diff originates is "unique" according to
            --tie-margin criterion; else '0'
        5. +1 or -1 * count, the number of instances of a read sequence for
            which to print exonic chunks

        Note that only unique alignments are currently output as ivals and/or
        diffs.

        Format 3 (sam); tab-delimited output tuple columns:
        Standard SAM output except fields are in different order, and the first
        field corresponds to sample label. (Fields are reordered to facilitate
        partitioning by sample name/RNAME and sorting by POS.) Each line
        corresponds to a spliced alignment. The order of the fields is as
        follows.
        1. Sample index if outputting BAMs by sample OR
                sample-rname index if outputting BAMs by chr
        2. (Number string representing RNAME; see BowtieIndexReference
            class in bowtie_index for conversion information) OR
            '0' if outputting BAMs by chr
        3. POS
        4. QNAME
        5. FLAG
        6. MAPQ
        7. CIGAR
        8. RNEXT
        9. PNEXT
        10. TLEN
        11. SEQ
        12. QUAL
        ... + optional fields

        Insertions/deletions (indel_bed)

        tab-delimited output tuple columns:
        1. 'I' or 'D' insertion or deletion line
        2. Number string representing RNAME
        3. Start position (Last base before insertion or 
            first base of deletion)
        4. End position (Last base before insertion or last base of deletion 
                            (exclusive))
        5. Inserted sequence for insertions or deleted sequence for deletions
        6. Sample index
        ----Next fields are for introns only; they are '\x1c' for indels----
        7. '\x1c'
        8. '\x1c'
        --------------------------------------------------------------------
        9. Number of instances of insertion or deletion in sample; this is
            always +1 * count before bed_pre combiner/reducer

        Read whose primary alignment is not end-to-end

        Tab-delimited output tuple columns (unmapped):
        1. Transcriptome Bowtie 2 index group number
        2. SEQ
        3. 1 if SEQ is reverse-complemented, else 0
        4. QNAME
        5. QUAL

        Tab-delimited output tuple columns (readletized):
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Tab-delimited tuple columns (postponed_sam):
        Standard 11+ -column raw SAM output

        Single column (unique):
        1. A unique read sequence

        Two columns, exactly one line (dummy); ensures creation of intron
            index:
        1. character "-"
        2. the word "dummy"

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie2_exe: filename of Bowtie2 executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie1 index files associated
            with the reference.
        bowtie2_index_base: the basename of the Bowtie2 index files associated
            with the reference.
        manifest_file: filename of manifest
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie2.
        bin_size: genome is partitioned in units of bin_size for later load
            balancing.
        verbose: True iff more informative messages should be written to
            stderr.
        exon_differentials: True iff EC differentials are to be emitted.
        exon_intervals: True iff EC intervals are to be emitted.
        report_multiplier: if verbose is True, the line number of an alignment
            or read written to stderr increases exponentially with base
            report_multiplier.
        min_exon_size: minimum exon size searched for in intron_search.py later
            in pipeline; used to determine how large a soft clip on one side of
            a read is necessary to pass it on to intron search pipeline
        search_filter: how large a soft clip on one side of a read is necessary
            to pass it on to intron search pipeline
        min_readlet_size: "capping" readlets (that is, readlets that terminate
            at a given end of the read) are never smaller than this value
        max_readlet_size: size of every noncapping readlet
        readlet_interval: number of bases separating successive readlets along
            the read
        capping_multiplier: successive capping readlets on a given end of a
            read are increased in size exponentially with base
            capping_multiplier
        drop_deletions: True iff deletions should be dropped from coverage
            vector
        gzip_level: compression level to use for temporary files
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory
        index_count: number of transcriptome Bowtie 2 indexes to which to
            assign unmapped reads for later realignment
        output_bam_by_chr: True iff final output BAMs will be by chromosome
        tie_margin: allowed score difference per 100 bases among ties in
            max score. For example, 150 and 144 are tied alignment scores
            for a 100-bp read when --tie-margin is 6.
        no_realign: True iff job flow does not need more than readlets: this
            usually means only a transcript index is being constructed
        no_polyA: kill noncapping readlets that are all As and write as
            unmapped all reads with polyA prefixes whose suffixes are <
            min_exon_size

        No return value.
    """
    global _input_line_count
    # Required length of prefix after poly(A) is trimmed
    remaining_seq_size = max(min_exon_size - 1, 1)
    polyA_set = frozenset(['A' * i
                           for i in xrange(1, remaining_seq_size + 1)] +
                          ['T' * i
                           for i in xrange(1, remaining_seq_size + 1)] + [''])
    reference_index = bowtie_index.BowtieIndexReference(bowtie_index_base)
    manifest_object = manifest.LabelsAndIndices(manifest_file)
    alignment_printer = AlignmentPrinter(manifest_object,
                                         reference_index,
                                         bin_size=bin_size,
                                         output_stream=output_stream,
                                         exon_ivals=exon_intervals,
                                         exon_diffs=exon_differentials,
                                         drop_deletions=drop_deletions,
                                         output_bam_by_chr=output_bam_by_chr,
                                         tie_margin=tie_margin)
    # Get task partition to pass to align_reads_delegate.py
    try:
        task_partition = os.environ['mapred_task_partition']
    except KeyError:
        # Hadoop 2.x?
        try:
            task_partition = os.environ['mapreduce_task_partition']
        except KeyError:
            # A unit test is probably being run
            task_partition = '0'
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    align_file = os.path.join(temp_dir, 'first_pass_reads.temp.gz')
    other_reads_file = os.path.join(temp_dir, 'other_reads.temp.gz')
    second_pass_file = os.path.join(temp_dir, 'second_pass_reads.temp.gz')
    k_value, _, _ = bowtie.parsed_bowtie_args(bowtie2_args)
    nothing_doing = True
    with xopen(True, align_file, 'w', gzip_level) as align_stream, \
        xopen(True, other_reads_file, 'w', gzip_level) as other_stream:
        for seq_number, ((seq, ),
                         xpartition) in enumerate(xstream(sys.stdin, 1)):
            if no_polyA and (seq[:-remaining_seq_size] in polyA_set
                             or seq[remaining_seq_size:] in polyA_set):
                if not no_realign:
                    '''If a sequence is too short without its poly(A) tail,
                    make all reads with that sequence unmapped. Technically,
                    this also kills poly(A)s at 5' ends, but we probably
                    couldn't align those sequences anyway.'''
                    reversed_complement_seq = seq[::-1].translate(
                        _reversed_complement_translation_table)
                    for is_reversed, name, qual in xpartition:
                        if is_reversed == '0':
                            alignment_printer.print_unmapped_read(
                                name, seq, qual)
                        else:
                            alignment_printer.print_unmapped_read(
                                name, reversed_complement_seq, qual[::-1])
                continue
            nothing_doing = False
            '''Select highest-quality read with alphabetically last qname
            for first-pass alignment.'''
            best_name, best_mean_qual, best_qual_index, i = None, None, 0, 0
            others_to_print = dlist()
            for is_reversed, name, qual in xpartition:
                _input_line_count += 1
                others_to_print.append('\t'.join(
                    [str(seq_number), is_reversed, name, qual]))
                mean_qual = (float(sum([ord(score)
                                        for score in qual])) / len(qual))
                if (mean_qual > best_mean_qual
                        or mean_qual == best_mean_qual and name > best_name):
                    best_qual_index = i
                    best_mean_qual = mean_qual
                    best_name = name
                    to_align = '\t'.join(
                        ['%s\x1d%s' % (is_reversed, name), seq, qual])
                i += 1
            assert i >= 1
            if i == 1:
                print >> other_stream, str(seq_number)
            else:
                for j, other_to_print in enumerate(others_to_print):
                    if j != best_qual_index:
                        print >> other_stream, other_to_print
            print >> align_stream, to_align
    # Print dummy line
    print 'dummy\t-\tdummy'
    sys.stdout.flush(
    )  # this is REALLY important b/c called script will stdout
    if nothing_doing:
        # No input
        sys.exit(0)
    input_command = 'gzip -cd %s' % align_file
    bowtie_command = ' '.join([
        bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
        ' --sam-no-qname-trunc --local -t --no-hd --mm -x', bowtie2_index_base,
        '--12 -'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        ('_delegate.py --task-partition {task_partition} '
         '--other-reads {other_reads} --second-pass-reads '
         '{second_pass_reads} --min-readlet-size '
         '{min_readlet_size} {drop_deletions} '
         '--max-readlet-size {max_readlet_size} '
         '--readlet-interval {readlet_interval} '
         '--capping-multiplier {capping_multiplier:1.12f} '
         '{verbose} --report-multiplier {report_multiplier:1.12f} '
         '--k-value {k_value} '
         '--bowtie-idx {bowtie_index_base} '
         '--partition-length {bin_size} '
         '--manifest {manifest_file} '
         '{exon_differentials} {exon_intervals} '
         '--gzip-level {gzip_level} '
         '--search-filter {search_filter} '
         '--index-count {index_count} '
         '--tie-margin {tie_margin} '
         '{no_realign} '
         '{no_polyA} '
         '{output_bam_by_chr}').format(
             task_partition=task_partition,
             other_reads=other_reads_file,
             second_pass_reads=second_pass_file,
             min_readlet_size=min_readlet_size,
             drop_deletions=('--drop-deletions' if drop_deletions else ''),
             max_readlet_size=max_readlet_size,
             readlet_interval=readlet_interval,
             capping_multiplier=capping_multiplier,
             verbose=('--verbose' if verbose else ''),
             report_multiplier=report_multiplier,
             k_value=k_value,
             bowtie_index_base=bowtie_index_base,
             bin_size=bin_size,
             manifest_file=manifest_file,
             exon_differentials=('--exon-differentials'
                                 if exon_differentials else ''),
             exon_intervals=('--exon-intervals' if exon_intervals else ''),
             gzip_level=gzip_level,
             search_filter=search_filter,
             index_count=index_count,
             tie_margin=tie_margin,
             no_realign=('--no-realign' if no_realign else ''),
             no_polyA=('--no-polyA' if no_polyA else ''),
             output_bam_by_chr=('--output-bam-by-chr'
                                if output_bam_by_chr else ''))
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >>sys.stderr, \
        'Starting first-pass Bowtie 2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading first-pass Bowtie 2 '
                           'output; exitlevel was %d.' % return_code)
    os.remove(align_file)
    os.remove(other_reads_file)
    if not no_realign:
        input_command = 'gzip -cd %s' % second_pass_file
        bowtie_command = ' '.join([
            bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
            ' --sam-no-qname-trunc --local -t --no-hd --mm -x',
            bowtie2_index_base, '--12 -'
        ])
        delegate_command = ''.join([
            sys.executable, ' ',
            os.path.realpath(__file__)[:-3],
            ('_delegate.py --task-partition {task_partition} '
             '--min-readlet-size {min_readlet_size} '
             '{drop_deletions} '
             '--max-readlet-size {max_readlet_size} '
             '--readlet-interval {readlet_interval} '
             '--capping-multiplier {capping_multiplier:012f} '
             '{verbose} '
             '--report-multiplier {report_multiplier:012f} '
             '--k-value {k_value} '
             '--bowtie-idx {bowtie_index_base} '
             '--partition-length {bin_size} '
             '--manifest {manifest_file} '
             '{exon_differentials} {exon_intervals} '
             '--gzip-level {gzip_level} '
             '--search-filter {search_filter} '
             '--index-count {index_count} '
             '--tie-margin {tie_margin} '
             '{output_bam_by_chr}').format(
                 task_partition=task_partition,
                 min_readlet_size=min_readlet_size,
                 drop_deletions=('--drop-deletions' if drop_deletions else ''),
                 readlet_interval=readlet_interval,
                 max_readlet_size=max_readlet_size,
                 capping_multiplier=capping_multiplier,
                 verbose=('--verbose' if verbose else ''),
                 report_multiplier=report_multiplier,
                 k_value=k_value,
                 bowtie_index_base=bowtie_index_base,
                 bin_size=bin_size,
                 manifest_file=manifest_file,
                 exon_differentials=('--exon-differentials'
                                     if exon_differentials else ''),
                 exon_intervals=('--exon-intervals' if exon_intervals else ''),
                 gzip_level=gzip_level,
                 search_filter=search_filter,
                 index_count=index_count,
                 tie_margin=tie_margin,
                 output_bam_by_chr=('--output-bam-by-chr'
                                    if output_bam_by_chr else ''))
        ])
        full_command = ' | '.join(
            [input_command, bowtie_command, delegate_command])
        print >>sys.stderr, \
            'Starting second-pass Bowtie 2 with command: ' + full_command
        bowtie_process = subprocess.Popen(' '.join(
            ['set -exo pipefail;', full_command]),
                                          bufsize=-1,
                                          stdout=sys.stdout,
                                          stderr=sys.stderr,
                                          shell=True,
                                          executable='/bin/bash')
        return_code = bowtie_process.wait()
        if return_code:
            raise RuntimeError('Error occurred while reading second-pass '
                               'Bowtie 2 output; exitlevel was %d.' %
                               return_code)
    sys.stdout.flush()
Example #13
0
    def install(self):
        """ Installs Rail-RNA and all its dependencies. """
        if not self.no_dependencies and self.curl_exe is None:
            self.curl_exe = which('curl')
            if self.curl_exe is None:
                print_to_screen('Rail-RNA\'s installer requires Curl if '
                                'dependencies are to be installed. '
                                'Download it at '
                                'http://curl.haxx.se/download.html and use '
                                '--curl to specify its path, or '
                                'disable installing dependencies with '
                                '--no-dependencies.')
                sys.exit(1)
        if self._yes_no_query(
                'Rail-RNA can be installed for all users or just the '
                'current user.\n    * Install for all users?',
                answer=(None if not self.yes else (self.yes and not self.me))
            ):
            if os.getuid():
                print_to_screen('Rerun with sudo privileges to install '
                                'for all users.')
                sys.exit(0)
            install_dir = '/usr/local'
            self.local = False
        else:
            install_dir = os.path.abspath(os.path.expanduser('~/'))
            self.local = True
        bin_dir = os.path.join(install_dir, 'bin')
        rail_exe = os.path.join(bin_dir, 'rail-rna')
        if self.install_dir is None:
            self.final_install_dir = os.path.join(install_dir, 'raildotbio')
        else:
            # User specified an installation directory
            self.final_install_dir = self.install_dir
        # Install in a temporary directory first, then move to final dest
        temp_install_dir = tempfile.mkdtemp()
        register_cleanup(remove_temporary_directories, [temp_install_dir])
        if os.path.exists(self.final_install_dir):
            if self._yes_no_query(
                    ('The installation path {dir} already exists.\n    '
                    '* Overwrite {dir}?').format(dir=self.final_install_dir)
                ):
                try:
                    shutil.rmtree(self.final_install_dir)
                except OSError:
                    # Handle this later if directory creation fails
                    pass
                try:
                    os.remove(self.final_install_dir)
                except OSError:
                    pass
            else:
                print_to_screen(
                        'Specify a different installation directory with '
                        '--install-dir.'
                    )
                sys.exit(0)
        self._print_to_screen_and_log('[Installing] Extracting Rail-RNA...',
                                        newline=False,
                                        carriage_return=True)
        try:
            os.makedirs(self.final_install_dir)
        except OSError as e:
            self._print_to_screen_and_log(
                            ('Problem encountered trying to create '
                             'directory %s for installation. May need '
                             'sudo permissions.') % self.final_install_dir
                        )
            self._bail()
        else:
            # So it's possible to move temp installation dir there
            os.rmdir(self.final_install_dir)
            pass
        with cd(temp_install_dir):
            with zipfile.ZipFile(self.zip_name) as zip_object:
                zip_object.extractall('./rail-rna')
            if not self.no_dependencies:
                self._grab_and_explode(self.depends['pypy'], 'PyPy')
                self._grab_and_explode(self.depends['sra_tools'], 'SRA Tools')
                if not self.prep_dependencies:
                    self._grab_and_explode(self.depends['bowtie1'], 'Bowtie 1')
                    self._grab_and_explode(self.depends['bowtie2'], 'Bowtie 2')
                    self._grab_and_explode(self.depends['bedgraphtobigwig'],
                                            'BedGraphToBigWig')
                    self._grab_and_explode(self.depends['samtools'],
                                                                'SAMTools')
            if not self.prep_dependencies and not self.no_dependencies:
                # Have to make SAMTools (annoying; maybe change this)
                samtools_dir = os.path.join(temp_install_dir,
                        self.depends['samtools'][0].rpartition('/')[2][:-8]
                    )
                with cd(samtools_dir):
                    '''Make sure unistd.h is #included cram_io.c ... it's some
                    bug in some SAMTools that prevents compilation on
                    langmead-fs1, which may be a general problem with
                    portability. See https://github.com/samtools/htslib/commit/
                    0ec5202de5691b27917ce828a9d24c9c729a9b81'''
                    cram_io_file = os.path.join(glob.glob('./htslib-*')[0],
                                                    'cram', 'cram_io.c')
                    with open(cram_io_file) as cram_io_stream:
                        all_cram_io = cram_io_stream.read()
                    if '<unistd.h>' not in all_cram_io:
                        with open(cram_io_file, 'w') as cram_io_out_stream:
                            cram_io_out_stream.write(all_cram_io.replace(
                                    '#include <string.h>',
                                    '#include <string.h>\n#include <unistd.h>'
                                ))
                    makefile = 'Makefile'
                    with open(makefile) as makefile_stream:
                        all_makefile = makefile_stream.read()
                    with open(makefile, 'w') as makefile_stream:
                        makefile_stream.write(
                            all_makefile.replace(
                                    '-D_CURSES_LIB=1', '-D_CURSES_LIB=0'
                                ).replace('LIBCURSES=','#LIBCURSES=')
                        )
                    # Make on all but one cylinder
                    thread_count = max(1, multiprocessing.cpu_count() - 1)
                    samtools_command = ['make', '-j%d' % thread_count]
                    self._print_to_screen_and_log(
                                '[Installing] Making SAMTools...',
                                newline=False,
                                carriage_return=True
                            )
                    try:
                        subprocess.check_output(samtools_command,
                                                    stderr=self.log_stream)
                    except subprocess.CalledProcessError as e:
                        self._print_to_screen_and_log(
                                ('Error encountered making SAMTools; exit '
                                 'code was %d; command invoked was "%s".') %
                                    (e.returncode, ' '.join(samtools_command))
                            )
                        self._bail()
                samtools = os.path.join(self.final_install_dir,
                        self.depends['samtools'][0].rpartition('/')[2][:-8],
                        'samtools')
                bowtie1_base = '-'.join(
                    self.depends['bowtie1'][0].rpartition(
                            '/'
                        )[2].split('-')[:2]
                )
                bowtie1 = os.path.join(self.final_install_dir,
                                                bowtie1_base, 'bowtie')
                bowtie1_build = os.path.join(self.final_install_dir,
                                                bowtie1_base, 'bowtie-build')
                bowtie2_base = '-'.join(
                    self.depends['bowtie2'][0].rpartition(
                            '/'
                        )[2].split('-')[:2]
                )
                bowtie2 = os.path.join(self.final_install_dir,
                                                bowtie2_base, 'bowtie2')
                bowtie2_build = os.path.join(self.final_install_dir,
                                                bowtie2_base, 'bowtie2-build')
                bedgraphtobigwig = os.path.join(
                                                self.final_install_dir,
                                                'bedGraphToBigWig'
                                            )
            else:
                bowtie1 = bowtie1_build = bowtie2 = bowtie2_build \
                    = bedgraphtobigwig = samtools = 'None'
            if self.no_dependencies:
                pypy = 'None'
                fastq_dump = 'None'
                vdb_config = 'None'
            else:
                pypy = os.path.join(self.final_install_dir,
                        self.depends['pypy'][0].rpartition(
                                '/'
                            )[2][:-8], 'bin', 'pypy'
                    )
                fastq_dump = os.path.join(self.final_install_dir,
                                self.depends['sra_tools'][0].rpartition(
                                '/'
                            )[2][:-7], 'bin', 'fastq-dump'
                    )
                vdb_config = os.path.join(self.final_install_dir,
                                self.depends['sra_tools'][0].rpartition(
                                '/'
                            )[2][:-7], 'bin', 'vdb-config'
                    )
            # Write paths to exe_paths
            with open(
                            os.path.join(temp_install_dir, 'rail-rna',
                                            'exe_paths.py'), 'w'
                        ) as exe_paths_stream:
                print >>exe_paths_stream, (
"""\"""
exe_paths.py
Part of Rail-RNA

Defines default paths of Rail-RNA's executable dependencies. Set a given
variable equal to None if the default path should be in PATH.
\"""

pypy = {pypy}
aws = None
curl = None
sort = None
bowtie1 = {bowtie1}
bowtie1_build = {bowtie1_build}
bowtie2 = {bowtie2}
bowtie2_build = {bowtie2_build}
samtools = {samtools}
bedgraphtobigwig = {bedgraphtobigwig}
fastq_dump = {fastq_dump}
vdb_config = {vdb_config}
"""
                ).format(pypy=self._quote(pypy), bowtie1=self._quote(bowtie1),
                            bowtie1_build=self._quote(bowtie1_build),
                            bowtie2=self._quote(bowtie2),
                            bowtie2_build=self._quote(bowtie2_build),
                            samtools=self._quote(samtools),
                            bedgraphtobigwig=self._quote(bedgraphtobigwig),
                            fastq_dump=self._quote(fastq_dump),
                            vdb_config=self._quote(vdb_config))
        # Move to final directory
        try:
            shutil.move(temp_install_dir, self.final_install_dir)
        except Exception as e:
            self._print_to_screen_and_log(('Problem "%s" encountered moving '
                                           'temporary installation directory '
                                           '%s to final destination %s.') % (
                                                e,
                                                temp_install_dir,
                                                self.final_install_dir
                                            ))
            self._bail()
        # Create shell-script executable
        try:
            os.makedirs(bin_dir)
        except Exception as e:
            if not os.path.isdir(bin_dir):
                self._print_to_screen_and_log(('Problem "%s" encountered '
                                               'creating directory %s.') % (
                                                    e,
                                                    bin_dir
                                                )
                                            )
                self._bail()
        install_dir_replacement = os.path.join(
                                self.final_install_dir, 'rail-rna'
                            )
        with open(rail_exe, 'w') as rail_exe_stream:
            print >>rail_exe_stream, (
"""#!/usr/bin/env bash

{python_executable} {install_dir} $@
"""
                ).format(python_executable=sys.executable,
                            install_dir=install_dir_replacement)
        if self.local:
            '''Have to add Rail to PATH. Do this in bashrc and bash_profile
            contingent on whether it's present already because of
            inconsistent behavior across Mac OS and Linux distros.'''
            to_print = (
"""
## Rail-RNA additions
if [ -d "{bin_dir}" ] && [[ ":$PATH:" != *":{bin_dir}:"* ]]; then
    PATH="${{PATH:+"$PATH:"}}{bin_dir}"
fi
export RAILDOTBIO={install_dir}
## End Rail-RNA additions
"""
                ).format(bin_dir=bin_dir,
                            install_dir=install_dir_replacement)
        else:
            # Just define raildotbio directory
            to_print = (
"""
## Rail-RNA addition
export RAILDOTBIO={install_dir}
## End Rail-RNA addition
"""
                ).format(bin_dir=bin_dir,
                            install_dir=install_dir_replacement)
        import mmap
        bashrc = os.path.expanduser('~/.bashrc')
        bash_profile = os.path.expanduser('~/.bash_profile')
        try:
            with open(bashrc) as bashrc_stream:
                mmapped = mmap.mmap(bashrc_stream.fileno(), 0, 
                                        access=mmap.ACCESS_READ)
                if mmapped.find(to_print) == -1:
                    print_to_bashrc = True
                else:
                    print_to_bashrc = False
        except (IOError, ValueError):
            # No file
            print_to_bashrc = True
        try:
            with open(bash_profile) as bash_profile_stream:
                mmapped = mmap.mmap(bash_profile_stream.fileno(), 0, 
                                        access=mmap.ACCESS_READ)
                if mmapped.find(to_print) == -1:
                    print_to_bash_profile = True
                else:
                    print_to_bash_profile = False
        except (IOError, ValueError):
            # No file
            print_to_bash_profile = True
        if print_to_bashrc:
            with open(bashrc, 'a') as bashrc_stream:
                print >>bashrc_stream, to_print
        if print_to_bash_profile:
            with open(bash_profile, 'a') as bash_profile_stream:
                print >>bash_profile_stream, to_print
        # Set 755 permissions across Rail's dirs and 644 across files
        dir_command = ['find', self.final_install_dir, '-type', 'd',
                            '-exec', 'chmod', '755', '{}', ';']
        file_command = ['find', self.final_install_dir, '-type', 'f',
                            '-exec', 'chmod', '644', '{}', ';']
        try:
            subprocess.check_output(dir_command,
                                        stderr=self.log_stream)
        except subprocess.CalledProcessError as e:
            self._print_to_screen_and_log(
                        ('Error encountered changing directory '
                         'permissions; exit code was %d; command invoked '
                         'was "%s".') %
                            (e.returncode, ' '.join(dir_command))
                    )
            self._bail()
        try:
            subprocess.check_output(file_command,
                                        stderr=self.log_stream)
        except subprocess.CalledProcessError as e:
            self._print_to_screen_and_log(
                        ('Error encountered changing file '
                         'permissions; exit code was %d; command invoked '
                         'was "%s".') %
                            (e.returncode, ' '.join(file_command))
                    )
            self._bail()
        # Go back and set 755 permissions for executables
        os.chmod(rail_exe, 0755)
        if not self.no_dependencies:
            os.chmod(pypy, 0755)
            os.chmod(fastq_dump, 0755)
            os.chmod(vdb_config, 0755)
            if not self.prep_dependencies:
                for program in [bowtie1, bowtie1_build, bowtie2, bowtie2_build,
                                samtools, bedgraphtobigwig]:
                    os.chmod(program, 0755)
                    # Also for misc. Bowtie executables
                    for program in glob.glob(
                            os.path.join(os.path.dirname(bowtie1), 'bowtie-*')
                        ):
                        os.chmod(program, 0755)
                    for program in glob.glob(
                            os.path.join(os.path.dirname(bowtie2), 'bowtie2-*')
                        ):
                        os.chmod(program, 0755)
            if self.add_symlinks:
                # Write appropriate symlinks
                self._add_symlink_to_exe(pypy)
                self._add_symlink_to_exe(fastq_dump)
                self._add_symlink_to_exe(vdb_config)
                if not self.prep_dependencies:
                    for program in [bowtie1, bowtie1_build, bowtie2,
                                    bowtie2_build, samtools, bedgraphtobigwig]:
                        self._add_symlink_to_exe(program)
        self._print_to_screen_and_log('Installed Rail-RNA.')
        # IPython much?
        try:
            import IPython
        except ImportError:
            # Guess not
            if self._yes_no_query(
                    'IPython is not installed but required for Rail-RNA to '
                    'work in its "parallel" mode.\n'
                    '    * Install IPython now?'
                ):
                temp_ipython_install_dir = tempfile.mkdtemp()
                register_cleanup(remove_temporary_directories,
                                    [temp_ipython_install_dir])
                with cd(temp_ipython_install_dir):
                    self._grab_and_explode(self.depends['ipython'], 'IPython')
                    setup_dir = os.path.dirname(find('setup.py', './'))
                    with cd(setup_dir):
                        ipython_command = [
                                    sys.executable, 'setup.py', 'install',
                                ]
                        if self.local:
                            ipython_command.append('--user')
                        try:
                            subprocess.check_output(ipython_command,
                                                        stderr=self.log_stream)
                        except subprocess.CalledProcessError as e:
                            self._print_to_screen_and_log(
                                ('Error encountered installing IPython; exit '
                                 'code was %d; command invoked was "%s".') %
                                    (e.returncode, ' '.join(ipython_command))
                            )
                            self._bail()
        install_aws = (not self.no_dependencies and not which('aws'))
        self.installed_aws = False
        if install_aws and self._yes_no_query(
                'AWS CLI is not installed but required for Rail-RNA to work '
                'in its "elastic" mode, on Amazon Elastic MapReduce.\n'
                '    * Install AWS CLI now?'
            ):
            temp_aws_install_dir = tempfile.mkdtemp()
            register_cleanup(remove_temporary_directories,
                                [temp_aws_install_dir])
            with cd(temp_aws_install_dir):
                self._grab_and_explode(self.depends['aws_cli'], 'AWS CLI')
                os.chmod('./awscli-bundle/install', 0755)
                if self.local:
                    # Local install
                    aws_command = ['./awscli-bundle/install', '-b',
                                    os.path.join(bin_dir, 'aws'),
                                   '-i', os.path.abspath(
                                        os.path.expanduser('~/.local/lib/aws')
                                    )]
                else:
                    # All users
                    aws_command = ['./awscli-bundle/install',
                                    '-i', '/usr/local/aws',
                                    '-b', '/usr/local/bin/aws']
                try:
                    subprocess.check_output(aws_command,
                                                stderr=self.log_stream)
                except (OSError, subprocess.CalledProcessError) as e:
                    self._print_to_screen_and_log(
                            ('Error encountered installing AWS CLI; exit '
                             'code was %d; command invoked was "%s".') %
                                (e.returncode, ' '.join(aws_command))
                        )
                    self._bail()
            self.installed_aws = True
        elif install_aws:
            print_to_screen('Visit http://docs.aws.amazon.com/cli/latest/'
                            'userguide/installing.html to install the '
                            'AWS CLI later.')
        self.finished = True
Example #14
0
                        os.path.realpath(__file__)))
                    )
                )
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

import bowtie
from dooplicity.tools import xstream, register_cleanup, xopen, make_temp_dir
from dooplicity.counters import Counter
import tempdel

# Initialize global variable for tracking number of input lines
_input_line_count = 0
counter = Counter('align_readlets')
register_cleanup(counter.flush)

def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie_exe='bowtie',
    bowtie_index_base='genome', bowtie_args='', gzip_level=3, verbose=False,
    report_multiplier=1.2, scratch=None):
    """ Runs Rail-RNA-align_readlets.

        Aligns input readlet sequences and writes a single output line per
        readlet belonging to a distinct read sequence.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns:
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
Example #15
0
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2',
    bowtie_index_base='genome', bowtie2_index_base='genome2', 
    manifest_file='manifest', bowtie2_args=None, bin_size=10000, verbose=False,
    exon_differentials=True, exon_intervals=False, report_multiplier=1.2,
    min_exon_size=8, search_filter=1, min_readlet_size=15, max_readlet_size=25,
    readlet_interval=12, capping_multiplier=1.5, drop_deletions=False,
    gzip_level=3, scratch=None, index_count=1, output_bam_by_chr=False,
    tie_margin=0, no_realign=False, no_polyA=False):
    """ Runs Rail-RNA-align_reads.

        A single pass of Bowtie is run to find end-to-end alignments. Unmapped
        reads are saved for readletizing to determine junctions in sucessive
        reduce steps as well as for realignment in a later map step.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns in a mix of any of the following
        three formats:
        Format 1 (single-end, 3-column):
          1. Nucleotide sequence or its reversed complement, whichever is first
            in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name
          4. Quality sequence or its reverse, whichever corresponds to field 1

        Format 2 (paired, 2 lines, 3 columns each)
        (so this is the same as single-end)
          1. Nucleotide sequence for mate 1 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name for mate 1
          4. Quality sequence for mate 1 or its reverse, whichever corresponds
            to field 1
            
            (new line)

          1. Nucleotide sequence for mate 2 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse complemented else 0
          3. Name for mate 2
          4. Quality sequence for mate 2 or its reverse, whichever corresponds
            to field 1

        Input is partitioned and sorted by field 1, the read sequence.

        Hadoop output (written to stdout)
        ----------------------------
        A given RNAME sequence is partitioned into intervals ("bins") of some 
        user-specified length (see partition.py).

        Exonic chunks (aka ECs; three formats, any or all of which may be
        emitted):

        Format 1 (exon_ival); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. Sample index
        3. EC start (inclusive) on forward strand
        4. EC end (exclusive) on forward strand

        Format 2 (exon_diff); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. max(EC start, bin start) (inclusive) on forward strand IFF diff is
            positive and EC end (exclusive) on forward strand IFF diff is
            negative
        3. Sample index
        4. '1' if alignment from which diff originates is "unique" according to
            --tie-margin criterion; else '0'
        5. +1 or -1 * count, the number of instances of a read sequence for
            which to print exonic chunks

        Note that only unique alignments are currently output as ivals and/or
        diffs.

        Format 3 (sam); tab-delimited output tuple columns:
        Standard SAM output except fields are in different order, and the first
        field corresponds to sample label. (Fields are reordered to facilitate
        partitioning by sample name/RNAME and sorting by POS.) Each line
        corresponds to a spliced alignment. The order of the fields is as
        follows.
        1. Sample index if outputting BAMs by sample OR
                sample-rname index if outputting BAMs by chr
        2. (Number string representing RNAME; see BowtieIndexReference
            class in bowtie_index for conversion information) OR
            '0' if outputting BAMs by chr
        3. POS
        4. QNAME
        5. FLAG
        6. MAPQ
        7. CIGAR
        8. RNEXT
        9. PNEXT
        10. TLEN
        11. SEQ
        12. QUAL
        ... + optional fields

        Insertions/deletions (indel_bed)

        tab-delimited output tuple columns:
        1. 'I' or 'D' insertion or deletion line
        2. Number string representing RNAME
        3. Start position (Last base before insertion or 
            first base of deletion)
        4. End position (Last base before insertion or last base of deletion 
                            (exclusive))
        5. Inserted sequence for insertions or deleted sequence for deletions
        6. Sample index
        ----Next fields are for junctions only; they are '\x1c' for indels----
        7. '\x1c'
        8. '\x1c'
        --------------------------------------------------------------------
        9. Number of instances of insertion or deletion in sample; this is
            always +1 * count before bed_pre combiner/reducer

        Read whose primary alignment is not end-to-end

        Tab-delimited output tuple columns (unmapped):
        1. Transcriptome Bowtie 2 index group number
        2. SEQ
        3. 1 if SEQ is reverse-complemented, else 0
        4. QNAME
        5. QUAL

        Tab-delimited output tuple columns (readletized):
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Tab-delimited tuple columns (postponed_sam):
        Standard 11+ -column raw SAM output

        Single column (unique):
        1. A unique read sequence

        Two columns, exactly one line (dummy); ensures creation of junction
            index:
        1. character "-"
        2. the word "dummy"

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and junctions.
        bowtie2_exe: filename of Bowtie2 executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie1 index files associated
            with the reference.
        bowtie2_index_base: the basename of the Bowtie2 index files associated
            with the reference.
        manifest_file: filename of manifest
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie2.
        bin_size: genome is partitioned in units of bin_size for later load
            balancing.
        verbose: True iff more informative messages should be written to
            stderr.
        exon_differentials: True iff EC differentials are to be emitted.
        exon_intervals: True iff EC intervals are to be emitted.
        report_multiplier: if verbose is True, the line number of an alignment
            or read written to stderr increases exponentially with base
            report_multiplier.
        min_exon_size: minimum exon size searched for in junction_search.py
            later in pipeline; used to determine how large a soft clip on one
            side of a read is necessary to pass it on to junction search
            pipeline
        search_filter: how large a soft clip on one side of a read is necessary
            to pass it on to junction search pipeline
        min_readlet_size: "capping" readlets (that is, readlets that terminate
            at a given end of the read) are never smaller than this value
        max_readlet_size: size of every noncapping readlet
        readlet_interval: number of bases separating successive readlets along
            the read
        capping_multiplier: successive capping readlets on a given end of a
            read are increased in size exponentially with base
            capping_multiplier
        drop_deletions: True iff deletions should be dropped from coverage
            vector
        gzip_level: compression level to use for temporary files
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory
        index_count: number of transcriptome Bowtie 2 indexes to which to
            assign unmapped reads for later realignment
        output_bam_by_chr: True iff final output BAMs will be by chromosome
        tie_margin: allowed score difference per 100 bases among ties in
            max score. For example, 150 and 144 are tied alignment scores
            for a 100-bp read when --tie-margin is 6.
        no_realign: True iff job flow does not need more than readlets: this
            usually means only a transcript index is being constructed
        no_polyA: kill noncapping readlets that are all As and write as
            unmapped all reads with polyA prefixes whose suffixes are <
            min_exon_size

        No return value.
    """
    global _input_line_count
    reference_index = bowtie_index.BowtieIndexReference(bowtie_index_base)
    manifest_object = manifest.LabelsAndIndices(manifest_file)
    alignment_printer = AlignmentPrinter(
            manifest_object,
            reference_index,
            bin_size=bin_size,
            output_stream=output_stream,
            exon_ivals=exon_intervals,
            exon_diffs=exon_differentials,
            drop_deletions=drop_deletions,
            output_bam_by_chr=output_bam_by_chr,
            tie_margin=tie_margin
        )
    # Get task partition to pass to align_reads_delegate.py
    try:
        task_partition = os.environ['mapred_task_partition']
    except KeyError:
        # Hadoop 2.x?
        try:
            task_partition = os.environ['mapreduce_task_partition']
        except KeyError:
            # A unit test is probably being run
            task_partition = '0'
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    align_file = os.path.join(temp_dir, 'first_pass_reads.temp.gz')
    other_reads_file = os.path.join(temp_dir, 'other_reads.temp.gz')
    second_pass_file = os.path.join(temp_dir, 'second_pass_reads.temp.gz')
    k_value, _, _ = bowtie.parsed_bowtie_args(bowtie2_args)
    nothing_doing = True
    # Required length of prefix after poly(A) is trimmed
    remaining_seq_size = max(min_exon_size - 1, 1)
    with xopen(True, align_file, 'w', gzip_level) as align_stream, \
        xopen(True, other_reads_file, 'w', gzip_level) as other_stream:
        for seq_number, ((seq,), xpartition) in enumerate(
                                                        xstream(sys.stdin, 1)
                                                    ):
            seq_length = len(seq)
            if no_polyA and (
                    all(seq[i] == 'A' 
                         for i in xrange(seq_length - remaining_seq_size))
                    or all(seq[i] == 'T' 
                         for i in xrange(remaining_seq_size, seq_length))
                    or all(seq[i] == 'A' 
                         for i in xrange(remaining_seq_size, seq_length))
                    or all(seq[i] == 'T' 
                         for i in xrange(seq_length - remaining_seq_size))
                ):
                if not no_realign:
                    '''If a sequence is too short without its poly(A) tail,
                    make all reads with that sequence unmapped. Technically,
                    this also kills poly(A)s at 5' ends, but we probably
                    couldn't align those sequences anyway.'''
                    reversed_complement_seq = seq[::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                    for is_reversed, name, qual in xpartition:
                        if is_reversed == '0':
                            alignment_printer.print_unmapped_read(
                                                    name,
                                                    seq,
                                                    qual
                                                )
                        else:
                            alignment_printer.print_unmapped_read(
                                                    name,
                                                    reversed_complement_seq,
                                                    qual[::-1]
                                                )
                continue
            nothing_doing = False
            '''Select highest-quality read with alphabetically last qname
            for first-pass alignment.'''
            best_name, best_mean_qual, best_qual_index, i = None, None, 0, 0
            others_to_print = dlist()
            for is_reversed, name, qual in xpartition:
                _input_line_count += 1
                others_to_print.append(
                        '\t'.join([
                            str(seq_number), is_reversed, name, qual
                        ])
                    )
                mean_qual = (
                        float(sum([ord(score) for score in qual])) / len(qual)
                    )
                if (mean_qual > best_mean_qual
                        or mean_qual == best_mean_qual and name > best_name):
                    best_qual_index = i
                    best_mean_qual = mean_qual
                    best_name = name
                    to_align = '\t'.join([
                                        '%s\x1d%s' % (is_reversed, name),
                                        seq, qual
                                    ])
                i += 1
            assert i >= 1
            if i == 1:
                print >>other_stream, str(seq_number)
            else:
                for j, other_to_print in enumerate(others_to_print):
                    if j != best_qual_index:
                        print >>other_stream, other_to_print
            print >>align_stream, to_align
    # Print dummy line
    print 'dummy\t-\tdummy'
    sys.stdout.flush() # this is REALLY important b/c called script will stdout
    if nothing_doing:
        # No input
        sys.exit(0)
    input_command = 'gzip -cd %s' % align_file
    bowtie_command = ' '.join([bowtie2_exe,
        bowtie2_args if bowtie2_args is not None else '',
        ' --sam-no-qname-trunc --local -t --no-hd --mm -x',
        bowtie2_index_base, '--12 -'])
    delegate_command = ''.join(
                [sys.executable, ' ', os.path.realpath(__file__)[:-3],
                    ('_delegate.py --task-partition {task_partition} '
                     '--other-reads {other_reads} --second-pass-reads '
                     '{second_pass_reads} --min-readlet-size '
                     '{min_readlet_size} {drop_deletions} '
                     '--max-readlet-size {max_readlet_size} '
                     '--readlet-interval {readlet_interval} '
                     '--capping-multiplier {capping_multiplier:1.12f} '
                     '{verbose} --report-multiplier {report_multiplier:1.12f} '
                     '--k-value {k_value} '
                     '--bowtie-idx {bowtie_index_base} '
                     '--partition-length {bin_size} '
                     '--manifest {manifest_file} '
                     '{exon_differentials} {exon_intervals} '
                     '--gzip-level {gzip_level} '
                     '--search-filter {search_filter} '
                     '--index-count {index_count} '
                     '--tie-margin {tie_margin} '
                     '{no_realign} '
                     '{no_polyA} '
                     '{output_bam_by_chr}').format(
                        task_partition=task_partition,
                        other_reads=other_reads_file,
                        second_pass_reads=second_pass_file,
                        min_readlet_size=min_readlet_size,
                        drop_deletions=('--drop-deletions' if drop_deletions
                                            else ''),
                        max_readlet_size=max_readlet_size,
                        readlet_interval=readlet_interval,
                        capping_multiplier=capping_multiplier,
                        verbose=('--verbose' if verbose else ''),
                        report_multiplier=report_multiplier,
                        k_value=k_value,
                        bowtie_index_base=bowtie_index_base,
                        bin_size=bin_size,
                        manifest_file=manifest_file,
                        exon_differentials=('--exon-differentials'
                                            if exon_differentials else ''),
                        exon_intervals=('--exon-intervals'
                                        if exon_intervals else ''),
                        gzip_level=gzip_level,
                        search_filter=search_filter,
                        index_count=index_count,
                        tie_margin=tie_margin,
                        no_realign=('--no-realign' if no_realign else ''),
                        no_polyA=('--no-polyA' if no_polyA else ''),
                        output_bam_by_chr=('--output-bam-by-chr'
                                            if output_bam_by_chr
                                            else '')
                     )]
            )
    full_command = ' | '.join([input_command, 
                                bowtie_command, delegate_command])
    print >>sys.stderr, \
        'Starting first-pass Bowtie 2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
                    ['set -exo pipefail;', full_command]
                ),
            bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True,
            executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading first-pass Bowtie 2 '
                           'output; exitlevel was %d.' % return_code)
    os.remove(align_file)
    os.remove(other_reads_file)
    if not no_realign:
        input_command = 'gzip -cd %s' % second_pass_file
        bowtie_command = ' '.join([bowtie2_exe,
            bowtie2_args if bowtie2_args is not None else '',
            ' --sam-no-qname-trunc --local -t --no-hd --mm -x',
            bowtie2_index_base, '--12 -'])
        delegate_command = ''.join(
                    [sys.executable, ' ', os.path.realpath(__file__)[:-3],
                        ('_delegate.py --task-partition {task_partition} '
                         '--min-readlet-size {min_readlet_size} '
                         '{drop_deletions} '
                         '--max-readlet-size {max_readlet_size} '
                         '--readlet-interval {readlet_interval} '
                         '--capping-multiplier {capping_multiplier:012f} '
                         '{verbose} '
                         '--report-multiplier {report_multiplier:012f} '
                         '--k-value {k_value} '
                         '--bowtie-idx {bowtie_index_base} '
                         '--partition-length {bin_size} '
                         '--manifest {manifest_file} '
                         '{exon_differentials} {exon_intervals} '
                         '--gzip-level {gzip_level} '
                         '--search-filter {search_filter} ' 
                         '--index-count {index_count} '
                         '--tie-margin {tie_margin} '
                         '{output_bam_by_chr}').format(
                            task_partition=task_partition,
                            min_readlet_size=min_readlet_size,
                            drop_deletions=('--drop-deletions'
                                                if drop_deletions else ''),
                            readlet_interval=readlet_interval,
                            max_readlet_size=max_readlet_size,
                            capping_multiplier=capping_multiplier,
                            verbose=('--verbose' if verbose else ''),
                            report_multiplier=report_multiplier,
                            k_value=k_value,
                            bowtie_index_base=bowtie_index_base,
                            bin_size=bin_size,
                            manifest_file=manifest_file,
                            exon_differentials=('--exon-differentials'
                                                if exon_differentials else ''),
                            exon_intervals=('--exon-intervals'
                                            if exon_intervals else ''),
                            gzip_level=gzip_level,
                            search_filter=search_filter,
                            index_count=index_count,
                            tie_margin=tie_margin,
                            output_bam_by_chr=('--output-bam-by-chr'
                                                if output_bam_by_chr
                                                else '')
                         )]
                )
        full_command = ' | '.join([input_command, 
                                    bowtie_command, delegate_command])
        print >>sys.stderr, \
            'Starting second-pass Bowtie 2 with command: ' + full_command
        bowtie_process = subprocess.Popen(' '.join(
                        ['set -exo pipefail;', full_command]
                    ),
                bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True,
                executable='/bin/bash')
        return_code = bowtie_process.wait()
        if return_code:
            raise RuntimeError('Error occurred while reading second-pass '
                               'Bowtie 2 output; exitlevel was %d.'
                                % return_code)
    sys.stdout.flush()
Example #16
0
def go(
    input_stream=sys.stdin,
    output_stream=sys.stdout,
    bowtie_exe="bowtie",
    bowtie_index_base="genome",
    bowtie_args="",
    gzip_level=3,
    verbose=False,
    report_multiplier=1.2,
    scratch=None,
):
    """ Runs Rail-RNA-align_readlets.

        Aligns input readlet sequences and writes a single output line per
        readlet belonging to a distinct read sequence.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns:
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Input is partitioned by field 1, the readlet sequence or its reversed
        complement.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited output tuple columns, where each line corresponds to a
        readlet from a distinct read rather than a unique readlet sequence:
        1. Read sequence ID
        2. Displacement of readlet's 5' end from read's 5' end + '\x1e' +
            displacement of readlet's 3' end from read's 3' end (+, for EXACTLY
            one readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            number of instances of read sequence + '\x1e' + number of instances
            of read sequence's reversed complement + '\x1e' (+, for EXACTLY one
            readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence))] + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective
            sample in list A) + '\x1e' + (an '\x1f'-separated list of the
            number of instances of the read sequence's reversed complement for
            each respective sample in list B)
        3. '\x1f'-separated list of alignment RNAMEs or '\x1c' if no alignments
        4. '\x1f'-separated list of alignment FLAGs or '\x1c' if no alignments
        5. '\x1f-separated list of alignment POSes or '\x1c' if no alignments

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie_exe: filename of Bowtie executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie, e.g., "--tryhard --best"; or None.
        gzip_level: level of gzip compression to use for qname file
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    global _input_line_count
    # For storing long qnames
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    qnames_file = os.path.join(temp_dir, "qnames.temp.gz")
    readlet_file = os.path.join(temp_dir, "readlets.temp.gz")
    with xopen(True, qnames_file, "w", gzip_level) as qname_stream:
        with xopen(True, readlet_file, "w", gzip_level) as readlet_stream:
            for (seq_count, ((seq,), xpartition)) in enumerate(xstream(input_stream, 1)):
                print >> readlet_stream, "\t".join([str(seq_count), seq, "I" * len(seq)])
                print >> qname_stream, next(iter(xpartition))[0]
                for (qname,) in xpartition:
                    _input_line_count += 1
                    print >> qname_stream, qname
                # Separate qnames with single + character
                print >> qname_stream, "+"
    input_command = "gzip -cd %s" % readlet_file
    bowtie_command = " ".join([bowtie_exe, bowtie_args, "-S -t --sam-nohead --mm", bowtie_index_base, "--12 -"])
    delegate_command = "".join(
        [
            sys.executable,
            " ",
            os.path.realpath(__file__)[:-3],
            "_delegate.py --report-multiplier %08f --qnames-file %s %s"
            % (report_multiplier, qnames_file, "--verbose" if verbose else ""),
        ]
    )
    full_command = " | ".join([input_command, bowtie_command, delegate_command])
    print >>sys.stderr, "Starting Bowtie with command: " + full_command
    bowtie_process = subprocess.Popen(
        " ".join(["set -exo pipefail;", full_command]),
        bufsize=-1,
        stdout=sys.stdout,
        stderr=sys.stderr,
        shell=True,
        executable="/bin/bash",
    )
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError("Error occurred while reading Bowtie output; " "exitlevel was %d." % return_code)
Example #17
0
    def install(self):
        """ Installs Rail-RNA and all its dependencies. """
        if not self.no_dependencies and self.curl_exe is None:
            self.curl_exe = which('curl')
            if self.curl_exe is None:
                print_to_screen('Rail-RNA\'s installer requires Curl if '
                                'dependencies are to be installed. '
                                'Download it at '
                                'http://curl.haxx.se/download.html and use '
                                '--curl to specify its path, or '
                                'disable installing dependencies with '
                                '--no-dependencies.')
                sys.exit(1)
        if self._yes_no_query(
                'Rail-RNA can be installed for all users or just the '
                'current user.\n    * Install for all users?',
                answer=(None if not self.yes else (self.yes and not self.me))
            ):
            if os.getuid():
                print_to_screen('Rerun with sudo privileges to install '
                                'for all users.')
                sys.exit(0)
            install_dir = '/usr/local'
            self.local = False
        else:
            install_dir = os.path.abspath(os.path.expanduser('~/'))
            self.local = True
        bin_dir = os.path.join(install_dir, 'bin')
        rail_exe = os.path.join(bin_dir, 'rail-rna')
        if self.install_dir is None:
            self.final_install_dir = os.path.join(install_dir, 'raildotbio')
        else:
            # User specified an installation directory
            self.final_install_dir = self.install_dir
        # Install in a temporary directory first, then move to final dest
        temp_install_dir = tempfile.mkdtemp()
        register_cleanup(remove_temporary_directories, [temp_install_dir])
        if os.path.exists(self.final_install_dir):
            if self._yes_no_query(
                    ('The installation path {dir} already exists.\n    '
                    '* Overwrite {dir}?').format(dir=self.final_install_dir)
                ):
                try:
                    shutil.rmtree(self.final_install_dir)
                except OSError:
                    # Handle this later if directory creation fails
                    pass
                try:
                    os.remove(self.final_install_dir)
                except OSError:
                    pass
            else:
                print_to_screen(
                        'Specify a different installation directory with '
                        '--install-dir.'
                    )
                sys.exit(0)
        self._print_to_screen_and_log('[Installing] Extracting Rail-RNA...',
                                        newline=False,
                                        carriage_return=True)
        try:
            os.makedirs(self.final_install_dir)
        except OSError as e:
            self._print_to_screen_and_log(
                            ('Problem encountered trying to create '
                             'directory %s for installation. May need '
                             'sudo permissions.') % self.final_install_dir
                        )
            self._bail()
        else:
            # So it's possible to move temp installation dir there
            os.rmdir(self.final_install_dir)
            pass
        with cd(temp_install_dir):
            with zipfile.ZipFile(self.zip_name) as zip_object:
                zip_object.extractall('./rail-rna')
            if not self.no_dependencies:
                self._grab_and_explode(self.depends['pypy'], 'PyPy')
                self._grab_and_explode(self.depends['sra_tools'], 'SRA Tools')
                if not self.prep_dependencies:
                    self._grab_and_explode(self.depends['bowtie1'], 'Bowtie 1')
                    self._grab_and_explode(self.depends['bowtie2'], 'Bowtie 2')
                    self._grab_and_explode(self.depends['bedgraphtobigwig'],
                                            'BedGraphToBigWig')
                    self._grab_and_explode(self.depends['samtools'],
                                                                'SAMTools')
            if not self.prep_dependencies and not self.no_dependencies:
                # Have to make SAMTools (annoying; maybe change this)
                samtools_dir = os.path.join(temp_install_dir,
                        self.depends['samtools'][0].rpartition('/')[2][:-8]
                    )
                with cd(samtools_dir):
                    '''Make sure unistd.h is #included cram_io.c ... it's some
                    bug in some SAMTools that prevents compilation on
                    langmead-fs1, which may be a general problem with
                    portability. See https://github.com/samtools/htslib/commit/
                    0ec5202de5691b27917ce828a9d24c9c729a9b81'''
                    cram_io_file = os.path.join(glob.glob('./htslib-*')[0],
                                                    'cram', 'cram_io.c')
                    with open(cram_io_file) as cram_io_stream:
                        all_cram_io = cram_io_stream.read()
                    if '<unistd.h>' not in all_cram_io:
                        with open(cram_io_file, 'w') as cram_io_out_stream:
                            cram_io_out_stream.write(all_cram_io.replace(
                                    '#include <string.h>',
                                    '#include <string.h>\n#include <unistd.h>'
                                ))
                    makefile = 'Makefile'
                    with open(makefile) as makefile_stream:
                        all_makefile = makefile_stream.read()
                    with open(makefile, 'w') as makefile_stream:
                        makefile_stream.write(
                            all_makefile.replace(
                                    '-D_CURSES_LIB=1', '-D_CURSES_LIB=0'
                                ).replace('LIBCURSES=','#LIBCURSES=')
                        )
                    # Make on all but one cylinder
                    thread_count = max(1, multiprocessing.cpu_count() - 1)
                    samtools_command = ['make', '-j%d' % thread_count]
                    self._print_to_screen_and_log(
                                '[Installing] Making SAMTools...',
                                newline=False,
                                carriage_return=True
                            )
                    try:
                        subprocess.check_output(samtools_command,
                                                    stderr=self.log_stream)
                    except subprocess.CalledProcessError as e:
                        self._print_to_screen_and_log(
                                ('Error encountered making SAMTools; exit '
                                 'code was %d; command invoked was "%s".') %
                                    (e.returncode, ' '.join(samtools_command))
                            )
                        self._bail()
                samtools = os.path.join(self.final_install_dir,
                        self.depends['samtools'][0].rpartition('/')[2][:-8],
                        'samtools')
                bowtie1_base = '-'.join(
                    self.depends['bowtie1'][0].rpartition(
                            '/'
                        )[2].split('-')[:2]
                )
                bowtie1 = os.path.join(self.final_install_dir,
                                                bowtie1_base, 'bowtie')
                bowtie1_build = os.path.join(self.final_install_dir,
                                                bowtie1_base, 'bowtie-build')
                bowtie2_base = '-'.join(
                    self.depends['bowtie2'][0].rpartition(
                            '/'
                        )[2].split('-')[:2]
                )
                bowtie2 = os.path.join(self.final_install_dir,
                                                bowtie2_base, 'bowtie2')
                bowtie2_build = os.path.join(self.final_install_dir,
                                                bowtie2_base, 'bowtie2-build')
                bedgraphtobigwig = os.path.join(
                                                self.final_install_dir,
                                                'bedGraphToBigWig'
                                            )
            else:
                bowtie1 = bowtie1_build = bowtie2 = bowtie2_build \
                    = bedgraphtobigwig = samtools = 'None'
            if self.no_dependencies:
                pypy = 'None'
                fastq_dump = 'None'
                vdb_config = 'None'
            else:
                pypy = os.path.join(self.final_install_dir,
                        self.depends['pypy'][0].rpartition(
                                '/'
                            )[2][:-8], 'bin', 'pypy'
                    )
                fastq_dump = os.path.join(self.final_install_dir,
                                self.depends['sra_tools'][0].rpartition(
                                '/'
                            )[2][:-7], 'bin', 'fastq-dump'
                    )
                vdb_config = os.path.join(self.final_install_dir,
                                self.depends['sra_tools'][0].rpartition(
                                '/'
                            )[2][:-7], 'bin', 'vdb-config'
                    )
            # Write paths to exe_paths
            with open(
                            os.path.join(temp_install_dir, 'rail-rna',
                                            'exe_paths.py'), 'w'
                        ) as exe_paths_stream:
                print >>exe_paths_stream, (
"""\"""
exe_paths.py
Part of Rail-RNA

Defines default paths of Rail-RNA's executable dependencies. Set a given
variable equal to None if the default path should be in PATH.
\"""

pypy = {pypy}
aws = None
curl = None
sort = None
bowtie1 = {bowtie1}
bowtie1_build = {bowtie1_build}
bowtie2 = {bowtie2}
bowtie2_build = {bowtie2_build}
samtools = {samtools}
bedgraphtobigwig = {bedgraphtobigwig}
fastq_dump = {fastq_dump}
vdb_config = {vdb_config}
"""
                ).format(pypy=self._quote(pypy), bowtie1=self._quote(bowtie1),
                            bowtie1_build=self._quote(bowtie1_build),
                            bowtie2=self._quote(bowtie2),
                            bowtie2_build=self._quote(bowtie2_build),
                            samtools=self._quote(samtools),
                            bedgraphtobigwig=self._quote(bedgraphtobigwig),
                            fastq_dump=self._quote(fastq_dump),
                            vdb_config=self._quote(vdb_config))
        # Move to final directory
        try:
            shutil.move(temp_install_dir, self.final_install_dir)
        except Exception as e:
            self._print_to_screen_and_log(('Problem "%s" encountered moving '
                                           'temporary installation directory '
                                           '%s to final destination %s.') % (
                                                e,
                                                temp_install_dir,
                                                self.final_install_dir
                                            ))
            self._bail()
        # Create shell-script executable
        try:
            os.makedirs(bin_dir)
        except Exception as e:
            if not os.path.isdir(bin_dir):
                self._print_to_screen_and_log(('Problem "%s" encountered '
                                               'creating directory %s.') % (
                                                    e,
                                                    bin_dir
                                                )
                                            )
                self._bail()
        with open(rail_exe, 'w') as rail_exe_stream:
            print >>rail_exe_stream, (
"""#!/usr/bin/env bash

{python_executable} {install_dir} $@
"""
                ).format(python_executable=sys.executable,
                            install_dir=os.path.join(
                                self.final_install_dir, 'rail-rna'
                            ))
        if self.local:
            '''Have to add Rail to PATH. Do this in bashrc and bash_profile
            contingent on whether it's present already because of
            inconsistent behavior across Mac OS and Linux distros.'''
            to_print = (
"""
## Rail-RNA additions
if [ -d "{bin_dir}" ] && [[ ":$PATH:" != *":{bin_dir}:"* ]]; then
    PATH="${{PATH:+"$PATH:"}}{bin_dir}"
fi
## End Rail-RNA additions
"""
                ).format(bin_dir=bin_dir)
            import mmap
            bashrc = os.path.expanduser('~/.bashrc')
            bash_profile = os.path.expanduser('~/.bash_profile')
            try:
                with open(bashrc) as bashrc_stream:
                    mmapped = mmap.mmap(bashrc_stream.fileno(), 0, 
                                            access=mmap.ACCESS_READ)
                    if mmapped.find(to_print) == -1:
                        print_to_bashrc = True
                    else:
                        print_to_bashrc = False
            except (IOError, ValueError):
                # No file
                print_to_bashrc = True
            try:
                with open(bash_profile) as bash_profile_stream:
                    mmapped = mmap.mmap(bash_profile_stream.fileno(), 0, 
                                            access=mmap.ACCESS_READ)
                    if mmapped.find(to_print) == -1:
                        print_to_bash_profile = True
                    else:
                        print_to_bash_profile = False
            except (IOError, ValueError):
                # No file
                print_to_bash_profile = True
            if print_to_bashrc:
                with open(bashrc, 'a') as bashrc_stream:
                    print >>bashrc_stream, to_print
            if print_to_bash_profile:
                with open(bash_profile, 'a') as bash_profile_stream:
                    print >>bash_profile_stream, to_print
        # Set 755 permissions across Rail's dirs and 644 across files
        dir_command = ['find', self.final_install_dir, '-type', 'd',
                            '-exec', 'chmod', '755', '{}', ';']
        file_command = ['find', self.final_install_dir, '-type', 'f',
                            '-exec', 'chmod', '644', '{}', ';']
        try:
            subprocess.check_output(dir_command,
                                        stderr=self.log_stream)
        except subprocess.CalledProcessError as e:
            self._print_to_screen_and_log(
                        ('Error encountered changing directory '
                         'permissions; exit code was %d; command invoked '
                         'was "%s".') %
                            (e.returncode, ' '.join(dir_command))
                    )
            self._bail()
        try:
            subprocess.check_output(file_command,
                                        stderr=self.log_stream)
        except subprocess.CalledProcessError as e:
            self._print_to_screen_and_log(
                        ('Error encountered changing file '
                         'permissions; exit code was %d; command invoked '
                         'was "%s".') %
                            (e.returncode, ' '.join(file_command))
                    )
            self._bail()
        # Go back and set 755 permissions for executables
        os.chmod(rail_exe, 0755)
        if not self.no_dependencies:
            os.chmod(pypy, 0755)
            os.chmod(fastq_dump, 0755)
            os.chmod(vdb_config, 0755)
            if not self.prep_dependencies:
                for program in [bowtie1, bowtie1_build, bowtie2, bowtie2_build,
                                samtools, bedgraphtobigwig]:
                    os.chmod(program, 0755)
                    # Also for misc. Bowtie executables
                    for program in glob.glob(
                            os.path.join(os.path.dirname(bowtie1), 'bowtie-*')
                        ):
                        os.chmod(program, 0755)
                    for program in glob.glob(
                            os.path.join(os.path.dirname(bowtie2), 'bowtie2-*')
                        ):
                        os.chmod(program, 0755)
            if self.add_symlinks:
                # Write appropriate symlinks
                self._add_symlink_to_exe(pypy)
                self._add_symlink_to_exe(fastq_dump)
                self._add_symlink_to_exe(vdb_config)
                if not self.prep_dependencies:
                    for program in [bowtie1, bowtie1_build, bowtie2,
                                    bowtie2_build, samtools, bedgraphtobigwig]:
                        self._add_symlink_to_exe(program)
        self._print_to_screen_and_log('Installed Rail-RNA.')
        # IPython much?
        try:
            import IPython
        except ImportError:
            # Guess not
            if self._yes_no_query(
                    'IPython is not installed but required for Rail-RNA to '
                    'work in its "parallel" mode.\n'
                    '    * Install IPython now?'
                ):
                temp_ipython_install_dir = tempfile.mkdtemp()
                register_cleanup(remove_temporary_directories,
                                    [temp_ipython_install_dir])
                with cd(temp_ipython_install_dir):
                    self._grab_and_explode(self.depends['ipython'], 'IPython')
                    setup_dir = os.path.dirname(find('setup.py', './'))
                    with cd(setup_dir):
                        ipython_command = [
                                    sys.executable, 'setup.py', 'install',
                                ]
                        if self.local:
                            ipython_command.append('--user')
                        try:
                            subprocess.check_output(ipython_command,
                                                        stderr=self.log_stream)
                        except subprocess.CalledProcessError as e:
                            self._print_to_screen_and_log(
                                ('Error encountered installing IPython; exit '
                                 'code was %d; command invoked was "%s".') %
                                    (e.returncode, ' '.join(ipython_command))
                            )
                            self._bail()
        install_aws = (not self.no_dependencies and not which('aws'))
        self.installed_aws = False
        if install_aws and self._yes_no_query(
                'AWS CLI is not installed but required for Rail-RNA to work '
                'in its "elastic" mode, on Amazon Elastic MapReduce.\n'
                '    * Install AWS CLI now?'
            ):
            temp_aws_install_dir = tempfile.mkdtemp()
            register_cleanup(remove_temporary_directories,
                                [temp_aws_install_dir])
            with cd(temp_aws_install_dir):
                self._grab_and_explode(self.depends['aws_cli'], 'AWS CLI')
                os.chmod('./awscli-bundle/install', 0755)
                if self.local:
                    # Local install
                    aws_command = ['./awscli-bundle/install', '-b',
                                    os.path.join(bin_dir, 'aws'),
                                   '-i', os.path.abspath(
                                        os.path.expanduser('~/.local/lib/aws')
                                    )]
                else:
                    # All users
                    aws_command = ['./awscli-bundle/install',
                                    '-i', '/usr/local/aws',
                                    '-b', '/usr/local/bin/aws']
                try:
                    subprocess.check_output(aws_command,
                                                stderr=self.log_stream)
                except (OSError, subprocess.CalledProcessError) as e:
                    self._print_to_screen_and_log(
                            ('Error encountered installing AWS CLI; exit '
                             'code was %d; command invoked was "%s".') %
                                (e.returncode, ' '.join(aws_command))
                        )
                    self._bail()
            self.installed_aws = True
        elif install_aws:
            print_to_screen('Visit http://docs.aws.amazon.com/cli/latest/'
                            'userguide/installing.html to install the '
                            'AWS CLI later.')
        self.finished = True