Beispiel #1
0
    )
parser.add_argument(
        '--keep-alive', action='store_const', const=True,
        default=False,
        help='Periodically print Hadoop status messages to stderr to keep ' \
             'job alive'
    )

args = parser.parse_args()

if args.keep_alive:
    from dooplicity.tools import KeepAlive
    keep_alive_thread = KeepAlive(sys.stderr)

input_line_count, output_line_count = 0, 0
counter = Counter('realign_reads_delegate')
register_cleanup(counter.flush)

# Must consume a line of stdin before outputting status messages
line = sys.stdin.readline()
if args.keep_alive: keep_alive_thread.start()

if args.type == 1:
    last_key, totals, write_line = None, [0] * args.value_count, False
    while True:
        counter.add('type1_inputs')
        if not line:
            if last_key is None:
                # Input is empty
                break
            else:
Beispiel #2
0
from collections import defaultdict
import random

base_path = os.path.abspath(
    os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

from dooplicity.tools import xstream, register_cleanup
from dooplicity.counters import Counter
from alignment_handlers import multiread_with_junctions, \
    indels_junctions_exons_mismatches

counter = Counter('cojunction_enum_delegate')
register_cleanup(counter.flush)

import string
_reversed_complement_translation_table = string.maketrans('ATCG', 'TAGC')


def cojunction_length(cojunction):
    """ Computes number of exonic bases spanned by cojunction

        cojunction: list of junctions [(intron_start, intron_end, ...), ...]

        Return value: number of exonic bases spanned by cojunction
    """
    return sum([
        cojunction[i][0] - cojunction[i - 1][1]
Beispiel #3
0
    os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

from dooplicity.tools import xstream, register_cleanup
from dooplicity.counters import Counter
from alignment_handlers \
    import multiread_with_junctions, AlignmentPrinter, multiread_to_report
import partition
import manifest
import bowtie
import bowtie_index

counter = Counter('count_inputs')
register_cleanup(counter.flush)

if __name__ == '__main__':
    # Print file's docstring if -h is invoked
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    # Add command-line arguments
    parser.add_argument('--verbose',
                        action='store_const',
                        const=True,
                        default=False,
                        help='Print out extra debugging statements')
    parser.add_argument('--exon-differentials',
                        action='store_const',
Beispiel #4
0
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

from dooplicity.tools import xstream, register_cleanup, xopen, \
    make_temp_dir
from dooplicity.counters import Counter
import bowtie
import argparse
import tempdel
import itertools
from copy import copy

# Initialize global variable for tracking number of input lines
_input_line_count = 0
counter = Counter('realign_reads')
register_cleanup(counter.flush)

_reversed_complement_translation_table = string.maketrans('ATCG', 'TAGC')


def input_files_from_input_stream(input_stream,
                                  output_stream,
                                  temp_dir_path=None,
                                  verbose=False,
                                  gzip_level=3):
    """ Generates FASTA reference to index and file with reads.

        Each line of the read file is in the following format:

        read number <TAB> SEQ <TAB> QUAL
Beispiel #5
0
if args.keep_alive:
    from dooplicity.tools import KeepAlive
    keep_alive_thread = KeepAlive(sys.stderr)
    keep_alive_thread.start()

import time
start_time = time.time()

reference_index = bowtie_index.BowtieIndexReference(
    os.path.expandvars(args.bowtie_idx))
# For mapping sample indices back to original sample labels
manifest_object = manifest.LabelsAndIndices(os.path.expandvars(args.manifest))
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
input_line_count = 0
counter = Counter('bed')
register_cleanup(counter.flush)

if output_url.is_local:
    # Set up destination directory
    try:
        os.makedirs(output_url.to_url())
    except:
        pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
for (line_type, sample_label), xpartition in xstream(sys.stdin, 2):
Beispiel #6
0
    'followed by ".[sample label].bw"; if basename is an empty string, '
    'a sample\'s bigwig filename is simply [sample label].bw')
parser.add_argument(
    '--keep-alive',
    action='store_const',
    const=True,
    default=False,
    help='Prints reporter:status:alive messages to stderr to keep EMR '
    'task alive')
parser.add_argument('--verbose',
                    action='store_const',
                    const=True,
                    default=False,
                    help='Print out extra debugging statements')

counter = Counter('coverage')
register_cleanup(counter.flush)

filemover.add_args(parser)
bowtie.add_args(parser)
tempdel.add_args(parser)
args = parser.parse_args()

# Start keep_alive thread immediately
if args.keep_alive:
    from dooplicity.tools import KeepAlive
    keep_alive_thread = KeepAlive(sys.stderr)
    keep_alive_thread.start()

if args.keep_alive:
Beispiel #7
0
    os.path.expandvars(args.bowtie_idx))
manifest_object = manifest.LabelsAndIndices(os.path.expandvars(args.manifest))
alignment_count_to_report, seed, non_deterministic \
    = bowtie.parsed_bowtie_args(bowtie_args)

alignment_printer = AlignmentPrinter(manifest_object,
                                     reference_index,
                                     output_stream=sys.stdout,
                                     bin_size=args.partition_length,
                                     exon_ivals=args.exon_intervals,
                                     exon_diffs=args.exon_differentials,
                                     drop_deletions=args.drop_deletions,
                                     output_bam_by_chr=args.output_bam_by_chr,
                                     tie_margin=args.tie_margin)
input_line_count, output_line_count = 0, 0
counter = Counter('break_ties')
register_cleanup(counter.flush)
start_time = time.time()

for (qname, ), xpartition in xstream(sys.stdin, 1):
    alignments = [(qname, ) + alignment for alignment in xpartition]
    input_line_count += len(alignments)
    junction_counts = [alignment[5].count('N') for alignment in alignments]
    min_junction_count = min(junction_counts)
    if not min_junction_count:
        '''There is at least one alignment that overlaps no junctions; report 
        an alignment with the highest score at random. Separate into alignments
        that overlap the fewest junctions and alignments that don't.'''
        counter.add('no_junction_partitions')
        clipped_alignments = [
            alignments[i] for i in xrange(len(junction_counts))
Beispiel #8
0
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
input_line_count = 0
if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])

input_line_count = 0
counter = Counter('tsv')
register_cleanup(counter.flush)

for (line_type,), xpartition in xstream(sys.stdin, 1):
    type_string = ('insertions' if line_type == '0' else
                    ('deletions' if line_type == '1' else
                      ('junctions' if line_type == '2' else
                         ('coverages' + line_type[1:]
                            if line_type.startswith('3') else
                                'normalization'))))
    counter.add(type_string + '_partitions')
    output_filename = ((args.tsv_basename + '.'
                          if args.tsv_basename != '' else '')
                          + type_string + '.tsv.gz')
    if output_url.is_local:
        output_path = os.path.join(args.out, output_filename)
Beispiel #9
0
import time

base_path = os.path.abspath(
                    os.path.dirname(os.path.dirname(os.path.dirname(
                        os.path.realpath(__file__)))
                    )
                )
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

from dooplicity.tools import xstream, register_cleanup
from dooplicity.counters import Counter
import manifest

counter = Counter('junction_filter')
register_cleanup(counter.flush)

def go(manifest_object, input_stream=sys.stdin, output_stream=sys.stdout,
        sample_fraction=0.05, coverage_threshold=5, collect_junctions=False,
        verbose=False):
    """ Runs Rail-RNA-junction_filter.

        Filters out every junction from input_stream that is not either:
          (1) in round(sample_fraction * (total number of samples)) samples OR
          (2) found in at least coverage_threshold reads in at least one
            sample.

        Input (read from stdin)
        ----------------------------
        Tab-delimited columns:
Beispiel #10
0
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

import bowtie
import bowtie_index
from dooplicity.tools import xstream, register_cleanup
from dooplicity.counters import Counter

# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(description=__doc__, 
            formatter_class=argparse.RawDescriptionHelpFormatter)
bowtie.add_args(parser)
args = parser.parse_args()
input_line_count, output_line_count = 0, 0
counter = Counter('junction_collect')
register_cleanup(counter.flush)

start_time = time.time()

reference_index = bowtie_index.BowtieIndexReference(
                        os.path.expandvars(args.bowtie_idx)
                    )
for (_, rname_string, intron_pos, intron_end_pos,
        sense, sample_index), xpartition in xstream(sys.stdin, 6):
    counter.add('partitions')
    coverage = 0
    for value in xpartition:
        counter.add('inputs')
        input_line_count += 1
        try:
Beispiel #11
0
    keep_alive_thread.start()

import time
start_time = time.time()

reference_index = bowtie_index.BowtieIndexReference(
                                os.path.expandvars(args.bowtie_idx)
                            )
# For mapping sample indices back to original sample labels
manifest_object = manifest.LabelsAndIndices(
                                os.path.expandvars(args.manifest)
                            )
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
input_line_count = 0
counter = Counter('collect_read_stats')
register_cleanup(counter.flush)
if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])

output_filename = ((args.tsv_basename + '.' 
                          if args.tsv_basename != '' else '')
                            + 'counts.tsv.gz')
Beispiel #12
0
base_path = os.path.abspath(
    os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

import bowtie
from dooplicity.tools import xstream, register_cleanup, xopen, make_temp_dir
from dooplicity.counters import Counter
import tempdel

# Initialize global variable for tracking number of input lines
_input_line_count = 0
counter = Counter('align_readlets')
register_cleanup(counter.flush)


def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie_exe='bowtie',
       bowtie_index_base='genome',
       bowtie_args='',
       gzip_level=3,
       verbose=False,
       report_multiplier=1.2,
       scratch=None):
    """ Runs Rail-RNA-align_readlets.

        Aligns input readlet sequences and writes a single output line per
Beispiel #13
0
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

import bowtie
from dooplicity.tools import xstream, register_cleanup, xopen, \
    make_temp_dir
from dooplicity.counters import Counter
from dooplicity.ansibles import Url
import tempdel
import filemover

# Initialize global variable for tracking number of input lines
_input_line_count = 0
counter = Counter('cojunction_enum')
register_cleanup(counter.flush)


def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie2_exe='bowtie2',
       bowtie2_index_base='genome',
       bowtie2_args='',
       verbose=False,
       report_multiplier=1.2,
       stranded=False,
       fudge=5,
       max_refs=300,
       score_min=60,
       gzip_level=3,
Beispiel #14
0
from dooplicity.tools import xstream, dlist, register_cleanup
from dooplicity.counters import Counter
import group_reads

parser = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(\
    '--verbose', action='store_const', const=True, default=False,
    help='Print out extra debugging statements')
bowtie.add_args(parser)
group_reads.add_args(parser)
args = parser.parse_args()

start_time = time.time()
input_line_count = 0
counter = Counter('cojunction_fasta')
register_cleanup(counter.flush)
reference_index = bowtie_index.BowtieIndexReference(
    os.path.expandvars(args.bowtie_idx))
group_reads_object = group_reads.IndexGroup(args.index_count)
for (rname, poses, end_poses), xpartition in xstream(sys.stdin,
                                                     3,
                                                     skip_duplicates=True):
    counter.add('partitions')
    reverse_strand_string = rname[-1]
    rname = rname[:-1]
    read_seqs = dlist()
    poses = [int(pos) for pos in poses.split(',')]
    end_poses = [int(end_pos) for end_pos in end_poses.split(',')]
    max_left_extend_size, max_right_extend_size = None, None
    for left_extend_size, right_extend_size, read_seq in xpartition:
Beispiel #15
0
    help='URL to which output should be written. Default is current '
         'working directory')
parser.add_argument('--filename',
                    type=str,
                    required=False,
                    default='split.manifest',
                    help='Output manifest filename')

# Add scratch command-line parameter
tempdel.add_args(parser)

args = parser.parse_args(sys.argv[1:])

start_time = time.time()
input_line_count, output_line_count = 0, 0
counter = Counter('assign_splits')
register_cleanup(counter.flush)

output_url = Url(args.out) if args.out is not None else Url(os.getcwd())
if output_url.is_local:
    # Set up destination directory
    try:
        os.makedirs(output_url.to_url())
    except:
        pass
    output_path = os.path.join(args.out, args.filename)
else:
    mover = filemover.FileMover(args=args)
    print >> sys.stderr, 'Instantiated FileMover.'
    # Set up temporary destination
    import tempfile
Beispiel #16
0
import argparse
import site
import time
import itertools

base_path = os.path.abspath(
    os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)
from dooplicity.tools import xstream, register_cleanup
from dooplicity.counters import Counter
import manifest

counter = Counter('bed_pre')
register_cleanup(counter.flush)


def go(manifest_object,
       input_stream=sys.stdin,
       output_stream=sys.stdout,
       sample_fraction=0.05,
       coverage_threshold=5,
       verbose=False):
    """ Runs Rail-RNA-bed_pre

        Writes indels and junctions for outputting BEDs by sample and
        TSVs across samples.

        Input (read from stdin)
Beispiel #17
0
base_path = os.path.abspath(
    os.path.dirname(
        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))))
utils_path = os.path.join(base_path, 'rna', 'utils')
site.addsitedir(utils_path)
site.addsitedir(base_path)

import bowtie
from dooplicity.ansibles import Url
from dooplicity.tools import register_cleanup, make_temp_dir
from dooplicity.counters import Counter
import filemover
import tempdel

counter = Counter('junction_index')
register_cleanup(counter.flush)

# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(\
    '--out', metavar='URL', type=str, required=False,
    default='None',
    help='Bowtie index files are written to this URL. DEFAULT IS CURRENT '
         'WORKING DIRECTORY.')
parser.add_argument(\
    '--basename', type=str, required=False,
    default='junction',
    help='Basename for index to be written')
parser.add_argument(\
Beispiel #18
0
        Return value: median
    """
    if not a_list:
        # Median's nothing for empty input in this case
        return 0
    sorted_list = sorted(a_list)
    list_size = len(a_list)
    index = (list_size - 1) // 2
    if (list_size % 2):
        return sorted_list[index]
    return (sorted_list[index] + sorted_list[index + 1]) / 2.0

library_size = args.library_size * 1000000
start_time = time.time()
input_line_count, output_line_count = 0, 0
counter = Counter('coverage_pre')
register_cleanup(counter.flush)
bin_count = 0
# For converting RNAMEs to number strings
reference_index = bowtie_index.BowtieIndexReference(
                        os.path.expandvars(args.bowtie_idx)
                    )
manifest_object = manifest.LabelsAndIndices(
                        os.path.expandvars(args.manifest)
                    )
# Grab read counts
mapped_read_counts, unique_mapped_read_counts = {}, {}
with xopen(None, args.read_counts) as read_count_stream:
    read_count_stream.readline()
    for line in read_count_stream:
        tokens = line.strip().split('\t')