Beispiel #1
0
def rmdup_and_blacklist(input_files, genome, output_path, disable_parallel=False):
    primary_logger = _logshim.getLogger('rmdup_blacklist')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=20)

    for filename in input_files:
        primary_logger.debug('Working on: %s' % (filename))
        # This is extremely fast and has minimal memory usage. Yay!
        # TODO: Allow adjustable windowing (-w %d) to blacklist larger/surrounding regions?
        command = "%s rmdup %s - 2>%s | %s window -abam - -b %s -v -w 0 > %s"

        shell_job_runner.run(command % (CONFIG['binaries']['samtools_legacy'],  # TODO: Update this when samtools is fixed.
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam',  # TODO: CLEAN THIS
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam.log',
                                        CONFIG['binaries']['bedtools'],
                                        os.path.dirname(os.path.realpath(__file__)) + '/' + CONFIG['blacklists'][genome],  # TODO: CLEAN THIS
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.srt.rmdup.bam'))

    shell_job_runner.finish()

    primary_logger.info('Removing temporary files from stage 1 ...')
    for filename in input_files:
        os.unlink(output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + '.tmp.bam')

    primary_logger.info('Completed rmdup and blacklist')
Beispiel #2
0
def fastq_merge(merge_strategy, output_path, disable_parallel=False):
    """
    Concatenate multiple fastq files (from multiple lanes) into one.

    :param merge_strategy:
    :param output_path:
    :return:
    """
    merge_log = _logshim.getLogger('fastq_merge')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(merge_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(merge_log, delay_seconds=45)

    for merged_name, merge_inputs in merge_strategy.iteritems():
        merge_input_files = ' '.join(merge_inputs)
        merge_log.info('Spawning niced process to merge: %s' % (merged_name))
        for filename in merge_inputs:
            assert(" " not in filename)
            assert(";" not in filename)  # Vague sanity testing for input filenames
            merge_log.debug('    Input: %s' % (filename))

        # WARNING: Using shell has security implications! Don't work on untrusted input filenames.
        command = "zcat %s | gzip -1 > %s/%s.fastq.gz" % (merge_input_files, output_path, merged_name)

        shell_job_runner.run(command)

    shell_job_runner.finish()

    return True
def generate_index(input_files, output_path, disable_parallel=False):
    """
    Many peak pickers want indexed .bams. Let's build indexes! (yay!)

    :param input_files:
    :param output_path:
    :param disable_parallel:
    :return:
    """
    primary_logger = _logshim.getLogger('index')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger,
                                                          delay_seconds=10)

    for filename in input_files:
        primary_logger.debug('Working on: %s' % (filename))
        command = "%s index %s"

        shell_job_runner.run(command %
                             (CONFIG['binaries']['samtools'], filename))

    shell_job_runner.finish()
Beispiel #4
0
def flatten_tsv(filename):
    """
    Flaten a TSV file -- parse and concatenate identical row names, by summing their values.
    """
    flatlog = _logshim.getLogger('flatten_tsv')

    flatlog.debug('Flattening input file: %s' % (filename))

    data_dict = OrderedDict()

    with open(filename, 'r') as tsv_ro_fh:
        tsv_input = csv.reader(tsv_ro_fh, delimiter=str("\t"))

        header = next(tsv_input, None)

        for row in tsv_input:
            row_key = row[0]
            these_row_values_as_int = map(int, row[1:])
            if row_key in data_dict:
                # Add the current row values to the existing values
                data_dict[row_key] = map(operator.add, data_dict[row_key], these_row_values_as_int)
            else:
                data_dict[row_key] = these_row_values_as_int

    # Write back the parsed dict
    with open(filename, 'wb') as tsv_rw_fh:
        tsv_writer = csv.writer(tsv_rw_fh, delimiter=str("\t"))
        tsv_writer.writerow(header)

        for key, val in data_dict.iteritems():
            tsv_writer.writerow([key] + val)
def run_macs2(input_files, output_path, genome, disable_parallel=False):
    macs2_log = _logshim.getLogger('run_macs2')

    macs2_log.info('Spawning MACS2 jobs...')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(macs2_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(macs2_log, delay_seconds=0.1)

    for filename in input_files:
        macs2_log.debug('Working on: %s' % (filename))

        # --bdg: generate .bed graph output
        # --nomodel: We'll be shifting manually!
        # --extsize 200: See long discussion at: @@@
        # --shift -100: As per above.
        # --slocal: Look at a local window of 20kb to build peak models
        # --keep-dup: We already removed duplicates with samtools.
        # TODO: Consider allowing tweaks to these settings with flags?
        command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s"

        filename_without_extension = os.path.splitext(filename)[0] + '.macs2'

        shell_job_runner.run(command % ('macs2',  # This must be pre-installed by the user. It's a big, complex package.
                                        filename,  # input file
                                        os.path.basename(filename_without_extension),
                                        output_path,
                                        genome,  # for genome size, uncleaer if this actually matters with nolambda/nomodel
                                        output_path + "/" + os.path.basename(filename_without_extension) + '.log'))

    shell_job_runner.finish()

    macs2_log.info('MACS2 peak calling complete.')
Beispiel #6
0
def large_filter_fixmate_and_sort(input_files, genome, output_path, disable_parallel=False):
    primary_logger = _logshim.getLogger('first_pass')

    output_suffix = ".tmp"

    if disable_parallel:  # Doesn't change parallelism in last samtools sort
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=60)

    # We do a few things here:
    #  - View only mapping quality >= 10
    #  - Remove chrM
    #  - Sort by name for fixmate
    #     We don't parallelize here (-@ #) because fixmate blocks & parallel seems to only help for compressed.
    #  - Fixmate (needed for rmdrup)
    #  - Resorted by position
    tempfiles = []
    for filename in input_files:
        primary_logger.debug('Working on: %s' % (filename))
        command = 'export LANG=C; %s view -h -q 10 %s | grep -vF "chrM" | %s view -u -b - | ' \
                  '%s sort -l 0 -n -m %s -T %s -O bam | %s fixmate -O bam - - | %s sort -@ 8 -m %s - %s'

        # A super evil user could modify TMPDIR and make this generate evil strings. That's evil.
        temporary_file = tempfile.mkstemp('.tmp.bam')
        tempfiles.append(temporary_file)

        shell_job_runner.run(command % (CONFIG['binaries']['samtools'],
                                        filename,
                                        CONFIG['binaries']['samtools'],
                                        CONFIG['binaries']['samtools'],
                                        MAX_MEM,
                                        temporary_file[1],
                                        CONFIG['binaries']['samtools'],
                                        CONFIG['binaries']['samtools'],
                                        MAX_MEM,
                                        output_path + "/" + os.path.basename(os.path.splitext(filename)[0]) + output_suffix))

    shell_job_runner.finish()

    # Clean up our temporary files.
    primary_logger.info('Removing temporary files ...')
    for fd, fname in tempfiles:
        os.close(fd)
        os.unlink(fname)

    primary_logger.info('First large stage complete! Saved as .tmp.bam for next stage.')
Beispiel #7
0
def merge_and_rmdup(input_files, output_path, disable_parallel=False):
    primary_logger = _logshim.getLogger('merge_and_rmdup')

    # Sanity checks on the input files list
    assert(len(input_files) > 1)
    # Check files are readable
    for filename in input_files:
        if not os.access(filename, os.R_OK):
            primary_logger.fatal("Unable to read input files.")
            raise IOError

    output_file_name = '-AND-'.join([os.path.basename(os.path.splitext(filename)[0]) for filename in input_files])

    # Sanity check: maximum output filename length
    max_filename_length = os.statvfs(output_path).f_namemax
    if max_filename_length < 100:
        primary_logger.fatal("Cannot deal with short filename length limit. Maybe namemax is broken?")
        raise IOError

    if (len(output_file_name) + 10) > max_filename_length:  # roughly truncate filename for sanity.
        primary_logger.critical("Very long filename! Truncating!")
        output_file_name = output_file_name[:-20]  # Give us some extra room for downstream stuff?

    output_file_name += ".merged.bam"

    input_file_string = ' '.join(input_files)

    shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)

    primary_logger.debug('Input file string: %s' % (input_file_string))
    primary_logger.debug('Working on merge as: %s' % (output_file_name))
    # This is pretty fast and has minimal memory usage. Yay!
    # We're probably re-rmduping some files if we're merging. That's ok since this is speedy.
    command = "%s merge -u - %s | %s rmdup - %s 2>%s"

    shell_job_runner.run(command % (CONFIG['binaries']['samtools'],
                                    input_file_string,
                                    CONFIG['binaries']['samtools_legacy'],  # TODO: Update this when samtools is fixed.
                                    output_path + "/" + output_file_name,
                                    output_path + "/" + os.path.basename(os.path.splitext(output_file_name)[0]) + '-rmdup.log'))

    shell_job_runner.finish()


    primary_logger.info('Merge and rmdup complete!')
def run_bowtie2(paired_end_mapping,
                genome,
                output_path,
                disable_parallel=False):
    bowtie2_logger = _logshim.getLogger('run_bowtie2')

    # Import the config file to get genome locations
    config = _script_helpers.get_config()

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger,
                                                          delay_seconds=60)

    for output_prefix, paired_ends in paired_end_mapping.iteritems():
        bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' %
                            (output_prefix))
        for filename in paired_ends:
            assert (" " not in filename)
            assert (";" not in filename
                    )  # Vague sanity testing for input filenames
            bowtie2_logger.debug('    Input: %s' % (filename))

        # bowtie2 options:
        # --end-to-end: this is the default, but let's explicitly specify it
        # --sensitive: again, the default (consider switching to --fast?)
        # --no-unal: Suppress unaligned reads from the output .sam
        # --no-discordant: These are paired-end reads. We expect them to be non-discordant.
        # --mm: mmap MAP_SHARED (other processes can use our genome, cool!)
        # --met-stderr: Write metrics to stderr
        # --time: output the time things took
        # -x: target genome
        command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s"

        shell_job_runner.run(
            command %
            (config['bowtie2_genomes'][genome], paired_ends[0], paired_ends[1],
             output_path + "/" + output_prefix + ".bt2.log",
             output_path + "/" + output_prefix + ".bt2.bam"))

    shell_job_runner.finish()
def run_macs14(input_files, output_path, genome, disable_parallel=False):
    macs14_log = _logshim.getLogger('run_macs14')

    macs14_log.info('Spawning MACS14 jobs...')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(macs14_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(macs14_log,
                                                          delay_seconds=20)

    for filename in input_files:
        macs14_log.debug('Working on: %s' % (filename))

        # macs14 is old, but we try it anyway, since it's sometimes useful.
        # -t: input
        # -n: output name
        # -f: format
        # -g: genome
        # -p: pvalue for peak cutoff
        # --wig: save .wig outputs
        # --single-profile: make one single wiggle
        # --space=50: wiggle resolution (default: 10)
        #
        # Note: This CD hack is because MACS1.4 can't specify an output path :(
        command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s"

        filename_without_extension = os.path.splitext(filename)[0] + '.macs14'

        shell_job_runner.run(command % (
            output_path,  # for cd hack
            'macs14',  # This must be pre-installed by the user. It's a big, complex package.
            os.getcwd() + '/' +
            filename,  # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/
            os.path.basename(filename_without_extension),
            genome,  # for genome size
            os.path.basename(filename_without_extension) + '.macs14.log'))

    shell_job_runner.finish()

    macs14_log.info('MACS14 peak calling complete.')
def run_macs14(input_files, output_path, genome, disable_parallel=False):
    macs14_log = _logshim.getLogger('run_macs14')

    macs14_log.info('Spawning MACS14 jobs...')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(macs14_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(macs14_log, delay_seconds=20)

    for filename in input_files:
        macs14_log.debug('Working on: %s' % (filename))

        # macs14 is old, but we try it anyway, since it's sometimes useful.
        # -t: input
        # -n: output name
        # -f: format
        # -g: genome
        # -p: pvalue for peak cutoff
        # --wig: save .wig outputs
        # --single-profile: make one single wiggle
        # --space=50: wiggle resolution (default: 10)
        #
        # Note: This CD hack is because MACS1.4 can't specify an output path :(
        command = "cd %s && %s -t %s -n %s -f BAM -g %s -p 1e-9 --wig --single-profile --space=50 2>%s"

        filename_without_extension = os.path.splitext(filename)[0] + '.macs14'

        shell_job_runner.run(command % (output_path,  # for cd hack
                                        'macs14',  # This must be pre-installed by the user. It's a big, complex package.
                                        os.getcwd() + '/' + filename,  # input file # TODO: Fix this path hack. MACS14 cannot specify an output path :/
                                        os.path.basename(filename_without_extension),
                                        genome,  # for genome size
                                        os.path.basename(filename_without_extension) + '.macs14.log'))

    shell_job_runner.finish()

    macs14_log.info('MACS14 peak calling complete.')
def generate_index(input_files, output_path, disable_parallel=False):
    """
    Many peak pickers want indexed .bams. Let's build indexes! (yay!)

    :param input_files:
    :param output_path:
    :param disable_parallel:
    :return:
    """
    primary_logger = _logshim.getLogger('index')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(primary_logger, delay_seconds=10)

    for filename in input_files:
        primary_logger.debug('Working on: %s' % (filename))
        command = "%s index %s"

        shell_job_runner.run(command % (CONFIG['binaries']['samtools'], filename))

    shell_job_runner.finish()
def run_macs2(input_files, output_path, genome, disable_parallel=False):
    macs2_log = _logshim.getLogger('run_macs2')

    macs2_log.info('Spawning MACS2 jobs...')

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(macs2_log)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(macs2_log,
                                                          delay_seconds=0.1)

    for filename in input_files:
        macs2_log.debug('Working on: %s' % (filename))

        # --bdg: generate .bed graph output
        # --nomodel: We'll be shifting manually!
        # --extsize 200: See long discussion at: @@@
        # --shift -100: As per above.
        # --slocal: Look at a local window of 20kb to build peak models
        # --keep-dup: We already removed duplicates with samtools.
        # TODO: Consider allowing tweaks to these settings with flags?
        command = "%s callpeak -t %s -n %s --outdir %s -g %s --bdg --nomodel --extsize 200 --shift -100 --slocal 20000 --llocal 20000 --keep-dup all 2>%s"

        filename_without_extension = os.path.splitext(filename)[0] + '.macs2'

        shell_job_runner.run(command % (
            'macs2',  # This must be pre-installed by the user. It's a big, complex package.
            filename,  # input file
            os.path.basename(filename_without_extension),
            output_path,
            genome,  # for genome size, uncleaer if this actually matters with nolambda/nomodel
            output_path + "/" + os.path.basename(filename_without_extension) +
            '.log'))

    shell_job_runner.finish()

    macs2_log.info('MACS2 peak calling complete.')
def run_bowtie2(paired_end_mapping, genome, output_path, disable_parallel=False):
    bowtie2_logger = _logshim.getLogger('run_bowtie2')

    # Import the config file to get genome locations
    config = _script_helpers.get_config()

    if disable_parallel:
        shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger)
    else:
        shell_job_runner = _script_helpers.ShellJobRunner(bowtie2_logger, delay_seconds=60)

    for output_prefix, paired_ends in paired_end_mapping.iteritems():
        bowtie2_logger.info('Spawning niced process for bowtie2 on: %s' % (output_prefix))
        for filename in paired_ends:
            assert(" " not in filename)
            assert(";" not in filename)  # Vague sanity testing for input filenames
            bowtie2_logger.debug('    Input: %s' % (filename))

        # bowtie2 options:
        # --end-to-end: this is the default, but let's explicitly specify it
        # --sensitive: again, the default (consider switching to --fast?)
        # --no-unal: Suppress unaligned reads from the output .sam
        # --no-discordant: These are paired-end reads. We expect them to be non-discordant.
        # --mm: mmap MAP_SHARED (other processes can use our genome, cool!)
        # --met-stderr: Write metrics to stderr
        # --time: output the time things took
        # -x: target genome
        command = "bowtie2 --end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time -x %s -1 %s -2 %s 2>%s | samtools view -bS - >%s"

        shell_job_runner.run(command % (config['bowtie2_genomes'][genome],
                                        paired_ends[0],
                                        paired_ends[1],
                                        output_path + "/" + output_prefix + ".bt2.log",
                                        output_path + "/" + output_prefix + ".bt2.bam"))

    shell_job_runner.finish()
Beispiel #14
0
def parse_h5files(input_files, annotationBedTool, overwrite, flatten, density, normalized, sizescaled):
    h5logger = _logshim.getLogger('parse_h5files')

    assert(not (density and normalized))
    total_file_count = len(input_files)
    h5logger.info('Parsing a total of: %d file(s)' % (total_file_count))

    output_suffix_list = ['tsv']

    annotating_regions = False
    if annotationBedTool:
        annotating_regions = True
        output_suffix_list.append('annotated')

    if normalized:
        output_suffix_list.append('normalized')
    elif density:
        output_suffix_list.append('density')
    elif sizescaled:
        output_suffix_list.append('sizescaled')

    output_suffix = '.'.join(reversed(output_suffix_list))

    # Cache regions that we're annotating, maybe.
    region_annotation_cache = {}

    for this_file_count, file in enumerate(input_files):
        h5logger.info('\tParsing: %s (%d/%d)' % (file, this_file_count + 1, total_file_count))

        output_filename = file + '.' + output_suffix

        if not overwrite and os.path.isfile(output_filename):
            h5logger.warn('Skipping this .h5 as output .tsv already exists: %s' % (output_filename))
            continue

        # TODO: Modularize H5FD_CORE (the in-memory driver?)
        with tables.open_file(file, mode="r", driver="H5FD_CORE") as h5_object:
            assert(h5_object.title.startswith("bam liquidator genome read counts"))  # Some sanity checking
            assert(h5_object.root.file_names[0] == "*")

            bam_filename_header = h5_object.root.file_names[1:]
            bam_filename_header.insert(0, 'region')

            # Note: len(files) = len(file_names) - 1, since file_names has a 'wildcard' first entry.
            number_of_regions = int(len(h5_object.root.region_counts) / len(h5_object.root.files))

            # We expect this .h5 object's region_counts to contain:
            # /region_counts (Table(SIZE,)) 'region counts'
            #   description := {
            #   "file_key": UInt32Col(shape=(), dflt=0, pos=0),
            #   "chromosome": StringCol(itemsize=64, shape=(), dflt='', pos=1),
            #   "region_name": StringCol(itemsize=64, shape=(), dflt='', pos=2),
            #   "start": UInt64Col(shape=(), dflt=0, pos=3),
            #   "stop": UInt64Col(shape=(), dflt=0, pos=4),
            #   "strand": StringCol(itemsize=1, shape=(), dflt='', pos=5),
            #   "count": UInt64Col(shape=(), dflt=0, pos=6),
            #   "normalized_count": Float64Col(shape=(), dflt=0.0, pos=7)}
            #   byteorder := 'little'
            #   chunkshape := (NNN,)
            counts = h5_object.root.region_counts

            with open(output_filename, 'wb') as tsv_output:
                tsvwriter = csv.writer(tsv_output, delimiter=str("\t"))
                tsvwriter.writerow(bam_filename_header)

                if annotating_regions:
                    h5logger.debug('Generating .bed annotations from provided genome.')
                    region_to_gene = {}
                    # Perform one annotation rapidly for all regions in the .hdf5
                    hdf5_positions_only = []

                    for region_number in range(0, number_of_regions):
                        hdf5_positions_only.append(counts[region_number][1] + ' ' + str(counts[region_number][3]) + ' ' + str(counts[region_number][4]))

                    hdf5_positions_only_hashkey = ''.join(hdf5_positions_only)

                    if hdf5_positions_only_hashkey in region_annotation_cache:
                        # The genome doesn't change mid run, so we cache only on hdf5_positions
                        region_to_gene = region_annotation_cache[hdf5_positions_only_hashkey]
                        h5logger.debug('Annotation from cache.')
                    else:
                        hdf5_stub_bed = pybedtools.BedTool('\n'.join(hdf5_positions_only), from_string=True)

                        annotated_bed = hdf5_stub_bed.closest(annotationBedTool, t='first')

                        for locus in annotated_bed:
                            region_to_gene[locus.chrom + ':' + str(locus.start) + '-' + str(locus.end)] = locus.fields[11].split('"')[1]

                        region_annotation_cache[hdf5_positions_only_hashkey] = region_to_gene
                        h5logger.debug('Annotation completed.')


                # We're going to aggressively access the hdf5 at a bunch of fixed offsets.
                # rowarray = [counts[number_of_regions*0 + i], counts[number_of_regions*1 + i] + counts[number_of_regions*2 + i] ...]

                number_of_files = len(h5_object.root.files)
                working_deque = deque(maxlen=number_of_files + 1)

                # Here, we loop over every "region"/locus (every entry in the first column of the .tsv)
                # And then (within this loop) jump to each individual "file" (the hdf5 can contain multiple
                # separate samples) to build the data for every row.
                for region_number in range(0, number_of_regions):
                    # Prefix the row with chrN:bpSTART-pbEND e.g. chr4:100-2000
                    locus_name = counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4])

                    # Sanity checking, in case the input is nuts
                    feature_width = counts[region_number][4] - counts[region_number][3]
                    assert(feature_width > 0)

                    # DESeq2 requires each region have a unique name.
                    # You can either append a unique value, or aggregate identical loci.
                    # We address this later by re-opening and aggregating.
                    if annotating_regions:
                        working_deque.append(region_to_gene[locus_name])
                    else:
                        working_deque.append(locus_name)
                    #rowarray = [counts[region_number][1] + ':' + str(counts[region_number][3]) + '-' + str(counts[region_number][4])]

                    for file_number in range(0, number_of_files):
                        if normalized:
                            # Standard normalized (counts/mreads)
                            # bamliquidator gives us (counts/mreads)/width so we multiply by width
                            working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * feature_width))
                        elif density:
                            # (counts/mreads)/width
                            # We upscale the fractional normalized count values by an arbitrary amount,
                            # because subsequent analyses like integers.
                            working_deque.append(int(counts[number_of_regions * file_number + region_number][7] * 10000))
                        elif sizescaled:
                            # counts/width
                            # We upscale the fractional normalized count values by an arbitrary amount,
                            # because subsequent analyses like integers.
                            working_deque.append(int(counts[number_of_regions * file_number + region_number][6] / feature_width * 100))
                        else:
                            working_deque.append(int(counts[number_of_regions * file_number + region_number][6]))

                    tsvwriter.writerow(working_deque)

            if flatten:
                flatten_tsv(output_filename)

    h5logger.info('Completed.')
def find_paired_ends(input_path, verbose=False):
    """
    Given an input path, return

    :param input_path:
    :return:
    """
    find_pe_logger = _logshim.getLogger('find_paired_ends')

    # TODO: Modularize all this!

    if not os.path.isdir(input_path):
        raise ValueError("Input must be a directory. You gave: %s" % (input_path))

    all_files = glob.glob(input_path + "/*.PE1.fastq.gz")  # Must have .PEX. in title
    all_files.extend(glob.glob(input_path + "/*.PE2.fastq.gz"))
    all_files.extend(glob.glob(input_path + "/*.PE1.fastq"))
    all_files.extend(glob.glob(input_path + "/*.PE2.fastq"))

    if len(all_files) == 0:
        raise ValueError("Input directory is empty!")


    # Given paired ends, we must always have an even number of input files.
    if len(all_files) % 2 != 0:
        raise ValueError("Input directory contains an odd number of files.")

    re_pattern = re.compile(r'^(.*)\.PE(\d)(\.fastq|\.fastq\.gz)$')

    file_dict = OrderedDict()

    prefixes_seen = []
    pe_seen = []
    for file in sorted(all_files):
        if not os.access(file, os.R_OK):
            raise OSError("Cannot read file: %s" % (file))

        filename_only = file.rsplit('/', 1)[-1]
        result = re.match(re_pattern, filename_only)

        file_dict[file] = {'prefix': str(result.group(1)),
                           'PE': int(result.group(2))}

        prefixes_seen.append(file_dict[file]['prefix'])
        pe_seen.append(file_dict[file]['PE'])

    if len(set(pe_seen)) != 2:
        raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen))))

    if pe_seen.count(1) != pe_seen.count(2):
        raise ValueError("Uneven pairing of paired ends (are you missing a file)? PE1 count: %d, PE2 count: %d" %
                         (pe_seen.count(1), pe_seen.count(2)))

    find_pe_logger.info("Files seen: %d" % (len(all_files)))
    find_pe_logger.info("Samples seen: %d" % (len(set(prefixes_seen))))

    merge_strategy = {}

    find_pe_logger.info("Sample IDs:")
    for prefix in sorted(set(prefixes_seen)):
        find_pe_logger.info("     %s" % (prefix))

    for file in file_dict.iterkeys():
        merge_strategy.setdefault(file_dict[file]['prefix'], []).append(file)

    if verbose:
        find_pe_logger.debug("Merge strategy is:")
        find_pe_logger.debug(pprint.pformat(merge_strategy))

    return merge_strategy
Beispiel #16
0
def fastq_map_predict(input_path, verbose=False):
    """
    Determine a sane .fastq muti-lane merge strategy.
    Fail if we can't merge correctly, if there are remaining files, etc.

    sample file name: Gordon-Ad2-11-AAGAGGCA-AAGAGGCA_S7_L001_R1_001.fastq.gz

    Args:
        input_path: An input path containing .fastq / .fastq.gz files
    Returns:
        A dict of mappings.
    """
    fastq_map_logger = _logshim.getLogger('fastq_map_predict')

    if not os.path.isdir(input_path):
        raise ValueError("Input must be a directory. You gave: %s" % (input_path))

    all_files = glob.glob(input_path + "/*_R*.fastq.gz")  # Ignore index files, must have _R in title
    all_files.extend(glob.glob(input_path + "/*_R*.fastq"))

    if len(all_files) == 0:
        raise ValueError("Input directory is empty!")

    # Given paired ends, we must always have an even number of input files.
    if len(all_files) % 2 != 0:
        raise ValueError("Input directory contains an odd number of files.")

    re_pattern = re.compile(r'^(.*)_L(\d+)_R(\d)_\d+(\.fastq|\.fastq\.gz)$')


    file_dict = OrderedDict()

    prefixes_seen = []
    lanes_seen = []
    pe_seen = []
    for file in sorted(all_files):
        if not os.access(file, os.R_OK):
            raise OSError("Cannot read file: %s" % (file))

        filename_only = file.rsplit('/', 1)[-1]
        result = re.match(re_pattern, filename_only)

        file_dict[file] = {'prefix': str(result.group(1)),
                           'L': int(result.group(2)),
                           'R': int(result.group(3))}

        prefixes_seen.append(file_dict[file]['prefix'])
        lanes_seen.append(file_dict[file]['L'])
        pe_seen.append(file_dict[file]['R'])


    # Sanity checking here. Missing files? Other oddities?
    if len(file_dict) % len(set(lanes_seen)) != 0:
        raise ValueError("Missing or extra file(s)? Saw %d lanes, and %d input files." %
                         (len(file_dict), len(set(lanes_seen))))

    if len(set(pe_seen)) != 2:
        raise ValueError("Saw %d paired ends, expecting exactly two. That's confusing!" % (len(set(pe_seen))))

    if pe_seen.count(1) != pe_seen.count(2):
        raise ValueError("Uneven pairing of paired ends (are you missing a file)? R1 count: %d, R2 count: %d" %
                         (pe_seen.count(1), pe_seen.count(2)))

    fastq_map_logger.info("Files seen: %d" % (len(all_files)))
    fastq_map_logger.info("Samples seen: %d" % (len(set(prefixes_seen))))
    fastq_map_logger.info("Lanes seen: %d" % (len(set(lanes_seen))))

    merge_strategy = {}

    fastq_map_logger.info("Sample IDs:")
    for prefix in sorted(set(prefixes_seen)):
        fastq_map_logger.info("     %s" % (prefix))

    for file in file_dict.iterkeys():
        merge_strategy.setdefault(file_dict[file]['prefix'] + ".PE" + str(file_dict[file]['R']), []).append(file)

    if verbose:
        fastq_map_logger.debug("Merge strategy is:")
        fastq_map_logger.debug(pprint.pformat(merge_strategy))

    return merge_strategy
def find_paired_ends(input_path, verbose=False):
    """
    Given an input path, return

    :param input_path:
    :return:
    """
    find_pe_logger = _logshim.getLogger('find_paired_ends')

    # TODO: Modularize all this!

    if not os.path.isdir(input_path):
        raise ValueError("Input must be a directory. You gave: %s" %
                         (input_path))

    all_files = glob.glob(input_path +
                          "/*.PE1.fastq.gz")  # Must have .PEX. in title
    all_files.extend(glob.glob(input_path + "/*.PE2.fastq.gz"))
    all_files.extend(glob.glob(input_path + "/*.PE1.fastq"))
    all_files.extend(glob.glob(input_path + "/*.PE2.fastq"))

    if len(all_files) == 0:
        raise ValueError("Input directory is empty!")

    # Given paired ends, we must always have an even number of input files.
    if len(all_files) % 2 != 0:
        raise ValueError("Input directory contains an odd number of files.")

    re_pattern = re.compile(r'^(.*)\.PE(\d)(\.fastq|\.fastq\.gz)$')

    file_dict = OrderedDict()

    prefixes_seen = []
    pe_seen = []
    for file in sorted(all_files):
        if not os.access(file, os.R_OK):
            raise OSError("Cannot read file: %s" % (file))

        filename_only = file.rsplit('/', 1)[-1]
        result = re.match(re_pattern, filename_only)

        file_dict[file] = {
            'prefix': str(result.group(1)),
            'PE': int(result.group(2))
        }

        prefixes_seen.append(file_dict[file]['prefix'])
        pe_seen.append(file_dict[file]['PE'])

    if len(set(pe_seen)) != 2:
        raise ValueError(
            "Saw %d paired ends, expecting exactly two. That's confusing!" %
            (len(set(pe_seen))))

    if pe_seen.count(1) != pe_seen.count(2):
        raise ValueError(
            "Uneven pairing of paired ends (are you missing a file)? PE1 count: %d, PE2 count: %d"
            % (pe_seen.count(1), pe_seen.count(2)))

    find_pe_logger.info("Files seen: %d" % (len(all_files)))
    find_pe_logger.info("Samples seen: %d" % (len(set(prefixes_seen))))

    merge_strategy = {}

    find_pe_logger.info("Sample IDs:")
    for prefix in sorted(set(prefixes_seen)):
        find_pe_logger.info("     %s" % (prefix))

    for file in file_dict.iterkeys():
        merge_strategy.setdefault(file_dict[file]['prefix'], []).append(file)

    if verbose:
        find_pe_logger.debug("Merge strategy is:")
        find_pe_logger.debug(pprint.pformat(merge_strategy))

    return merge_strategy