Beispiel #1
0
def get_untrimmed_coverage_from_output(log, sample, assembly_pth, coverage,
                                       velvet):
    log.info("Screening contigs for coverage")
    if not velvet:
        regex = re.compile("({}).*:(\d+)".format(
            get_user_param("headers", "trinity")))
    else:
        regex = re.compile("({}.*):(\d+)".format(
            get_user_param("headers", "velvet")))
    # setup starting values
    previous_match = None
    contig_depth = []
    contig_data = OrderedDict()
    overall_coverage = []
    overall_length = []
    overall_count = 0
    overall_contigs = {}
    upcc = os.path.join(assembly_pth,
                        '{}-UNTRIMMED-per-contig-coverage.txt'.format(sample))
    with open(coverage, 'rU') as infile:
        with open(upcc, 'w') as unt_per_contig_cov:
            # read header line
            gatk_header = infile.readline()
            # write headers to outfiles
            unt_per_contig_cov.write(
                "name\tbeginning-length\tbeginning-mean-cov\n")
            for line in infile:
                ls = line.split()
                search = regex.search(ls[0])
                match_name, pos = search.groups()
                if previous_match is None or match_name == previous_match:
                    # hold onto current match_name
                    previous_match = match_name
                    # compute metrics on current position
                    #contig_data[int(pos)] = line
                    contig_depth.append(int(ls[1]))
                elif match_name != previous_match:
                    metadata = compute_coverage_metrics(contig_depth,
                                                        trim=False)
                    unt_per_contig_cov.write("{}\t{}\t{}\n".format(
                        previous_match, metadata["beginning-length"],
                        metadata["beginning-mean-cov"]))
                    overall_contigs[previous_match] = metadata
                    overall_count += 1
                    overall_coverage.append(metadata["beginning-mean-cov"])
                    overall_length.append(metadata["beginning-length"])
                    # reset previous match to current
                    previous_match = match_name
                    # reset containers
                    contig_depth = []
                    contig_data = OrderedDict()
                    # compute metrics on current first position
                    contig_data[int(pos)] = line
                    contig_depth.append(int(ls[1]))
    log.info(
        "\t{} contigs, mean coverage = {:.1f}, mean length = {:.1f}".format(
            overall_count, numpy.mean(overall_coverage),
            numpy.mean(overall_length)))
    return overall_contigs
Beispiel #2
0
def get_untrimmed_coverage_from_output(log, sample, assembly_pth, coverage, velvet):
    log.info("Screening contigs for coverage")
    if not velvet:
        regex = re.compile("({}).*:(\d+)".format(get_user_param("headers", "trinity")))
    else:
        regex = re.compile("({}.*):(\d+)".format(get_user_param("headers", "velvet")))
    # setup starting values
    previous_match = None
    contig_depth = []
    contig_data = OrderedDict()
    overall_coverage = []
    overall_length = []
    overall_count = 0
    overall_contigs = {}
    upcc = os.path.join(
        assembly_pth,
        '{}-UNTRIMMED-per-contig-coverage.txt'.format(sample)
    )
    with open(coverage, 'rU') as infile:
        with open(upcc, 'w') as unt_per_contig_cov:
            # read header line
            gatk_header = infile.readline()
            # write headers to outfiles
            unt_per_contig_cov.write("name\tbeginning-length\tbeginning-mean-cov\n")
            for line in infile:
                ls = line.split()
                search = regex.search(ls[0])
                match_name, pos = search.groups()
                if previous_match is None or match_name == previous_match:
                    # hold onto current match_name
                    previous_match = match_name
                    # compute metrics on current position
                    #contig_data[int(pos)] = line
                    contig_depth.append(int(ls[1]))
                elif match_name != previous_match:
                    metadata = compute_coverage_metrics(contig_depth, trim=False)
                    unt_per_contig_cov.write("{}\t{}\t{}\n".format(
                            previous_match,
                            metadata["beginning-length"],
                            metadata["beginning-mean-cov"]
                        ))
                    overall_contigs[previous_match] = metadata
                    overall_count += 1
                    overall_coverage.append(metadata["beginning-mean-cov"])
                    overall_length.append(metadata["beginning-length"])
                    # reset previous match to current
                    previous_match = match_name
                    # reset containers
                    contig_depth = []
                    contig_data = OrderedDict()
                    # compute metrics on current first position
                    contig_data[int(pos)] = line
                    contig_depth.append(int(ls[1]))
    log.info("\t{} contigs, mean coverage = {:.1f}, mean length = {:.1f}".format(
        overall_count,
        numpy.mean(overall_coverage),
        numpy.mean(overall_length)
    ))
    return overall_contigs
Beispiel #3
0
 def test_config_parameters(self):
     """Test that config parameters exist"""
     for parameter in self.parameters:
         param = get_user_param(parameter[0], parameter[1])
         expected = parameter[2]
         self.assertEqual(
             param, expected,
             "Config entry {} != {} (expected)".format(param, expected))
 def test_config_parameters(self):
     """Test that config parameters exist"""
     for parameter in self.parameters:
         param = get_user_param(parameter[0], parameter[1])
         expected = parameter[2]
         self.assertEqual(
             param,
             expected,
             "Config entry {} != {} (expected)".format(param, expected)
         )
Beispiel #5
0
(c) 2014 Brant Faircloth || http://faircloth-lab.org/
All rights reserved.

This code is distributed under a 3-clause BSD license. Please see
LICENSE.txt for more information.

Created on 26 June 2014 17:13 PDT (-0700)
"""


import os
import subprocess
from phyluce.pth import get_user_path, get_user_param


JAVA = get_user_param("java", "executable")
JAVA_PARAMS = get_user_param("java", "mem")
JAR_PATH = get_user_path("java", "jar")


def new_bam_name(bam, append):
    pth, bamfname = os.path.split(bam)
    bamfname = os.path.splitext(bamfname)[0]
    new_bamfname = "{}-{}.bam".format(bamfname, append)
    new_bam = os.path.join(pth, new_bamfname)
    return new_bam


def create_reference_dict(log, sample, sample_dir, reference):
    log.info("Creating FASTA dict for {}".format(sample))
    outf = os.path.splitext(reference)[0] + ".dict"
Beispiel #6
0
def get_trimmed_coverage_from_output(log, sample, assembly_pth, coverage,
                                     assembler):
    log.info("Screening and filtering contigs for coverage (3x ends, 5x avg.)")
    if assembler == "trinity":
        regex = re.compile("({}).*:(\d+)".format(
            get_user_param("headers", "trinity")))
    elif assembler == "velvet":
        regex = re.compile("({}.*):(\d+)".format(
            get_user_param("headers", "velvet")))
    elif assembler == "abyss":
        regex = re.compile("({}.*):(\d+)".format(
            get_user_param("headers", "abyss")))
    elif assembler == "idba":
        regex = re.compile("({}.*):(\d+)".format(
            get_user_param("headers", "idba")))
    # setup starting values
    previous_match = None
    contig_depth = []
    contig_data = OrderedDict()
    overall_coverage = []
    overall_length = []
    overall_count = 1
    overall_contigs = {}
    pbc = os.path.join(assembly_pth,
                       '{}-TRIMMED-per-base-coverage.txt.gz'.format(sample))
    pcc = os.path.join(assembly_pth,
                       '{}-TRIMMED-per-contig-coverage.txt'.format(sample))
    upcc = os.path.join(assembly_pth,
                        '{}-UNTRIMMED-per-contig-coverage.txt'.format(sample))
    with open(coverage, 'rU') as infile:
        with gzip.open(pbc, 'w') as per_base_cov:
            with open(pcc, 'w') as per_contig_cov:
                with open(upcc, 'w') as unt_per_contig_cov:
                    # read header line
                    gatk_header = infile.readline()
                    # write headers to outfiles
                    per_contig_cov.write(
                        "name\tbeginning-length\tbeginning-mean-cov\ttrim-start\ttrim-end\tend-length\tend-mean-cov\n"
                    )
                    unt_per_contig_cov.write(
                        "name\tbeginning-length\tbeginning-mean-cov\n")
                    per_base_cov.write(gatk_header)
                    for line in infile:
                        ls = line.split()
                        search = regex.search(ls[0])
                        match_name, pos = search.groups()
                        if previous_match is None or match_name == previous_match:
                            # hold onto current match_name
                            previous_match = match_name
                            # compute metrics on current position
                            contig_data[int(pos)] = line
                            contig_depth.append(int(ls[1]))
                        elif match_name != previous_match:
                            metadata = compute_coverage_metrics(contig_depth,
                                                                trim=True)
                            unt_per_contig_cov.write("{}\t{}\t{}\n".format(
                                previous_match, metadata["beginning-length"],
                                metadata["beginning-mean-cov"]))
                            if metadata["ending-mean-cov"] >= 5.0:
                                per_contig_cov.write(
                                    "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                                        previous_match,
                                        metadata["beginning-length"],
                                        metadata["beginning-mean-cov"],
                                        metadata["trim-start"],
                                        metadata["trim-end"],
                                        metadata["ending-length"],
                                        metadata["ending-mean-cov"]))
                                for pos, line in contig_data.iteritems():
                                    if pos - 1 >= metadata[
                                            "trim-start"] and pos - 1 < metadata[
                                                "trim-end"]:
                                        per_base_cov.write(line)
                                overall_contigs[previous_match] = metadata
                                overall_count += 1
                                overall_coverage.append(
                                    metadata["ending-mean-cov"])
                                overall_length.append(
                                    metadata["ending-length"])
                            # reset previous match to current
                            previous_match = match_name
                            # reset containers
                            contig_depth = []
                            contig_data = OrderedDict()
                            # compute metrics on current first position
                            contig_data[int(pos)] = line
                            contig_depth.append(int(ls[1]))
    log.info(
        "\t{} contigs, mean coverage = {:.1f}, mean length = {:.1f}".format(
            overall_count, numpy.mean(overall_coverage),
            numpy.mean(overall_length)))
    return overall_contigs
Beispiel #7
0
Created on 26 June 2014 17:17 PDT (-0700)
"""

import os
import re
import gzip
import glob
import numpy
import subprocess
from collections import OrderedDict

from phyluce.pth import get_user_param, get_user_path

from Bio import SeqIO

JAVA = get_user_param("java", "executable")
JAVA_PARAMS = get_user_param("java", "mem")
JAR_PATH = get_user_path("java", "jar")
GATK = get_user_param("java", "gatk")


def coverage(log, sample, assembly_pth, assembly, cores, bam):
    log.info("Computing coverage with GATK for {}".format(sample))
    cwd = os.getcwd()
    # move into reference directory
    os.chdir(assembly_pth)
    cmd = [
        JAVA, JAVA_PARAMS, "-jar",
        os.path.join(JAR_PATH, GATK), "-T", "DepthOfCoverage", "-R", assembly,
        "-I", bam, "-o", "{}-coverage".format(sample), "-nt",
        str(cores), "--omitIntervalStatistics", "--omitLocusTable"
Beispiel #8
0
def get_trimmed_coverage_from_output(log, sample, assembly_pth, coverage, assembler):
    log.info("Screening and filtering contigs for coverage (3x ends, 5x avg.)")
    if assembler == "trinity":
        regex = re.compile("({}).*:(\d+)".format(get_user_param("headers", "trinity")))
    elif assembler == "velvet":
        regex = re.compile("({}.*):(\d+)".format(get_user_param("headers", "velvet")))
    elif assembler == "abyss":
        regex = re.compile("({}.*):(\d+)".format(get_user_param("headers", "abyss")))
    elif assembler == "idba":
        regex = re.compile("({}.*):(\d+)".format(get_user_param("headers", "idba")))
    # setup starting values
    previous_match = None
    contig_depth = []
    contig_data = OrderedDict()
    overall_coverage = []
    overall_length = []
    overall_count = 1
    overall_contigs = {}
    pbc = os.path.join(
        assembly_pth,
        '{}-TRIMMED-per-base-coverage.txt.gz'.format(sample)
    )
    pcc = os.path.join(
        assembly_pth,
        '{}-TRIMMED-per-contig-coverage.txt'.format(sample)
    )
    upcc = os.path.join(
        assembly_pth,
        '{}-UNTRIMMED-per-contig-coverage.txt'.format(sample)
    )
    with open(coverage, 'rU') as infile:
        with gzip.open(pbc, 'w') as per_base_cov:
            with open(pcc, 'w') as per_contig_cov:
                with open(upcc, 'w') as unt_per_contig_cov:
                    # read header line
                    gatk_header = infile.readline()
                    # write headers to outfiles
                    per_contig_cov.write("name\tbeginning-length\tbeginning-mean-cov\ttrim-start\ttrim-end\tend-length\tend-mean-cov\n")
                    unt_per_contig_cov.write("name\tbeginning-length\tbeginning-mean-cov\n")
                    per_base_cov.write(gatk_header)
                    for line in infile:
                        ls = line.split()
                        search = regex.search(ls[0])
                        match_name, pos = search.groups()
                        if previous_match is None or match_name == previous_match:
                            # hold onto current match_name
                            previous_match = match_name
                            # compute metrics on current position
                            contig_data[int(pos)] = line
                            contig_depth.append(int(ls[1]))
                        elif match_name != previous_match:
                            metadata = compute_coverage_metrics(contig_depth, trim=True)
                            unt_per_contig_cov.write("{}\t{}\t{}\n".format(
                                    previous_match,
                                    metadata["beginning-length"],
                                    metadata["beginning-mean-cov"]
                                ))
                            if metadata["ending-mean-cov"] >= 5.0:
                                per_contig_cov.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                                    previous_match,
                                    metadata["beginning-length"],
                                    metadata["beginning-mean-cov"],
                                    metadata["trim-start"],
                                    metadata["trim-end"],
                                    metadata["ending-length"],
                                    metadata["ending-mean-cov"]
                                ))
                                for pos, line in contig_data.iteritems():
                                    if pos-1 >= metadata["trim-start"] and pos-1 < metadata["trim-end"]:
                                        per_base_cov.write(line)
                                overall_contigs[previous_match] = metadata
                                overall_count += 1
                                overall_coverage.append(metadata["ending-mean-cov"])
                                overall_length.append(metadata["ending-length"])
                            # reset previous match to current
                            previous_match = match_name
                            # reset containers
                            contig_depth = []
                            contig_data = OrderedDict()
                            # compute metrics on current first position
                            contig_data[int(pos)] = line
                            contig_depth.append(int(ls[1]))
    log.info("\t{} contigs, mean coverage = {:.1f}, mean length = {:.1f}".format(
        overall_count,
        numpy.mean(overall_coverage),
        numpy.mean(overall_length)
    ))
    return overall_contigs