コード例 #1
0
query information. Makes use of ExAC, dbSNP, 1000 genomes, clinvar, cosmic and
effects annotations. The general idea is to prioritize deleterious variants
missing or present at a low frequency in the population, or secondarily identified
in external databases like COSMIC and ClinVar.
"""
import collections
import csv
import re

from bcbio import utils
from bcbio.distributed.transaction import file_transaction
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.variation import population, vcfutils

geneimpacts = utils.LazyImport("geneimpacts")
cyvcf2 = utils.LazyImport("cyvcf2")


def handle_vcf_calls(vcf_file, data, orig_items):
    """Prioritize VCF calls based on external annotations supplied through GEMINI.
    """
    if not _do_prioritize(orig_items):
        return vcf_file
    else:
        ann_vcf = population.run_vcfanno(vcf_file, data)
        if ann_vcf:
            priority_file = _prep_priority_filter_vcfanno(ann_vcf, data)
            return _apply_priority_filter(ann_vcf, priority_file, data)
        # No data available for filtering, return original file
        else:
コード例 #2
0
from distutils.version import LooseVersion
import os

import numpy as np

from bcbio import bam, broad, utils
from bcbio.bam import is_paired
from bcbio.log import logger
from bcbio.distributed.transaction import file_transaction
from bcbio.pipeline import config_utils
from bcbio.pipeline.shared import subset_variant_regions
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.variation import annotation, bamprep, bedutils, gatk, vcfutils, ploidy

cyvcf2 = utils.LazyImport("cyvcf2")


def _add_tumor_params(paired, items, gatk_type):
    """Add tumor/normal BAM input parameters to command line.
    """
    params = []
    if not paired:
        raise ValueError(
            "Specified MuTect2 calling but 'tumor' phenotype not present in batch\n"
            "https://bcbio-nextgen.readthedocs.org/en/latest/contents/"
            "pipelines.html#cancer-variant-calling\n"
            "for samples: %s" %
            ", ".join([dd.get_sample_name(x) for x in items]))
    if gatk_type == "gatk4":
        params += ["-I", paired.tumor_bam]
コード例 #3
0
ファイル: postalign.py プロジェクト: skanwal/bcbio-nextgen
"""
import contextlib
import math
import os

import toolz as tz

from bcbio import bam, broad, utils
from bcbio.bam import ref
from bcbio.distributed.transaction import file_transaction, tx_tmpdir
from bcbio.log import logger
from bcbio.pipeline import config_utils
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do

pysam = utils.LazyImport("pysam")


@contextlib.contextmanager
def tobam_cl(data, out_file, is_paired=False):
    """Prepare command line for producing de-duplicated sorted output.

    - If no deduplication, sort and prepare a BAM file.
    - If paired, then use samblaster and prepare discordant outputs.
    - If unpaired, use biobambam's bammarkduplicates
    """
    do_dedup = _check_dedup(data)
    umi_consensus = dd.get_umi_consensus(data)
    with file_transaction(data, out_file) as tx_out_file:
        if not do_dedup:
            yield (sam_to_sortbam_cl(data, tx_out_file), tx_out_file)
コード例 #4
0
ファイル: validate.py プロジェクト: zhangj5/bcbio-nextgen
import os

import toolz as tz
import numpy as np
import pandas as pd
import pybedtools

from bcbio.log import logger
from bcbio import utils
from bcbio.pipeline import datadict as dd
from bcbio.provenance import do
from bcbio.structural import convert
from bcbio.distributed.transaction import file_transaction, tx_tmpdir
from bcbio.variation import bedutils, vcfutils, ploidy, validateplot

mpl = utils.LazyImport("matplotlib")
plt = utils.LazyImport("matplotlib.pyplot")
sns = utils.LazyImport("seaborn")

# -- VCF based validation


def _evaluate_vcf(calls, truth_vcf, work_dir, data):
    out_file = os.path.join(
        work_dir,
        os.path.join("%s-sv-validate.csv" % dd.get_sample_name(data)))
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(
コード例 #5
0
Handles data normalization and plotting, emphasizing comparisons on methodology
differences.
"""
import collections
import os

from distutils.version import LooseVersion
import numpy as np
import pandas as pd

from bcbio.log import logger
from bcbio import utils
from bcbio.variation import bamprep

mpl = utils.LazyImport("matplotlib")
plt = utils.LazyImport("matplotlib.pyplot")
mpl_ticker = utils.LazyImport("matplotlib.ticker")
sns = utils.LazyImport("seaborn")

def classifyplot_from_plotfiles(plot_files, out_csv, outtype="png", title=None, size=None):
    """Create a plot from individual summary csv files with classification metrics.
    """
    dfs = [pd.read_csv(x) for x in plot_files]
    samples = []
    for df in dfs:
        for sample in df["sample"].unique():
            if sample not in samples:
                samples.append(sample)
    df = pd.concat(dfs)
    df.to_csv(out_csv, index=False)
コード例 #6
0
ファイル: variation.py プロジェクト: vamst/bcbio-nextgen
import os
from bcbio import utils
from bcbio.utils import file_exists, get_R_exports, safe_makedir
from bcbio.bam import ref
from bcbio.heterogeneity import chromhacks
import bcbio.pipeline.datadict as dd
from bcbio.pipeline import config_utils, shared
from bcbio.ngsalign.postalign import dedup_bam
from bcbio.distributed.transaction import file_transaction
from bcbio.provenance import do
from bcbio.variation import vardict
from bcbio import broad, bam
from bcbio.variation import gatk, vcfutils
from bcbio.rnaseq import gtf

pybedtools = utils.LazyImport("pybedtools")


def rnaseq_gatk_variant_calling(data):
    data = dd.set_deduped_bam(data, dedup_bam(dd.get_work_bam(data), data))
    data = gatk_splitreads(data)
    data = gatk_rnaseq_calling(data)
    return data


def gatk_splitreads(data):
    """
    use GATK to split reads with Ns in the CIGAR string, hard clipping regions
    that end up in introns
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
コード例 #7
0
"""Integration with the DNAnexus platform using the API.

Looks up and fills in sample locations from inputs folders in a DNAnexus project.
"""
import os

import toolz as tz

from bcbio import utils
from bcbiovm.shared import retriever as sret

dxpy = utils.LazyImport("dxpy")

# ## DNAnexus specific functionality

KEY = "dx"
CONFIG_KEY = "dnanexus"


def _authenticate():
    assert os.environ.get("DX_AUTH_TOKEN"), \
        "Need to set DX_AUTH_TOKEN for file retrieval from DNAnexus"
    dxpy.set_security_context({
        "auth_token_type": "bearer",
        "auth_token": os.environ["DX_AUTH_TOKEN"]
    })


def _is_remote(f):
    return f.startswith("%s:" % KEY)
コード例 #8
0
from datetime import datetime
import collections
import functools
import os
import gzip
import pytz
import re
import socket

import pandas as pd
import pickle

from bcbio import utils
from bcbio.graph.collectl import load_collectl

mpl = utils.LazyImport("matplotlib")
plt = utils.LazyImport("matplotlib.pyplot")
pylab = utils.LazyImport("pylab")


def _setup_matplotlib():
    # plt.style.use('ggplot')
    mpl.use('Agg')
    pylab.rcParams['image.cmap'] = 'viridis'
    pylab.rcParams['figure.figsize'] = (35.0, 12.0)
    # pylab.rcParams['figure.figsize'] = (100, 100)
    pylab.rcParams['figure.dpi'] = 300
    pylab.rcParams['font.size'] = 25


def get_bcbio_nodes(path):
コード例 #9
0
ファイル: coverage.py プロジェクト: druvus/bcbio-nextgen
"""
import os

import six
import pandas as pd
import pybedtools

from bcbio import utils
from bcbio.utils import rbind, file_exists
from bcbio.provenance import do
from bcbio.distributed.transaction import file_transaction
import bcbio.pipeline.datadict as dd
from collections import defaultdict
from itertools import repeat

mpl = utils.LazyImport("matplotlib")
plt = utils.LazyImport("matplotlib.pyplot")
pylab = utils.LazyImport("pylab")
backend_pdf = utils.LazyImport("matplotlib.backends.backend_pdf")
sns = utils.LazyImport("seaborn")

def _calc_regional_coverage(in_bam, chrom, start, end, samplename, work_dir):
    """
    given a BAM and a region, calculate the coverage for each base in that
    region. returns a pandas dataframe of the format:

    chrom position coverage name

    where the samplename column is the coverage at chrom:position
    """
    region_bt = pybedtools.BedTool("%s\t%s\t%s\n" % (chrom, start, end), from_string=True).saveas()
コード例 #10
0
"""
from __future__ import print_function

import os

from bcbio.log import logger
from bcbio import utils
import bcbio.pipeline.datadict as dd
from bcbio.pipeline import config_utils
from bcbio.distributed.transaction import file_transaction
from bcbio.rnaseq import kallisto, sailfish, gtf
from bcbio.provenance import do
from bcbio.utils import file_exists, safe_makedir
from bcbio.bam import fasta

h5py = utils.LazyImport("h5py")
import numpy as np
import pandas as pd


def get_fragment_length(data):
    """
    lifted from
    https://github.com/pmelsted/pizzly/scripts/pizzly_get_fragment_length.py
    """
    h5 = kallisto.get_kallisto_h5(data)
    cutoff = 0.95
    with h5py.File(h5) as f:
        x = np.asarray(f['aux']['fld'], dtype='float64')
    y = np.cumsum(x) / np.sum(x)
    fraglen = np.argmax(y > cutoff)