Esempio n. 1
0
def run_callvar(bqsrbam, rawvcf, genome, disable_dup_filter=False):
    """
    Call germline SNPs and indels via local re-assembly of haplotypes. BAM file recalbrated by BQSR do recommand as
    input BAM file and this functin only run the single sample genotypeVCF calling. More details see also
    HaplotypeCaller_

    .. _HaplotypeCaller: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_haplotypecaller_HaplotypeCaller.php

    Usage:
    ::
      baseq-SNV run_callvar -q Test.marked.bqsr.bam -r Test.raw.indel.snp.vcf -g hg38

    Return:
    ::
      Test.raw.indel.snp.vcf

    """
    GATK = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    interval = get_config("SNV_ref_" + genome, "interval")
    extra = ""
    if disable_dup_filter:
        extra = "--disable-read-filter NotDuplicateReadFilter"
    callvar_cmd = callvar_cmd_script.format(gatk=GATK,
                                            index=index,
                                            interval=interval,
                                            bqsrbam=bqsrbam,
                                            rawvcf=rawvcf,
                                            extrainfos=extra)
    run_cmd("call variants", "".join(callvar_cmd))
    return callvar_cmd
Esempio n. 2
0
def selectvar(rawvcf, selectvcf, filtervcf, genome, run=True):
    """
    This function selects SNPs from a VCF file which is usually the output file of
    HaplotypeCaller. Then, all SNPs are filtered by certain criteria based on INFO and/or FORMAT annotations.
    Criteria used here is "QD < 2.0 || FS > 60.0 || MQ < 40.0".
    More details about SelectVariants_ and VariantFiltration_

    .. _SelectVariants: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    .. _VariantFiltration: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_filters_VariantFiltration.php

    Usage:
    ::
      baseq-SNV run_selectvar -r Test.raw.indel.snp.vcf -s Test.raw.snp.vcf -f Test.filtered.snp.vcf -g hg38

    Return:
    ::
      Test.raw.snp.vcf
      Test.filtered.snp.vcf
    """
    GATK = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    selectvar_cmd = selectvar_cmd_script.format(gatk=GATK,
                                                index=index,
                                                rawvcf=rawvcf,
                                                selectvcf=selectvcf,
                                                filtervcf=filtervcf)
    if run:
        run_cmd("SelectVariants", "".join(selectvar_cmd))
    return selectvar_cmd
Esempio n. 3
0
def deseq2(config, tpmfile, countfile, groupfile, comparefile, outpath):
    """ Run DNACopy.R file ...
    input:
        tmp file
        count file
        group file: tell the group name for each
            samplename/groups/
        compare file: which groups should be compared...
            compare_name/group1/group2
        output path
    output:
        under the output path, for each
    """
    print(config, "XXXXXX")
    if config:
        df_cfg = pd.read_excel(config, sheet_name=["sample", "compare"])
        print(df_cfg["sample"])
        print(df_cfg["compare"])
        #write the sample file and group compare file ...

    Rscript = get_config("RNA", "deseq")
    script = os.path.join(r_script_dir, "DESeq2.R")
    cmd = "{} {} {} {} {} {} {}".format(Rscript, script, tpmfile, countfile,
                                        groupfile, comparefile, outpath)

    if not os.path.exists(outpath):
        os.mkdir(outpath)
        print("[info] Create OutDir {}".format(outpath))
    try:
        run_cmd("DESeq2", cmd)
    except:
        sys.exit("[error] Failed to run the Normalize Rscript ...")
Esempio n. 4
0
def run_markdup(bamfile, markedbam):
    """
    Run MarkDuplicates of Picard. this function tags duplicate reads with "markduplicate" in BAM file.
    See also MarkDuplicates_ in GATK.


    .. _MarkDuplicates: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php

    Usage:
    ::
      baseq-SNV run_markdup -b Test.bam -m Test.marked.bam

    Return:
    metrics file indicates the numbers of duplicates for both single- and paired-end reads
    ::
      Test.marked.bam
      Test.marked.bam.bai
      Test.marked.bam.metrics
    """
    java = get_config("SNV", "java")
    picard = get_config("SNV", "picard")
    samtools = get_config("RNA", "samtools")
    cmd = markdup_cmd_script.format(java=java,
                                    picard=picard,
                                    samtools=samtools,
                                    markedbam=markedbam,
                                    bamfile=bamfile)
    run_cmd("Mark duplicates", "".join(cmd))
    return cmd
Esempio n. 5
0
def run_createsomatic_pon(path_pon, path):
    gatk = get_config("SNV", "GATK")
    normalargs = listofvcf(path_pon)
    ponvcf = os.path.join(path, "pon.vcf.gz")
    pon_cmd = pon_cmd_script.format(gatk=gatk,
                                    normalvcfs=normalargs,
                                    ponvcf=ponvcf)
    run_cmd("create panel of normals", "".join(pon_cmd))
Esempio n. 6
0
def filter_mutect_vcf(somaticvcf, calcontam_table, filter_call):
    gatk = get_config("SNV", "GATK")
    filtercall_cmd = filtercall_cmd_script.format(
        gatk=gatk,
        somaticvcf=somaticvcf,
        calcontam_table=calcontam_table,
        filter_call=filter_call)
    run_cmd("filter mutect calls using contamination table",
            "".join(filtercall_cmd))
Esempio n. 7
0
def get_filter_table(tumorbam, resource, gps_table, calcontam_table):
    gatk = get_config("SNV", "GATK")
    filtertable_cmd = filtertable_cmd_script.format(
        gatk=gatk,
        tumorbam=tumorbam,
        resource=resource,
        gps_table=gps_table,
        calcontam_table=calcontam_table)
    run_cmd("obatin filter table for mutect calls", "".join(filtertable_cmd))
Esempio n. 8
0
def bin_segmentation(infile, path_out):
    """ Run DNACopy.R file ...
    """
    script = os.path.join(r_script_dir, "DNACopy.R")
    cmd = "Rscript {} {} {}".format(script, infile, path_out)
    try:
        run_cmd("Normalize ", cmd)
        print("[info] Segment file write to {}".format(path_out))
    except:
        sys.exit("[error] Failed to run the CBS Rscript ...")
Esempio n. 9
0
def plot_genome(bincount, cbs_path, path_out):
    """Plot the genome
    """
    script = os.path.join(r_script_dir, "Genome_Plot.R")
    #dynamic_bin = get_config("CNV_ref_"+genome, "dynamic_bin")
    cmd = "Rscript {} {} {} {}".format(script, bincount, cbs_path, path_out)
    try:
        run_cmd("Plot Genome", cmd)
        #build the json...
    except:
        sys.exit("[error] Failed to run the Normalize Rscript ...")
Esempio n. 10
0
def bqsr(markedbam, bqsrbam, genome, disable_dup_filter=False):
    """
    Run BQSR_. This function performs the two-steps process called base quality score recalibration. the first
    ster generates a recalibration table based on various covariates which is recruited to the second step to
    correct the systematic bias in input BAM file. More details about BaseRecalibrator_ and ApplyBQSR_ .


    .. _BQSR: https://gatkforums.broadinstitute.org/gatk/discussion/44/base-quality-score-recalibration-bqsr
    .. _BaseRecalibrator: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php
    .. _ApplyBQSR: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php

    Usage:

    * Default mode filters duplicate reads (reads with "markduplicate" tags) before applying BQSR
      ::
         baseq-SNV run_bqsr -m Test.marked.bam -g hg38 -q Test.marked.bqsr.bam

    * Disable reads filter before analysis.
      ::
        baseq-SNV run_bqsr -m Test.marked.bam -g hg38 -q Test.marked.bqsr.bam -f Yes

    Return:
    ::
      Test.marked.bam.table
      Test.marked.bqsr.bai
      Test.marked.bqsr.bam
    """
    gatk = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    DBSNP = get_config("SNV_ref_" + genome, "DBSNP")
    SNP = get_config("SNV_ref_" + genome, "SNP")
    INDEL = get_config("SNV_ref_" + genome, "INDEL")
    interval = get_config("SNV_ref_" + genome, "interval")

    if not disable_dup_filter:
        bqsr_cmd = bqsr_cmd_script.format(gatk=gatk,
                                          index=index,
                                          interval=interval,
                                          markedbam=markedbam,
                                          bqsrbam=bqsrbam,
                                          dbsnp=DBSNP,
                                          snp=SNP,
                                          indel=INDEL)
    else:
        bqsr_cmd = bqsr_cmd_script_DRF.format(gatk=gatk,
                                              index=index,
                                              interval=interval,
                                              markedbam=markedbam,
                                              bqsrbam=bqsrbam,
                                              dbsnp=DBSNP,
                                              snp=SNP,
                                              indel=INDEL)
    run_cmd("BaseRecalibrator", "".join(bqsr_cmd))
Esempio n. 11
0
def run_annovar(filtervcf, annovarfile, name, genome, run=True):
    annovar = get_config("Annovar", "annovar")
    if genome == "hg38":
        ref = get_config("Annovar", "annovar_db_hg38")
    elif genome in ["hg37", "hg19"]:
        ref = get_config("Annovar", "annovar_db_hg19")
        genome = "hg19"
    annovar_cmd = annovar_cmd_script.format(annovar=annovar,
                                            filtervcf=filtervcf,
                                            annovarfile=annovarfile,
                                            ref_annovar=ref,
                                            name=name,
                                            genome=genome)
    if run:
        run_cmd("convert vcf file to aninput format", "".join(annovar_cmd))
    return annovar_cmd
Esempio n. 12
0
def alignment(fq1, fq2, sample, genome, thread=8):
    """
    Map low-divergent sequences against reference genome using BWA.
    Add ReadGroup(more details about ReadGroup_ )to bamfile using the input sample name.
    Outfile is in BAM format and indexed for the downstream analysis.

    .. _ReadGroup: https://software.broadinstitute.org/gatk/documentation/article.php?id=6472

    Usage:
    ::
      baseq-SNV run_bwa -1 Reads.1.fq.gz -2 Read.2.fq.gz -g hg38 -n Test

    Return:
    ::
      Test.bam
      Test.bam.bai

    """
    bwa = get_config("SNV", "bwa")
    samtools = get_config("SNV", "samtools")
    genome = get_config("SNV_ref_" + genome, "bwa_index")
    viewedbam = sample + ".view.bam"
    samfile = sample + ".sam"
    if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2):
        bwa_cmd = bwa_cmd_script_p.format(bwa=bwa,
                                          sample=sample,
                                          genome=genome,
                                          fq1=fq1,
                                          fq2=fq2,
                                          samfile=samfile,
                                          thread=thread)
    elif fq1 and os.path.exists(fq1):
        bwa_cmd = bwa_cmd_script_s.format(bwa=bwa,
                                          sample=sample,
                                          genome=genome,
                                          fq1=fq1,
                                          samfile=samfile,
                                          thread=thread)
    sort_index_cmd = sort_index_cmd_script.format(samtools=samtools,
                                                  sample=sample,
                                                  samfile=samfile,
                                                  viewedbam=viewedbam)
    run_cmd("bwa alignment", "".join(bwa_cmd))
    run_cmd("samtools sort", "".join(sort_index_cmd))
    return bwa_cmd + "\n" + sort_index_cmd
Esempio n. 13
0
def run_salmon(fq1, fq2, genome, outdir):
    salmon = get_config("RNA", "salmon")
    salmon_ref = get_config("RNA_ref_" + genome, "salmon_index")
    gene_map = get_config("RNA_ref_" + genome, "gene_map")
    if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2):
        salmon_cmd = [
            salmon, 'quant', '-i', salmon_ref, '-l A', '-1', fq1, '-2', fq2,
            '-p 8', '-g', gene_map, '-o', outdir
        ]
    elif fq1 and os.path.exists(fq1):
        salmon_cmd = [
            salmon, 'quant', '-i', salmon_ref, '-l A', '-r', fq1, '-p 8', '-g',
            gene_map, '-o', outdir
        ]
    else:
        sys.exit("[error]")
    run_cmd("Salmon Quantification", " ".join(salmon_cmd))
    return salmon_cmd
Esempio n. 14
0
def run_star(fq1, fq2, genome, outdir, run=True):
    star = get_config("RNA", "star")
    star_index = get_config("RNA_ref_" + genome, "star_index")
    samtools = get_config("RNA", "samtools")
    # Run hisat, samtools and cufflinks
    if not os.path.exists(outdir):
        os.mkdir(outdir)
        print("[info] Create outdir in: {}".format(outdir))
    if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2):
        star_cmd = script.format(outdir, star, star_index, fq1, fq2,samtools)
    elif fq1 and os.path.exists(fq1):
        star_cmd = script1.format(outdir, star, star_index, fq1,samtools)
    else:
        pass
    cufflinks_cmd = run_cufflinks(genome, method="star")
    if run:
        run_cmd("star analysis", "".join(star_cmd))
        run_cmd("cufflinks analysis", "".join(cufflinks_cmd))
    return star_cmd + "\n" + cufflinks_cmd
Esempio n. 15
0
def bowtie2_sort(fq1, fq2, bamfile, genome, reads=5*1000*1000, thread=8):
    bowtie2 = get_config("CNV", "bowtie2")
    bowtie2_ref = genome
    samtools = get_config("CNV", "samtools")

    samfile = bamfile+".sam"
    bamfile = bamfile
    statsfile = bamfile+".stat"

    print("[info] Bamfile Path : {}".format(bamfile))

    #Run Bowtie
    if fq1 and fq2:
        bowtie_cmd = [bowtie2, '-p', str(thread), '-x', bowtie2_ref, '-u', str(reads), '-1', fq1, '-2', fq2, '>', samfile]
    else:
        bowtie_cmd = [bowtie2, '-p', str(thread), '-x', bowtie2_ref, '-u', str(reads), '-U', fq1, '>', samfile]
    run_cmd("bowtie alignment", " ".join(bowtie_cmd))

    #run Samtools
    samtools_sort = [samtools, 'sort -@ ', str(thread), '-o', bamfile, samfile, ";", samtools, "index", bamfile, "; rm", samfile]
    run_cmd("samtools sort", " ".join(samtools_sort))

    #run flagstats
    cmd_stats = [samtools, "flagstat", bamfile, ">", statsfile]
    run_cmd("samtools stats", " ".join(cmd_stats))

    return bamfile
Esempio n. 16
0
def create_pon(genome, list, path, interval):
    """
    Create_pon function helps you create PoN(panel of normals) file necessary for mutect2 calling. The PoN captures
    common artifactual and germline variant sites. Mutect2 then uses the PoN to filter variants at the site-level.

    Example of samples list (tab delimited):

    * Content of columns are normal sample name, normal BAM file, tumor sample name, tumor BAM file(order cannot be distruped).
      Absoulte path of all BAM files should be added if directory of BAM files and analysis directory are different.
      ::
        N504    N504_marked_bqsr.bam   T504    T504_marked_bqsr.bam
        N505    N505_marked_bqsr.bam   T505    T505_marked_bqsr.bam
        N506    N506_marked_bqsr.bam   T506    T506_marked_bqsr.bam
        N509    N509_marked_bqsr.bam   T509    T509_marked_bqsr.bam
        N510    N510_marked_bqsr.bam   T510    T510_marked_bqsr.bam

    Usage:
    
    * Interval list defines genomic regions where analysis is restricted. Introduction of interval list format and its function, please see here_.
      ::
        # designated a intervals.list
        baseq-SNV create_pon -g hg37 -l sample_list.txt -p ./ -L interval.list
        # Using the dafalut intervals.list
        baseq-SNV create_pon -g hg37 -l sample_list.txt -p ./

    .. _here: https://software.broadinstitute.org/gatk/documentation/article?id=11009
    """
    index = get_config("SNV_ref_" + genome, "bwa_index")
    gatk = get_config("SNV", "GATK")
    if not os.path.exists(path):
        print("[ERROR] No such file or directory")
    else:
        path_pon = os.path.join(path, "pon")
        if not os.path.exists(path_pon):
            os.mkdir(path_pon)

    with open(list, "r") as file:
        lines = file.readlines()
    sample_info = [line.strip().split() for line in lines]
    import multiprocessing as mp
    pool = mp.Pool(processes=6)
    results = []
    for sample in sample_info:
        normalvcf = os.path.join(path_pon,
                                 "{}_tumor-only.vcf.gz".format(sample[0]))
        if interval:
            normalvcf_cmd = normalvcf_cmd_script.format(gatk=gatk,
                                                        index=index,
                                                        normalbam=sample[1],
                                                        samplename=sample[0],
                                                        normalvcf=normalvcf,
                                                        interval=interval)
        else:
            normalvcf_cmd = normalvcf_cmd_script1.format(gatk=gatk,
                                                         index=index,
                                                         normalbam=sample[1],
                                                         samplename=sample[0],
                                                         normalvcf=normalvcf)
        results.append(
            pool.apply_async(
                run_cmd, ("creat normal vcf file", "".join(normalvcf_cmd, ))))
    pool.close()
    pool.join()
    [x.get() for x in results]
    normalargs = listofvcf(path_pon)
    ponvcf = os.path.join(path, "pon.vcf.gz")
    pon_cmd = pon_cmd_script.format(gatk=gatk,
                                    normalvcfs=normalargs,
                                    ponvcf=ponvcf)
    run_cmd("create panel of normals", "".join(pon_cmd))
Esempio n. 17
0
def mutect2(genome, normalname, normalbam, tumorname, tumorbam, vcffile, pon,
            germline):
    """
    Mutect2 is aim to call somatic SNVs and indels via local assembly of haplotypes. This function requires both
    tumor BAM file and its matched normal BAM file. tumorname and normalname should be consistent with the ReadGroup(ID) of tumor
    BAM file and normal BAM file respectively. PoN is refer to panel of normals callset(more infomation about PoN and how to
    create it, please see PoN_ ). Germline resource, also in VCF format, is used to annotate variant alleles. Default germline resource is
    downloaded from here_ .

    .. _here: https://software.broadinstitute.org/gatk/download/bundle
    .. _PoN: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_CreateSomaticPanelOfNormals.php

    Usage:

    * Simplified Mutect2 command line
      ::
        # single sample
        baseq-SNV run_mutect2 -g hg37 -n normal -N normal_marked_bqsr.bam \\
                                      -t tumor -T tumor_marked_bqsr.bam -o ./
        # multiple samples
        baseq-SNV run_mutect2 -g hg37 -l sample_list.txt -o ./

    * Specify PoN(panels of normals) VCF file and germline VCF file
      Default germline VCF file comes form GATK resource bundle and is recruited if germline isn't designated.
      ::
        # single sample
        baseq-SNV run_mutect2 -g hg37 -n normal -N normal_marked_bqsr.bam \\
                                      -t tumor -T tumor_marked_bqsr.bam -o ./ \\
                                      -p pon.vcf.gz -G af-only-gnomad.raw.sites.b37.vcf.gz
        # multiple samples
        baseq-SNV run_mutect2 -g hg37 -l sample_list.txt -o ./ \\
                                      -p pon.vcf.gz -G af-only-gnomad.raw.sites.b37.vcf.gz

    """

    gatk = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    if pon:
        if not germline:
            germline = get_config("SNV_ref_" + genome, "germline")
            mutect2_cmd = mutect2_cmd_stardand_script.format(
                gatk=gatk,
                index=index,
                normalbam=normalbam,
                normalname=normalname,
                tumorbam=tumorbam,
                tumorname=tumorname,
                vcffile=vcffile,
                germline=germline,
                pon=pon)
        else:
            mutect2_cmd = mutect2_cmd_stardand_script.format(
                gatk=gatk,
                index=index,
                normalbam=normalbam,
                normalname=normalname,
                tumorbam=tumorbam,
                tumorname=tumorname,
                vcffile=vcffile,
                germline=germline,
                pon=pon)
    else:
        mutect2_cmd = mutect2_cmd_simplified_script.format(
            gatk=gatk,
            index=index,
            normalbam=normalbam,
            normalname=normalname,
            tumorbam=tumorbam,
            tumorname=tumorname,
            vcffile=vcffile)

    run_cmd("mutect annlysis", "".join(mutect2_cmd))