Ejemplo n.º 1
0
def vcfanno(job, config, name, samples, input_vcf):
    """Take the specified VCF and use vcfanno to add additional annotations
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_vcf: The input_vcf file name to process.
    :type input_vcf: str.
    :returns:  str -- The output vcf file name.
    """

    output_vcf = "{}.vcfanno.snpEff.{}.vcf".format(name, config['snpeff']['reference'])
    logfile = "{}.vcfanno.log".format(name)

    command = ["{}".format(config['vcfanno']['bin']),
               "-p",
               "{}".format(config['vcfanno']['num_cores']),
               "--lua",
               "{}".format(config['vcfanno']['lua']),
               "{}".format(samples[name]['vcfanno_config']),
               "{}".format(input_vcf),
               ">",
               "{}".format(output_vcf)]

    job.fileStore.logToMaster("VCFAnno Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output_vcf
Ejemplo n.º 2
0
def snpeff(job, config, name, input_vcf):
    """Annotate the specified VCF using snpEff
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_vcf: The input_vcf file name to process.
    :type input_vcf: str.
    :returns:  str -- The output vcf file name.
    """

    output_vcf = "{}.snpEff.{}.vcf".format(name, config['snpeff']['reference'])
    logfile = "{}.snpeff.log".format(name)

    snpeff_command = ["{}".format(config['snpeff']['bin']),
                      "-Xmx{}g".format(config['snpeff']['max_mem']),
                      "-v",
                      "{}".format(config['snpeff']['reference']),
                      "{}".format(input_vcf),
                      ">"
                      "{}".format(output_vcf)]

    job.fileStore.logToMaster("snpEff Command: {}\n".format(snpeff_command))
    pipeline.run_and_log_command(" ".join(snpeff_command), logfile)

    return output_vcf
Ejemplo n.º 3
0
def gemini(job, config, name, input_vcf):
    """Take the specified VCF and use GEMINI to add additional annotations and convert to database format
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_vcf: The input_vcf file name to process.
    :type input_vcf: str.
    :returns:  str -- The output GEMINI database name.
    """

    db = "{}.snpEff.{}.db".format(name, config['snpeff']['reference'])
    logfile = "{}.gemini.log".format(name)

    command = ["{}".format(config['gemini']['bin']),
               "load",
               "--cores",
               "{}".format(config['gemini']['num_cores']),
               "--save-info-string",
               "-v",
               "{}".format(input_vcf),
               "-t",
               "snpEff",
               "{}".format(db)]

    job.fileStore.logToMaster("GEMINI Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return db
Ejemplo n.º 4
0
def merge_sam(job, config, name, input_bams):
    """Run Picard MergeSamFiles
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bams: The list of input_bam files to merge.
    :type input_bams: str.
    :returns:  str -- The output bam file name.
    """

    output_sam = "{}.merged.sorted.bam".format(name)
    logfile = "{}.mergesam.log".format(name)

    bam_string = " I=".join(input_bams)

    command = ["{}".format(config['picard-merge']['bin']),
               "MergeSamFiles",
               "I={}".format(bam_string),
               "O={}".format(output_sam),
               "USE_THREADING=True"]

    job.fileStore.logToMaster("Picard MergeSam Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output_sam
Ejemplo n.º 5
0
def realign_target_creator(job, config, name, input_bam):
    """Run GATK TargetCreator on the specified BAM to identify targets for realignment
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The file name of the targets file.
    """

    targets = "{}.targets.intervals".format(name)
    targets_log = "{}.targetcreation.log".format(name)

    command = ["{}".format(config['gatk-realign']['bin']),
               "-T",
               "RealignerTargetCreator",
               "-R",
               "{}".format(config['reference']),
               "-I",
               "{}".format(input_bam),
               "-o",
               "{}".format(targets),
               "-known",
               "{}".format(config['indel1']),
               "-known",
               "{}".format(config['indel2']),
               "-nt",
               "{}".format(config['gatk-realign']['num_cores'])
               ]

    job.fileStore.logToMaster("GATK RealignerTargetCreator Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), targets_log)

    return targets
Ejemplo n.º 6
0
def mark_duplicates(job, config, name, input_bam):
    """Run Picard MarkDuplicates
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output bam file name.
    """

    job.fileStore.logToMaster("Running MarkDuplicates for sample: {}".format(name))

    metrics_file = "{}.dedup.metrics".format(name)
    output_bam = "{}.dedup.sorted.bam".format(name)
    logfile = "{}.markduplicates.log".format(name)

    command = ["{}".format(config['picard-dedup']['bin']),
               "MarkDuplicates",
               "CREATE_INDEX=true",
               "METRICS_FILE={}".format(metrics_file),
               "INPUT={}".format(input_bam),
               "OUTPUT={}".format(output_bam)]

    job.fileStore.logToMaster("Picard MarkDuplicates Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return output_bam
Ejemplo n.º 7
0
def diagnose_pooled_targets(job, config, name, regions, samples, input_bam1, input_bam2):
    """Run GATK's DiagnoseTargets against the supplied region
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param regions: regions dictionary key name and tag.
    :type regions: str.
    :param samples: samples dictionary.
    :type samples: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The DiagnoseTargets output vcf file name.
    """

    diagnose_targets_vcf = "{}_{}.diagnosetargets.vcf".format(name, regions)
    missing_intervals = "{}_{}.missing.intervals".format(name, regions)
    logfile = "{}.{}.diagnose_targets.log".format(name, regions)

    command = ["{}".format(config['gatk']['bin']),
               "-T",
               "DiagnoseTargets",
               "-R",
               "{}".format(config['reference']),
               "-L",
               "{}".format(samples[name][regions]),
               "--coverage_status_threshold",
               "{}".format(config['coverage_loci_threshold']),
               "--bad_mate_status_threshold",
               "{}".format(config['bad_mate_threshold']),
               "--minimum_coverage",
               "{}".format(config['coverage_threshold']),
               "--quality_status_threshold",
               "{}".format(config['quality_loci_threshold']),
               "-I",
               "{}".format(input_bam1),
               "-I",
               "{}".format(input_bam2),
               "-o",
               "{}".format(diagnose_targets_vcf),
               "--missing_intervals",
               "{}".format(missing_intervals)]

    job.fileStore.logToMaster("GATK DiagnoseTargets Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    return diagnose_targets_vcf
Ejemplo n.º 8
0
def filter_variants(job, config, name, input_vcf):
    """Run GATK's VariantFilter on the specified VCF
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_vcf: The input_vcf file name to process.
    :type input_vcf: str.
    :returns:  str -- The output vcf file name.
    """

    output_vcf = "{}.filtered.vcf".format(name)
    filter_log = "{}.variantfiltration.log".format(name)

    filter_command = ["{}".format(config['gatk-filter']['bin']),
                      "-T",
                      "VariantFiltration",
                      "-R",
                      "{}".format(config['reference']),
                      "--filterExpression",
                      "'MQ0 > {}'".format(config['mq0_threshold']),
                      "--filterName",
                      "'HighMQ0'",
                      "--filterExpression",
                      "'DP < {}'".format(config['coverage_threshold']),
                      "--filterName",
                      "'LowDepth'",
                      "--filterExpression",
                      "'QUAL < {}'".format(config['var_qual_threshold']),
                      "--filterName",
                      "'LowQual'",
                      "--filterExpression",
                      "'MQ < {}'".format(config['map_qual_threshold']),
                      "--filterName",
                      "'LowMappingQual'",
                      "--variant",
                      "{}".format(input_vcf),
                      "-o",
                      "{}".format(output_vcf)]

    job.fileStore.logToMaster("GATK VariantFiltration Command: {}\n".format(filter_command))
    pipeline.run_and_log_command(" ".join(filter_command), filter_log)

    return output_vcf
Ejemplo n.º 9
0
def annotate_vcf(job, config, name, input_vcf, input_bam):
    """Run GATK's VariantAnnotation on the specified VCF
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_vcf: The input_vcf file name to process.
    :type input_vcf: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output vcf file name.
    """

    output_vcf = "{}.annotated.vcf".format(name)
    annotation_logfile = "{}.variantannotation.log".format(name)

    annotation_command = ["{}".format(config['gatk-annotate']['bin']),
                          "-T",
                          "VariantAnnotator",
                          "-R",
                          "{}".format(config['reference']),
                          "-nt",
                          "{}".format(config['gatk-annotate']['num_cores']),
                          "--group",
                          "StandardAnnotation",
                          "--dbsnp",
                          "{}".format(config['dbsnp']),
                          "-I",
                          "{}".format(input_bam),
                          "--variant",
                          "{}".format(input_vcf),
                          "-L",
                          "{}".format(input_vcf),
                          "-o",
                          "{}".format(output_vcf)]

    job.fileStore.logToMaster("GATK VariantAnnotator Command: {}\n".format(annotation_command))
    pipeline.run_and_log_command(" ".join(annotation_command), annotation_logfile)

    return output_vcf
Ejemplo n.º 10
0
def realign_indels(job, config, name, input_bam, targets):
    """Run GATK Indel Realignment on the specified BAM
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :param targets: The file name of targets to realign.
    :type targets: str.
    :returns:  str -- The output bam file name.
    """

    output_bam = "{}.realigned.sorted.bam".format(name)
    realign_log = "{}.realignindels.log".format(name)

    command = ["{}".format(config['gatk-realign']['bin']),
               "-T",
               "IndelRealigner",
               "-R",
               "{}".format(config['reference']),
               "-I",
               "{}".format(input_bam),
               "-known",
               "{}".format(config['indel1']),
               "-known",
               "{}".format(config['indel2']),
               "-targetIntervals",
               "{}".format(targets),
               "--read_filter",
               "NotPrimaryAlignment",
               "-o",
               "{}".format(output_bam)]

    job.fileStore.logToMaster("GATK IndelRealigner Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), realign_log)

    return output_bam
Ejemplo n.º 11
0
def add_or_replace_readgroups(job, config, name, input_bam):
    """Run Picard's AddOrReplaceReadGroups on the specified BAM
    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output bam file name.
    """

    job.fileStore.logToMaster("Running AddOrReplaceReadGroups in sample: {}".format(name))

    output_bam = "{}.rg.sorted.bam".format(name)
    logfile = "{}.addreadgroups.log".format(name)
    index_log = "{}.buildindex.log".format(name)

    command = ["{}".format(config['picard-add']['bin']),
               "AddOrReplaceReadGroups",
               "INPUT={}".format(input_bam),
               "OUTPUT={}".format(output_bam),
               "RGID={}".format(name),
               "RGSM={}".format(name),
               "RGLB={}".format(name),
               "RGPL=illumina",
               "RGPU=miseq"]

    command2 = ["{}".format(config['picard-add']['bin']),
                "BuildBamIndex",
                "INPUT={}".format(output_bam)]

    job.fileStore.logToMaster("GATK AddOrReplaceReadGroupsCommand Command: {}\n".format(command))
    pipeline.run_and_log_command(" ".join(command), logfile)

    job.fileStore.logToMaster("GATK BuildBamIndex Command: {}\n".format(command2))
    pipeline.run_and_log_command(" ".join(command2), index_log)

    return output_bam
Ejemplo n.º 12
0
def recalibrator(job, config, name, input_bam):
    """Run GATK Recalibrator on the specified BAM

    :param config: The configuration dictionary.
    :type config: dict.
    :param sample: sample name.
    :type sample: str.
    :param input_bam: The input_bam file name to process.
    :type input_bam: str.
    :returns:  str -- The output bam file name.

    """

    output_bam = "{}.recalibrated.sorted.bam".format(name)
    recal_config = "{}.recal".format(name)
    recal_log = "{}.recalibrate.log".format(name)
    print_log = "{}.printrecalibrated.log".format(name)
    cp_log = "{}.copy.log".format(name)

    # Calculate covariates
    recal_commands = ["{}".format(config['gatk-recal']['bin']),
                      "-T",
                      "BaseRecalibrator",
                      "-R",
                      "{}".format(config['reference']),
                      "-I",
                      "{}".format(input_bam),
                      "-o",
                      "{}".format(recal_config),
                      "--knownSites",
                      "{}".format(config['dbsnp']),
                      "-nct",
                      "{}".format(config['gatk-recal']['num_cores'])]

    # Print recalibrated BAM
    print_reads_command = ["{}".format(config['gatk-recal']['bin']),
                           "-T",
                           "PrintReads",
                           "-R",
                           "{}".format(config['reference']),
                           "-I",
                           "{}".format(input_bam),
                           "-o",
                           "{}".format(output_bam),
                           "-BQSR",
                           "{}".format(recal_config),
                           "-nct",
                           "{}".format(config['gatk-recal']['num_cores'])]

    # Copy index to alternative name
    cp_command = ["cp",
                  "{}.recalibrated.sorted.bai".format(name),
                  "{}.recalibrated.sorted.bam.bai".format(name)]

    job.fileStore.logToMaster("GATK BaseRecalibrator Command: {}\n".format(recal_commands))
    pipeline.run_and_log_command(" ".join(recal_commands), recal_log)

    job.fileStore.logToMaster("GATK PrintReads Command: {}\n".format(print_reads_command))
    pipeline.run_and_log_command(" ".join(print_reads_command), print_log)

    job.fileStore.logToMaster("GATK Copy Command: {}\n".format(cp_command))
    pipeline.run_and_log_command(" ".join(cp_command), cp_log)

    return output_bam