def format_pindel_vcf(input_vcf: str, output_vcf: str) -> None:
    """
    Formats Pindel VCFs to work better with GDC downstream workflows.

    :param input_vcf: The input VCF file to filter.
    :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("format_pindel_vcf")
    logger.info("Formats Pindel VCFs.")

    # setup
    total = 0
    reader = pysam.VariantFile(input_vcf)
    header = get_header(reader.header)
    mode = get_pysam_outmode(output_vcf)
    writer = pysam.VariantFile(output_vcf, mode=mode, header=header)

    # Process
    try:
        for record in reader.fetch():
            total += 1

            tgt = record.samples["TUMOR"]["GT"]
            flag = tgt == (0, 0)
            if flag:
                record.samples["TUMOR"]["GT"] = (0, 1)
            # Info
            new_info = get_info(record, flag)

            # New record
            new_record = writer.new_record()
            new_record.contig = record.contig
            new_record.alleles = record.alleles
            new_record.start = record.start
            new_record.stop = record.stop
            new_record.id = record.id
            new_record.qual = record.qual

            for f in record.filter:
                new_record.filter.add(f)

            for i in new_info:
                new_record.info[i[0]] = i[1]

            for i, sample in enumerate(record.samples):
                for k, v in record.samples[sample].items():
                    new_record.samples[i][k] = v
            writer.write(new_record)

    finally:
        reader.close()
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records.".format(total))
def filter_nonstandard_variants(input_vcf: str, output_vcf: str) -> None:
    """
    Remove non-ACTG loci from a VCF.

    :param input_vcf: The input VCF file to filter.
    :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("filter_nonstandard_variants")
    logger.info("Drops non-ACTG loci from a VCF.")

    # setup
    total = 0
    removed = 0
    written = 0

    # Full vcf reader
    reader = pysam.VariantFile(input_vcf)

    # Writer
    mode = get_pysam_outmode(output_vcf)
    writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header)

    # Process
    try:
        for record in reader.fetch():
            total += 1
            alleles = list(''.join(list(record.alleles)).upper())
            check = set(alleles) - ALLOWED_BASES
            if check:
                logger.warning("Removing {0}:{1}:{2}".format(
                    record.chrom, record.pos, ",".join(alleles)))
                removed += 1
            else:
                written += 1
                writer.write(record)

    finally:
        reader.close()
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records - Removed {}; Wrote {} ".format(
        total, removed, written))
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str,
                      output_vcf: str) -> None:
    """
    Transforms dToxoG MAF to minimal VCF of only dtoxo failures.

    :param input_maf: The annotated dtoxog MAF output file.
    :param reference_fa: Reference fasta used to make seqdict header.
    :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("dtoxog_maf_to_vcf")
    logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures")

    # setup
    total = 0
    written = 0
    tag = "oxog"

    # header
    header = generate_header(reference_fa, tag)

    # Writer
    mode = get_pysam_outmode(output_vcf)
    writer = VariantFile(output_vcf, mode=mode, header=header)

    # Process
    try:
        with open(input_maf, "rt") as fh:
            for record in maf_generator(fh):
                total += 1
                if record["oxoGCut"] == "1":
                    new_vcf_record = build_new_record(record, writer, tag)
                    writer.write(new_vcf_record)
                    written += 1

    finally:
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records - Wrote {}".format(total, written))
def position_filter_dkfz(input_vcf: str, output_vcf: str) -> None:
    """
    Removes VCF records where the POS-2 is less than 0 which
    will cause an Exception to be thrown in DKFZBiasFilter. We
    assume that the input VCF only contains SNPs, but no assertions
    are made to validate this.

    :param input_vcf: The input VCF file to filter.
    :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("position_filter_dkfz")
    logger.info("Position Filter for DKFZ.")

    # setup
    total = 0
    removed = 0
    written = 0

    reader = pysam.VariantFile(input_vcf)
    mode = get_pysam_outmode(output_vcf)
    writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header)

    # Process
    try:
        for record in reader.fetch():
            total += 1
            if record.pos - 2 < 0:
                removed += 1
                continue
            written += 1
            writer.write(record)

    finally:
        reader.close()
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records - Removed {}; Wrote {} ".format(
        total, removed, written))
def add_oxog_filters(input_vcf: str, input_dtoxog: str,
                     output_vcf: str) -> None:
    """
    Adds 'oxog' filter tag to VCFs.

    :param input_vcf: The full input VCF file to filter.
    :param input_dtoxog: The dtoxog VCF from dtoxog-maf-to-vcf used to annotate the full input VCF.
    :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("add_oxog_filters")
    logger.info("Adds dtoxog filters to VCF.")

    # setup
    total = 0
    tagged = 0
    written = 0

    # Full vcf reader
    reader = pysam.VariantFile(input_vcf)
    filter_tag = "oxog"
    reader.header.filters.add(filter_tag, None, None, "Failed dToxoG")

    # Writer
    mode = get_pysam_outmode(output_vcf)
    writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header)

    # dtoxog reader
    dtoxog_reader = pysam.VariantFile(input_dtoxog)

    # Process
    try:
        for record in reader.fetch():
            total += 1
            region = "{0}:{1}-{2}".format(record.contig, record.pos,
                                          record.pos)
            try:
                for row in dtoxog_reader.fetch(region=region):
                    if record.pos == row.pos and record.ref.upper(
                    ) == row.ref.upper():
                        # Add filter if failed oxog
                        record.filter.add("oxog")
                        tagged += 1
                        break
            except ValueError:
                pass

            # handle case where the INFO column is '.'
            for i in record.info:
                if i == ".":
                    del record.info[i]

            written += 1
            writer.write(record)

    finally:
        reader.close()
        writer.close()
        dtoxog_reader.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records - Tagged {}; Wrote {} ".format(
        total, tagged, written))
Esempio n. 6
0
def format_gdc_vcf(
    input_vcf: str,
    output_vcf: str,
    patient_barcode: str,
    case_id: str,
    tumor_barcode: str,
    tumor_aliquot_uuid: str,
    tumor_bam_uuid: str,
    normal_barcode: str,
    normal_aliquot_uuid: str,
    normal_bam_uuid: str,
    *,
    reference_name: str = "GRCh38.d1.vd1.fa",
) -> None:
    """
    Adds VCF header metadata specific to the GDC.

    :param input_vcf: The input VCF file to format.
    :param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'.
    :param patient_barcode: The case submitter id.
    :param case_id: The case uuid.
    :param tumor_barcode: The tumor aliquot submitter id.
    :param tumor_aliquot_uuid: The tumor aliquot uuid.
    :param tumor_bam_uuid: The tumor bam uuid.
    :param normal_barcode: The normal aliquot submitter id.
    :param normal_aliquot_uuid: The normal aliquot uuid.
    :param normal_bam_uuid: The normal bam uuid.
    :param reference_name: Reference name to use in header.
    """
    logger = Logger.get_logger("format_gdc_vcf")
    logger.info("Format GDC tumor/normal paired VCFs.")

    # setup
    reader = pysam.VariantFile(input_vcf)
    mode = get_pysam_outmode(output_vcf)

    # Load new header
    new_header = build_header(
        reader,
        patient_barcode,
        case_id,
        tumor_barcode,
        tumor_aliquot_uuid,
        tumor_bam_uuid,
        normal_barcode,
        normal_aliquot_uuid,
        normal_bam_uuid,
        reference_name,
    )

    writer = pysam.VariantFile(output_vcf, mode=mode, header=new_header)

    # Process
    try:
        for record in reader.fetch():
            writer.write(record)
    finally:
        reader.close()
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)
    def test_get_pysam_outmode(self):
        mode = get_pysam_outmode("fake.vcf")
        self.assertEqual(mode, "w")

        mode = get_pysam_outmode("fake.vcf.gz")
        self.assertEqual(mode, "wz")
def filter_somatic_score(
    input_vcf: str,
    output_vcf: str,
    *,
    tumor_sample_name: str = "TUMOR",
    drop_somatic_score: int = 25,
    min_somatic_score: int = 40,
) -> None:
    """
    Filters SomaticSniper VCF files based on the Somatic Score.

    :param input_vcf: The input VCF file to filter.
    :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'.
    :param tumor_sample_name: The name of the tumor sample in the VCF.
    :param drop_somatic_score: If the somatic score is < this, remove it.
    :param min_somatic_score: If the somatic score is > drop_somatic_score and < this value, add ssc filter tag.
    """
    logger = Logger.get_logger("filter_somatic_score")
    logger.info("Filters SomaticSniper VCF files based on Somatic Score.")

    # setup
    total = 0
    removed = 0
    tagged = 0
    written = 0

    reader = pysam.VariantFile(input_vcf)
    filter_tag = "ssc{0}".format(min_somatic_score)
    logger.info("Filter tag: {}".format(filter_tag))
    reader.header.filters.add(filter_tag, None, None,
                              "Somatic Score < {0}".format(min_somatic_score))
    mode = get_pysam_outmode(output_vcf)
    writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header)

    # Process
    try:
        for record in reader.fetch():
            total += 1
            ssc = record.samples[tumor_sample_name]["SSC"]

            if ssc < drop_somatic_score:
                removed += 1
                continue
            elif ssc < min_somatic_score:
                tagged += 1
                record.filter.add(filter_tag)

            written += 1
            writer.write(record)

    finally:
        reader.close()
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)

    logger.info(
        "Processed {} records - Removed {}; Tagged {}; Wrote {} ".format(
            total, removed, tagged, written))