def main(args: List[str] = None) -> None: """ Main entrypoint for the CLI. """ Logger.setup_root_logger() logger = Logger.get_logger("main") funcs = [ add_oxog_filters, create_dtoxog_maf, create_oxog_intervals, dtoxog_maf_to_vcf, extract_oxoq_from_sqlite, filter_contigs, filter_nonstandard_variants, filter_somatic_score, format_gdc_vcf, format_pindel_vcf, format_sanger_pindel_vcf, position_filter_dkfz, ] defopt.run( funcs, argv=args if args is not None else sys.argv[1:], version=True, argparse_kwargs={'prog': 'gdc_filtration_tools'}, ) logger.info("Finished!")
def captured_output(): """Captures stderr and stdout and returns them""" new_out, new_err = StringIO(), StringIO() old_out, old_err = sys.stdout, sys.stderr try: sys.stdout, sys.stderr = new_out, new_err Logger.setup_root_logger() yield sys.stdout, sys.stderr finally: sys.stdout, sys.stderr = old_out, old_err
def create_oxog_intervals(input_vcf: str, output_file: str) -> None: """ Takes a SNP-only VCF file and creates an interval list for use by the Broad oxog metrics tool. :param input_vcf: The input SNP-only VCF file to extract intervals from. :param output_file: The output interval list to create. """ logger = Logger.get_logger("create_oxog_intervals") logger.info("Extracts interval-file for Broad OxoG metrics from VCF.") logger.warning("Expects a SNP-Only VCF!!") # setup total = 0 # Vcf reader reader = pysam.VariantFile(input_vcf) # Process try: with open(output_file, "wt") as o: for record in reader.fetch(): total += 1 row = "{0}:{1}".format(record.contig, record.pos) o.write(row + "\n") finally: reader.close() logger.info("Processed {} records".format(total))
def format_pindel_vcf(input_vcf: str, output_vcf: str) -> None: """ Formats Pindel VCFs to work better with GDC downstream workflows. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("format_pindel_vcf") logger.info("Formats Pindel VCFs.") # setup total = 0 reader = pysam.VariantFile(input_vcf) header = get_header(reader.header) mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=header) # Process try: for record in reader.fetch(): total += 1 tgt = record.samples["TUMOR"]["GT"] flag = tgt == (0, 0) if flag: record.samples["TUMOR"]["GT"] = (0, 1) # Info new_info = get_info(record, flag) # New record new_record = writer.new_record() new_record.contig = record.contig new_record.alleles = record.alleles new_record.start = record.start new_record.stop = record.stop new_record.id = record.id new_record.qual = record.qual for f in record.filter: new_record.filter.add(f) for i in new_info: new_record.info[i[0]] = i[1] for i, sample in enumerate(record.samples): for k, v in record.samples[sample].items(): new_record.samples[i][k] = v writer.write(new_record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records.".format(total))
def create_dtoxog_maf( input_vcf: str, output_file: str, reference: str, oxog_file: str, oxoq_score: float, ) -> None: """ Takes a SNP-only VCF file and converts it to the dToxoG MAF format which includes the OXOQ value. :param input_vcf: The input SNP-only VCF file to convert to dToxoG MAF. :param output_file: The output MAF file to create. :param reference: Faidx indexed reference fasta file. :param oxog_file: Metrics file output from GATK OxoGMetrics tool. :param oxoq_score: The oxoQ score. """ logger = Logger.get_logger("create_dtoxog_maf") logger.info("Converts a SNP VCF to dToxoG MAF format.") logger.warning("Expects a SNP-Only VCF!!") # setup total = 0 # Load oxog oxog = load_oxog(oxog_file) # Pysam readers vcf_reader = pysam.VariantFile(input_vcf) fasta_reader = pysam.FastaFile(reference) # Process try: with open(output_file, "wt") as o: o.write("#version 2.4.1\n") o.write("\t".join(MAF_COLUMNS) + "\n") for record in vcf_reader.fetch(): total += 1 maf_record = generate_maf_record(record, fasta_reader, oxog, oxoq_score, logger) if maf_record is not None: row = list([maf_record[i] for i in MAF_COLUMNS]) o.write("\t".join(row) + "\n") finally: vcf_reader.close() fasta_reader.close() logger.info("Processed {} records".format(total))
def filter_nonstandard_variants(input_vcf: str, output_vcf: str) -> None: """ Remove non-ACTG loci from a VCF. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("filter_nonstandard_variants") logger.info("Drops non-ACTG loci from a VCF.") # setup total = 0 removed = 0 written = 0 # Full vcf reader reader = pysam.VariantFile(input_vcf) # Writer mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # Process try: for record in reader.fetch(): total += 1 alleles = list(''.join(list(record.alleles)).upper()) check = set(alleles) - ALLOWED_BASES if check: logger.warning("Removing {0}:{1}:{2}".format( record.chrom, record.pos, ",".join(alleles))) removed += 1 else: written += 1 writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Removed {}; Wrote {} ".format( total, removed, written))
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str, output_vcf: str) -> None: """ Transforms dToxoG MAF to minimal VCF of only dtoxo failures. :param input_maf: The annotated dtoxog MAF output file. :param reference_fa: Reference fasta used to make seqdict header. :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("dtoxog_maf_to_vcf") logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures") # setup total = 0 written = 0 tag = "oxog" # header header = generate_header(reference_fa, tag) # Writer mode = get_pysam_outmode(output_vcf) writer = VariantFile(output_vcf, mode=mode, header=header) # Process try: with open(input_maf, "rt") as fh: for record in maf_generator(fh): total += 1 if record["oxoGCut"] == "1": new_vcf_record = build_new_record(record, writer, tag) writer.write(new_vcf_record) written += 1 finally: writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Wrote {}".format(total, written))
def position_filter_dkfz(input_vcf: str, output_vcf: str) -> None: """ Removes VCF records where the POS-2 is less than 0 which will cause an Exception to be thrown in DKFZBiasFilter. We assume that the input VCF only contains SNPs, but no assertions are made to validate this. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("position_filter_dkfz") logger.info("Position Filter for DKFZ.") # setup total = 0 removed = 0 written = 0 reader = pysam.VariantFile(input_vcf) mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # Process try: for record in reader.fetch(): total += 1 if record.pos - 2 < 0: removed += 1 continue written += 1 writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Removed {}; Wrote {} ".format( total, removed, written))
def test_generate_maf_record(self): from gdc_filtration_tools.logger import Logger imets = get_test_data_path("test_oxog_metrics.txt") mets = load_oxog(imets) vcf_file = get_test_data_path("test_input_for_dtoxog.vcf") fa_file = get_test_data_path("test_oxog_ref.fa") fasta = pysam.FastaFile(fa_file) vcf = pysam.VariantFile(vcf_file) logger = Logger.get_logger("create_dtoxog_maf") count = 0 try: for record in vcf: maf_record = generate_maf_record(record, fasta, mets, 32.0, logger) self.assertEqual(maf_record, TestCreatedToxoGMaf.exp_maf[count]) count += 1 finally: fasta.close() vcf.close()
def add_oxog_filters(input_vcf: str, input_dtoxog: str, output_vcf: str) -> None: """ Adds 'oxog' filter tag to VCFs. :param input_vcf: The full input VCF file to filter. :param input_dtoxog: The dtoxog VCF from dtoxog-maf-to-vcf used to annotate the full input VCF. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("add_oxog_filters") logger.info("Adds dtoxog filters to VCF.") # setup total = 0 tagged = 0 written = 0 # Full vcf reader reader = pysam.VariantFile(input_vcf) filter_tag = "oxog" reader.header.filters.add(filter_tag, None, None, "Failed dToxoG") # Writer mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # dtoxog reader dtoxog_reader = pysam.VariantFile(input_dtoxog) # Process try: for record in reader.fetch(): total += 1 region = "{0}:{1}-{2}".format(record.contig, record.pos, record.pos) try: for row in dtoxog_reader.fetch(region=region): if record.pos == row.pos and record.ref.upper( ) == row.ref.upper(): # Add filter if failed oxog record.filter.add("oxog") tagged += 1 break except ValueError: pass # handle case where the INFO column is '.' for i in record.info: if i == ".": del record.info[i] written += 1 writer.write(record) finally: reader.close() writer.close() dtoxog_reader.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Tagged {}; Wrote {} ".format( total, tagged, written))
def format_gdc_vcf( input_vcf: str, output_vcf: str, patient_barcode: str, case_id: str, tumor_barcode: str, tumor_aliquot_uuid: str, tumor_bam_uuid: str, normal_barcode: str, normal_aliquot_uuid: str, normal_bam_uuid: str, *, reference_name: str = "GRCh38.d1.vd1.fa", ) -> None: """ Adds VCF header metadata specific to the GDC. :param input_vcf: The input VCF file to format. :param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'. :param patient_barcode: The case submitter id. :param case_id: The case uuid. :param tumor_barcode: The tumor aliquot submitter id. :param tumor_aliquot_uuid: The tumor aliquot uuid. :param tumor_bam_uuid: The tumor bam uuid. :param normal_barcode: The normal aliquot submitter id. :param normal_aliquot_uuid: The normal aliquot uuid. :param normal_bam_uuid: The normal bam uuid. :param reference_name: Reference name to use in header. """ logger = Logger.get_logger("format_gdc_vcf") logger.info("Format GDC tumor/normal paired VCFs.") # setup reader = pysam.VariantFile(input_vcf) mode = get_pysam_outmode(output_vcf) # Load new header new_header = build_header( reader, patient_barcode, case_id, tumor_barcode, tumor_aliquot_uuid, tumor_bam_uuid, normal_barcode, normal_aliquot_uuid, normal_bam_uuid, reference_name, ) writer = pysam.VariantFile(output_vcf, mode=mode, header=new_header) # Process try: for record in reader.fetch(): writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)
def filter_somatic_score( input_vcf: str, output_vcf: str, *, tumor_sample_name: str = "TUMOR", drop_somatic_score: int = 25, min_somatic_score: int = 40, ) -> None: """ Filters SomaticSniper VCF files based on the Somatic Score. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. :param tumor_sample_name: The name of the tumor sample in the VCF. :param drop_somatic_score: If the somatic score is < this, remove it. :param min_somatic_score: If the somatic score is > drop_somatic_score and < this value, add ssc filter tag. """ logger = Logger.get_logger("filter_somatic_score") logger.info("Filters SomaticSniper VCF files based on Somatic Score.") # setup total = 0 removed = 0 tagged = 0 written = 0 reader = pysam.VariantFile(input_vcf) filter_tag = "ssc{0}".format(min_somatic_score) logger.info("Filter tag: {}".format(filter_tag)) reader.header.filters.add(filter_tag, None, None, "Somatic Score < {0}".format(min_somatic_score)) mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # Process try: for record in reader.fetch(): total += 1 ssc = record.samples[tumor_sample_name]["SSC"] if ssc < drop_somatic_score: removed += 1 continue elif ssc < min_somatic_score: tagged += 1 record.filter.add(filter_tag) written += 1 writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info( "Processed {} records - Removed {}; Tagged {}; Wrote {} ".format( total, removed, tagged, written))