def testSNPsAndIndelStartAndEndPos(self): """ Tests that the start and end positions of SNPs and Indels are parsed as defined by the NCI's MAF specification (https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+(MAF)+Specification). """ inputFilename = os.path.join(*["testdata", "vcf", "example.snps.indels.vcf"]) outputFilename = os.path.join("out", "example.snps.indels.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) for row in tsvReader: if row['start'] == "16890445": self.assertEqual(row["end"], "16890445", "The value should be %s but it was %s." % ("16890445", row["end"])) elif row["start"] == "154524458": self.assertEqual(row["end"], "154524459", "The value should be %s but it was %s." % ("154524459", row["end"])) elif row["start"] == "114189432": self.assertEqual(row["end"], "114189433", "The value should be %s but it was %s." % ("114189433", row["end"]))
def testDuplicateAnnotation(self): """ Tests that the duplicate annotations are parsed correctly. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"]) outputFilename = os.path.join("out", "example.duplicate_annotation.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) fieldnames = tsvReader.getFieldNames() self.assertTrue("variant_status" in fieldnames, "variant_status field is missing in the header.") self.assertTrue("sample_variant_status" in fieldnames, "sample_variant_status is missing in the header.") row = tsvReader.next() self.assertTrue("variant_status" in row, "variant_status field is missing in the row.") self.assertTrue("sample_variant_status" in row, "sample_variant_status is missing in the row.") self.assertEqual("2", row["variant_status"], "Incorrect value of variant_status.") self.assertEqual("0", row["sample_variant_status"], "Incorrect value of sample_variant_status")
def testAnnotationRoundTripEmpty(self): """Read a VCF, annotate it with no datasources, write it, and read it again without changes""" inputFilename = os.path.join( *["testdata", "m2_support", "NA12878.ob_filtered.vcf"]) outputFilename = os.path.join("out", "test_round_trip_empty_annotated.vcf") other_opts = dict() other_opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = True run_spec = RunSpecificationFactory.create_run_spec( "VCF", "VCF", inputFilename, outputFilename, datasource_dir="THIS_DIR_DOES_NOT_EXIST__", genomeBuild="hg19", other_opts=other_opts) annotator = Annotator() annotator.initialize(run_spec) annotated_filename = annotator.annotate() vcf_input2 = VcfInputMutationCreator( annotated_filename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts2 = [m for m in vcf_input2.createMutations()] self.assertTrue(len(muts2) > 0)
def testSampleNameSelectorWithVCF(self): input = VcfInputMutationCreator("testdata/vcf/example.1row.vcf") first_mut = next(input.createMutations()) s = SampleNameSelector(first_mut) expected = ["NA 00001", "NA 00002", "NA 00003"] for mut in input.createMutations(): self.assertIn(s.getSampleName(mut), expected) self.assertEqual(s.getAnnotationSource(), "INPUT") self.assertEquals(s.getOutputAnnotationName(), "sample_name")
def testFailureWithSpanningDeletion(self): """Fail with a spanning deletion unless alternates are being ignored.""" inputFilename = os.path.join(*["testdata", "simple_vcf_spanning_deletion.vcf"]) vcf_input = VcfInputMutationCreator(inputFilename, MutationDataFactory(allow_overwriting=True)) muts = vcf_input.createMutations() ctr = 0 for m in muts: ctr += 1
def testNumberGRenderingOfRandomVcf(self): inputFilename = os.path.join(*["testdata", "vcf", "number_g.random.vcf"]) outputFilename = os.path.join("out", "number_g.random.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testDuplicateAnnotationMetaData(self): """ Tests that the metadata is populated correctly in cases where duplicate annotations are present in the input VCF file. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"]) creator = VcfInputMutationCreator(inputFilename) md = creator.getMetadata() self.assertTrue("variant_status" in md, "variant_status field is missing in metadata.") self.assertTrue("sample_variant_status" in md, "sample_variant_status is missing in metadata.")
def testSwitchedFieldsWithExampleVcf(self): """ Tests whether the switched tags are ignored. """ inputFilename = os.path.join(*["testdata", "vcf", "example.bad.switched.fields.vcf"]) outputFilename = os.path.join("out", "example.switched.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer)
def testSuccesseWithSpanningDeletion(self): """Succeed with a spanning deletion since alternates are being ignored.""" inputFilename = os.path.join(*["testdata", "simple_vcf_spanning_deletion.vcf"]) other_options = {InputMutationCreatorOptions.IS_SKIP_ALTS: True} vcf_input = VcfInputMutationCreator(inputFilename, MutationDataFactory(allow_overwriting=True), other_options=other_options) muts = vcf_input.createMutations() ctr = 0 for m in muts: ctr += 1 self.assertTrue(ctr == 1, "There should only have been one mutation seen, instead saw: " + str(ctr))
def testSimpleAnnotationWithAComplexVcf(self): """ Tests the ability to parse a rather complex VCF file without any errors. """ inputFilename = os.path.join(*["testdata", "vcf", "random.vcf"]) outputFilename = os.path.join("out", "random.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testGetMetaDataWithNoSampleNameExampleVcf(self): """ Tests to ensure that the metadata can be retrieved even before createMutations has been called. """ inputFilename = os.path.join(*["testdata", "vcf", "example.sampleName.removed.vcf"]) creator = VcfInputMutationCreator(inputFilename) gtKeys = {'genotype', 'read_depth', 'genotype_quality', 'haplotype_quality', 'q10', 's50', 'samples_number', 'depth_across_samples', 'allele_frequency', 'ancestral_allele', 'dbSNP_membership', 'id', 'qual', 'hapmap2_membership'} md = creator.getMetadata() ks = set(md.keys()) diff = gtKeys.symmetric_difference(ks) self.assertTrue(len(diff) == 0, "Missing keys that should have been seen in the metadata: " + str(diff))
def testAnnotationWithDuplicateValuesInVcf(self): """ Tests the ability to parse a VCF that contains an INFO, FILTER, and INFO field with the same name. """ inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_fields.vcf"]) outputFilename = os.path.join("out", "example.duplicate_fields2.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testSimpleAnnotationWithExampleVcf(self): """ Tests the ability to do a simple Gaf 3.0 annotation. """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "simpleVCF.Gaf.annotated.out.tsv") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename, []) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.addDatasource(TestUtils.createTranscriptProviderDatasource(self.config)) annotator.annotate()
def testGenotypeFieldIsHonored(self): """ Tests that no issues arise with genotype values >1 when multiple variants appear on one line. """ inputFilename = os.path.join(*["testdata", "vcf", "example.severalGTs.vcf"]) creator = VcfInputMutationCreator(inputFilename) muts = creator.createMutations() ctr = 0 for mut in muts: if MutUtils.str2bool(mut["alt_allele_seen"]): self.assertTrue(mut['sample_name'] != "NA 00001") ctr += 1 self.assertTrue(ctr == 7, str(ctr) + " mutations with alt seen, but expected 7. './.' should not show as a variant.")
def testAnnotationWithExampleVcf(self): """ Tests whether parsed annotations match the actual annotations in a simple TSV. Missing format fields yield -->"" ".,." --> "," """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.out.tsv") expectedOutputFilename = os.path.join( *["testdata", "vcf", "example.expected.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue( len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue( len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue( sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len( current.index), "Should have the same values in column " + colName + ": \n" + str(current[colName]) + "\nvs\n" + str(expected[colName]))
def testMissingFilter(self): """ Tests that the missing FILTER fields are parsed correctly. """ inputFilename = os.path.join( *["testdata", "vcf", "example.missing_filters.vcf"]) outputFilename = os.path.join("out", "example.missing_filters.out.tsv") expectedOutputFilename = os.path.join( *["testdata", "vcf", "example.expected.missing_filters.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue( len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue( len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue( sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len( current.index), "Should have the same values in column " + colName)
def testTCGAMAFRendering(self): """ Tests the ability to render a germline VCF file as a TCGA MAF file. """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.vcf.maf.annotated") creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = TcgaMafOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.setManualAnnotations(self._createTCGAMAFOverridesForVCF()) datasources = self._createDatasourceCorpus() for ds in datasources: annotator.addDatasource(ds) filename = annotator.annotate() self._validateTcgaMafContents(filename)
def testSplitByNumberOfAltsWithFile(self): """ Tests whether we properly determine that a field is split using an actual file. """ inputFilename = os.path.join( *["testdata", "vcf", "example.split.tags.vcf"]) creator = VcfInputMutationCreator(inputFilename) isSplit = dict() isSplit['read_depth'] = False isSplit['ESP_MAF'] = False isSplit['allele_frequency'] = True mapVcfFields2Tsv = dict() mapVcfFields2Tsv['read_depth'] = 'DP' mapVcfFields2Tsv['ESP_MAF'] = 'ESP_MAF' mapVcfFields2Tsv['allele_frequency'] = 'AF' muts = creator.createMutations() vcfReader = vcf.Reader(filename=inputFilename, strict_whitespace=True) chrom = None pos = None variant = None for m in muts: if (chrom != m['chr']) or (pos != m['start']): chrom = m['chr'] pos = m['start'] variant = vcfReader.next() for annotationName in isSplit.keys(): if mapVcfFields2Tsv[annotationName] in variant.INFO: a = m.getAnnotation(annotationName) self.assertTrue( (TagConstants.SPLIT in a.getTags()) == isSplit[annotationName], "Is " + annotationName + " split for chrom " + chrom + ", pos " + str(pos) + "? " + str(isSplit[annotationName]) + ", but saw: " + str(TagConstants.SPLIT in a.getTags()))
def testBasicCreationWithExampleVcf(self): """ Tests the ability to parse an input VCF file can be parsed without any errors. """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) creator = VcfInputMutationCreator(inputFilename) muts = creator.createMutations() # You cannot use len(muts), since muts is a generator. ctr = 0 for m in muts: ctr += 1 self.assertTrue( ctr == 27, "Should have seen 27 (# REF alleles x # samples) mutations, but saw: " + str(ctr)) self.assertTrue((m.chr == "21") and (m.start == 1234569), "Last mutation was not correct: " + str(m)) # Reminder: muts is a generator, so it has to be reset creator.reset() muts = creator.createMutations() ctr = 0 for m in muts: ctr += 1 self.assertTrue( ctr == 27, "Should have seen 27 called mutations, but saw: " + str(ctr))
def testAnnotationWithNoSampleNameExampleVcf(self): """ Tests whether parsed annotations match the actual annotations when the input is a VCF file that has no samples. """ inputFilename = os.path.join(*["testdata", "vcf", "example.sampleName.removed.vcf"]) outputFilename = os.path.join("out", "example.sampleName.removed.out.tsv") creator = VcfInputMutationCreator(inputFilename) renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate()
def testAnnotationWithExampleVcf(self): """ Tests whether parsed annotations match the actual annotations in a simple TSV. Missing format fields yield -->"" ".,." --> "," """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) outputFilename = os.path.join("out", "example.out.tsv") expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len(current.index), "Should have the same values in column " + colName + ": \n" + str(current[colName]) + "\nvs\n" + str(expected[colName]))
def testBasicCreationWithExampleVcf(self): """ Tests the ability to parse an input VCF file can be parsed without any errors. """ inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"]) creator = VcfInputMutationCreator(inputFilename) muts = creator.createMutations() # You cannot use len(muts), since muts is a generator. ctr = 0 for m in muts: ctr += 1 self.assertTrue(ctr == 27, "Should have seen 27 (# REF alleles x # samples) mutations, but saw: " + str(ctr)) self.assertTrue((m.chr == "21") and (m.start == 1234569), "Last mutation was not correct: " + str(m)) # Reminder: muts is a generator, so it has to be reset creator.reset() muts = creator.createMutations() ctr = 0 for m in muts: ctr += 1 self.assertTrue(ctr == 27, "Should have seen 27 called mutations, but saw: " + str(ctr))
def testMissingFilter(self): """ Tests that the missing FILTER fields are parsed correctly. """ inputFilename = os.path.join(*["testdata", "vcf", "example.missing_filters.vcf"]) outputFilename = os.path.join("out", "example.missing_filters.out.tsv") expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.missing_filters.out.tsv"]) creator = VcfInputMutationCreator(inputFilename) creator.createMutations() renderer = SimpleOutputRenderer(outputFilename) annotator = Annotator() annotator.setInputCreator(creator) annotator.setOutputRenderer(renderer) annotator.annotate() tsvReader = GenericTsvReader(outputFilename) current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList())) expected = pandas.read_csv(expectedOutputFilename, sep='\t') currentColNames = set() for i in range(len(current.columns)): currentColNames.add(current.columns[i]) expectedColNames = set() for i in range(len(expected.columns)): expectedColNames.add(expected.columns[i]) self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0, "Should have the same columns") self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows") for colName in currentColNames: self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) & pandas.isnull(expected[colName]))) == len(current.index), "Should have the same values in column " + colName)
def testSplitByNumberOfAltsWithFile(self): """ Tests whether we properly determine that a field is split using an actual file. """ inputFilename = os.path.join(*["testdata", "vcf", "example.split.tags.vcf"]) creator = VcfInputMutationCreator(inputFilename) isSplit = dict() isSplit['read_depth'] = False isSplit['ESP_MAF'] = False isSplit['allele_frequency'] = True mapVcfFields2Tsv = dict() mapVcfFields2Tsv['read_depth'] = 'DP' mapVcfFields2Tsv['ESP_MAF'] = 'ESP_MAF' mapVcfFields2Tsv['allele_frequency'] = 'AF' muts = creator.createMutations() vcfReader = vcf.Reader(filename=inputFilename, strict_whitespace=True) chrom = None pos = None variant = None for m in muts: if (chrom != m['chr']) or (pos != m['start']): chrom = m['chr'] pos = m['start'] variant = vcfReader.next() for annotationName in isSplit.keys(): if mapVcfFields2Tsv[annotationName] in variant.INFO: a = m.getAnnotation(annotationName) self.assertTrue((TagConstants.SPLIT in a.getTags()) == isSplit[annotationName], "Is " + annotationName + " split for chrom " + chrom + ", pos " + str(pos) + "? " + str(isSplit[annotationName]) + ", but saw: " + str(TagConstants.SPLIT in a.getTags()))
def testOverwriteAnnotationsSupported(self): """Test that mutations support overwrite annotation in the VCFInputMutationCreator. (white box testing)""" inputFilename = os.path.join(*["testdata", "vcf", "example.trailing_whitespace_in_alleles.vcf"]) vcf_overwriting_disallowed = VcfInputMutationCreator(inputFilename, MutationDataFactory()) vcf_overwriting_allowed = VcfInputMutationCreator(inputFilename, MutationDataFactory(allow_overwriting=True)) mutations = vcf_overwriting_disallowed.createMutations() for m in mutations: self.assertTrue(m._new_required) mutations = vcf_overwriting_allowed.createMutations() for m in mutations: self.assertFalse(m._new_required)
def testSimpleRoundTripWithoutAnnotating(self): """Read a VCF, write it, and read it again without changes""" other_opts = dict() other_opts[OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS] = True inputFilename = os.path.join( *["testdata", "m2_support", "NA12878.ob_filtered.vcf"]) vcf_input = VcfInputMutationCreator( inputFilename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts = [m for m in vcf_input.createMutations()] outputFilename = os.path.join("out", "test_round_trip.vcf") vcf_output = VcfOutputRenderer(outputFilename, otherOptions=other_opts) vcf_output.renderMutations(muts) vcf_input2 = VcfInputMutationCreator( outputFilename, MutationDataFactory(allow_overwriting=True), other_options=other_opts) muts2 = [m for m in vcf_input2.createMutations()] self.assertTrue(len(muts2) > 0)