def testESPCoverageAnnotationWithMissingAnnotationValuesIndelAvgMatch( self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075350" m1.end = "100075356" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgSampleReadDepth") cur_annotation = Annotation( value="91.25", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def test_continuous_exons_in_segments(self): """Test that all exons are accounted when annotating adjacent segments that skip an exon. """ # SPECC1L 10+ 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L 8- 22 16282318 POTEH 2- 24730543 SPECC1L 8- 433.0 -0.00781166374668759 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM # SPECC1L-ADORA2A 22 24734447 SPECC1L 10+ 41783674 TEF 1- 1215.0 -0.04975556624325125 hg19 CESC.TCGA.BI.A0VR.Tumor.SM.1RACM seg1 = MutationData() seg1.chr = "22" seg1.start = "24734447" # Just passed the exon 9 (0-based) seg1.end = "41783674" seg2 = MutationData() seg2.chr = "22" seg2.start = "16282318" seg2.end = "24730543" # Just passed the exon 8 (0-based) segs = [seg1, seg2] # 'ENST00000314328.9' for GENCODE v19 chosen_tx, transcript_ds = self._get_chosen_tx_and_transcript_ds(seg1.chr, seg1.start) result_tuple = transcript_ds._determine_exons_affected_by_start(seg1.start, chosen_tx) self.assertTrue(result_tuple == (10, '+')) result_tuple = transcript_ds._determine_exons_affected_by_end(seg2.end, chosen_tx) self.assertTrue(result_tuple == (8, '-'))
def testESPCoverageAnnotationWithSNPAvgMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join(*["testdata", "small_esp_coverage_avg_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075334" m1.end = "100075334" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation(value="75.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation(value="692.0", datasourceName="ESP", dataType="Float", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation(value="X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation(value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation(value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation(value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def _combine_mutations(mutations): """ Merge multiple adjacent mutations into a single new mutation. :param mutations: an ordered list of MutationData :returns a new MutationData :warning: _combine_mutations does not make any attempt to sanity check input mutations it will happily combine overlapping and non-adjacent mutations on disparate chromosomes """ if len(mutations) == 0: return None if len(mutations) == 1: return mutations[0] # special logic for the attributes start = min([mut.start for mut in mutations]) end = max([mut.end for mut in mutations]) chr = mutations[0].chr ref = "".join([mut.ref_allele for mut in mutations]) alt = "".join([mut.alt_allele for mut in mutations]) build = "|".join(set([x.build for x in mutations])) #create the new mutation newmut = MutationData(chr=chr, start=start, end=end, ref_allele=ref, alt_allele=alt, build=build) #add annotations to the mutation allAnnotations = set(flatmap(lambda x: x.keys(), mutations)) annotationNames = allAnnotations - set( mutations[0].getAttributeNames()) for annotName in annotationNames: annotations = [] for mut in mutations: try: annotations.append(mut.getAnnotation(annotName)) except KeyError: pass values = sorted( (set([x.getValue() for x in annotations if x.getValue()]))) value = "|".join(values) tags = sorted(set(flatmap(lambda x: x.getTags(), annotations))) source = annotations[0].getDatasource() datatype = annotations[0].getDataType() number = annotations[0].getNumber() description = annotations[0].getDescription() newmut.createAnnotation(annotationName=annotName, annotationValue=value, annotationSource=source, annotationDataType=datatype, annotationDescription=description, tags=tags, number=number) return newmut
def generateTranscriptMuts(gafDS,uniprotDS): tDict = gafDS.getTranscriptDict() for transcriptID in tDict.keys(): m = MutationData() m.createAnnotation('gene', tDict[transcriptID]['gene']) m.createAnnotation('transcript_id', transcriptID) m = uniprotDS.annotate_mutation(m) yield m
def testHeaderCreation(self): """Test that a tcga vcf header can be generated, even from a blank mutation. """ vcfOR = TcgaVcfOutputRenderer("out/TCGAVCFHeader.out.txt") m = MutationData() m.createAnnotation('center', "broad.mit.edu") hdr = vcfOR.createVcfHeader(m) self.assertTrue(hdr is not None) self.assertTrue(hdr <> "") self.assertTrue(hdr.find("broad.mit.edu") <> -1, "Could not find string that should have been in header.")
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
def testMissingAnnotations(self): ''' Tests that if the required annotations ("gene", "protein_change", and "other_transcripts") are missing, an excpetion is thrown. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="SmallNatVar", version="test") m = MutationData() m.createAnnotation("gene", "TP53") #m.createAnnotation("protein_change", "p.S376C") self.assertRaisesRegexp(MissingAnnotationException, "protein_change", datasource.annotate_mutation, m)
def testSetValues(self): m = MutationData() m.createAnnotation("fake1", "1") m.createAnnotation("fake2", "blah blah") self.assertTrue(m["fake1"] == "1", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake1"])) self.assertTrue(m["fake2"] == "blah blah", "Could not properly retrieve annotation using the dictionary interface. " + str(m["fake2"])) m["fake2"] = "Whoa" self.assertTrue(m["fake2"] == "Whoa", "Could not properly retrieve annotation using the dictionary interface, after a value change.") print(str(m))
def testMissingAnnotations(self): ''' Tests that if the required annotations ("gene", "protein_change", and "other_transcripts") are missing, an exception is thrown. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="SmallNatVar", version="test") m = MutationData() m.createAnnotation("gene", "TP53") #m.createAnnotation("protein_change", "p.S376C") self.assertRaisesRegexp(MissingAnnotationException, "protein_change", datasource.annotate_mutation, m)
def testDatasourceCreator(self): """ Test that the datasource creator process will work for TranscriptToUniProtProteinPositionTransformingDatasource. NOTE: This test needs to be updated to use sqlite instead of filesystem file. """ tDS = DatasourceFactory.createDatasource("testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/") outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def test_cached_annots_dummy_cache(self): """Test dummy cache. Also, tests a simple store and retrieve, which should be None.""" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize(None, fake_db_dir_key, is_read_only=False) m = MutationData() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(annots is None)
def testAnnotationSourceIsPopulated(self): ''' Tests that the annotation source is not blank for the example tsv datasource. ''' geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene',"ABL1") m = geneDS.annotate_mutation(m) self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown") self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
def test_range_fetch(self): m = MutationData() m.createAnnotation('chr', '1') m.createAnnotation('start', 78978) m.createAnnotation('end', 79000) self.bigwig_datasource.annotate_mutation(m) self.assertEqual(m.get('TestBigWig_score'), 0.75)
def testBasicAnnotationWithChange(self): """ Test whether we can translate from one coordinate system to another. This tests a known change. """ tDS = TranscriptToUniProtProteinPositionTransformingDatasource(title="UniProt", version="test", src_file="file://testdata/small_uniprot_prot_seq_ds/db") # Must correspond to what the datasource is going to generate. outputAnnotation = "UniProt_aapos" m = MutationData() m.createAnnotation('transcript_id', 'uc009vvt.1') m.createAnnotation('protein_change', 'p.T1105A') m = tDS.annotate_mutation(m) self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
def testdbNSFPNoRefAltAnnotationWithExactMatch(self): """ """ self.logger.info("Initializing dbNSFP") tabixIndexedTsvDirName = os.path.join(*[ "testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19" ]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join( tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "1" m1.start = "35140" m1.end = "35140" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos") cur_annotation = Annotation( value="1|1|1", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon") cur_annotation = Annotation( value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand") cur_annotation = Annotation( value="-|-|-", datasourceName="dbNSFP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testESPCoverageAnnotationWithMissingIndelOverlapMatch(self): """ """ self.logger.info("Initializing ESP6500SI-V2 Coverage") tabixIndexedTsvDirName = os.path.join( *["testdata", "small_esp_coverage_overlap_ds", "hg19"]) tabixIndexedTsvDatasource = DatasourceFactory.createDatasource( os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_overlap_ds.config"), tabixIndexedTsvDirName) m1 = MutationData() m1.chr = "X" m1.start = "100075300" m1.end = "100075336" m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth") cur_annotation = Annotation( value="75.0|81.0|81.0", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered") cur_annotation = Annotation( value="692|692|692", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome") cur_annotation = Annotation( value="X|X|X", datasourceName="ESP", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
def testBasicCosmicInit(self): """ Very simple test that will create a datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource('testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic") m = MutationData() m.chr = 19 m.start = 58858921 m.end = 58858921 m = ds.annotate_mutation(m) self.assertTrue(m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])
def testEmptyAnswer(self): ''' The Reference Datasource should return a blank result if the chromosome is not found. Note: A log entry should also be written, but this is not tested. ''' self.logger.info("Please ignore the next logging warning: testdata/reference_ds/chrTHIS_DOES_NOT_EXIST.txt not found. Please add it.") ds = ReferenceDatasource('testdata/reference_ds') m = MutationData() m.chr = "THIS_DOES_NOT_EXIST" m.start = "11" m.end = "11" groundTruth = "" # remember that the annotate_mutation returns a generator, so we use an iterator guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly -- should be blank: " + str(guess['ref_context']))
def test_cached_annots(self): """Test to make sure that we are not storing annotations that should not be cached. Also, tests a simple store and retrieve.""" cache_file = "out/shove.managertest.annots.cache" cm = CacheManager() fake_db_dir_key = "blah" cm.initialize("file://" + cache_file, fake_db_dir_key, is_read_only=False) m = MutationData() m.createAnnotation("blah1", "val1", annotationSource="INPUT") m.createAnnotation("blah2", "val5", annotationSource="some_datasource") cm.store_annotations_in_cache(m) annots = cm.retrieve_cached_annotations(m) self.assertTrue(len(annots.keys()) == 1) self.assertTrue(annots["blah2"].getValue() == "val5")
def createMutations(self): """ No inputs. Returns a generator of mutations built from the specified maflite file. """ aliasKeys = self._reverseAlternativeDict.keys() allColumns = self._tsvReader.getFieldNames() for line in self._tsvReader: # We only need to assign fields that are mutation attributes and have a different name in the maflite file. mut = MutationData(build=self._build) for col in allColumns: # Three scenarios: # 1) col is name of mutation data field -- simple createAnnotation # 2) col name is an alias for a mutation data field -- do lookup then createAnnotation # 3) col name is not an alias for a mutation data field -- simple createAnnotation if col in aliasKeys: realKey = self._reverseAlternativeDict[col] self.logger.debug(realKey + " found from " + col) val = line[col] if realKey == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(realKey, val, 'INPUT') else: # Scenario 1 and 3 # Make sure to convert chromosome values. val = line[col] if col == "chr": val = MutUtils.convertChromosomeStringToMutationDataFormat( line[col]) mut.createAnnotation(col, val, 'INPUT') mut.ref_allele, mut.alt_allele = mut.ref_allele.strip( ), mut.alt_allele.strip( ) #remove any trailing whitespace if present # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different. if mut.alt_allele == mut.ref_allele: mut.alt_allele = self._find_alt_allele_in_other_field( line, mut.ref_allele) # FIXME: Support more than one alias in the reverse dictionary. Then this line can be removed. if mut.start is not "" and mut.end is "": mut.end = mut.start if mut.end is not "" and mut.start is "": mut.start = mut.end yield mut
def test_no_data_fetch(self): """Test for value not found in bigwig. In this case, our test bigwig only has data for chr1 so None is expected return value. """ m = MutationData() m.createAnnotation('chr', '13') m.createAnnotation('start', 78978) m.createAnnotation('end', 79000) self.bigwig_datasource.annotate_mutation(m) self.assertEqual(m.get('TestBigWig_score'), None)
def test_copy(self): """Test annotation copy """ m = MutationData() m.createAnnotation("foo", "3", "blah_source", annotationDescription="testing", tags=["superblah"], number="A") m.createCopyAnnotation(m.getAnnotation("foo"), "bar") # Note that getAnnotation returns an instance of Annotation, not simply the value self.assertEqual(m.getAnnotation("foo"), m.getAnnotation("bar"))
def testBasicGeneTSVInit(self): """ Make sure that we can initialize a simple tsv data source """ geneDS = DatasourceFactory.createDatasource( "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/") self.assertTrue(geneDS <> None, "gene indexed datasource was None.") m = MutationData() m.createAnnotation('gene', "ABL1") m = geneDS.annotate_mutation(m) self.assertTrue( m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1", "Test gene TSV datasource did not annotate properly.")
def testSimpleGLAnnotate(self): ''' Test a simple annotation case. Make sure that the ref_context and gc_content annotations are correct. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationData() m.chr = "GL000211.1" m.start = "11" m.end = "11" groundTruth = "gaattctttttcaagtaagtc" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content']) - (float(3)/float(11))) < .001, "gc_content was not populated properly: " + str(m['gc_content']))
def _is_matching(self, mut, tsv_record): chrom = tsv_record[self.tsv_index["chrom"]] startPos = tsv_record[self.tsv_index["start"]] endPos = tsv_record[self.tsv_index["end"]] build = "hg19" if self.match_mode == "exact": if "ref" in self.tsv_index and "alt" in self.tsv_index: # ref and alt information is present ref = tsv_record[self.tsv_index["ref"]] alt = tsv_record[self.tsv_index["alt"]] if ref == "-" or alt == "-": # addresses Mutation Annotation Format based tsv records # TODO: This looks risky to be calling the MutationData constructor directly ds_mut = MutationData(chrom, startPos, endPos, ref, alt, build) else: # addresses tsv records where the input isn't a Mutation Annotation Format file ds_mut = MutUtils.initializeMutFromAttributes( chrom, startPos, endPos, ref, alt, build) if mut.chr == ds_mut.chr and mut.ref_allele == ds_mut.ref_allele \ and mut.alt_allele == ds_mut.alt_allele and int(mut.start) == int(ds_mut.start) \ and int(mut.end) == int(ds_mut.end): return True else: # do not use ref and alt information if mut.chr == chrom and int( mut.start) == int(startPos) and int( mut.end) == int(endPos): return True else: return TranscriptProviderUtils.test_overlap( int(mut.start), int(mut.end), int(startPos), int(endPos)) return False
def testSimpleRendering(self): m = MutationData() m.chr = '1' m.start = 1000000 m.end = 1000000 outputFilename = "out/simpleBEDTest.bed" outputRenderer = SimpleBedOutputRenderer(outputFilename) outputRenderer.renderMutations([m], Metadata()) fp = file(outputFilename,'r') mOut = fp.readline().strip().split(' ') self.assertTrue(mOut[0] == "chr1") self.assertTrue(mOut[1] == "999999") self.assertTrue(mOut[2] == "1000000") fp.close()
def testSimpleRendering(self): m = MutationData() m.chr = '1' m.start = 1000000 m.end = 1000000 outputFilename = "out/simpleBEDTest.bed" outputRenderer = SimpleBedOutputRenderer(outputFilename) outputRenderer.renderMutations([m], Metadata()) fp = file(outputFilename, 'r') mOut = fp.readline().strip().split(' ') self.assertTrue(mOut[0] == "chr1") self.assertTrue(mOut[1] == "999999") self.assertTrue(mOut[2] == "1000000") fp.close()
def testSimpleAnnotation(self): ''' Create a dummy mutation and make sure it gets annotated properly ''' m = MutationData() m.createAnnotation('transcript_id', 'uc001hms.3') transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/") m = transcriptDS.annotate_mutation(m) self.assertTrue( m['refseq_test_mRNA_Id'] == 'NM_022746', "Transcript-based annotation did not populate properly: " + m['refseq_test_mRNA_Id']) self.assertTrue( m['refseq_test_prot_Id'] == 'NP_073583', "Transcript-based annotation did not populate properly: " + m['refseq_test_prot_Id'])
def testSimpleAnnotation(self): """ Create a dummy mutation and make sure it gets annotated properly """ m = MutationData() m.createAnnotation("transcript_id", "uc001hms.3") transcriptDS = DatasourceFactory.createDatasource( "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/" ) m = transcriptDS.annotate_mutation(m) self.assertTrue( m["refseq_test_mRNA_Id"] == "NM_022746", "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"], ) self.assertTrue( m["refseq_test_prot_Id"] == "NP_073583", "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"], )
def testBasicAnnotate(self): '''Test that the COSMIC datasource can be initialized with two index files (gp and gpp) and a simple annotation performed''' tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic(src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file= tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. m = MutationData() m.createAnnotation("gene", "EGFR") m.createAnnotation("transcript_protein_position_start", "747") m.createAnnotation("transcript_protein_position_end", "747") m.chr = '7' m.start = '55259560' m.end = '55259560' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '2')
def test_appris_selects_transcript(self): m = MutationData(chr="2", start="201722365", end="201722366", ref_allele="AC", alt_allele="-", build="hg19") transcript_ds = TestUtils.createTranscriptProviderDatasource(self.config) m = transcript_ds.annotate_mutation(m) tx = transcript_ds.get_transcript(m['annotation_transcript']) self.assertTrue(tx is not None, "Transcript was None when it should have been found. Does the ground truth transcript above need to be updated?") self.assertEqual(tx._transcript_id,'ENST00000321356.4')
def testSimpleAnnotate(self): ''' Perform a simple test of one of the aligned chromosomes (chr22) and make sure that we get a reasonable answer. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationData() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content'])- (float(6)/float(11))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def testBasicRefInit(self): """ Very simple test that will create a reference datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource('testdata/reference_ds/reference_ds.config', "testdata/reference_ds") m = MutationData() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" # remember that the annotate_mutation returns a generator, so we use an iterator m = ds.annotate_mutation(m) self.assertTrue(m['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context']))
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource( self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow + len(refs)): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def testAddTag(self): ''' Test adding a tag to an annotation ''' m = MutationData() m.createAnnotation("fake1", "1") m.addTagToAnnotation("fake1", "fakeTag") self.assertTrue("fakeTag" in m.getAnnotation("fake1").getTags(), "Tag was not added properly.")
def testAnnotateListOfMutations(self): """Test that we can initialize an Annotator, without an input or output and then feed mutations, one at a time... using a runspec""" # Locate the datasource directory and create a runspec dbDir = self.config.get("DEFAULT", "dbDir") ds = DatasourceFactory.createDatasources(dbDir) runSpec = RunSpecification() runSpec.initialize(None, None, datasources=ds) # Initialize the annotator with the runspec annotator = Annotator() annotator.initialize(runSpec) m = MutationData() m.chr = "1" m.start = "12941796" m.end = "12941796" m.alt_allele = "G" m.ref_allele = "T" muts = [m] muts = annotator.annotate_mutations(muts) m2 = muts.next() self.assertTrue(m2.get("gene", None) is not None)
def testExtentOutOfRangeError(self): ''' If a window is specified that extends beyond the beginning or end of a file, truncate the ref_context. Use what is left for gc_content as well.''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeRef=6, windowSizeGCContent=5) m = MutationData() m.chr = "22" m.start = "4" m.end = "4" # "CCCAAGCTAAACCCAGGCCAC" groundTruth = "CCCAAGCTAA" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content']) - (float(5)/float(9))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def testPopulatedButNullValuesInInitNLod(self): """Test that if init_n_lod is "." or "", there is no error """ m = MutationData() m.createAnnotation("init_n_lod", "") outputFilename = "out/blank.vcf" vcfOR = TcgaVcfOutputRenderer(outputFilename) lod = vcfOR._extract_lod(m,"init_n_lod") self.assertEqual(lod, 50) m["init_n_lod"] = '.' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, 50) m["init_n_lod"] = '6' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, 6) m["init_n_lod"] = '6.8' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, 6) m["init_n_lod"] = '-12.8' lod = vcfOR._extract_lod(m, "init_n_lod") self.assertEqual(lod, -12) m.createAnnotation("t_lod_fstar", "") lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 50) m["t_lod_fstar"] = '.' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 50) m["t_lod_fstar"] = '6' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 6) m["t_lod_fstar"] = '6.8' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, 6) m["t_lod_fstar"] = '-12.8' lod = vcfOR._extract_lod(m, "t_lod_fstar") self.assertEqual(lod, -12)
def testPickleable(self): """Test that a near-empty MutationData can be pickled""" m = MutationData() m.chr = "2" m.createAnnotation("fake1", "1") m.addTagToAnnotation("fake1", "fakeTag") import cPickle cPickle.dump(m, open("out/testMDPickle.pkl", 'w'))
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource( 'testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def testBasicCosmicInit(self): """ Very simple test that will create a datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource( 'testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic") m = MutationData() m.chr = 19 m.start = 58858921 m.end = 58858921 m = ds.annotate_mutation(m) self.assertTrue( m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])
def testRetrieveMissingAnnotations(self): """ Test simple case. """ m = MutationData() m.createAnnotation("a1", "1") m.createAnnotation("a2", "1") m.createAnnotation("a3", "1") m.createAnnotation("a4", "1") annotationNames = ["a3", "a2"] result = MutUtils.retrieveMissingAnnotations(m,annotationNames) self.assertIsNotNone(result) self.assertTrue(len(result) == 0, "Result was not empty: " + str(result)) annotationNames = ["zztop", "a1", "blah", "dummy"] result = MutUtils.retrieveMissingAnnotations(m,annotationNames) self.assertTrue(result[0] == "blah", "Result was not sorted") self.assertTrue("blah" in result and "dummy" in result and "zztop" in result, "Incorrect elements (Truth: [zztop, blah, dummy]): " + str(result))
def testBasicAnnotation(self): ds = GenericGenomicMutationDatasource('testdata/small_cosmic_2/cosmic_v65_chr18.tsv') m = MutationData() m.chr = '18' m.start = '48604683' m.end = '48604683' m.ref_allele = 'G' m.alt_allele = 'A' m.createAnnotation('strand', '+') guess = ds.annotate_mutation(m) self.assertTrue(guess['_cosmic_muts_disease_counts'], 'Unable to annotate mutation correctly')
def testMixedAnnotation(self): """Test that the COSMIC datasource can retrieve entries by both gp and gpp.""" tabixDir = "testdata/small_cosmic_with_gp_and_gpp/" cosmicDS = Cosmic(src_file=tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.gz", title="Cosmic", version="test", gpp_tabix_file= tabixDir + "small_cosmic_trimmed_for_sorting.txt.tbi.byAA.sorted.tsv.gz") # These values are not taken from a real world scenario, but are cooked for this test. # Line 9 should get picked up genomic coords # Lines 7,8 should get picked up by the protein position m = MutationData() m.createAnnotation("gene", "A2M") m.createAnnotation("transcript_protein_position_start", "1300") m.createAnnotation("transcript_protein_position_end", "1400") m.chr = '12' m.start = '9227220' m.end = '9227230' m = cosmicDS.annotate_mutation(m) self.assertTrue(m['COSMIC_n_overlapping_mutations'] == '3') self.assertTrue(m['COSMIC_overlapping_mutation_AAs'].find('1229') != -1, "Could not find the entry specified by genomic coords.") self.assertTrue(m['COSMIC_overlapping_primary_sites'] == "lung(3)", "Did not have the correct primary sites annotation (lung(3)): " + m['COSMIC_overlapping_primary_sites'])
def test_validation_correction(self): """ Test that the validation allele fields are determined automatically when not specified by the user for invalid mutation. """ m = MutationData() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Invalid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction1.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join( "configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue( line_dict['Tumor_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele2'], "Tumor alleles did not match normal alleles for an invalid validation result." ) self.assertTrue( line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" % (line_dict['Match_Norm_Validation_Allele1'], line_dict['Reference_Allele'])) self.assertTrue( "G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue( "None" == line_dict['Mutation_Status'], "Mutation Status must be None when Validation Status is Invalid: " + line_dict['Mutation_Status'])
def testBasicRefInit(self): """ Very simple test that will create a reference datasource from a sample datasource directory. The directory conforms to the standard datasource structure, including placement of the config file. """ ds = DatasourceFactory.createDatasource( 'testdata/reference_ds/reference_ds.config', "testdata/reference_ds") m = MutationData() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" # remember that the annotate_mutation returns a generator, so we use an iterator m = ds.annotate_mutation(m) self.assertTrue( m['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context']))
def testSpliceSiteWithinNBases(self): """Test that a silent mutation is changed to splice site w/in 10 bases of a splice site """ # chr21:10,998,326-10,998,346 # 10,998,336 is a splice site. (Junction between 10998335 and 336) # AGTTCTCCTT C TGGAAAAAAG refs = 'AGTTCTCCTTCTGGAAAAAAG' alts = 'TCAGACTGAAAATACCCCCCT' gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) vcs = [] for s in range(10998326, 10998347): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "21" m.ref_allele = refs[s - 10998326] m.alt_allele = alts[s - 10998326] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(10998336 - int(m.start)) vc = m['variant_classification'] self.assertTrue(vc != 'Silent', 'Silent mutation found when it should be a splice site.') vcs.append(vc) print vc + " " + m.start self.assertTrue(all([tmp == "Splice_Site" for tmp in vcs[8:12]]), "Not all vcs within 2 bases were splice site: " + str(vcs[8:12])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[0:8]]), "No splice sites should be seen: " + str(vcs[0:8])) self.assertTrue(all([tmp != "Splice_Site" for tmp in vcs[12:20]]), "No splice sites should be seen: " + str(vcs[12:20]))
def testSilentMutationGoingToSpliceSite(self): """Test that a silent mutation within 10 bp of a splice junction should become a splice site""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCGCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 28233780 for s in range(startWindow, 28233806): m = MutationData() m.start = str(s) m.end = str(s) m.chr = "1" m.ref_allele = refs[s - startWindow] m.alt_allele = alts[s - startWindow] m = gafDatasource.annotate_mutation(m) distanceFromSpliceSite = abs(28233793 - int(m.start)) vc = m['variant_classification'] vcs.append(vc) # self.assertTrue(vc <> 'Silent', 'Silent mutation found when it should be a splice site.') if vc.lower() == "splice_site": numSpliceSites += 1 if vc.lower() == "silent": numSilent += 1 print vc + " " + m.start + " " + str(distanceFromSpliceSite) self.assertTrue(numSpliceSites == 4, "Should have seen 4 splice site mutations, but saw: " + str(numSpliceSites)) self.assertTrue(numSilent == 11, "Should have seen 11 Silent mutations, but saw: " + str(numSilent))
def testFlank(self): """Test that we can see a Flank mutation.""" #chr1:28,233,780-28,233,805 Junction is at chr1:28,233,793 & 94 # refs = "TGGGCTCGGGCTCTCTGAAAAGAAAA" alts = "TGGGCTCAGGCTCTCTGAAAAGAAAA" vcs = [] gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config) numSpliceSites = 0 numSilent = 0 startWindow = 11042200 for s in range(startWindow, startWindow+len(refs)): m = MutationData() m.start = str(s) m.end = str(s) m.chr="1" m.ref_allele = refs[s-startWindow] m.alt_allele = alts[s-startWindow] m = gafDatasource.annotate_mutation(m) vc = m['variant_classification'] vcs.append(vc) print vc + " " + m.start pass
def test_validation_correction_valid(self): """ Test that the validation allele fields are determined automatically when not specified by the user for a valid mutation. """ m = MutationData() m.chr = "3" m.start = "178948145" m.end = "178948145" m.alt_allele = "A" m.ref_allele = "G" m['validation_status'] = "Valid" m['Match_Norm_Validation_Allele1'] = "" m['Match_Norm_Validation_Allele2'] = "" m['Tumor_Validation_Allele1'] = "" m['Tumor_Validation_Allele2'] = "" m['Mutation_Status'] = "Somatic" output_filename = os.path.join("out", "test_validation_correction2.maf.tsv") outputRenderer = TcgaMafOutputRenderer(output_filename, configFile=os.path.join("configs", "tcgaMAF2.4_output.config")) outputRenderer.renderMutations([m].__iter__()) tsv_reader = GenericTsvReader(output_filename) for line_dict in tsv_reader: self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Match_Norm_Validation_Allele2'], "Matched norm alleles did not match.") self.assertTrue(line_dict['Tumor_Validation_Allele1'] == line_dict['Reference_Allele'], "Tumor validation allele 1 did not match reference for a valid validation result.") self.assertTrue(line_dict['Tumor_Validation_Allele2'] == line_dict['Tumor_Seq_Allele2'], "Tumor validation allele 2 did not match Tumor_Seq_Allele2 for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Tumor_Validation_Allele1'], "Tumor allele 1 did not match normal alleles for a valid validation result.") self.assertTrue(line_dict['Match_Norm_Validation_Allele1'] == line_dict['Reference_Allele'], "Norm validation alleles did not match reference (norm, reference): (%s, %s)" %(line_dict['Match_Norm_Validation_Allele1'] ,line_dict['Reference_Allele']) ) self.assertTrue("G" == line_dict['Reference_Allele'], "Reference allele should have been G, but was " + line_dict['Reference_Allele']) self.assertTrue("A" == line_dict['Tumor_Seq_Allele2'], "Alt allele should have been A, but was " + line_dict['Tumor_Seq_Allele2'])
def generateTranscriptMuts(gafDS, uniprotDS): tDict = gafDS.getTranscriptDict() for transcriptID in tDict.keys(): m = MutationData() m.createAnnotation('gene', tDict[transcriptID]['gene']) m.createAnnotation('transcript_id', transcriptID) m = uniprotDS.annotate_mutation(m) yield m
def testIter(self): m = MutationData() m.createAnnotation("fake1", "1") m.createAnnotation("fake2", "blah blah") for k in m: self.assertTrue((k in ["fake1", "fake2"]) or (k in MutationData.attributes), "Key not present: " + k)
def testBasicAnnotation(self): ''' Test an extremely simple case. ''' datasource = GenericGeneProteinPositionDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.tsv", title="UniProt_NatVar", version="2011_09") m = MutationData() m.createAnnotation("gene", "TP53") m.createAnnotation("protein_change", "p.S376C") m.createAnnotation("other_transcripts", "TP53_uc002gig.1_Intron|TP53_uc002gih.2_Intron|TP53_uc010cne.1_RNA|TP53_uc010cnf.1_3'UTR|TP53_uc010cng.1_3'UTR|TP53_uc002gii.1_Missense_Mutation_p.S244C|TP53_uc010cnh.1_3'UTR|TP53_uc010cni.1_3'UTR|TP53_uc002gij.2_Missense_Mutation_p.S376C") m2 = datasource.annotate_mutation(m) annotationName= "UniProt_NatVar_natural_variations" self.assertTrue(sorted(m[annotationName].split("|")) == sorted("S -> T (in a sporadic cancer; somatic mutation).|S -> A (in a sporadic cancer; somatic mutation).".split("|")), "Incorrect annotation value seen: " + m[annotationName])