def test_overlapping_single_transcripts(self):
        base_config_location = "testdata/ensembl/saccer/"

        ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location)
        recs = ensembl_ds.get_overlapping_transcripts("I", "500", "500")
        self.assertTrue(len(recs) == 1)
        self.assertTrue(recs[0].get_gene() == 'YAL069W')
    def testESPCoverageAnnotationWithSNPAvgMatch(self):
        """
        """
        self.logger.info("Initializing ESP6500SI-V2 Coverage")
        tabixIndexedTsvDirName = os.path.join(*["testdata", "small_esp_coverage_avg_ds", "hg19"])
        tabixIndexedTsvDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedTsvDirName, "small_esp_coverage_avg_ds.config"), tabixIndexedTsvDirName)

        m1 = MutationData()
        m1.chr = "X"
        m1.start = "100075334"
        m1.end = "100075334"

        m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth")
        cur_annotation = Annotation(value="75.0", datasourceName="ESP", dataType="Float",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered")
        cur_annotation = Annotation(value="692.0", datasourceName="ESP", dataType="Float",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome")
        cur_annotation = Annotation(value="X", datasourceName="ESP", dataType="String",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
    def testdbNSFPNoRefAltAnnotationWithExactMatch(self):
        """

        """
        self.logger.info("Initializing dbNSFP")
        tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19"])
        tabixIndexedTsvDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"),
            tabixIndexedTsvDirName)

        m1 = MutationData()
        m1.chr = "1"
        m1.start = "35140"
        m1.end = "35140"

        m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos")
        cur_annotation = Annotation(value="1|1|1", datasourceName="dbNSFP", dataType="String",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon")
        cur_annotation = Annotation(value="TAA|TAA|TAA", datasourceName="dbNSFP", dataType="String",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand")
        cur_annotation = Annotation(value="-|-|-", datasourceName="dbNSFP", dataType="String",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
Exemple #4
0
    def test_no_overwriting_muts(self):
        """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/"
        )
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False, OptionConstants.NO_PREPEND: True}

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts,
        )
        annotator = Annotator()
        annotator.initialize(run_spec)

        self.assertRaises(DuplicateAnnotationException, annotator.annotate)
Exemple #5
0
    def test_overwriting_muts(self):
        """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/"
        )
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True, OptionConstants.NO_PREPEND: True}

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts,
        )
        annotator = Annotator()
        annotator.initialize(run_spec)

        annotator.annotate()

        tsv_reader = GenericTsvReader(output_filename)

        for i, line_dict in enumerate(tsv_reader):
            self.assertTrue(line_dict.get("TJ_Data_Who", "") != "Tromokratis")
    def _create_test_ds(self, input_tsv, dir_name, index_cols):

        base_name = "test_snp_leveldb"

        full_name = dir_name + "/" + base_name

        if os.path.exists(full_name):
            shutil.rmtree(full_name)

        os.makedirs(full_name)

        tsv_reader = GenericTsvReader(input_tsv, commentPrepend="%")
        annotation_cols = copy.copy(tsv_reader.getFieldNames())
        for icol in index_cols:
            if icol in annotation_cols:
                annotation_cols.remove(icol)

        ds_creator = SnpOnlyLevelDbDatasourceCreator()
        ds_creator.createDatasource(full_name, input_tsv, ",".join(index_cols), full_name + "/" + base_name + ".config", "snp_leveldb", base_name, "TEST",
                         "exact", annotation_cols, [])

        config_filename = "out/test_simple_annotate_snp_only_leveldb/test_snp_leveldb/test_snp_leveldb.config"
        ds = DatasourceFactory.createDatasource(os.path.abspath(config_filename), os.path.dirname(config_filename))

        return ds
    def test_overlapping_single_transcripts(self):
        base_config_location = "testdata/ensembl/saccer/"

        ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location)
        recs = ensembl_ds.get_overlapping_transcripts("I", "500", "500")
        self.assertTrue(len(recs) == 1)
        self.assertTrue(recs[0].get_gene() == 'YAL069W')
    def testdbNSFPAnnotationWithMissingOverlapMatch(self):  # SNPs only
        """

        """
        self.logger.info("Initializing dbNSFP")
        tabixIndexedTsvDirName = os.path.join(*["testdata", "dbNSFP_chr1_chr3_100vars_overlap_ds", "hg19"])
        tabixIndexedTsvDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedTsvDirName, "dbNSFP_chr1_chr3_100vars_overlap_ds.config"), tabixIndexedTsvDirName)

        m1 = MutationDataFactory.default_create()
        m1.chr = "1"
        m1.start = "35136"
        m1.end = "35137"

        m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos")
        cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon")
        cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand")
        cur_annotation = Annotation(value="", datasourceName="dbNSFP", dataType="String",
                                    description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()
        
        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()
        
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)
        
        self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header")
        
        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
 def testBasicAnnotation(self):
     ''' Annotate from a basic tsv of Genomic positions.  This tests both single- and multiple-nucleotide variants.  The tsv is already installed (i.e. proper config file created).
     '''
     outputFilename = 'out/genericGenomePositionTest.out.tsv'
     
     gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/")
     
     annotator = Annotator()
     annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite.maf.txt'))
     annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
     annotator.addDatasource(gpDS)
     testFilename = annotator.annotate()
     
     # Make sure that some values were populated
     self.assertTrue(os.path.exists(testFilename))
     tsvReader = GenericTsvReader(testFilename) 
     
     ctr = 1
     # Two overlap, one does not.  Repeat...
     for lineDict in tsvReader:
         if (ctr % 3 == 0):
             self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should have had blank value, but did not: " + lineDict["ORegAnno_hg19.oreganno.id"])
         else:
             self.assertFalse(lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " + str(ctr) + " should not have had blank value, but did.")
             self.assertTrue(lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034', "Line " + str(ctr) + " did not have correct value: " + lineDict["ORegAnno_hg19.oreganno.id"])
         ctr = ctr + 1
Exemple #13
0
    def test_no_overwriting_muts(self):
        """Ensure that (given configuration that disallows) we cannot annotate from a datasource when a value was specified in the input."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {
            OptionConstants.ALLOW_ANNOTATION_OVERWRITING: False,
            OptionConstants.NO_PREPEND: True
        }

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts)
        annotator = Annotator()
        annotator.initialize(run_spec)

        self.assertRaises(DuplicateAnnotationException, annotator.annotate)
Exemple #14
0
    def test_overwriting_muts(self):
        """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {
            OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True,
            OptionConstants.NO_PREPEND: True
        }

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts)
        annotator = Annotator()
        annotator.initialize(run_spec)

        annotator.annotate()

        tsv_reader = GenericTsvReader(output_filename)

        for i, line_dict in enumerate(tsv_reader):
            self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
Exemple #15
0
 def createCosmicDatasource(config):
     """ Creates a Cosmic datasource from a config file.
         """
     cosmic_dirname = config.get("COSMIC", "CosmicDir")
     cosmicDatasource = DatasourceFactory.createDatasource(
         cosmic_dirname + "/cosmic.config", cosmic_dirname)
     return cosmicDatasource
Exemple #16
0
    def testBasicAnnotation(self):
        ''' Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. '''
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config",
            "testdata/small_transcript_tsv_ds/")
        outputFilename = 'out/genericTranscriptTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue(
            "refseq_test_mRNA_Id" in headers,
            "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue(
            "refseq_test_prot_Id" in headers,
            "refseq_test_prot_Id not found in headers: " + str(headers))
Exemple #17
0
    def testESPCoverageAnnotationWithMissingAnnotationValuesIndelAvgMatch(
            self):
        """

        """
        self.logger.info("Initializing ESP6500SI-V2 Coverage")
        tabixIndexedTsvDirName = os.path.join(
            *["testdata", "small_esp_coverage_avg_ds", "hg19"])
        tabixIndexedTsvDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedTsvDirName,
                         "small_esp_coverage_avg_ds.config"),
            tabixIndexedTsvDirName)

        m1 = MutationData()
        m1.chr = "X"
        m1.start = "100075350"
        m1.end = "100075356"

        m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("ESP_AvgSampleReadDepth")
        cur_annotation = Annotation(
            value="91.25",
            datasourceName="ESP",
            dataType="Float",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")
    def testExampleVcfDBAnnotationWithSNPExactMatch(self):
        """

        """
        tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"])
        tabixIndexedVcfDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName)

        chrom = "20"
        start = "1110696"
        end = "1110696"
        ref_allele = "A"
        alt_allele = "T"
        build = "hg19"
        m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build)

        m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1)

        m1_annotation = m1_annotated.getAnnotation("ESP_AF")
        cur_annotation = Annotation(value="0.667", datasourceName="ESP", dataType="Float",
                                    description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT],
                                    number=-1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_AC")
        cur_annotation = Annotation(value="2,4", datasourceName="ESP", dataType="Integer",
                                    description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_H2")
        cur_annotation = Annotation(value="False", datasourceName="ESP", dataType="Flag",
                                    description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=0)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        chrom = "20"
        start = "1230237"
        end = "1230237"
        ref_allele = "T"
        alt_allele = "A"
        build = "hg19"
        m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build)

        m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1)

        m1_annotation = m1_annotated.getAnnotation("ESP_NS")
        cur_annotation = Annotation(value="3", datasourceName="ESP", dataType="Integer",
                                    description="Number of Samples With Data",
                                    tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_AF")
        cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float",
                                    description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT],
                                    number=-1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
    def test_hashcode_generation(self):
        """Test that we can read a hashcode for a datasource, if available."""
        geneDS = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        self.assertTrue(geneDS is not None,
                        "gene indexed datasource was None.")

        self.assertTrue(
            geneDS.get_hashcode() == "7120edfdc7b29e45191c81c99894afd5")
    def testBasicDatasourceSorting(self):
        """Test that the GAF datasource is sorted before a gene-based datasource"""

        gafDatasource = TestUtils.createTranscriptProviderDatasource(self.config)
        geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
        
        incorrectSortList = [geneDS, gafDatasource]
        guessSortList =  DatasourceFactory.sortDatasources(incorrectSortList)
        self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.")
        self.assertTrue(len(guessSortList) == 2, "Sorting altered number of datasources (gt: 2): " + str(len(guessSortList)))
    def testBasicGeneTSVInit(self):
        """ Make sure that we can initialize a simple tsv data source """

        geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
        self.assertTrue(geneDS <> None, "gene indexed datasource was None.")
        
        m = MutationDataFactory.default_create()
        m.createAnnotation('gene',"ABL1")
        m = geneDS.annotate_mutation(m)
        self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        '''

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31',
            'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC',
            'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1',
            'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B'
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            'CGC_Abridged_Other Syndrome/Disease' in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue(
            'CGC_Abridged_Mutation Type' in fields,
            "'CGC_Abridged_Mutation Type' was not present in the header")

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(
                    lineDict['CGC_Abridged_GeneID'] != '',
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: "
                    + str(ctr))
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0,
                        "Bad data -- cannot test missed detects.")
 def testAnnotationSourceIsPopulated(self):
     ''' Tests that the annotation source is not blank for the example tsv datasource. '''
     geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
     self.assertTrue(geneDS <> None, "gene indexed datasource was None.")
     
     m = MutationData()
     m.createAnnotation('gene',"ABL1")
     m = geneDS.annotate_mutation(m)
     self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
     self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown")
     self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
 def testAnnotationSourceIsPopulated(self):
     ''' Tests that the annotation source is not blank for the example tsv datasource. '''
     geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
     self.assertTrue(geneDS <> None, "gene indexed datasource was None.")
     
     m = MutationData()
     m.createAnnotation('gene',"ABL1")
     m = geneDS.annotate_mutation(m)
     self.assertTrue(m['CGC_Abridged_Name'] == "v-abl Abelson murine leukemia viral oncogene homolog 1","Test gene TSV datasource did not annotate properly.")
     self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource() <> "Unknown", "Annotation source was unknown")
     self.assertTrue(m.getAnnotation('CGC_Abridged_Name').getDatasource().strip() <> "", "Annotation source was blank")
    def testDatasourceCreator(self):
        """ Test that the datasource creator process will work for  TranscriptToUniProtProteinPositionTransformingDatasource.  NOTE: This test needs to be updated to use sqlite instead of filesystem file.
        """

        tDS = DatasourceFactory.createDatasource("testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config", "testdata/small_uniprot_prot_seq_ds/")
        outputAnnotation = "UniProt_aapos"
        m = MutationData()
        m.createAnnotation('transcript_id', 'uc009vvt.1')
        m.createAnnotation('protein_change', 'p.T1105A')
        m = tDS.annotate_mutation(m)
        self.assertTrue(m[outputAnnotation] == "969", "Did not get proper value (969): " + m[outputAnnotation])
    def test_overlapping_multiple_transcripts_snp(self):
        base_config_location = "testdata/ensembl/saccer/"

        ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location)
        recs = ensembl_ds.get_overlapping_transcripts("I", "550", "550")
        self.assertTrue(len(recs) == 2)
        ids = set()
        for r in recs:
            ids.add(r.get_transcript_id())

        self.assertTrue(len(ids - set(['YAL069W', 'YAL068W-A'])) == 0)
    def test_overlapping_multiple_transcripts_snp(self):
        base_config_location = "testdata/ensembl/saccer/"

        ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location)
        recs = ensembl_ds.get_overlapping_transcripts("I", "550", "550")
        self.assertTrue(len(recs) == 2)
        ids = set()
        for r in recs:
            ids.add(r.get_transcript_id())

        self.assertTrue(len(ids - {'YAL069W', 'YAL068W-A'}) == 0)
    def testdbNSFPNoRefAltAnnotationWithExactMatch(self):
        """

        """
        self.logger.info("Initializing dbNSFP")
        tabixIndexedTsvDirName = os.path.join(*[
            "testdata", "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds", "hg19"
        ])
        tabixIndexedTsvDatasource = DatasourceFactory.createDatasource(
            os.path.join(
                tabixIndexedTsvDirName,
                "dbNSFP_chr1_chr3_100vars_exact_no_ref_alt_ds.config"),
            tabixIndexedTsvDirName)

        m1 = MutationDataFactory.default_create()
        m1.chr = "1"
        m1.start = "35140"
        m1.end = "35140"

        m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos")
        cur_annotation = Annotation(
            value="1|1|1",
            datasourceName="dbNSFP",
            dataType="String",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon")
        cur_annotation = Annotation(
            value="TAA|TAA|TAA",
            datasourceName="dbNSFP",
            dataType="String",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand")
        cur_annotation = Annotation(
            value="-|-|-",
            datasourceName="dbNSFP",
            dataType="String",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")
Exemple #29
0
    def testdbNSFPAnnotationWithMissingExactMatch(self):  # SNPs only
        """

        """
        self.logger.info("Initializing dbNSFP")
        tabixIndexedTsvDirName = os.path.join(
            *["testdata", "dbNSFP_chr1_6vars_exact_ds", "hg19"])
        tabixIndexedTsvDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedTsvDirName,
                         "dbNSFP_chr1_6vars_exact_ds.config"),
            tabixIndexedTsvDirName)

        m1 = MutationData()
        m1.chr = "1"
        m1.start = "35138"
        m1.end = "35138"
        m1.ref_allele = "T"
        m1.alt_allele = "C"

        m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("dbNSFP_codonpos")
        cur_annotation = Annotation(
            value="",
            datasourceName="dbNSFP",
            dataType="Integer",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_refcodon")
        cur_annotation = Annotation(
            value="",
            datasourceName="dbNSFP",
            dataType="String",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("dbNSFP_cds_strand")
        cur_annotation = Annotation(
            value="",
            datasourceName="dbNSFP",
            dataType="Float",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")
Exemple #30
0
    def testESPCoverageAnnotationWithMissingIndelOverlapMatch(self):
        """


        """
        self.logger.info("Initializing ESP6500SI-V2 Coverage")
        tabixIndexedTsvDirName = os.path.join(
            *["testdata", "small_esp_coverage_overlap_ds", "hg19"])
        tabixIndexedTsvDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedTsvDirName,
                         "small_esp_coverage_overlap_ds.config"),
            tabixIndexedTsvDirName)

        m1 = MutationData()
        m1.chr = "X"
        m1.start = "100075300"
        m1.end = "100075336"

        m1_annotated = tabixIndexedTsvDatasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("ESP_AvgAAsampleReadDepth")
        cur_annotation = Annotation(
            value="75.0|81.0|81.0",
            datasourceName="ESP",
            dataType="String",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_TotalAAsamplesCovered")
        cur_annotation = Annotation(
            value="692|692|692",
            datasourceName="ESP",
            dataType="String",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_Chromosome")
        cur_annotation = Annotation(
            value="X|X|X",
            datasourceName="ESP",
            dataType="String",
            description="",
            tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
            number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation),
                        "Annotations do not match.")
 def testBasicCosmicInit(self):
     """ Very simple test that will create a datasource from a sample datasource directory.  
     The directory conforms to the standard datasource structure, including placement of the config file.
     """
     ds = DatasourceFactory.createDatasource('testdata/small_cosmic/small_cosmic.config', "testdata/small_cosmic")
     
     m = MutationDataFactory.default_create()
     m.chr = 19
     m.start = 58858921
     m.end = 58858921
     
     m = ds.annotate_mutation(m)
     
     self.assertTrue(m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)', "Did not properly annotate mutation: " + m['COSMIC_overlapping_mutation_AAs'])
    def testTags(self):
        """

        """
        self.logger.info("Initializing ESP6500SI-V2")
        tabixIndexedVcfDirName = os.path.join(*["testdata", "small_esp_ds"])
        tabixIndexedVcfDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedVcfDirName, "esp.config"), tabixIndexedVcfDirName)

        tagsDict = tabixIndexedVcfDatasource._determine_tags()
        for ID in tagsDict:
            tags = tagsDict[ID]
            self.assertTrue(len(tags) == 2, "The length of tags is not 2 but %s." % len(tags))
            self.assertTrue(TagConstants.INFO in tags, "INFO tag is missing for %s." % ID)
            self.assertTrue(TagConstants.NOT_SPLIT in tags, "NOT_SPLIT tag is missing for %s." % ID)
    def testBasicGeneTSVInit(self):
        """ Make sure that we can initialize a simple tsv data source """

        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")
        self.assertTrue(geneDS <> None, "gene indexed datasource was None.")

        m = MutationDataFactory.default_create()
        m.createAnnotation('gene', "ABL1")
        m = geneDS.annotate_mutation(m)
        self.assertTrue(
            m['CGC_Abridged_Name'] ==
            "v-abl Abelson murine leukemia viral oncogene homolog 1",
            "Test gene TSV datasource did not annotate properly.")
    def testDatasourceCreator(self):
        """ Test that the datasource creator process will work for v1 of TranscriptToUniProtProteinPositionTransformingDatasource.  NOTE: This test needs to be updated to use sqlite instead of filesystem file.
        """

        tDS = DatasourceFactory.createDatasource(
            "testdata/small_uniprot_prot_seq_ds/small_uniprot_prot_seq_ds.config",
            "testdata/small_uniprot_prot_seq_ds/")
        outputAnnotation = "UniProt_aapos"
        m = MutationData()
        m.createAnnotation('transcript_id', 'uc009vvt.1')
        m.createAnnotation('protein_change', 'p.T1105A')
        m = tDS.annotate_mutation(m)
        self.assertTrue(
            m[outputAnnotation] == "969",
            "Did not get proper value (969): " + m[outputAnnotation])
Exemple #35
0
 def test_simple_annotation_without_version_number_in_data_nor_query(self):
     ''' Create a dummy mutation and make sure it gets annotated properly when there is a version number in the query, but version number is not in the datasource.'''
     m = MutationDataFactory.default_create()
     m.createAnnotation('transcript_id', 'uc001hms')
     transcriptDS = DatasourceFactory.createDatasource(
         "testdata/small_transcript_tsv_ds_no_version_number/small_transcript_tsv_ds.config",
         "testdata/small_transcript_tsv_ds_no_version_number/")
     m = transcriptDS.annotate_mutation(m)
     self.assertTrue(
         m['refseq_test_mRNA_Id'] == 'NM_022746',
         "Transcript-based annotation did not populate properly: " +
         m['refseq_test_mRNA_Id'])
     self.assertTrue(
         m['refseq_test_prot_Id'] == 'NP_073583',
         "Transcript-based annotation did not populate properly: " +
         m['refseq_test_prot_Id'])
 def testSimpleAnnotation(self):
     """ Create a dummy mutation and make sure it gets annotated properly """
     m = MutationData()
     m.createAnnotation("transcript_id", "uc001hms.3")
     transcriptDS = DatasourceFactory.createDatasource(
         "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/"
     )
     m = transcriptDS.annotate_mutation(m)
     self.assertTrue(
         m["refseq_test_mRNA_Id"] == "NM_022746",
         "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"],
     )
     self.assertTrue(
         m["refseq_test_prot_Id"] == "NP_073583",
         "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"],
     )
 def testDoubleAnnotationError(self):
     ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. '''
     outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv'
     
     gpDS = DatasourceFactory.createDatasource("testdata/small_genome_position_tsv_ds/oreganno_trim.config", "testdata/small_genome_position_tsv_ds/")
     
     
     annotator = Annotator()
     annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/testDoubleAnnotate.maf.tsv'))
     annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
     annotator.addDatasource(gpDS)
     testFilename = annotator.annotate()
     
     
     # Make sure that some values were populated
     self.assertTrue(os.path.exists(testFilename))
 def test_simple_annotation_with_version_number_in_data_but_not_query(self):
     """ Create a dummy mutation and make sure it gets annotated properly with version num in data, but not query """
     m = MutationDataFactory.default_create()
     m.createAnnotation("transcript_id", "uc001hms")
     transcriptDS = DatasourceFactory.createDatasource(
         "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/"
     )
     m = transcriptDS.annotate_mutation(m)
     self.assertTrue(
         m["refseq_test_mRNA_Id"] == "NM_022746",
         "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"],
     )
     self.assertTrue(
         m["refseq_test_prot_Id"] == "NP_073583",
         "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"],
     )
    def testBasicDatasourceSorting(self):
        """Test that the GAF datasource is sorted before a gene-based datasource"""

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")

        incorrectSortList = [geneDS, gafDatasource]
        guessSortList = DatasourceFactory.sortDatasources(incorrectSortList)
        self.assertTrue(guessSortList[1] == geneDS, "Sorting is incorrect.")
        self.assertTrue(
            len(guessSortList) == 2,
            "Sorting altered number of datasources (gt: 2): " +
            str(len(guessSortList)))
Exemple #40
0
 def testSimpleAnnotation(self):
     ''' Create a dummy mutation and make sure it gets annotated properly '''
     m = MutationData()
     m.createAnnotation('transcript_id', 'uc001hms.3')
     transcriptDS = DatasourceFactory.createDatasource(
         "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config",
         "testdata/small_transcript_tsv_ds/")
     m = transcriptDS.annotate_mutation(m)
     self.assertTrue(
         m['refseq_test_mRNA_Id'] == 'NM_022746',
         "Transcript-based annotation did not populate properly: " +
         m['refseq_test_mRNA_Id'])
     self.assertTrue(
         m['refseq_test_prot_Id'] == 'NP_073583',
         "Transcript-based annotation did not populate properly: " +
         m['refseq_test_prot_Id'])
    def testExampleVcfDBAnnotationWithMissingIndelExactMatch(self):
        """

        """
        tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_exact", "hg19"])
        tabixIndexedVcfDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedVcfDirName, "vcf_db_exact.config"), tabixIndexedVcfDirName)

        chrom = "21"
        start = "1234567"
        end = "1234567"
        ref_allele = "AGTC"
        alt_allele = "A"
        build = "hg19"
        m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build)

        m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1)

        m1_annotation = m1_annotated.getAnnotation("ESP_AF")
        cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float",
                                    description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT],
                                    number=-1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_X")
        cur_annotation = Annotation(value="", datasourceName="ESP", dataType="String",
                                    description="A random variable, X", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=2)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_H2")
        cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Flag",
                                    description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=0)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_Y")
        cur_annotation = Annotation(value="", datasourceName="ESP", dataType="String",
                                    description="A random variable, Y", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=-2)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_Z")
        cur_annotation = Annotation(value="", datasourceName="ESP", dataType="Float",
                                    description="A random variable, Z", tags=[TagConstants.INFO,
                                                                              TagConstants.NOT_SPLIT], number=3)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
    def testExampleVcfDBAnnotationWithIndelAvgMatch(self):
        """

        """
        tabixIndexedVcfDirName = os.path.join(*["testdata", "vcf_db_avg", "hg19"])
        tabixIndexedVcfDatasource = DatasourceFactory.createDatasource(
            os.path.join(tabixIndexedVcfDirName, "vcf_db_avg.config"), tabixIndexedVcfDirName)

        chrom = "4"
        start = "1234567"
        end = "1234567"
        ref_allele = "GTC"
        alt_allele = "GTCTTA"
        build = "hg19"
        m1 = MutUtils.initializeMutFromAttributes(chrom, start, end, ref_allele, alt_allele, build)

        m1_annotated = tabixIndexedVcfDatasource.annotate_mutation(m1)

        m1_annotation = m1_annotated.getAnnotation("ESP_AF")
        cur_annotation = Annotation(value="0.5", datasourceName="ESP", dataType="Float",
                                    description="Allele Frequency", tags=[TagConstants.INFO, TagConstants.SPLIT],
                                    number=-1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_AC")
        cur_annotation = Annotation(value="3.0", datasourceName="ESP", dataType="Float",
                                    description="Allele Count", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_H2")
        cur_annotation = Annotation(value="False|False|False", datasourceName="ESP", dataType="String",
                                    description="HapMap2 membership", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_AA")
        cur_annotation = Annotation(value="T", datasourceName="ESP", dataType="String",
                                    description="Ancestral Allele", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT],
                                    number=1)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        m1_annotation = m1_annotated.getAnnotation("ESP_Z")
        cur_annotation = Annotation(value="2.0,3.0,3.0", datasourceName="ESP", dataType="Float",
                                    description="A random variable, Z", tags=[TagConstants.INFO,
                                                                              TagConstants.NOT_SPLIT], number=3)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")
    def test_convert_genomic_space_to_transcript_space(self):
        base_config_location = "testdata/ensembl/saccer/"
        ensembl_ds = DatasourceFactory.createDatasource(base_config_location + "ensembl.config", base_config_location)

        tx = ensembl_ds.get_overlapping_transcripts("I", "350", "350") # transcript starts at 335.
        start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("350", "350", tx[0])
        self.assertTrue(start == end)
        self.assertTrue(start == 16)

        tx = ensembl_ds.get_overlapping_transcripts("II", "764690", "764690") # transcript starts at 764697 (strand is '-').
        start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764690", "764690", tx[0])
        self.assertTrue(start == end)
        self.assertTrue(start == 7)

        start, end = TranscriptProviderUtils.convert_genomic_space_to_transcript_space("764680", "764690", tx[0])
        self.assertTrue(start == (end - 10))
        self.assertTrue(start == 7)
 def test_simple_annotation_without_version_number_in_data(self):
     """ Create a dummy mutation and make sure it gets annotated properly when there is a version number in the query, but version number is not in the datasource."""
     m = MutationDataFactory.default_create()
     m.createAnnotation("transcript_id", "uc001hms.3")
     transcriptDS = DatasourceFactory.createDatasource(
         "testdata/small_transcript_tsv_ds_no_version_number/small_transcript_tsv_ds.config",
         "testdata/small_transcript_tsv_ds_no_version_number/",
     )
     m = transcriptDS.annotate_mutation(m)
     self.assertTrue(
         m["refseq_test_mRNA_Id"] == "NM_022746",
         "Transcript-based annotation did not populate properly: " + m["refseq_test_mRNA_Id"],
     )
     self.assertTrue(
         m["refseq_test_prot_Id"] == "NP_073583",
         "Transcript-based annotation did not populate properly: " + m["refseq_test_prot_Id"],
     )
 def testBasicRefInit(self):
     """ Very simple test that will create a reference datasource from a sample datasource directory.  
     The directory conforms to the standard datasource structure, including placement of the config file.
     """
     ds = DatasourceFactory.createDatasource('testdata/reference_ds/reference_ds.config', "testdata/reference_ds")
     
     m = MutationDataFactory.default_create()
     m.chr = "22"
     m.start = "11"
     m.end = "11"
     
     groundTruth = "CCCAAGCTAAACCCAGGCCAC"
     
     # remember that the annotate_mutation returns a generator, so we use an iterator
     m = ds.annotate_mutation(m)
     
     self.assertTrue(m['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context']))
Exemple #46
0
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename,
                                          'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()

        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1,
                        "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Who) from header")

        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(
            ctrOut == (ctrIn + 2),
            "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): "
            + str(ctrIn) + ", " + str(ctrOut))
    def testDoubleAnnotationError(self):
        ''' Given a maf file that used to cause a duplicate annotation exception, do not throw that (or any) exception. '''
        outputFilename = 'out/genericGenomePositionDoubleAnnotationTest.out.tsv'

        gpDS = DatasourceFactory.createDatasource(
            "testdata/small_genome_position_tsv_ds/oreganno_trim.config",
            "testdata/small_genome_position_tsv_ds/")

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/testDoubleAnnotate.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gpDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
    def testBasicCosmicInit(self):
        """ Very simple test that will create a datasource from a sample datasource directory.  
        The directory conforms to the standard datasource structure, including placement of the config file.
        """
        ds = DatasourceFactory.createDatasource(
            'testdata/small_cosmic/small_cosmic.config',
            "testdata/small_cosmic")

        m = MutationDataFactory.default_create()
        m.chr = 19
        m.start = 58858921
        m.end = 58858921

        m = ds.annotate_mutation(m)

        self.assertTrue(
            m['COSMIC_overlapping_mutation_AAs'] == 'p.P426P(1)',
            "Did not properly annotate mutation: " +
            m['COSMIC_overlapping_mutation_AAs'])
Exemple #49
0
def main():
    args = parseOptions()
    input_gene_list_file = args.input_gene_list_file
    transcript_ds = args.transcript_ds_config
    outputFilename = args.outputFilename
    isNonCoding = args.includeNonCoding
    padding = args.padding
    ds = DatasourceFactory.createDatasource(transcript_ds, os.path.dirname(transcript_ds))
    input_gene_list_file_fp = file(input_gene_list_file, 'r')
    outputFileFP = file(outputFilename, 'w')
    errorFileFP = file(outputFilename + ".err", 'w')
    for line in input_gene_list_file_fp:
        gene = line.strip()
        exons = ds.retrieveExons(gene, isCodingOnly=(not isNonCoding), padding=int(padding))
        if len(exons) == 0:
            errorFileFP.write("Could not locate " + gene + "\n")
        for e in exons:
            outputFileFP.write('%s\t%s\t%s\t%s\n' % (e[0], e[1], e[2], e[3]))
    print("Done ... " + outputFilename)
    def testBasicRefInit(self):
        """ Very simple test that will create a reference datasource from a sample datasource directory.  
        The directory conforms to the standard datasource structure, including placement of the config file.
        """
        ds = DatasourceFactory.createDatasource(
            'testdata/reference_ds/reference_ds.config',
            "testdata/reference_ds")

        m = MutationDataFactory.default_create()
        m.chr = "22"
        m.start = "11"
        m.end = "11"

        groundTruth = "CCCAAGCTAAACCCAGGCCAC"

        # remember that the annotate_mutation returns a generator, so we use an iterator
        m = ds.annotate_mutation(m)

        self.assertTrue(
            m['ref_context'] == groundTruth,
            "ref_context was not populated properly: " + str(m['ref_context']))
    def testBasicAnnotation(self):
        """ Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. """
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/"
        )
        outputFilename = "out/genericTranscriptTest.out.tsv"

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue("refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
Exemple #52
0
def main():
    args = parseOptions()
    input_gene_list_file = args.input_gene_list_file
    transcript_ds = args.transcript_ds_config
    outputFilename = args.outputFilename
    isNonCoding = args.includeNonCoding
    padding = args.padding
    ds = DatasourceFactory.createDatasource(transcript_ds,
                                            os.path.dirname(transcript_ds))
    input_gene_list_file_fp = file(input_gene_list_file, 'r')
    outputFileFP = file(outputFilename, 'w')
    errorFileFP = file(outputFilename + ".err", 'w')
    for line in input_gene_list_file_fp:
        gene = line.strip()
        exons = ds.retrieveExons(gene,
                                 isCodingOnly=(not isNonCoding),
                                 padding=int(padding))
        if len(exons) == 0:
            errorFileFP.write("Could not locate " + gene + "\n")
        for e in exons:
            outputFileFP.write('%s\t%s\t%s\t%s\n' % (e[0], e[1], e[2], e[3]))
    print("Done ... " + outputFilename)
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv of Genomic positions.  This tests both single- and multiple-nucleotide variants.  The tsv is already installed (i.e. proper config file created).
        '''
        outputFilename = 'out/genericGenomePositionTest.out.tsv'

        gpDS = DatasourceFactory.createDatasource(
            "testdata/small_genome_position_tsv_ds/oreganno_trim.config",
            "testdata/small_genome_position_tsv_ds/")

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/tiny_maflite.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gpDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 1
        # Two overlap, one does not.  Repeat...
        for lineDict in tsvReader:
            if (ctr % 3 == 0):
                self.assertTrue(
                    lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " +
                    str(ctr) + " should have had blank value, but did not: " +
                    lineDict["ORegAnno_hg19.oreganno.id"])
            else:
                self.assertFalse(
                    lineDict["ORegAnno_hg19.oreganno.id"] == '', "Line " +
                    str(ctr) + " should not have had blank value, but did.")
                self.assertTrue(
                    lineDict["ORegAnno_hg19.oreganno.id"] == 'OREG0013034',
                    "Line " + str(ctr) + " did not have correct value: " +
                    lineDict["ORegAnno_hg19.oreganno.id"])
            ctr = ctr + 1
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        ''' 
        
        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = ['ABL1','ABL2','ACSL3','AF15Q14','AF1Q','AF3p21','AF5q31','AKAP9','AKT1','AKT2','ALDH2','ALK','ALO17','APC','ARHGEF12','ARHH','ARID1A','ARID2','ARNT','ASPSCR1','ASXL1','ATF1','ATIC','ATM','ATRX','BAP1','BCL10','BCL11A','BCL11B']
        
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'
        
        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()
        
        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)
        
        fields = tsvReader.getFieldNames()
        self.assertTrue('CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue('CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header")
        
        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(lineDict['CGC_Abridged_GeneID'] <> '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: " + str(ctr))
                linesThatShouldBeAnnotated = linesThatShouldBeAnnotated + 1
            ctr = ctr + 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
    def testCreateIndexedTsvDatasource(self):
        datasourceFilename = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt"
        datasourceFoldername = "1000Genomes"
        datasourceName = "1000Genomes"
        datasourceType = "indexed_tsv"
        datasourceVersion = "V4.1"
        genomeBuild = "hg19"
        indexColumnNames = "CHROM,POS,POS"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        annotationColumnNames = "DBSNP,EA_AC,AA_AC,TAC"

        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir=destDir, ds_file=datasourceFilename,
                                                 ds_foldername=datasourceFoldername, ds_name=datasourceName,
                                                 ds_type=datasourceType, ds_version=datasourceVersion,
                                                 index_columns=indexColumnNames,
                                                 ds_annotation_columns=annotationColumnNames)

        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        configFilename = os.path.join(*[destDir, "1000Genomes.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        datasource = DatasourceFactory.createDatasource(configFilename, destDir)

        m1 = MutationDataFactory.default_create()
        m1.chr = "1"
        m1.start = "802177"
        m1.end = "802177"
        m1.ref_allele = "T"
        m1.alt_allele = "C"

        m1_annotated = datasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("1000Genomes_AA_AC")
        cur_annotation = Annotation(value="2,866", datasourceName="1000Genomes", dataType="String",
                                    description="",
                                    tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        annotationNames = ["1000Genomes_CHROM", "1000Genomes_POS", "1000Genomes_REF", "1000Genomes_ALT",
                           "1000Genomes_GWAS_PUBMED"]
        for annotationName in annotationNames:
            self.assertTrue(annotationName not in m1_annotated, "m1_annotated was annotated with %s." % annotationName)

        annotationNames = ["1000Genomes_DBSNP", "1000Genomes_EA_AC", "1000Genomes_AA_AC", "1000Genomes_TAC"]
        for annotationName in annotationNames:
            self.assertTrue(annotationName in m1_annotated, "m1_annotated was not annotated with %s value."
                                                            % annotationName)

        MutUtils.removeDir(tmpDir)
Exemple #56
0
def main():
    args = parseOptions()
    uniprot_swiss_fname = expanduser(args.swiss_file)
    uniprot_trembl_fname = expanduser(args.trembl_file)
    output_file = args.output_file
    gencode_ds_loc = expanduser(args.gencode_ds)

    out_tx_matches = args.out_tx_matches
    out_tx_matches_fp = file(out_tx_matches, 'w')

    setup_logging()

    uniprot_tsv = expanduser(args.uniprot_tsv)
    blast_exe = args.blast_exe

    gencode_ds = DatasourceFactory.createDatasource(
        configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc))
    #uniprotDS = DatasourceFactory.createDatasource(configFilename=uniprot_ds_loc, leafDir=os.path.dirname(uniprot_ds_loc))
    uniprotDS = GenericTranscriptDatasource(src_file=uniprot_tsv,
                                            title="UniProt",
                                            version="2014_12",
                                            geneColumnName="gene")

    tmp_dir = args.temp_pickle_store
    if tmp_dir is None:
        tmp_dir = mkdtemp(prefix="onco_unipickles_")
    swiss_data = parseWithShove(uniprot_swiss_fname, parse_uniprot_data,
                                tmp_dir)
    trembl_data = parseWithShove(uniprot_trembl_fname, parse_uniprot_data,
                                 tmp_dir)
    alignmentDB = Shove("file://" + output_file, "simple://")

    # Go through each transcript
    txs = gencode_ds.getTranscriptDict()
    tx_ids = txs.keys()
    num_tx_ids = len(tx_ids)
    swissKeys = swiss_data.keys()
    tremblKeys = trembl_data.keys()

    uniprotEntryNameKey = 'UniProt_uniprot_entry_name'

    numNotInProteinSeqs = 0
    numTranscriptsNotInUniprot = 0
    ctr = 0
    process_list = []
    tpool = Pool(processes=4)
    for tx_id in tx_ids:
        ctr += 1
        if (ctr % 2000) == 0:
            logging.getLogger(__name__).info(str(ctr) + "/" + str(num_tx_ids))

        tx_protein_seq = txs[tx_id].get_protein_seq()
        if tx_protein_seq is None or tx_protein_seq.strip(
        ) == "" or tx_protein_seq.strip() == "*":
            numNotInProteinSeqs += 1
            continue

        # Create a fake dummy mutation and annotate the gene and the simple_uniprot info
        m = MutationDataFactory.default_create()
        m.createAnnotation('gene', txs[tx_id].get_gene())
        m.createAnnotation('transcript_id', tx_id)
        m = uniprotDS.annotate_mutation(m)
        uniprot_entry_key = m[uniprotEntryNameKey]
        if uniprot_entry_key in swissKeys:
            uniprot_record = swiss_data[uniprot_entry_key]

        elif uniprot_entry_key in tremblKeys:
            uniprot_record = trembl_data[uniprot_entry_key]
        else:
            numTranscriptsNotInUniprot += 1
            continue
        uniprot_seq = uniprot_record.sequence

        # print(m['transcript_id'] + " " + m[uniprotEntryNameKey])
        # "/bulk/blast-2.2.26/bin/bl2seq" is blast_exe for Lee's laptop VM

        # When doing the comparison, tx protein includes stop codon at the end, uniprot does not.
        if tx_protein_seq[0:-1] == uniprot_seq:
            out_tx_matches_fp.write(tx_id + "\n")

        # runAlignment(tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir, blast_exe, alignmentDB)
        p = (tx_id, uniprot_entry_key, tx_protein_seq, uniprot_seq, tmp_dir,
             blast_exe)
        process_list.append(p)

    # # Running all of the processes....
    logging.getLogger(
        __name__).info("Running big block of alignments across multicores (" +
                       str(len(process_list)) + " alignments)")
    alignment_data_tuples = tpool.map(run_alignment_given_tuple, process_list)
    # logging.getLogger(__name__).info("Running big block of alignments across one core (" + str(len(process_list)) + " alignments)")
    # alignment_data_tuples = [run_alignment_given_tuple(p) for p in process_list]

    logging.getLogger(__name__).info("Storing results")
    for t in alignment_data_tuples:
        alignmentDB[t[0]] = t[1]

    logging.getLogger(__name__).info("Could not get protein seq for " +
                                     str(numNotInProteinSeqs) +
                                     " transcripts.")
    logging.getLogger(__name__).info("Could not get uniprot seq for " +
                                     str(numTranscriptsNotInUniprot) +
                                     " transcripts.")
    logging.getLogger(__name__).info("Attempted " + str(ctr) + " muts")
    parser = ArgumentParser(description=desc, formatter_class=RawDescriptionHelpFormatter, epilog=epilog)
    parser.add_argument("gencode_ds_loc", type=str, help="Location of the GENCODE datasource config file -- E.g. /bulk/dbDir/gencode_ds/hg19/gencode_ds.config")
    parser.add_argument("output_file", type=str, help="TSV filename for output.  File will be overwritten if it already exists.")


    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parseOptions()
    output_file = expanduser(args.output_file)
    gencode_ds_loc = expanduser(args.gencode_ds_loc)

    # Instantiate a gencode datasource
    gencode_ds = DatasourceFactory.createDatasource(configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc))

    # Get all transcript IDs in the datasource
    txs = gencode_ds.getTranscriptDict()
    tx_ids = txs.keys()
    num_tx_ids = len(tx_ids)

    # Initialize output file
    fp = file(output_file, 'w')
    ctr = 0
    for tx_id in tx_ids:
        ctr += 1
        if (ctr % 2000) == 0:
            print(str(ctr) + "/" + str(num_tx_ids))
            fp.flush()
        m = MutationData()
        m.createAnnotation('gene', tDict[transcriptID]['gene'])
        m.createAnnotation('transcript_id', transcriptID)
        m = uniprotDS.annotate_mutation(m)
        yield m

if __name__ == '__main__':
    args = parseOptions()
    uniprot_swiss_fname = expanduser(args.swiss_file)
    uniprot_trembl_fname = expanduser(args.trembl_file)
    output_file = args.output_file
    gencode_ds_loc = expanduser(args.gencode_ds)
    uniprot_ds_loc = expanduser(args.simple_uniprot_ds)
    blast_exe = args.blast_exe

    gencode_ds = DatasourceFactory.createDatasource(configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc))
    uniprotDS = DatasourceFactory.createDatasource(configFilename=uniprot_ds_loc, leafDir=os.path.dirname(uniprot_ds_loc))

    tmp_dir = args.temp_pickle_store
    if tmp_dir is None:
        tmp_dir = mkdtemp(prefix="onco_unipickles_")
    swiss_data = parseWithShove(uniprot_swiss_fname, parse_uniprot_data, tmp_dir)
    trembl_data = parseWithShove(uniprot_trembl_fname, parse_uniprot_data, tmp_dir)
    alignmentDB = Shove("file://" + output_file, "simple://")

    # Go through each transcript
    txs = gencode_ds.getTranscriptDict()
    tx_ids = txs.keys()
    num_tx_ids = len(tx_ids)
    swissKeys = swiss_data.keys()
    tremblKeys = trembl_data.keys()