Esempio n. 1
0
    def testBasicAnnotation(self):
        ''' Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. '''
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config",
            "testdata/small_transcript_tsv_ds/")
        outputFilename = 'out/genericTranscriptTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue(
            "refseq_test_mRNA_Id" in headers,
            "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue(
            "refseq_test_prot_Id" in headers,
            "refseq_test_prot_Id not found in headers: " + str(headers))
    def test_full_seg_file_annotations(self):
        """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_full_seg_file_annotations.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "SIMPLE_TSV",
            inputFilename,
            output_filename,
            datasource_dir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
            self.assertTrue(len(line_dict["genes"].split(",")) > 0)
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
    def test_basic_rendering(self):
        """Test that we can render a basic seg file as a gene list"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_basic_rendering.gene_list.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "GENE_LIST",
            inputFilename,
            output_filename,
            datasourceDir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            self.assertTrue(line_dict['segment_end'] is not None)
            self.assertTrue(line_dict['segment_end'].strip() != "")
            self.assertTrue("gene" in line_dict.keys())
            self.assertTrue(len(line_dict["gene"]) > 0)
            self.assertTrue(float(line_dict["segment_num_probes"]))
            self.assertTrue(line_dict['sample'] == "Patient0")
Esempio n. 5
0
    def testSNPsAndIndelStartAndEndPos(self):
        """
        Tests that the start and end positions of SNPs and Indels are parsed as defined by the NCI's MAF specification
        (https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+(MAF)+Specification).
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.snps.indels.vcf"])
        outputFilename = os.path.join("out", "example.snps.indels.out.tsv")

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        for row in tsvReader:
            if row['start'] == "16890445":
                self.assertEqual(row["end"], "16890445", "The value should be %s but it was %s." % ("16890445",
                                                                                                    row["end"]))
            elif row["start"] == "154524458":
                self.assertEqual(row["end"], "154524459", "The value should be %s but it was %s." % ("154524459",
                                                                                                     row["end"]))
            elif row["start"] == "114189432":
                self.assertEqual(row["end"], "114189433", "The value should be %s but it was %s." % ("114189433",
                                                                                                     row["end"]))
    def testProperConversionVcfToMaf(self):
        """Test that ref, alt, and positions are properly populated in a TCGA MAF generated from a VCF """

        # For this conversion, you must specify the barcodes manually
        override_annotations = TcgaMafOutputRendererTest.TCGA_MAF_DEFAULTS
        override_annotations.update({
            'tumor_barcode': 'Patient0-Tumor',
            'normal_barcode': 'Patient0-Normal'
        })

        outputFilename = self._annotateTest(
            'testdata/vcf/Patient0.somatic.strelka.indels.vcf',
            "out/testConversionFromVCF.maf.annotated",
            self._determine_db_dir(),
            inputFormat="VCF",
            outputFormat="TCGAMAF",
            override_annotations=override_annotations,
            is_skip_no_alts=True)

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(outputFilename)

        # Check to make sure that the ref and alt are correct for a TCGA MAF.
        tsvReader = GenericTsvReader(outputFilename)

        ctr = 0

        for line_dict in tsvReader:
            ref = line_dict['Reference_Allele']
            alt = line_dict['Tumor_Seq_Allele2']

            # INS
            if len(alt) > len(ref):
                self.assertTrue(ref == "-",
                                "Invalid insertion with " + ref + "  " + alt)

            # DEL
            if len(ref) > len(alt):
                self.assertTrue(alt == "-",
                                "Invalid deletion with " + ref + "  " + alt)

            self.assertTrue(line_dict['Start_position'] in [
                "10089935", "57493929", "155301009", "64948169", "64948166",
                "64948167", "64948168"
            ])
            self.assertTrue(
                line_dict['Reference_Allele'] in ["-", "TC", "A", "TT", "TTT"])
            self.assertTrue(
                line_dict['Tumor_Seq_Allele2'] in ["-", "TC", "G", "T"])
            self.assertTrue(
                line_dict['Matched_Norm_Sample_Barcode'] == "Patient0-Normal")
            self.assertTrue(
                line_dict['Matched_Norm_Sample_UUID'] == "Patient0-Normal")
            self.assertTrue(
                line_dict['Tumor_Sample_Barcode'] == "Patient0-Tumor")
            self.assertTrue(line_dict['Tumor_Sample_UUID'] == "Patient0-Tumor")
            ctr += 1

        self.assertTrue(ctr == 8,
                        str(ctr) + " mutations found, but should have been 8.")
    def testProperConversionVcfToMafWithThirdSample(self):
        """Test that ref, alt, and positions are properly populated in a TCGA MAF generated from a VCF, but that the NORMAL is treated as any other sample, since this VCF has three samples in it. """

        # For this conversion, you must specify the barcodes manually
        override_annotations = TcgaMafOutputRendererTest.TCGA_MAF_DEFAULTS
        override_annotations.update({'tumor_barcode': 'NA'})

        outputFilename = self._annotateTest(
            os.path.join(*[
                "testdata", "vcf", "Patient0.somatic.strelka.indels.met.vcf"
            ]),
            os.path.join("out", "testConversionFromVCFv2.maf.annotated"),
            self._determine_db_dir(),
            inputFormat="VCF",
            outputFormat="TCGAMAF",
            override_annotations=override_annotations)

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(outputFilename)

        # Check to make sure that the ref and alt are correct for a TCGA MAF.
        tsvReader = GenericTsvReader(outputFilename)

        ctr = 0

        for line_dict in tsvReader:
            ctr += 1

        self.assertTrue(
            ctr == 24,
            str(ctr) + " mutations found, but should have been 24.")
Esempio n. 8
0
    def testDuplicateAnnotation(self):
        """
        Tests that the duplicate annotations are parsed correctly.
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"])
        outputFilename = os.path.join("out", "example.duplicate_annotation.out.tsv")

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        fieldnames = tsvReader.getFieldNames()
        self.assertTrue("variant_status" in fieldnames, "variant_status field is missing in the header.")
        self.assertTrue("sample_variant_status" in fieldnames, "sample_variant_status is missing in the header.")

        row = tsvReader.next()
        self.assertTrue("variant_status" in row, "variant_status field is missing in the row.")
        self.assertTrue("sample_variant_status" in row, "sample_variant_status is missing in the row.")

        self.assertEqual("2", row["variant_status"], "Incorrect value of variant_status.")
        self.assertEqual("0", row["sample_variant_status"], "Incorrect value of sample_variant_status")
Esempio n. 9
0
    def sortFile(self, filename, func, length=50000):
        """
        This method sorts the input file and writes out the sorted file to filename.

        :param filename: sorted filename
        :param func: function that converts each row of the input file to an unique, sortable key
        :param length: maximum number of lines in a partition
        """
        reader = GenericTsvReader(filename=self.readfilename,
                                  commentPrepend=self.commentPrepend,
                                  delimiter=self.delimiter)
        comments = reader.getComments()

        fieldnames = reader.getFieldNames()
        if fieldnames is None:
            fieldnames = []

        fieldnameIndexes = collections.OrderedDict()
        if fieldnames is not None:
            fieldnameIndexes = collections.OrderedDict([
                (x, i) for (i, x) in enumerate(fieldnames)
            ])

        iterable = iter(reader.getInputContentFP())
        partitions = self._yieldPartitions(iterable, func, fieldnameIndexes,
                                           length)

        with open(name=filename, mode='wb', buffering=64 * 1024) as writer:
            writer.write(comments)
            writer.write(string.join(fieldnames, self.delimiter) + "\n")
            writer.writelines(
                self._merge(partitions)
            )  # generators are allowed as inputs to writelines function
Esempio n. 10
0
    def testManualAnnotations(self):
        """ Test that the manual annotation facility in the Annotator is working properly. """
        annotator = Annotator()
        overrides = {'source': 'Capture', 'status': 'Somatic', 'phase': 'Phase_I', 'sequencer': 'Illumina GAIIx'}
        annotator.setManualAnnotations(overrides)
        inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')
        outputRenderer = SimpleOutputRenderer("out/testManualAnnotationsFile.tsv")
        annotator.setInputCreator(inputCreator)
        annotator.setOutputRenderer(outputRenderer)

        testOutputFilename = annotator.annotate()

        keysOfInterest = overrides.keys()

        statinfo = os.stat(testOutputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated TSV file (" + testOutputFilename + ") is empty.")

        tsvReader = GenericTsvReader(testOutputFilename)

        ctr = 1
        for lineDict in tsvReader:
            for k in keysOfInterest:
                self.assertTrue(lineDict[k] != "__UNKNOWN__",
                                "__UNKNOWN__ value seen on line " + str(ctr) + ", when it should be populated: " + k)
                self.assertTrue(lineDict[k] != "",
                                "Blank value seen on line " + str(ctr) + ", when it should be populated: " + k)
                self.assertTrue(lineDict[k] == overrides[k],
                                "Value for " + k + " on line " + str(ctr) + " did not match override: " + str(
                                    lineDict[k]) + " <> " + str(overrides[k]))
            ctr += 1
    def test_rendering_with_exons(self):
        """Test that we can render a seg file that includes exons at end points"""
        inputFilename = "testdata/seg/Middle_of_exon.seg.txt"
        output_filename = "out/test_exon_seg2.gene_list.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "GENE_LIST",
            inputFilename,
            output_filename,
            datasourceDir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            if line_dict['segment_end_gene'] == "MAPK1":
                self.assertTrue(
                    line_dict['segment_end_exon'].strip() == "8+",
                    "Should have been 8+, but saw: %s" %
                    line_dict['segment_end_exon'].strip())
Esempio n. 12
0
    def test_overwriting_muts(self):
        """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {
            OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True,
            OptionConstants.NO_PREPEND: True
        }

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts)
        annotator = Annotator()
        annotator.initialize(run_spec)

        annotator.annotate()

        tsv_reader = GenericTsvReader(output_filename)

        for i, line_dict in enumerate(tsv_reader):
            self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
Esempio n. 13
0
    def __init__(self,
                 filename,
                 mutation_data_factory=None,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator,
              self).__init__(filename, mutation_data_factory, configFile,
                             genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
Esempio n. 14
0
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename,
                                          'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()

        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1,
                        "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Who) from header")

        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(
            ctrOut == (ctrIn + 2),
            "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): "
            + str(ctrIn) + ", " + str(ctrOut))
    def test_validation_correction(self):
        """ Test that the validation allele fields are determined automatically when not specified by the user for invalid mutation.
        """
        m = MutationDataFactory.default_create()
        m.chr = "3"
        m.start = "178948145"
        m.end = "178948145"
        m.alt_allele = "A"
        m.ref_allele = "G"
        m['validation_status'] = "Invalid"
        m['Match_Norm_Validation_Allele1'] = ""
        m['Match_Norm_Validation_Allele2'] = ""
        m['Tumor_Validation_Allele1'] = ""
        m['Tumor_Validation_Allele2'] = ""
        m['Mutation_Status'] = "Somatic"

        output_filename = os.path.join("out",
                                       "test_validation_correction1.maf.tsv")

        outputRenderer = TcgaMafOutputRenderer(output_filename,
                                               configFile=os.path.join(
                                                   "configs",
                                                   "tcgaMAF2.4_output.config"))
        outputRenderer.renderMutations([m].__iter__())

        tsv_reader = GenericTsvReader(output_filename)

        for line_dict in tsv_reader:
            self.assertTrue(
                line_dict['Match_Norm_Validation_Allele1'] ==
                line_dict['Match_Norm_Validation_Allele2'],
                "Matched norm alleles did not match.")
            self.assertTrue(
                line_dict['Tumor_Validation_Allele1'] ==
                line_dict['Tumor_Validation_Allele2'],
                "Tumor alleles did not match for an invalid validation result."
            )
            self.assertTrue(
                line_dict['Match_Norm_Validation_Allele1'] ==
                line_dict['Tumor_Validation_Allele2'],
                "Tumor alleles did not match normal alleles for an invalid validation result."
            )
            self.assertTrue(
                line_dict['Match_Norm_Validation_Allele1'] ==
                line_dict['Reference_Allele'],
                "Norm validation alleles did not match reference (norm, reference): (%s, %s)"
                % (line_dict['Match_Norm_Validation_Allele1'],
                   line_dict['Reference_Allele']))
            self.assertTrue(
                "G" == line_dict['Reference_Allele'],
                "Reference allele should have been G, but was " +
                line_dict['Reference_Allele'])
            self.assertTrue(
                "None" == line_dict['Mutation_Status'],
                "Mutation Status must be None when Validation Status is Invalid: "
                + line_dict['Mutation_Status'])
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        '''

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31',
            'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC',
            'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1',
            'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B'
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            'CGC_Abridged_Other Syndrome/Disease' in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue(
            'CGC_Abridged_Mutation Type' in fields,
            "'CGC_Abridged_Mutation Type' was not present in the header")

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(
                    lineDict['CGC_Abridged_GeneID'] != '',
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: "
                    + str(ctr))
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0,
                        "Bad data -- cannot test missed detects.")
Esempio n. 17
0
    def _renderSortedTsv(self, templateFilename, vcfFilename, tsvFilename, sampleNames, dataManager, inferGenotypes):
        """


        :param templateFilename:
        :param vcfFilename:
        :param tsvFilename:
        :param sampleNames:
        :param dataManager:
        """
        tempVcfReader = vcf.Reader(filename=templateFilename, strict_whitespace=True)
        pointer = file(vcfFilename, "w")
        vcfWriter = vcf.Writer(pointer, tempVcfReader, self.lineterminator)
        tsvReader = GenericTsvReader(tsvFilename, delimiter=self.delimiter)
        index = 0
        nrecords = 1000
        chrom = None
        pos = None
        refAllele = None
        recordBuilder = None

        ctr = 0
        m = None
        try:
            for m in tsvReader:
                ctr += 1
                isNewRecord = self._isNewVcfRecordNeeded(chrom, m["chr"], pos, m["start"], refAllele, m["ref_allele"])
                if isNewRecord:
                    if recordBuilder is not None:
                        record = recordBuilder.createRecord()
                        vcfWriter.write_record(record)
                        index += 1
                        if index % nrecords == 0:
                            self.logger.info("Rendered " + str(index) + " vcf records.")
                            vcfWriter.flush()

                    chrom = m["chr"]
                    pos = m["start"]
                    refAllele = m["ref_allele"]

                    recordBuilder = RecordBuilder(chrom, int(pos), refAllele, sampleNames)

                recordBuilder = self._parseRecordBuilder(m, recordBuilder, dataManager, inferGenotypes)

            if recordBuilder is not None:
                record = recordBuilder.createRecord()
                vcfWriter.write_record(record)
            vcfWriter.close()

        except Exception as e:
            self.logger.error(traceback.format_exc())
            self.logger.error("Error at mutation " + str(ctr) + " " + str([m["chr"], m["start"], m["end"]]) + ": ")

        self.logger.info("Rendered all " + str(index) + " vcf records.")
    def _validateTcgaMafContents(self, filename):
        """
        This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
        
        Note: This method has nothing to do with the TCGA validator.
        
        TODO: This is code duplication from TCGA MAF Output RendererTest.  This should be refactored into a base class
        (to preserve self.assertTrue, etc).
        """
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0,
                        "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') <> -1,
                        "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:
            if lineDict['Entrez_Gene_Id'] == "0":
                self.assertTrue(
                    lineDict['Hugo_Symbol'] == "Unknown",
                    "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: "
                    + str(ctr))

            unknownKeys = []
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue(
                    '\r' not in lineDict[k],
                    "Carriage return character found in an annotation value.")

                configFile = ConfigUtils.createConfigParser(
                    'configs/tcgaMAF2.3_output.config')
                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                if (k not in requiredColumns) and (k not in optionalColumns):
                    self.assertTrue(
                        k.startswith("i_"),
                        "Internal column was not prepended with 'i_'")

            unknownKeys.sort()
            self.assertTrue(
                len(unknownKeys) == 0, "__UNKNOWN__ values (" +
                str(len(unknownKeys)) + ") seen on line " + str(ctr) +
                ", in fields: " + ", ".join(unknownKeys))

            ctr += 1
    def __init__(self,
                 filename,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
    def test_splitting_allelic_depth_disabled(self):
        """Make sure that allelic depth is not split when told"""
        input_vcf_file = "testdata/m2_support/Dream4.chr20.oxoGinfo.vcf"
        output_tcgamaf_file = "out/m2_support/Dream4.chr20.oxoGinfo.vcf.noADSplit.maf.annotated"

        if not os.path.exists(
                os.path.abspath(os.path.dirname(output_tcgamaf_file))):
            os.makedirs(os.path.abspath(os.path.dirname(output_tcgamaf_file)))

        # For this conversion, you must specify the barcodes manually
        override_annotations = TcgaMafOutputRendererTest.TCGA_MAF_DEFAULTS
        override_annotations.update({
            'tumor_barcode': 'Patient0-Tumor',
            'normal_barcode': 'Patient0-Normal'
        })

        other_opts = {
            OptionConstants.COLLAPSE_FILTER_COLS: True,
            OptionConstants.NO_PREPEND: True,
            OptionConstants.SPLIT_ALLELIC_DEPTH: False
        }

        # Use an empty datasource dir in order to speed this up.
        self._annotateTest(input_vcf_file,
                           output_tcgamaf_file,
                           datasource_dir=None,
                           inputFormat="VCF",
                           is_skip_no_alts=True,
                           other_opts=other_opts,
                           override_annotations=override_annotations)

        # Check the output MAF
        tsv_reader = GenericTsvReader(output_tcgamaf_file)

        keys_to_check_existence = ['allelic_depth']

        keys_to_check_non_existence = {'t_ref_count', 't_alt_count'}

        for line_dict in tsv_reader:

            for ks in keys_to_check_existence:
                self.assertTrue(ks in line_dict.keys(),
                                "Key " + ks + " was not rendered.")
                self.assertTrue(
                    line_dict[ks] != ""
                    or (line_dict['Reference_Allele'] == "-"
                        or line_dict['Tumor_Seq_Allele2'] == "-"),
                    "Key " + ks + " had a blank value." + str(line_dict))

            for ks in keys_to_check_non_existence:
                self.assertTrue(ks not in line_dict.keys())
Esempio n. 21
0
    def _create_tx_id_to_protein_id_mapping(self, mapping_file):
        """
        Mapping file is assumed to have three columns and be a tsv:
        Ensembl Gene ID, Ensembl Transcript ID, and Ensembl Protein ID

        """
        result = dict()
        if mapping_file is None or mapping_file.strip() == "":
            return result
        tsv_reader = GenericTsvReader(mapping_file)
        for line_dict in tsv_reader:
            result[line_dict['Ensembl Transcript ID']] = line_dict[
                'Ensembl Protein ID']
        return result
Esempio n. 22
0
    def _renderSortedTsv(self, templateFilename, vcfFilename, tsvFilename, sampleNames, dataManager, inferGenotypes):
        """
        Turn a sorted tsv into a VCF

        :param templateFilename: basic VCF to model output VCF.
        :param vcfFilename: output VCF filename
        :param tsvFilename: input sorted tsv
        :param sampleNames: sample names that should be used in output
        :param dataManager: dataManager instance used in creating pyvcf records.
        :param inferGenotypes: whether we should try to infer the genotypes, since we may not have add GT explicitly
        on input
        """
        tempVcfReader = vcf.Reader(filename=templateFilename, strict_whitespace=True)
        pointer = file(vcfFilename, "w")

        tsvReader = GenericTsvReader(tsvFilename, delimiter=self.delimiter)
        index = 0
        nrecords = 1000
        chrom = None
        pos = None
        refAllele = None
        recordBuilder = None

        vcfWriter = vcf.Writer(pointer, tempVcfReader, self.lineterminator)
        for ctr, m in enumerate(tsvReader):
            isNewRecord = self._isNewVcfRecordNeeded(chrom, m["chr"], pos, m["start"], refAllele, m["ref_allele"])
            if isNewRecord:
                if recordBuilder is not None:
                    record = recordBuilder.createRecord()
                    vcfWriter.write_record(record)
                    index += 1
                    if index % nrecords == 0:
                        self.logger.info("Rendered " + str(index) + " vcf records.")
                        vcfWriter.flush()

                chrom = m["chr"]
                pos = m["start"]
                refAllele = m["ref_allele"]

                recordBuilder = RecordBuilder(chrom, int(pos), refAllele, sampleNames)

            recordBuilder = self._parseRecordBuilder(m, recordBuilder, dataManager, inferGenotypes)

        if recordBuilder is not None:
            record = recordBuilder.createRecord()
            vcfWriter.write_record(record)

        vcfWriter.close()
        tsvReader.close()
        self.logger.info("Rendered all " + str(index) + " vcf records.")
    def testExposedColumns(self):
        """Test that columns listed in the config file as exposed do not get the i_ prepend"""
        testOutputFilename = self._annotateTest(
            'testdata/maflite/tiny_maflite.maf.txt',
            "out/testExposedCols.maf.tsv", self._determine_db_dir())

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(testOutputFilename)

        # Check the columns, since the input has a couple of exposed columns.
        tsvReader = GenericTsvReader(testOutputFilename)
        headers = tsvReader.getFieldNames()
        headersToCheck = ['t_alt_count', 't_ref_count']
        for h in headersToCheck:
            self.assertFalse(("i_" + h) in headers, "i_ was prepended to " + h)
            self.assertTrue(h in headers, h + " not found.")
    def test_simple_seg_file_annotations(self):
        """Test that we can read in a seg file, do GENCODE annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_simple_seg_file_annotations.tsv"
        if os.path.exists(output_filename):
            os.remove(output_filename)
        ic = MafliteInputMutationCreator(inputFilename, None,
                                         'configs/seg_file_input.config')
        segs = ic.createMutations()

        i = 1
        for i, seg in enumerate(segs):
            pass

        self.assertTrue(
            (i + 1) == 27,
            "Found %d segments when there should have been 27." % (i + 1))

        ic = MafliteInputMutationCreator(inputFilename, None,
                                         'configs/seg_file_input.config')
        segs = ic.createMutations()

        gencode_ds = TestUtils._create_test_gencode_v19_ds(
            "out/seg_file_gencode_ds")
        annotator = Annotator()

        segs_annotated = []
        for seg in segs:
            segs_annotated.append(gencode_ds.annotate_segment(seg))

        outputRenderer = SimpleOutputRenderer(output_filename, '')
        outputRenderer.renderMutations(segs_annotated.__iter__())

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
    def testMissingFilter(self):
        """
        Tests that the missing FILTER fields are parsed correctly.
        """
        inputFilename = os.path.join(
            *["testdata", "vcf", "example.missing_filters.vcf"])
        outputFilename = os.path.join("out", "example.missing_filters.out.tsv")
        expectedOutputFilename = os.path.join(
            *["testdata", "vcf", "example.expected.missing_filters.out.tsv"])

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)

        current = pandas.read_csv(outputFilename,
                                  sep='\t',
                                  header=len(tsvReader.getCommentsAsList()))
        expected = pandas.read_csv(expectedOutputFilename, sep='\t')

        currentColNames = set()
        for i in range(len(current.columns)):
            currentColNames.add(current.columns[i])

        expectedColNames = set()
        for i in range(len(expected.columns)):
            expectedColNames.add(expected.columns[i])

        self.assertTrue(
            len(currentColNames.symmetric_difference(expectedColNames)) is 0,
            "Should have the same columns")
        self.assertTrue(
            len(current.index) == len(expected.index),
            "Should have the same number of rows")

        for colName in currentColNames:
            self.assertTrue(
                sum((current[colName] == expected[colName])
                    | (pandas.isnull(current[colName])
                       & pandas.isnull(expected[colName]))) == len(
                           current.index),
                "Should have the same values in column " + colName)
    def testAnnotationWithExampleVcf(self):
        """
        Tests whether parsed annotations match the actual annotations in a simple TSV.  Missing format fields yield -->""  ".,." --> ","
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"])
        outputFilename = os.path.join("out", "example.out.tsv")
        expectedOutputFilename = os.path.join(
            *["testdata", "vcf", "example.expected.out.tsv"])

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)

        current = pandas.read_csv(outputFilename,
                                  sep='\t',
                                  header=len(tsvReader.getCommentsAsList()))
        expected = pandas.read_csv(expectedOutputFilename, sep='\t')

        currentColNames = set()
        for i in range(len(current.columns)):
            currentColNames.add(current.columns[i])

        expectedColNames = set()
        for i in range(len(expected.columns)):
            expectedColNames.add(expected.columns[i])

        self.assertTrue(
            len(currentColNames.symmetric_difference(expectedColNames)) is 0,
            "Should have the same columns")
        self.assertTrue(
            len(current.index) == len(expected.index),
            "Should have the same number of rows")

        for colName in currentColNames:
            self.assertTrue(
                sum((current[colName] == expected[colName])
                    | (pandas.isnull(current[colName])
                       & pandas.isnull(expected[colName]))) == len(
                           current.index),
                "Should have the same values in column " + colName + ": \n" +
                str(current[colName]) + "\nvs\n" + str(expected[colName]))
    def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self):
        """Test FILTER col is properly rendered when using the collapse-filter-cols option."""

        input_fname = 'testdata/vcf/example.vcf'
        output_fname = 'out/example.one_filter_col.maf.txt'
        annotator = Annotator()
        other_opts = {'collapse_filter_cols': True}

        run_spec = RunSpecificationFactory.create_run_spec(
            'VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts)
        annotator.initialize(run_spec)
        annotator.annotate()

        tsv_reader = GenericTsvReader(output_fname)
        for line_dict in tsv_reader:
            self.assertIn('i_filter', line_dict)
            self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
Esempio n. 28
0
    def testMulticoreAnnotateFromChunkedFile(self):
        #TODO: Add unit test that Mutation data is pickle-able
        inputFile = "testdata/maflite/Patient0.snp.maf.txt"
        outputFile = "out/testGAFMulticorePatient0.snp.maf.txt"
        chunkSize = 200
        numChunks = 4

        gafDatasource = TestUtils.createGafDatasourceProxy(self.config)
        ic = MafliteInputMutationCreator(inputFile)
        oc = SimpleOutputRenderer(outputFile)

        # createChunks
        muts = ic.createMutations()

        allAnnotatedChunksFlat = []
        are_mutations_remaining = True
        p = LoggingPool(processes=numChunks)
        while are_mutations_remaining:

            chunks = []
            for j in xrange(0, numChunks):
                chunk = []
                for i in xrange(0, chunkSize):
                    try:
                        chunk.append(muts.next())
                    except StopIteration:
                        are_mutations_remaining = False
                        break

                chunks.append((chunk, gafDatasource))

            annotatedChunks = p.map(annotate_mutations_global, chunks)
            annotatedChunksFlat = self._flattenChunks(annotatedChunks)
            allAnnotatedChunksFlat.append(annotatedChunksFlat)
        p.close()
        p.join()

        annotatedMuts = chain.from_iterable(allAnnotatedChunksFlat)

        ctr = 0
        oc.renderMutations(annotatedMuts, Metadata())
        tsvReader = GenericTsvReader(outputFile)
        for line in tsvReader:
            ctr += 1
        self.assertTrue(ctr == 730,
                        "Should have read 730 variants, but read " + str(ctr))
Esempio n. 29
0
    def test_m2_phasing_from_file_easy(self):
        """Test the phasing ONP combining functionality given files -- trivial example"""
        input_vcf_file = "testdata/m2_support/phasingExample.vcf"
        output_tcgamaf_file = "out/phasingExample.vcf.maf.annotated"

        self._annotate_m2_vcf(input_vcf_file, output_tcgamaf_file)

        # Check the output MAF
        tsv_reader = GenericTsvReader(output_tcgamaf_file)

        # Ground truth has three mutations combined into two
        ctr = 0
        for i, line_dict in enumerate(tsv_reader):
            ctr += 1

        self.assertTrue(
            ctr == 2,
            "Should have had two mutations, but had " + str(ctr) + " instead.")
    def testFullSNPOutput(self):
        """ Create a TCGA MAF from a SNP TSV file."""
        self.logger.info("Initializing Maflite SNP Test...")

        testOutputFilename = self._annotateTest(
            'testdata/maflite/Patient0.snp.maf.txt',
            "out/testSNP_v2.4.maf.tsv", self._determine_db_dir())

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(testOutputFilename)

        tsv_reader = GenericTsvReader(testOutputFilename)

        # We should see at least one entry with a dbSNP value
        ctr = 0
        for lineDict in tsv_reader:
            if lineDict["dbSNP_RS"] != "":
                ctr = ctr + 1

        self.assertTrue(ctr > 0, "No dbSNP entries seen.")