def test_basic_rendering(self):
        """Test that we can render a basic seg file as a gene list"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_basic_rendering.gene_list.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "GENE_LIST",
            inputFilename,
            output_filename,
            datasourceDir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            self.assertTrue(line_dict['segment_end'] is not None)
            self.assertTrue(line_dict['segment_end'].strip() != "")
            self.assertTrue("gene" in line_dict.keys())
            self.assertTrue(len(line_dict["gene"]) > 0)
            self.assertTrue(float(line_dict["segment_num_probes"]))
            self.assertTrue(line_dict['sample'] == "Patient0")
Example #2
0
    def testBasicAnnotation(self):
        ''' Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. '''
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config",
            "testdata/small_transcript_tsv_ds/")
        outputFilename = 'out/genericTranscriptTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue(
            "refseq_test_mRNA_Id" in headers,
            "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue(
            "refseq_test_prot_Id" in headers,
            "refseq_test_prot_Id not found in headers: " + str(headers))
    def test_rendering_with_exons(self):
        """Test that we can render a seg file that includes exons at end points"""
        inputFilename = "testdata/seg/Middle_of_exon.seg.txt"
        output_filename = "out/test_exon_seg2.gene_list.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "GENE_LIST",
            inputFilename,
            output_filename,
            datasourceDir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            if line_dict['segment_end_gene'] == "MAPK1":
                self.assertTrue(
                    line_dict['segment_end_exon'].strip() == "8+",
                    "Should have been 8+, but saw: %s" %
                    line_dict['segment_end_exon'].strip())
    def _create_test_ds(self, input_tsv, dir_name, index_cols):

        base_name = "test_snp_leveldb"

        full_name = dir_name + "/" + base_name

        if os.path.exists(full_name):
            shutil.rmtree(full_name)

        os.makedirs(full_name)

        tsv_reader = GenericTsvReader(input_tsv, commentPrepend="%")
        annotation_cols = copy.copy(tsv_reader.getFieldNames())
        for icol in index_cols:
            if icol in annotation_cols:
                annotation_cols.remove(icol)

        ds_creator = SnpOnlyLevelDbDatasourceCreator()
        ds_creator.createDatasource(full_name, input_tsv, ",".join(index_cols), full_name + "/" + base_name + ".config", "snp_leveldb", base_name, "TEST",
                         "exact", annotation_cols, [])

        config_filename = "out/test_simple_annotate_snp_only_leveldb/test_snp_leveldb/test_snp_leveldb.config"
        ds = DatasourceFactory.createDatasource(os.path.abspath(config_filename), os.path.dirname(config_filename))

        return ds
    def test_simple_seg_file_input(self):
        """Test that we can read in a seg file, do no annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_simple_seg_file_input.tsv"
        if os.path.exists(output_filename):
            os.remove(output_filename)
        ic = MafliteInputMutationCreator(inputFilename, 'configs/seg_file_input.config')
        segs = ic.createMutations()

        i = 1
        for i,seg in enumerate(segs):
            pass

        self.assertTrue((i+1) == 27, "Found %d segments when there should have been 27." % (i+1))

        ic = MafliteInputMutationCreator(inputFilename, 'configs/seg_file_input.config')
        segs = ic.createMutations()


        outputRenderer = SimpleOutputRenderer(output_filename, '')
        outputRenderer.renderMutations(segs)

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
    def test_full_seg_file_annotations(self):
        """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_full_seg_file_annotations.tsv"
        db_dir = self.config.get('DEFAULT', "dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec(
            "SEG_FILE",
            "SIMPLE_TSV",
            inputFilename,
            output_filename,
            datasource_dir=db_dir,
            annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
            self.assertTrue(len(line_dict["genes"].split(",")) > 0)
    def test_full_seg_file_annotations(self):
        """Test that we can read in a seg file, do a proper full annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_full_seg_file_annotations.tsv"
        db_dir = self.config.get('DEFAULT',"dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "SIMPLE_TSV", inputFilename, output_filename,
                                                           datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
            self.assertTrue(len(line_dict["genes"].split(",")) > 0)
Example #8
0
    def sortFile(self, filename, func, length=50000):
        """
        This method sorts the input file and writes out the sorted file to filename.

        :param filename: sorted filename
        :param func: function that converts each row of the input file to an unique, sortable key
        :param length: maximum number of lines in a partition
        """
        reader = GenericTsvReader(filename=self.readfilename, commentPrepend=self.commentPrepend,
                                  delimiter=self.delimiter)
        comments = reader.getComments()

        fieldnames = reader.getFieldNames()
        if fieldnames is None:
            fieldnames = []

        fieldnameIndexes = collections.OrderedDict()
        if fieldnames is not None:
            fieldnameIndexes = collections.OrderedDict([(x, i) for (i, x) in enumerate(fieldnames)])

        iterable = iter(reader.getInputContentFP())
        partitions = self._yieldPartitions(iterable, func, fieldnameIndexes, length)

        with open(name=filename, mode='wb', buffering=64 * 1024) as writer:
            writer.write(comments)
            writer.write(string.join(fieldnames, self.delimiter) + "\n")
            writer.writelines(self._merge(partitions))  # generators are allowed as inputs to writelines function
Example #9
0
    def testDuplicateAnnotation(self):
        """
        Tests that the duplicate annotations are parsed correctly.
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.duplicate_annotation.vcf"])
        outputFilename = os.path.join("out", "example.duplicate_annotation.out.tsv")

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        fieldnames = tsvReader.getFieldNames()
        self.assertTrue("variant_status" in fieldnames, "variant_status field is missing in the header.")
        self.assertTrue("sample_variant_status" in fieldnames, "sample_variant_status is missing in the header.")

        row = tsvReader.next()
        self.assertTrue("variant_status" in row, "variant_status field is missing in the row.")
        self.assertTrue("sample_variant_status" in row, "sample_variant_status is missing in the row.")

        self.assertEqual("2", row["variant_status"], "Incorrect value of variant_status.")
        self.assertEqual("0", row["sample_variant_status"], "Incorrect value of sample_variant_status")
    def test_basic_rendering(self):
        """Test that we can render a basic seg file as a gene list"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_basic_rendering.gene_list.tsv"
        db_dir = self.config.get('DEFAULT',"dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename,
                                                           datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            self.assertTrue(line_dict['segment_end'] is not None)
            self.assertTrue(line_dict['segment_end'].strip() != "")
            self.assertTrue("gene" in line_dict.keys())
            self.assertTrue(len(line_dict["gene"]) > 0)
            self.assertTrue(float(line_dict["segment_num_probes"]))
            self.assertTrue(line_dict['sample'] == "Patient0")
Example #11
0
    def sortFile(self, filename, func, length=50000):
        """
        This method sorts the input file and writes out the sorted file to filename.

        :param filename: sorted filename
        :param func: function that converts each row of the input file to an unique, sortable key
        :param length: maximum number of lines in a partition
        """
        reader = GenericTsvReader(filename=self.readfilename,
                                  commentPrepend=self.commentPrepend,
                                  delimiter=self.delimiter)
        comments = reader.getComments()

        fieldnames = reader.getFieldNames()
        if fieldnames is None:
            fieldnames = []

        fieldnameIndexes = collections.OrderedDict()
        if fieldnames is not None:
            fieldnameIndexes = collections.OrderedDict([
                (x, i) for (i, x) in enumerate(fieldnames)
            ])

        iterable = iter(reader.getInputContentFP())
        partitions = self._yieldPartitions(iterable, func, fieldnameIndexes,
                                           length)

        with open(name=filename, mode='wb', buffering=64 * 1024) as writer:
            writer.write(comments)
            writer.write(string.join(fieldnames, self.delimiter) + "\n")
            writer.writelines(
                self._merge(partitions)
            )  # generators are allowed as inputs to writelines function
    def createDatasource(self, destDir, ds_file, index_column_names, configFilename, ds_type, ds_name, ds_version,
                         ds_match_mode, annotation_column_names, indexCols):
        """


        :param destDir:
        :param ds_file:
        :param index_column_names:
        :param configFilename:
        :param ds_type:
        :param ds_name:
        :param ds_version:
        :param ds_match_mode:
        :param annotation_column_names: If blank, assume all in the tsv (minus the index columns)
        :param indexCols: list of the index columns.  Assumed to be five corresponding to chrom, start, end, ref, and alt.
        """
        index_column_names = index_column_names.split(",")

        output_filename = destDir + "/" + ds_name + ".leveldb"
        src_file = os.path.basename(output_filename)
        db = leveldb.LevelDB(output_filename, create_if_missing=True)

        comment_prepend = "#"
        if any([True for icol in index_column_names if icol.startswith("#")]):
            comment_prepend = "%"

        tsv_file = ds_file
        tsv_reader = GenericTsvReader(tsv_file, commentPrepend=comment_prepend)


        if annotation_column_names is None:
            annotation_column_names = copy.copy(tsv_reader.getFieldNames())
            for icol in index_column_names:
                if icol in annotation_column_names:
                    annotation_column_names.remove(icol)

        logging.getLogger(__name__).info("Creating SNP LevelDB for the following index headers: " + str(index_column_names))
        logging.getLogger(__name__).info("Creating SNP LevelDB for the following data headers: " + str(annotation_column_names))

        # Create the config file
        self._createConfigFile(configFilename, src_file, ds_name, ds_version, index_column_names, annotation_columns=annotation_column_names)

        batch = leveldb.WriteBatch()
        for i,line_dict in enumerate(tsv_reader):

            chrom = line_dict[index_column_names[0]]
            start = line_dict[index_column_names[1]]
            end = line_dict[index_column_names[2]]
            ref = line_dict[index_column_names[3]]
            alt = line_dict[index_column_names[4]]

            h = SnpOnlyLevelDbDatasource.generate_hash(chrom, start, end, ref, alt)
            if i % 5000 == 0:
                logging.getLogger(__name__).info("Rendering %d entries" % (i))

            line_list = [line_dict.get(k, "") for k in annotation_column_names]
            db.Put(h, ",".join(line_list))
        db.Write(batch, sync = True)
Example #13
0
    def __init__(self,
                 filename,
                 mutation_data_factory=None,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator,
              self).__init__(filename, mutation_data_factory, configFile,
                             genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        '''

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            'ABL1', 'ABL2', 'ACSL3', 'AF15Q14', 'AF1Q', 'AF3p21', 'AF5q31',
            'AKAP9', 'AKT1', 'AKT2', 'ALDH2', 'ALK', 'ALO17', 'APC',
            'ARHGEF12', 'ARHH', 'ARID1A', 'ARID2', 'ARNT', 'ASPSCR1', 'ASXL1',
            'ATF1', 'ATIC', 'ATM', 'ATRX', 'BAP1', 'BCL10', 'BCL11A', 'BCL11B'
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(
            config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config",
            "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'

        annotator = Annotator()
        annotator.setInputCreator(
            MafliteInputMutationCreator(
                'testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            'CGC_Abridged_Other Syndrome/Disease' in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue(
            'CGC_Abridged_Mutation Type' in fields,
            "'CGC_Abridged_Mutation Type' was not present in the header")

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(
                    lineDict['CGC_Abridged_GeneID'] != '',
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: "
                    + str(ctr))
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0,
                        "Bad data -- cannot test missed detects.")
    def _validateTcgaMafContents(self, filename):
        """
        This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
        
        Note: This method has nothing to do with the TCGA validator.
        
        TODO: This is code duplication from TCGA MAF Output RendererTest.  This should be refactored into a base class
        (to preserve self.assertTrue, etc).
        """
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0,
                        "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') <> -1,
                        "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:
            if lineDict['Entrez_Gene_Id'] == "0":
                self.assertTrue(
                    lineDict['Hugo_Symbol'] == "Unknown",
                    "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: "
                    + str(ctr))

            unknownKeys = []
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue(
                    '\r' not in lineDict[k],
                    "Carriage return character found in an annotation value.")

                configFile = ConfigUtils.createConfigParser(
                    'configs/tcgaMAF2.3_output.config')
                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                if (k not in requiredColumns) and (k not in optionalColumns):
                    self.assertTrue(
                        k.startswith("i_"),
                        "Internal column was not prepended with 'i_'")

            unknownKeys.sort()
            self.assertTrue(
                len(unknownKeys) == 0, "__UNKNOWN__ values (" +
                str(len(unknownKeys)) + ") seen on line " + str(ctr) +
                ", in fields: " + ", ".join(unknownKeys))

            ctr += 1
    def __init__(self,
                 filename,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
Example #17
0
    def _renderSortedTsv(self, templateFilename, vcfFilename, tsvFilename, sampleNames, dataManager, inferGenotypes):
        """
        Turn a sorted tsv into a VCF

        :param templateFilename: basic VCF to model output VCF.
        :param vcfFilename: output VCF filename
        :param tsvFilename: input sorted tsv
        :param sampleNames: sample names that should be used in output
        :param dataManager: dataManager instance used in creating pyvcf records.
        :param inferGenotypes: whether we should try to infer the genotypes, since we may not have add GT explicitly
        on input
        """
        tempVcfReader = vcf.Reader(filename=templateFilename, strict_whitespace=True)
        pointer = file(vcfFilename, "w")

        tsvReader = GenericTsvReader(tsvFilename, delimiter=self.delimiter)
        index = 0
        nrecords = 1000
        chrom = None
        pos = None
        refAllele = None
        recordBuilder = None

        vcfWriter = vcf.Writer(pointer, tempVcfReader, self.lineterminator)
        for ctr, m in enumerate(tsvReader):
            isNewRecord = self._isNewVcfRecordNeeded(chrom, m["chr"], pos, m["start"], refAllele, m["ref_allele"])
            if isNewRecord:
                if recordBuilder is not None:
                    record = recordBuilder.createRecord()
                    vcfWriter.write_record(record)
                    index += 1
                    if index % nrecords == 0:
                        self.logger.info("Rendered " + str(index) + " vcf records.")
                        vcfWriter.flush()

                chrom = m["chr"]
                pos = m["start"]
                refAllele = m["ref_allele"]

                recordBuilder = RecordBuilder(chrom, int(pos), refAllele, sampleNames)

            recordBuilder = self._parseRecordBuilder(m, recordBuilder, dataManager, inferGenotypes)

        if recordBuilder is not None:
            record = recordBuilder.createRecord()
            vcfWriter.write_record(record)

        vcfWriter.close()
        tsvReader.close()
        self.logger.info("Rendered all " + str(index) + " vcf records.")
    def testExposedColumns(self):
        """Test that columns listed in the config file as exposed do not get the i_ prepend"""
        testOutputFilename = self._annotateTest('testdata/maflite/tiny_maflite.maf.txt', "out/testExposedCols.maf.tsv", self._determine_db_dir())

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(testOutputFilename)

        # Check the columns, since the input has a couple of exposed columns.
        tsvReader = GenericTsvReader(testOutputFilename)
        headers = tsvReader.getFieldNames()
        headersToCheck = ['t_alt_count', 't_ref_count']
        for h in headersToCheck:
            self.assertFalse(("i_" + h) in headers, "i_ was prepended to " + h)
            self.assertTrue(h in headers, h + " not found.")
    def _validateTcgaMafContents(self, filename):
        """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created.  
        
        Note: This method has nothing to do with the TCGA validator.
        
        """
        configFile = ConfigUtils.createConfigParser(os.path.join("configs", "tcgaMAF2.4_output.config"))
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:

            # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf)
            # if lineDict['Entrez_Gene_Id'] == "0":
            #     self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            self.assertTrue(lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"]))
            self.assertTrue(lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr))
            uniprot_aa_xform_counter = 0
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")

                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                exposedColumns = configFile.get("general", "exposedColumns")
                if (k not in requiredColumns) and (k not in optionalColumns) and (k not in exposedColumns):
                    self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")
            if lineDict['UniProt_AApos'] == "0":
                uniprot_aa_xform_counter += 1

            if lineDict["Variant_Type"] == VariantClassification.VT_DEL:
                self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-")

            if lineDict["Variant_Type"] == VariantClassification.VT_INS:
                self.assertTrue(lineDict["Reference_Allele"] == "-")

            unknownKeys.sort()
            self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys))
            self.assertTrue(uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + ").  This is probably an error.")

            ctr += 1
    def testProperConversionVcfToMafWithThirdSample(self):
        """Test that ref, alt, and positions are properly populated in a TCGA MAF generated from a VCF, but that the NORMAL is treated as any other sample, since this VCF has three samples in it. """

        # For this conversion, you must specify the barcodes manually
        override_annotations = TcgaMafOutputRendererTest.TCGA_MAF_DEFAULTS
        override_annotations.update({'tumor_barcode': 'NA'})

        outputFilename = self._annotateTest(
            os.path.join(*[
                "testdata", "vcf", "Patient0.somatic.strelka.indels.met.vcf"
            ]),
            os.path.join("out", "testConversionFromVCFv2.maf.annotated"),
            self._determine_db_dir(),
            inputFormat="VCF",
            outputFormat="TCGAMAF",
            override_annotations=override_annotations)

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(outputFilename)

        # Check to make sure that the ref and alt are correct for a TCGA MAF.
        tsvReader = GenericTsvReader(outputFilename)

        ctr = 0

        for line_dict in tsvReader:
            ctr += 1

        self.assertTrue(
            ctr == 24,
            str(ctr) + " mutations found, but should have been 24.")
Example #21
0
    def test_overwriting_muts(self):
        """Ensure that (given correct configuration) we can annotate from a datasource, even if the datasource will overwrite an existing mutation."""
        # We will have an input with a "Who" annotation that this datasource will try to write.
        gene_ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        input_filename = "testdata/maflite/who_alt1_vs_alt2.maflite"
        output_filename = "out/who_alt1_vs_alt2.maf.annotated"
        input_format = "MAFLITE"
        output_format = "TCGAMAF"

        other_opts = {
            OptionConstants.ALLOW_ANNOTATION_OVERWRITING: True,
            OptionConstants.NO_PREPEND: True
        }

        run_spec = RunSpecificationFactory.create_run_spec_given_datasources(
            input_format,
            output_format,
            input_filename,
            output_filename,
            datasource_list=[gene_ds],
            other_opts=other_opts)
        annotator = Annotator()
        annotator.initialize(run_spec)

        annotator.annotate()

        tsv_reader = GenericTsvReader(output_filename)

        for i, line_dict in enumerate(tsv_reader):
            self.assertTrue(line_dict.get('TJ_Data_Who', "") != "Tromokratis")
Example #22
0
    def testManualAnnotations(self):
        """ Test that the manual annotation facility in the Annotator is working properly. """
        annotator = Annotator()
        overrides = {'source': 'Capture', 'status': 'Somatic', 'phase': 'Phase_I', 'sequencer': 'Illumina GAIIx'}
        annotator.setManualAnnotations(overrides)
        inputCreator = MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt')
        outputRenderer = SimpleOutputRenderer("out/testManualAnnotationsFile.tsv")
        annotator.setInputCreator(inputCreator)
        annotator.setOutputRenderer(outputRenderer)

        testOutputFilename = annotator.annotate()

        keysOfInterest = overrides.keys()

        statinfo = os.stat(testOutputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated TSV file (" + testOutputFilename + ") is empty.")

        tsvReader = GenericTsvReader(testOutputFilename)

        ctr = 1
        for lineDict in tsvReader:
            for k in keysOfInterest:
                self.assertTrue(lineDict[k] != "__UNKNOWN__",
                                "__UNKNOWN__ value seen on line " + str(ctr) + ", when it should be populated: " + k)
                self.assertTrue(lineDict[k] != "",
                                "Blank value seen on line " + str(ctr) + ", when it should be populated: " + k)
                self.assertTrue(lineDict[k] == overrides[k],
                                "Value for " + k + " on line " + str(ctr) + " did not match override: " + str(
                                    lineDict[k]) + " <> " + str(overrides[k]))
            ctr += 1
    def testExposedColumns(self):
        """Test that columns listed in the config file as exposed do not get the i_ prepend"""
        testOutputFilename = self._annotateTest(
            'testdata/maflite/tiny_maflite.maf.txt',
            "out/testExposedCols.maf.tsv", self._determine_db_dir())

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(testOutputFilename)

        # Check the columns, since the input has a couple of exposed columns.
        tsvReader = GenericTsvReader(testOutputFilename)
        headers = tsvReader.getFieldNames()
        headersToCheck = ['t_alt_count', 't_ref_count']
        for h in headersToCheck:
            self.assertFalse(("i_" + h) in headers, "i_ was prepended to " + h)
            self.assertTrue(h in headers, h + " not found.")
    def testProperConversionVcfToMaf(self):
        """Test that ref, alt, and positions are properly populated in a TCGA MAF generated from a VCF """

        # For this conversion, you must specify the barcodes manually
        override_annotations = TcgaMafOutputRendererTest.TCGA_MAF_DEFAULTS
        override_annotations.update({
            'tumor_barcode': 'Patient0-Tumor',
            'normal_barcode': 'Patient0-Normal'
        })

        outputFilename = self._annotateTest(
            'testdata/vcf/Patient0.somatic.strelka.indels.vcf',
            "out/testConversionFromVCF.maf.annotated",
            self._determine_db_dir(),
            inputFormat="VCF",
            outputFormat="TCGAMAF",
            override_annotations=override_annotations,
            is_skip_no_alts=True)

        # Sanity checks to make sure that the generated maf file is not junk.
        self._validateTcgaMafContents(outputFilename)

        # Check to make sure that the ref and alt are correct for a TCGA MAF.
        tsvReader = GenericTsvReader(outputFilename)

        ctr = 0

        for line_dict in tsvReader:
            ref = line_dict['Reference_Allele']
            alt = line_dict['Tumor_Seq_Allele2']

            # INS
            if len(alt) > len(ref):
                self.assertTrue(ref == "-",
                                "Invalid insertion with " + ref + "  " + alt)

            # DEL
            if len(ref) > len(alt):
                self.assertTrue(alt == "-",
                                "Invalid deletion with " + ref + "  " + alt)

            self.assertTrue(line_dict['Start_position'] in [
                "10089935", "57493929", "155301009", "64948169", "64948166",
                "64948167", "64948168"
            ])
            self.assertTrue(
                line_dict['Reference_Allele'] in ["-", "TC", "A", "TT", "TTT"])
            self.assertTrue(
                line_dict['Tumor_Seq_Allele2'] in ["-", "TC", "G", "T"])
            self.assertTrue(
                line_dict['Matched_Norm_Sample_Barcode'] == "Patient0-Normal")
            self.assertTrue(
                line_dict['Matched_Norm_Sample_UUID'] == "Patient0-Normal")
            self.assertTrue(
                line_dict['Tumor_Sample_Barcode'] == "Patient0-Tumor")
            self.assertTrue(line_dict['Tumor_Sample_UUID'] == "Patient0-Tumor")
            ctr += 1

        self.assertTrue(ctr == 8,
                        str(ctr) + " mutations found, but should have been 8.")
Example #25
0
    def testSNPsAndIndelStartAndEndPos(self):
        """
        Tests that the start and end positions of SNPs and Indels are parsed as defined by the NCI's MAF specification
        (https://wiki.nci.nih.gov/display/TCGA/Mutation+Annotation+Format+(MAF)+Specification).
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.snps.indels.vcf"])
        outputFilename = os.path.join("out", "example.snps.indels.out.tsv")

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        for row in tsvReader:
            if row['start'] == "16890445":
                self.assertEqual(row["end"], "16890445", "The value should be %s but it was %s." % ("16890445",
                                                                                                    row["end"]))
            elif row["start"] == "154524458":
                self.assertEqual(row["end"], "154524459", "The value should be %s but it was %s." % ("154524459",
                                                                                                     row["end"]))
            elif row["start"] == "114189432":
                self.assertEqual(row["end"], "114189433", "The value should be %s but it was %s." % ("114189433",
                                                                                                     row["end"]))
    def testCreationAndAnnotation(self):
        """ Test the datasource creation and then do a simple annotation
        """
        outputFilename = 'out/genericGeneProteinPositionTest.out.tsv'

        gafDS = TestUtils.createTranscriptProviderDatasource(self.config)
        gppDS = DatasourceFactory.createDatasource("testdata/simple_uniprot_natvar/simple_uniprot_natvar.config", "testdata/simple_uniprot_natvar/")

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/tiny_maflite_natvar.maf.tsv'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDS)
        annotator.addDatasource(gppDS)
        testFilename = annotator.annotate()

        # Make sure that some values were populated
        self.assertTrue(os.path.exists(testFilename))
        tsvReader = GenericTsvReader(testFilename)

        ctr = 0
        for lineDict in tsvReader:
            colName = "UniProt_NatVar_natural_variations"
            self.assertTrue(sorted(lineDict[colName].split("|")) == sorted("R -> RR (in EDMD2).|R -> Q (in EDMD2).".split("|")), "Annotation value did not match: " + lineDict[colName])
            ctr += 1

        self.assertTrue(ctr == 1, "Number of mutations incorrect (1): " + str(ctr) )
    def test_simple_seg_file_annotations(self):
        """Test that we can read in a seg file, do GENCODE annotation, and output as SIMPLE_TSV"""
        inputFilename = "testdata/seg/Patient0.seg.txt"
        output_filename = "out/test_simple_seg_file_annotations.tsv"
        if os.path.exists(output_filename):
            os.remove(output_filename)
        ic = MafliteInputMutationCreator(inputFilename, None,
                                         'configs/seg_file_input.config')
        segs = ic.createMutations()

        i = 1
        for i, seg in enumerate(segs):
            pass

        self.assertTrue(
            (i + 1) == 27,
            "Found %d segments when there should have been 27." % (i + 1))

        ic = MafliteInputMutationCreator(inputFilename, None,
                                         'configs/seg_file_input.config')
        segs = ic.createMutations()

        gencode_ds = TestUtils._create_test_gencode_v19_ds(
            "out/seg_file_gencode_ds")
        annotator = Annotator()

        segs_annotated = []
        for seg in segs:
            segs_annotated.append(gencode_ds.annotate_segment(seg))

        outputRenderer = SimpleOutputRenderer(output_filename, '')
        outputRenderer.renderMutations(segs_annotated.__iter__())

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        required_cols = ["Sample", "Num_Probes", "Segment_Mean"]
        headers = output_reader.getFieldNames()
        for rcol in required_cols:
            self.assertTrue(rcol in headers)

        for line_dict in output_reader:
            self.assertTrue(line_dict['start'] is not None)
            self.assertTrue(line_dict['start'].strip() != "")
            self.assertTrue(line_dict['end'] is not None)
            self.assertTrue(line_dict['end'].strip() != "")
            self.assertTrue("genes" in line_dict.keys())
    def testMissingFilter(self):
        """
        Tests that the missing FILTER fields are parsed correctly.
        """
        inputFilename = os.path.join(
            *["testdata", "vcf", "example.missing_filters.vcf"])
        outputFilename = os.path.join("out", "example.missing_filters.out.tsv")
        expectedOutputFilename = os.path.join(
            *["testdata", "vcf", "example.expected.missing_filters.out.tsv"])

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)

        current = pandas.read_csv(outputFilename,
                                  sep='\t',
                                  header=len(tsvReader.getCommentsAsList()))
        expected = pandas.read_csv(expectedOutputFilename, sep='\t')

        currentColNames = set()
        for i in range(len(current.columns)):
            currentColNames.add(current.columns[i])

        expectedColNames = set()
        for i in range(len(expected.columns)):
            expectedColNames.add(expected.columns[i])

        self.assertTrue(
            len(currentColNames.symmetric_difference(expectedColNames)) is 0,
            "Should have the same columns")
        self.assertTrue(
            len(current.index) == len(expected.index),
            "Should have the same number of rows")

        for colName in currentColNames:
            self.assertTrue(
                sum((current[colName] == expected[colName])
                    | (pandas.isnull(current[colName])
                       & pandas.isnull(expected[colName]))) == len(
                           current.index),
                "Should have the same values in column " + colName)
    def testAnnotationWithExampleVcf(self):
        """
        Tests whether parsed annotations match the actual annotations in a simple TSV.  Missing format fields yield -->""  ".,." --> ","
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"])
        outputFilename = os.path.join("out", "example.out.tsv")
        expectedOutputFilename = os.path.join(
            *["testdata", "vcf", "example.expected.out.tsv"])

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)

        current = pandas.read_csv(outputFilename,
                                  sep='\t',
                                  header=len(tsvReader.getCommentsAsList()))
        expected = pandas.read_csv(expectedOutputFilename, sep='\t')

        currentColNames = set()
        for i in range(len(current.columns)):
            currentColNames.add(current.columns[i])

        expectedColNames = set()
        for i in range(len(expected.columns)):
            expectedColNames.add(expected.columns[i])

        self.assertTrue(
            len(currentColNames.symmetric_difference(expectedColNames)) is 0,
            "Should have the same columns")
        self.assertTrue(
            len(current.index) == len(expected.index),
            "Should have the same number of rows")

        for colName in currentColNames:
            self.assertTrue(
                sum((current[colName] == expected[colName])
                    | (pandas.isnull(current[colName])
                       & pandas.isnull(expected[colName]))) == len(
                           current.index),
                "Should have the same values in column " + colName + ": \n" +
                str(current[colName]) + "\nvs\n" + str(expected[colName]))
    def test_validation_correction(self):
        """ Test that the validation allele fields are determined automatically when not specified by the user for invalid mutation.
        """
        m = MutationDataFactory.default_create()
        m.chr = "3"
        m.start = "178948145"
        m.end = "178948145"
        m.alt_allele = "A"
        m.ref_allele = "G"
        m['validation_status'] = "Invalid"
        m['Match_Norm_Validation_Allele1'] = ""
        m['Match_Norm_Validation_Allele2'] = ""
        m['Tumor_Validation_Allele1'] = ""
        m['Tumor_Validation_Allele2'] = ""
        m['Mutation_Status'] = "Somatic"

        output_filename = os.path.join("out",
                                       "test_validation_correction1.maf.tsv")

        outputRenderer = TcgaMafOutputRenderer(output_filename,
                                               configFile=os.path.join(
                                                   "configs",
                                                   "tcgaMAF2.4_output.config"))
        outputRenderer.renderMutations([m].__iter__())

        tsv_reader = GenericTsvReader(output_filename)

        for line_dict in tsv_reader:
            self.assertTrue(
                line_dict['Match_Norm_Validation_Allele1'] ==
                line_dict['Match_Norm_Validation_Allele2'],
                "Matched norm alleles did not match.")
            self.assertTrue(
                line_dict['Tumor_Validation_Allele1'] ==
                line_dict['Tumor_Validation_Allele2'],
                "Tumor alleles did not match for an invalid validation result."
            )
            self.assertTrue(
                line_dict['Match_Norm_Validation_Allele1'] ==
                line_dict['Tumor_Validation_Allele2'],
                "Tumor alleles did not match normal alleles for an invalid validation result."
            )
            self.assertTrue(
                line_dict['Match_Norm_Validation_Allele1'] ==
                line_dict['Reference_Allele'],
                "Norm validation alleles did not match reference (norm, reference): (%s, %s)"
                % (line_dict['Match_Norm_Validation_Allele1'],
                   line_dict['Reference_Allele']))
            self.assertTrue(
                "G" == line_dict['Reference_Allele'],
                "Reference allele should have been G, but was " +
                line_dict['Reference_Allele'])
            self.assertTrue(
                "None" == line_dict['Mutation_Status'],
                "Mutation Status must be None when Validation Status is Invalid: "
                + line_dict['Mutation_Status'])
Example #31
0
    def _renderSortedTsv(self, templateFilename, vcfFilename, tsvFilename, sampleNames, dataManager, inferGenotypes):
        """


        :param templateFilename:
        :param vcfFilename:
        :param tsvFilename:
        :param sampleNames:
        :param dataManager:
        """
        tempVcfReader = vcf.Reader(filename=templateFilename, strict_whitespace=True)
        pointer = file(vcfFilename, "w")
        vcfWriter = vcf.Writer(pointer, tempVcfReader, self.lineterminator)
        tsvReader = GenericTsvReader(tsvFilename, delimiter=self.delimiter)
        index = 0
        nrecords = 1000
        chrom = None
        pos = None
        refAllele = None
        recordBuilder = None

        ctr = 0
        m = None
        try:
            for m in tsvReader:
                ctr += 1
                isNewRecord = self._isNewVcfRecordNeeded(chrom, m["chr"], pos, m["start"], refAllele, m["ref_allele"])
                if isNewRecord:
                    if recordBuilder is not None:
                        record = recordBuilder.createRecord()
                        vcfWriter.write_record(record)
                        index += 1
                        if index % nrecords == 0:
                            self.logger.info("Rendered " + str(index) + " vcf records.")
                            vcfWriter.flush()

                    chrom = m["chr"]
                    pos = m["start"]
                    refAllele = m["ref_allele"]

                    recordBuilder = RecordBuilder(chrom, int(pos), refAllele, sampleNames)

                recordBuilder = self._parseRecordBuilder(m, recordBuilder, dataManager, inferGenotypes)

            if recordBuilder is not None:
                record = recordBuilder.createRecord()
                vcfWriter.write_record(record)
            vcfWriter.close()

        except Exception as e:
            self.logger.error(traceback.format_exc())
            self.logger.error("Error at mutation " + str(ctr) + " " + str([m["chr"], m["start"], m["end"]]) + ": ")

        self.logger.info("Rendered all " + str(index) + " vcf records.")
    def _validateTcgaMafContents(self, filename):
        """
        This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
        
        Note: This method has nothing to do with the TCGA validator.
        
        TODO: This is code duplication from TCGA MAF Output RendererTest.  This should be refactored into a base class
        (to preserve self.assertTrue, etc).
        """
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:
            if lineDict['Entrez_Gene_Id'] == "0":
                self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown",
                                "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")

                configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.3_output.config')
                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                if (k not in requiredColumns) and (k not in optionalColumns):
                    self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")

            unknownKeys.sort()
            self.assertTrue(len(unknownKeys) == 0,
                            "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(
                                ctr) + ", in fields: " + ", ".join(unknownKeys))

            ctr += 1
    def testBasicAnnotation(self):
        """ Test annotation from a generic TSV based on a transcript annotation.  Only confirms the proper headers of the output. """
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        transcriptDS = DatasourceFactory.createDatasource(
            "testdata/small_transcript_tsv_ds/small_transcript_tsv_ds.config", "testdata/small_transcript_tsv_ds/"
        )
        outputFilename = "out/genericTranscriptTest.out.tsv"

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(transcriptDS)
        outputFilename = annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("refseq_test_mRNA_Id" in headers, "refseq_test_mRNA_Id not found in headers: " + str(headers))
        self.assertTrue("refseq_test_prot_Id" in headers, "refseq_test_prot_Id not found in headers: " + str(headers))
Example #34
0
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename,
                                          'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()

        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource(
            "testdata/thaga_janakari_gene_ds/hg19/tj_data.config",
            "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1,
                        "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(),
                        "New field missing (i_TJ_Data_Who) from header")

        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(
            ctrOut == (ctrIn + 2),
            "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): "
            + str(ctrIn) + ", " + str(ctrOut))
    def testTCGAMAFAsInputAndQuickAnnotate(self):
        """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """
        inputFilename = "testdata/maf/Patient0.maf.annotated"
        tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config')
        outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv"
        outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config')
        annotator = Annotator()
        
        annotator.setInputCreator(tmp)
        annotator.setOutputRenderer(outputRenderer)
        ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/")
        annotator.addDatasource(ds)
        annotator.annotate()
        
        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")
        tsvReaderIn = GenericTsvReader(inputFilename)
        tsvReader = GenericTsvReader(outputFilename)
        
        self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")
        self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header")
        self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header")
        
        ctrOut = 0
        for lineDict in tsvReader:
            ctrOut += 1
        ctrIn = 0
        for lineDict in tsvReaderIn:
            ctrIn += 1
        ctrIn += len(tsvReaderIn.getCommentsAsList())
        ctrOut += len(tsvReader.getCommentsAsList())

        self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file.  (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
    def test_splitting_allelic_depth_disabled(self):
        """Make sure that allelic depth is not split when told"""
        input_vcf_file = "testdata/m2_support/Dream4.chr20.oxoGinfo.vcf"
        output_tcgamaf_file = "out/m2_support/Dream4.chr20.oxoGinfo.vcf.noADSplit.maf.annotated"

        if not os.path.exists(
                os.path.abspath(os.path.dirname(output_tcgamaf_file))):
            os.makedirs(os.path.abspath(os.path.dirname(output_tcgamaf_file)))

        # For this conversion, you must specify the barcodes manually
        override_annotations = TcgaMafOutputRendererTest.TCGA_MAF_DEFAULTS
        override_annotations.update({
            'tumor_barcode': 'Patient0-Tumor',
            'normal_barcode': 'Patient0-Normal'
        })

        other_opts = {
            OptionConstants.COLLAPSE_FILTER_COLS: True,
            OptionConstants.NO_PREPEND: True,
            OptionConstants.SPLIT_ALLELIC_DEPTH: False
        }

        # Use an empty datasource dir in order to speed this up.
        self._annotateTest(input_vcf_file,
                           output_tcgamaf_file,
                           datasource_dir=None,
                           inputFormat="VCF",
                           is_skip_no_alts=True,
                           other_opts=other_opts,
                           override_annotations=override_annotations)

        # Check the output MAF
        tsv_reader = GenericTsvReader(output_tcgamaf_file)

        keys_to_check_existence = ['allelic_depth']

        keys_to_check_non_existence = {'t_ref_count', 't_alt_count'}

        for line_dict in tsv_reader:

            for ks in keys_to_check_existence:
                self.assertTrue(ks in line_dict.keys(),
                                "Key " + ks + " was not rendered.")
                self.assertTrue(
                    line_dict[ks] != ""
                    or (line_dict['Reference_Allele'] == "-"
                        or line_dict['Tumor_Seq_Allele2'] == "-"),
                    "Key " + ks + " had a blank value." + str(line_dict))

            for ks in keys_to_check_non_existence:
                self.assertTrue(ks not in line_dict.keys())
    def testAnnotationWithExampleVcf(self):
        """
        Tests whether parsed annotations match the actual annotations in a simple TSV.  Missing format fields yield -->""  ".,." --> ","
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.vcf"])
        outputFilename = os.path.join("out", "example.out.tsv")
        expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.out.tsv"])

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)

        current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList()))
        expected = pandas.read_csv(expectedOutputFilename, sep='\t')

        currentColNames = set()
        for i in range(len(current.columns)):
            currentColNames.add(current.columns[i])

        expectedColNames = set()
        for i in range(len(expected.columns)):
            expectedColNames.add(expected.columns[i])

        self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0,
                        "Should have the same columns")
        self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows")

        for colName in currentColNames:
            self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) &
                                                                           pandas.isnull(expected[colName]))) ==
                            len(current.index), "Should have the same values in column " + colName + ": \n" +
                            str(current[colName]) + "\nvs\n" + str(expected[colName]))
    def _create_tx_id_to_protein_id_mapping(self, mapping_file):
        """
        Mapping file is assumed to have three columns and be a tsv:
        Ensembl Gene ID, Ensembl Transcript ID, and Ensembl Protein ID

        """
        result = dict()
        if mapping_file is None or mapping_file.strip() == "":
            return result
        tsv_reader = GenericTsvReader(mapping_file)
        for line_dict in tsv_reader:
            result[line_dict['Ensembl Transcript ID']] = line_dict[
                'Ensembl Protein ID']
        return result
    def testMissingFilter(self):
        """
        Tests that the missing FILTER fields are parsed correctly.
        """
        inputFilename = os.path.join(*["testdata", "vcf", "example.missing_filters.vcf"])
        outputFilename = os.path.join("out", "example.missing_filters.out.tsv")
        expectedOutputFilename = os.path.join(*["testdata", "vcf", "example.expected.missing_filters.out.tsv"])

        creator = VcfInputMutationCreator(inputFilename)
        creator.createMutations()
        renderer = SimpleOutputRenderer(outputFilename)
        annotator = Annotator()
        annotator.setInputCreator(creator)
        annotator.setOutputRenderer(renderer)
        annotator.annotate()

        tsvReader = GenericTsvReader(outputFilename)

        current = pandas.read_csv(outputFilename, sep='\t', header=len(tsvReader.getCommentsAsList()))
        expected = pandas.read_csv(expectedOutputFilename, sep='\t')

        currentColNames = set()
        for i in range(len(current.columns)):
            currentColNames.add(current.columns[i])

        expectedColNames = set()
        for i in range(len(expected.columns)):
            expectedColNames.add(expected.columns[i])

        self.assertTrue(len(currentColNames.symmetric_difference(expectedColNames)) is 0,
                        "Should have the same columns")
        self.assertTrue(len(current.index) == len(expected.index), "Should have the same number of rows")

        for colName in currentColNames:
            self.assertTrue(sum((current[colName] == expected[colName]) | (pandas.isnull(current[colName]) &
                                                                           pandas.isnull(expected[colName]))) ==
                            len(current.index), "Should have the same values in column " + colName)
    def testInternalFieldsSkipPrepend(self):
        """ Test that no prepending of "i_" is honored."""
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationDataFactory.default_create()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(
            outputFilename,
            configFile='configs/tcgaMAF2.4_output.config',
            other_options={OptionConstants.NO_PREPEND: True})
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser(
            'configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue(
            "Hugo_Symbol" in requiredColumns,
            " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified."
        )

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers,
                        "Hugo_Symbol not found in output headers")
        self.assertTrue(
            "i_TEST" not in headers,
            "i_TEST was found in output headers when prepend was disabled.")
        self.assertTrue("TEST" in headers,
                        "TEST was not found in output headers.")
    def testBasicAnnotation(self):
        ''' Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        ''' 
        
        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = ['ABL1','ABL2','ACSL3','AF15Q14','AF1Q','AF3p21','AF5q31','AKAP9','AKT1','AKT2','ALDH2','ALK','ALO17','APC','ARHGEF12','ARHH','ARID1A','ARID2','ARNT','ASPSCR1','ASXL1','ATF1','ATIC','ATM','ATRX','BAP1','BCL10','BCL11A','BCL11B']
        
        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        geneDS = DatasourceFactory.createDatasource("testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/")
        outputFilename = 'out/genericGeneTest.out.tsv'
        
        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator('testdata/maflite/Patient0.snp.maf.txt'))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()
        
        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)
        
        fields = tsvReader.getFieldNames()
        self.assertTrue('CGC_Abridged_Other Syndrome/Disease' in fields, "'CGC_Other Syndrome/Disease' was not present in the header")
        self.assertTrue('CGC_Abridged_Mutation Type' in fields, "'CGC_Abridged_Mutation Type' was not present in the header")
        
        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue('gene' in lineDict.keys())
            if lineDict['gene'] in genesAvailable:
                self.assertTrue(lineDict['CGC_Abridged_GeneID'] <> '', "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: " + str(ctr))
                linesThatShouldBeAnnotated = linesThatShouldBeAnnotated + 1
            ctr = ctr + 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
    def test_rendering_with_exons(self):
        """Test that we can render a seg file that includes exons at end points"""
        inputFilename = "testdata/seg/Middle_of_exon.seg.txt"
        output_filename = "out/test_exon_seg2.gene_list.tsv"
        db_dir = self.config.get('DEFAULT',"dbDir")
        if os.path.exists(output_filename):
            os.remove(output_filename)

        annotator = Annotator()
        run_spec = RunSpecificationFactory.create_run_spec("SEG_FILE", "GENE_LIST", inputFilename, output_filename,
                                                           datasourceDir=db_dir, annotating_type=RunSpecification.ANNOTATE_SEGMENTS)
        annotator.initialize(run_spec)
        annotator.annotate()

        # Now check the output
        output_reader = GenericTsvReader(output_filename)

        headers = output_reader.getFieldNames()

        for line_dict in output_reader:
            self.assertTrue(line_dict['segment_start'] is not None)
            self.assertTrue(line_dict['segment_start'].strip() != "")
            if line_dict['segment_end_gene'] == "MAPK1":
                self.assertTrue(line_dict['segment_end_exon'].strip() == "8+", "Should have been 8+, but saw: %s" % line_dict['segment_end_exon'].strip())
    def testInternalFields(self):
        """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationData()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, configFile='configs/tcgaMAF2.4_output.config')
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser(
            'configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue(
            "Hugo_Symbol" in requiredColumns,
            " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified."
        )

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers,
                        "Hugo_Symbol not found in output headers")
        self.assertTrue(
            "TEST" not in headers,
            "TEST was found in output headers when it should have been renamed to i_TEST"
        )
        self.assertTrue("i_TEST" in headers,
                        "i_TEST not found in output headers")
    def testInternalFields(self):
        """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationData()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")
        
        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")
        
        outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config')
        outputRenderer.renderMutations(iter([m]), ['No comments'])
        
        configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified.")

        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")
        
        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers")
        self.assertTrue("TEST" not in headers, "TEST was found in output headers when it should have been renamed to i_TEST")
        self.assertTrue("i_TEST" in headers, "i_TEST not found in output headers")
    def testInternalFieldsSkipPrepend(self):
        """ Test that no prepending of "i_" is honored."""
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationDataFactory.default_create()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config', other_options={OptionConstants.NO_PREPEND:True})
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified.")

        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers")
        self.assertTrue("i_TEST" not in headers, "i_TEST was found in output headers when prepend was disabled.")
        self.assertTrue("TEST" in headers, "TEST was not found in output headers.")
    def test_proper_conversion_vcf_to_maf_with_collapse_filter_cols(self):
        """Test FILTER col is properly rendered when using the collapse-filter-cols option."""

        input_fname = 'testdata/vcf/example.vcf'
        output_fname = 'out/example.one_filter_col.maf.txt'
        annotator = Annotator()
        other_opts = {'collapse_filter_cols': True}

        run_spec = RunSpecificationFactory.create_run_spec(
            'VCF', 'TCGAMAF', input_fname, output_fname, other_opts=other_opts)
        annotator.initialize(run_spec)
        annotator.annotate()

        tsv_reader = GenericTsvReader(output_fname)
        for line_dict in tsv_reader:
            self.assertIn('i_filter', line_dict)
            self.assertTrue(line_dict['i_filter'] in ['PASS', 'q10'])
Example #47
0
    def testMulticoreAnnotateFromChunkedFile(self):
        #TODO: Add unit test that Mutation data is pickle-able
        inputFile = "testdata/maflite/Patient0.snp.maf.txt"
        outputFile = "out/testGAFMulticorePatient0.snp.maf.txt"
        chunkSize = 200
        numChunks = 4

        gafDatasource = TestUtils.createGafDatasourceProxy(self.config)
        ic = MafliteInputMutationCreator(inputFile)
        oc = SimpleOutputRenderer(outputFile)

        # createChunks
        muts = ic.createMutations()

        allAnnotatedChunksFlat = []
        are_mutations_remaining = True
        p = LoggingPool(processes=numChunks)
        while are_mutations_remaining:

            chunks = []
            for j in xrange(0, numChunks):
                chunk = []
                for i in xrange(0, chunkSize):
                    try:
                        chunk.append(muts.next())
                    except StopIteration:
                        are_mutations_remaining = False
                        break

                chunks.append((chunk, gafDatasource))

            annotatedChunks = p.map(annotate_mutations_global, chunks)
            annotatedChunksFlat = self._flattenChunks(annotatedChunks)
            allAnnotatedChunksFlat.append(annotatedChunksFlat)
        p.close()
        p.join()

        annotatedMuts = chain.from_iterable(allAnnotatedChunksFlat)

        ctr = 0
        oc.renderMutations(annotatedMuts, Metadata())
        tsvReader = GenericTsvReader(outputFile)
        for line in tsvReader:
            ctr += 1
        self.assertTrue(ctr == 730,
                        "Should have read 730 variants, but read " + str(ctr))
    def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
    def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()
        
        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
    def testBasicAnnotation(self):
        """ Annotate from a basic tsv gene file.  Use the Gaf to annotate before trying the tsv -- required since the gene annotation must be populated.
        Using trimmed CancerGeneCensus as basis for this test.
        """

        # cut -f 1 oncotator/test/testdata/small_tsv_ds/CancerGeneCensus_Table_1_full_2012-03-15_trim.txt | egrep -v Symbol | sed -r "s/^/'/g" | sed ':a;N;$!ba;s/\n/,/g' | sed -r "s/,'/','/g"
        genesAvailable = [
            "ABL1",
            "ABL2",
            "ACSL3",
            "AF15Q14",
            "AF1Q",
            "AF3p21",
            "AF5q31",
            "AKAP9",
            "AKT1",
            "AKT2",
            "ALDH2",
            "ALK",
            "ALO17",
            "APC",
            "ARHGEF12",
            "ARHH",
            "ARID1A",
            "ARID2",
            "ARNT",
            "ASPSCR1",
            "ASXL1",
            "ATF1",
            "ATIC",
            "ATM",
            "ATRX",
            "BAP1",
            "BCL10",
            "BCL11A",
            "BCL11B",
        ]

        # We need a gaf data source to annotate gene

        gafDatasource = TestUtils.createTranscriptProviderDatasource(config=self.config)
        geneDS = DatasourceFactory.createDatasource(
            "testdata/small_tsv_ds/small_tsv_ds.config", "testdata/small_tsv_ds/"
        )
        outputFilename = "out/genericGeneTest.out.tsv"

        annotator = Annotator()
        annotator.setInputCreator(MafliteInputMutationCreator("testdata/maflite/Patient0.snp.maf.txt"))
        annotator.setOutputRenderer(SimpleOutputRenderer(outputFilename))
        annotator.addDatasource(gafDatasource)
        annotator.addDatasource(geneDS)
        annotator.annotate()

        # Check that there were actual annotations performed.
        tsvReader = GenericTsvReader(outputFilename)

        fields = tsvReader.getFieldNames()
        self.assertTrue(
            "CGC_Abridged_Other Syndrome/Disease" in fields,
            "'CGC_Other Syndrome/Disease' was not present in the header",
        )
        self.assertTrue(
            "CGC_Abridged_Mutation Type" in fields, "'CGC_Abridged_Mutation Type' was not present in the header"
        )

        ctr = 1
        linesThatShouldBeAnnotated = 0
        for lineDict in tsvReader:
            self.assertTrue("gene" in lineDict.keys())
            if lineDict["gene"] in genesAvailable:
                self.assertTrue(
                    lineDict["CGC_Abridged_GeneID"] != "",
                    "'CGC_Abridged_GeneID' was missing on a row that should have been populated.  Line: " + str(ctr),
                )
                linesThatShouldBeAnnotated += 1
            ctr += 1
        self.assertTrue((linesThatShouldBeAnnotated) > 0, "Bad data -- cannot test missed detects.")
    '''
    parser = ArgumentParser(description=desc, formatter_class=RawDescriptionHelpFormatter, epilog=epilog)
    parser.add_argument("ds_file", type=str, help="COSMIC datasource filename. For example, 'CosmicCompleteExport_v62_261112.tsv' ")
    parser.add_argument("output_file", type=str, help="TSV filename for output.  File will be overwritten if it already exists.")

    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = parseOptions()
    inputFilename = args.ds_file
    outputFilename = args.output_file

    outputHeaders = ['gene', 'total_alterations_in_gene', 'tissue_types_affected']

    tsvReader = GenericTsvReader(inputFilename)
    headers = tsvReader.getFieldNames()
    print('Found headers (input): ' + str(headers))
    if "Gene name" not in headers:
        raise NotImplementedError("Could not find Gene name column in the input file.")

    if 'Primary site' not in headers:
        raise NotImplementedError("Could not find Primary site column in the input file.")

    # Construct dictionary that is [gene][histology/tissue type] = count, where count is the total for that histology
    #   and that gene
    geneDictionary = dict()
    for line in tsvReader:
        gene = line['Gene name']
        # Skip blank genes
        if gene is None or gene.strip() == "":
Example #52
0
    def index(destDir, inputFilename, fileColumnNumList=None, preset=None):
        """
        Create a tabix index file for genomic position datasource tsv files.
        Prerequisites (for genomic position indexed):
            Input file has three columns that can be mapped to chromosome, start position, and end position without any modification.
                For example, ['hg19.oreganno.chrom', 'hg19.oreganno.chromStart', 'hg19.oreganno.chromEnd'] in oreganno.hg19.txt

        This will overwrite an existing index (since the force parameter is set to True in pysam.tabix_index() call).
        Also, in cases where the inputFilename doesn't end with a ".gz", the a compressed file will be created and indexed.

        If the gz and tbi files already exist, this will simply copy the files to the specified destination.

        :param destDir: destination directory
        :param fileColumnNumList: ordered list.  This list contains the corresponding entries (column numbers)
            in the tsv file. Typically, this would be [chr,start,end]  or [gene, startAA, endAA]
        :param inputFilename: tsv file input
        :param preset: if preset is provided, the column coordinates are taken from a preset. Valid values for preset
        are "gff", "bed", "sam", "vcf", "psltbl", and "pileup".  "tsv" is also recognized, but this will use the tabix
        generic indexing (after commenting out the header line)
        """
        fileColumnNumList = [] if fileColumnNumList is None else fileColumnNumList
        inputFilename = os.path.abspath(inputFilename)
        fileDir = os.path.dirname(inputFilename)
        fileName, fileExtension = os.path.splitext(os.path.basename(inputFilename))

        if fileExtension in (".gz",):
            # Ensure .gz.tbi file is there as well
            inputIndexFilename = os.path.join(fileDir, string.join([inputFilename, "tbi"], "."))
            if not os.path.exists(inputIndexFilename):
                msg = "Missing tabix index file %s." % inputIndexFilename
                raise TabixIndexerFileMissingError(msg)

            outputFilename = os.path.join(destDir, string.join([fileName, "gz"], "."))
            shutil.copyfile(inputFilename, outputFilename)

            outputIndexFilename = os.path.join(destDir, string.join([fileName, "gz", "tbi"], "."))
            shutil.copyfile(inputIndexFilename, outputIndexFilename)

            return outputFilename

        outputFilename = os.path.join(destDir, string.join([fileName, ".tabix_indexed", fileExtension], ""))

        # Load the file into a tsvReader.
        if preset in ("gff", "bed", "sam", "vcf", "psltbl", "pileup"):
            # Copy the input file to output file.
            shutil.copyfile(inputFilename, outputFilename)
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, preset=preset)
        else:

            # Need to comment out the header line with a "#", so we cannot simply copy the file.
            input_reader = GenericTsvReader(inputFilename)

            with file(outputFilename, 'w') as output_writer:
                output_writer.writelines(input_reader.getCommentsAsList())

                # Add "#" for the header line.
                output_writer.write("#")
                field_names = input_reader.getFieldNames()
                output_writer.write("\t".join(field_names))
                output_writer.write("\n")
                output_writer.flush()

                # Write the rest of the file
                # This might be too slow, since a raw reader would be pretty fast.
                for line_dict in input_reader:
                    line_list = [line_dict[k] for k in field_names]
                    line_rendered = "\t".join(line_list) + "\n"
                    output_writer.write(line_rendered)

            input_reader.close()
            tabix_index = pysam.tabix_index(filename=outputFilename, force=True, seq_col=fileColumnNumList[0],
                                            start_col=fileColumnNumList[1], end_col=fileColumnNumList[2])

        if tabix_index is None:
            raise OncotatorException("Could not create a tabix index from this input file: " + outputFilename)

        return tabix_index
Example #53
0
    def indexGeneProteinPosition(geneColumn, proteinInfoColumn, inputFilename, outputFilename):
        """
        Creates an intermediate temporary file that includes two additional columns, startAA and endAA,
        sorts the file, writes thee sorted file to outputFilename, and then indexes the sorted file.

        :param geneColumn: name of the gene column in the inputFilename
        :param proteinInfoColumn: name of the protein change or position column. Can be of formats: p.K128_R130del
        (position 128 through 130) For more examples, see MutUtilsTest.testProteinChange()
        :param inputFilename: input tsv filename
        :param outputFilename: output filename
        """
        startAACol = "startAA"
        endAACol = "endAA"

        # Create intermediate file.  Do not use '#' for comments, since header can start with '#'
        tsvReader = GenericTsvReader(inputFilename, commentPrepend=";")

        # These are the outputHeaders for the intermediate file.
        headers = tsvReader.getFieldNames()

        if startAACol not in headers:
            headers += [startAACol]
        if endAACol not in headers:
            headers += [endAACol]

        # Write to the intermediate temporary file.
        # This file is created in the current working directory."
        temp = tempfile.NamedTemporaryFile()
        csvfile = file(temp.name, 'w')

        # Initialize the intermediate file's header.
        tsvWriter = csv.DictWriter(csvfile, headers, delimiter='\t', lineterminator='\n')
        # If the headers have a leading '#', get rid of it.
        for i in range(0, len(headers)):
            header = headers[i]
            if header.startswith("#"):
                headers[i] = header.replace("#", "")
        tsvWriter.writeheader()

        # Get indices of relevant columns.
        gene_i = headers.index(geneColumn)
        startAA_i = headers.index(startAACol)
        endAA_i = headers.index(endAACol)

        # Write each line of the intermediate file.
        for row in tsvReader:
            protein = row[proteinInfoColumn]
            if protein is None or not protein.strip():
                continue
            [startAA, endAA] = MutUtils.extractProteinPosition(protein)
            if not startAA.strip() or not endAA.strip():
                continue
            row[startAACol] = startAA
            row[endAACol] = endAA
            tsvWriter.writerow(row)
        csvfile.flush()
        csvfile.close()

        # Sort the intermediate tsv file.
        tsvSorter = TsvFileSorter(temp.name)
        func = lambda val: ((val["Gene name"]).lower(), int(val["startAA"]), int(val["endAA"]))

        # Use the whole file path name.
        outputFilename = os.path.abspath(outputFilename)
        tsvSorter.sortFile(outputFilename, func)

        return TabixIndexer.index(destDir=os.path.dirname(os.path.abspath(outputFilename)),
                                  inputFilename=outputFilename, fileColumnNumList=[gene_i, startAA_i, endAA_i])
class MafliteInputMutationCreator(InputMutationCreator):
    """
    A maflite file is a simple tsv file

    See the config file maflite_input.config for aliases and required headers.

    Additional columns can be included and will be annotate to the mutation using the header name.

    IMPORTANT NOTE: maflite will look at all aliases for alt_allele (see maflite_input.config) and choose the first that does not match the ref_allele
    """

    def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )

    def getComments(self):
        return self._tsvReader.getCommentsAsList()

    def getMetadata(self):
        result = Metadata()
        fieldNames = self._specified_fields
        fieldNameAliases = self._reverseAlternativeDict.keys()
        for fieldName in fieldNames:
            if fieldName in fieldNameAliases:
                fieldName = self._reverseAlternativeDict[fieldName]
            result[fieldName] = Annotation("", datasourceName="INPUT")
        return result

    def _find_alt_allele_in_other_field(self, raw_line_dict, ref_allele):
        """Check all the possible alt allele columns and choose the one that does not match the reference allele. """

        list_alternates = self._alternativeDict.get("alt_allele", [])

        for candidate_field in list_alternates:
            candidate_value = raw_line_dict.get(candidate_field, "").strip() #remove any trailing whitespace if present
            if candidate_value != "" and candidate_value != ref_allele:
                return candidate_value
        return ref_allele

    def createMutations(self):
        """ No inputs.
        Returns a generator of mutations built from the specified maflite file. """

        aliasKeys = self._reverseAlternativeDict.keys()
        allColumns = self._specified_fields

        for line in self._tsvReader:

            # We only need to assign fields that are mutation attributes and have a different name in the maflite file.
            mut = self._mutation_data_factory.create(build=self._build)

            for col in allColumns:
                # Three scenarios:
                #   1) col is name of mutation data field -- simple createAnnotation
                #   2) col name is an alias for a mutation data field -- do lookup then createAnnotation
                #   3) col name is not an alias for a mutation data field -- simple createAnnotation
                if col in aliasKeys:
                    realKey = self._reverseAlternativeDict[col]
                    self.logger.debug(realKey + " found from " + col)
                    val = line[col]
                    if realKey == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col])
                    mut.createAnnotation(realKey, val, 'INPUT')
                else:
                    # Scenario 1 and 3
                    # Make sure to convert chromosome values.
                    val = line[col]
                    if col == "chr":
                        val = MutUtils.convertChromosomeStringToMutationDataFormat(line[col])
                    mut.createAnnotation(col, val, 'INPUT') 

            mut.ref_allele, mut.alt_allele = mut.ref_allele.strip(), mut.alt_allele.strip() #remove any trailing whitespace if present

            # if the alt allele == ref_allele, check that this is not a case where there is an alt_allele2 that is different.
            if mut.alt_allele == mut.ref_allele:
                mut.alt_allele = self._find_alt_allele_in_other_field(line, mut.ref_allele)

            # FIXME: Support more than one alias in the reverse dictionary.  Then this line can be removed.
            if mut.start is not "" and mut.end is "":
                mut.end = mut.start
            if mut.end is not "" and mut.start is "":
                mut.start = mut.end

            yield mut