def sortFile(self, filename, func, length=50000): """ This method sorts the input file and writes out the sorted file to filename. :param filename: sorted filename :param func: function that converts each row of the input file to an unique, sortable key :param length: maximum number of lines in a partition """ reader = GenericTsvReader(filename=self.readfilename, commentPrepend=self.commentPrepend, delimiter=self.delimiter) comments = reader.getComments() fieldnames = reader.getFieldNames() if fieldnames is None: fieldnames = [] fieldnameIndexes = collections.OrderedDict() if fieldnames is not None: fieldnameIndexes = collections.OrderedDict([ (x, i) for (i, x) in enumerate(fieldnames) ]) iterable = iter(reader.getInputContentFP()) partitions = self._yieldPartitions(iterable, func, fieldnameIndexes, length) with open(name=filename, mode='wb', buffering=64 * 1024) as writer: writer.write(comments) writer.write(string.join(fieldnames, self.delimiter) + "\n") writer.writelines( self._merge(partitions) ) # generators are allowed as inputs to writelines function
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer(outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource("testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue(ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def sortFile(self, filename, func, length=50000): """ This method sorts the input file and writes out the sorted file to filename. :param filename: sorted filename :param func: function that converts each row of the input file to an unique, sortable key :param length: maximum number of lines in a partition """ reader = GenericTsvReader(filename=self.readfilename, commentPrepend=self.commentPrepend, delimiter=self.delimiter) comments = reader.getComments() fieldnames = reader.getFieldNames() if fieldnames is None: fieldnames = [] fieldnameIndexes = collections.OrderedDict() if fieldnames is not None: fieldnameIndexes = collections.OrderedDict([(x, i) for (i, x) in enumerate(fieldnames)]) iterable = iter(reader.getInputContentFP()) partitions = self._yieldPartitions(iterable, func, fieldnameIndexes, length) with open(name=filename, mode='wb', buffering=64 * 1024) as writer: writer.write(comments) writer.write(string.join(fieldnames, self.delimiter) + "\n") writer.writelines(self._merge(partitions)) # generators are allowed as inputs to writelines function
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. TODO: This is code duplication from TCGA MAF Output RendererTest. This should be refactored into a base class (to preserve self.assertTrue, etc). """ statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: if lineDict['Entrez_Gene_Id'] == "0": self.assertTrue( lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue( '\r' not in lineDict[k], "Carriage return character found in an annotation value.") configFile = ConfigUtils.createConfigParser( 'configs/tcgaMAF2.3_output.config') requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") if (k not in requiredColumns) and (k not in optionalColumns): self.assertTrue( k.startswith("i_"), "Internal column was not prepended with 'i_'") unknownKeys.sort() self.assertTrue( len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys)) ctr += 1
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. """ configFile = ConfigUtils.createConfigParser(os.path.join("configs", "tcgaMAF2.4_output.config")) statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf) # if lineDict['Entrez_Gene_Id'] == "0": # self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] self.assertTrue(lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"])) self.assertTrue(lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr)) uniprot_aa_xform_counter = 0 for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.") requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") exposedColumns = configFile.get("general", "exposedColumns") if (k not in requiredColumns) and (k not in optionalColumns) and (k not in exposedColumns): self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'") if lineDict['UniProt_AApos'] == "0": uniprot_aa_xform_counter += 1 if lineDict["Variant_Type"] == VariantClassification.VT_DEL: self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-") if lineDict["Variant_Type"] == VariantClassification.VT_INS: self.assertTrue(lineDict["Reference_Allele"] == "-") unknownKeys.sort() self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys)) self.assertTrue(uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + "). This is probably an error.") ctr += 1
def testTCGAMAFAsInputAndQuickAnnotate(self): """ Test that we can take in a TCGA MAF (using MAFLITE), do annotating, and still render it properly """ inputFilename = "testdata/maf/Patient0.maf.annotated" tmp = MafliteInputMutationCreator(inputFilename, 'configs/maflite_input.config') outputFilename = "out/testTCGAMAFAsInputAndQuickAnnotate.tsv" outputRenderer = TcgaMafOutputRenderer( outputFilename, 'configs/tcgaMAF2.4_output.config') annotator = Annotator() annotator.setInputCreator(tmp) annotator.setOutputRenderer(outputRenderer) ds = DatasourceFactory.createDatasource( "testdata/thaga_janakari_gene_ds/hg19/tj_data.config", "testdata/thaga_janakari_gene_ds/hg19/") annotator.addDatasource(ds) annotator.annotate() statinfo = os.stat(outputFilename) self.assertTrue( statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReaderIn = GenericTsvReader(inputFilename) tsvReader = GenericTsvReader(outputFilename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") self.assertTrue("i_TJ_Data_Why" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Why) from header") self.assertTrue("i_TJ_Data_Who" in tsvReader.getFieldNames(), "New field missing (i_TJ_Data_Who) from header") ctrOut = 0 for lineDict in tsvReader: ctrOut += 1 ctrIn = 0 for lineDict in tsvReaderIn: ctrIn += 1 ctrIn += len(tsvReaderIn.getCommentsAsList()) ctrOut += len(tsvReader.getCommentsAsList()) self.assertTrue( ctrOut == (ctrIn + 2), "Output file should have same number of lines plus two (for maf version and Oncotator version comments) as input file. (In,Out): " + str(ctrIn) + ", " + str(ctrOut))
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. TODO: This is code duplication from TCGA MAF Output RendererTest. This should be refactored into a base class (to preserve self.assertTrue, etc). """ statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: if lineDict['Entrez_Gene_Id'] == "0": self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.") configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.3_output.config') requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") if (k not in requiredColumns) and (k not in optionalColumns): self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'") unknownKeys.sort() self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str( ctr) + ", in fields: " + ", ".join(unknownKeys)) ctr += 1
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. """ configFile = ConfigUtils.createConfigParser( os.path.join("configs", "tcgaMAF2.4_output.config")) statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf) # if lineDict['Entrez_Gene_Id'] == "0": # self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] self.assertTrue( lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"])) self.assertTrue( lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr)) uniprot_aa_xform_counter = 0 for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue( '\r' not in lineDict[k], "Carriage return character found in an annotation value.") requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") exposedColumns = configFile.get("general", "exposedColumns") if (k not in requiredColumns) and ( k not in optionalColumns) and (k not in exposedColumns): self.assertTrue( k.startswith("i_"), "Internal column was not prepended with 'i_'") if lineDict['UniProt_AApos'] == "0": uniprot_aa_xform_counter += 1 if lineDict["Variant_Type"] == VariantClassification.VT_DEL: self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-") if lineDict["Variant_Type"] == VariantClassification.VT_INS: self.assertTrue(lineDict["Reference_Allele"] == "-") unknownKeys.sort() self.assertTrue( len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys)) self.assertTrue( uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + "). This is probably an error.") ctr += 1