def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None): """ TODO: Need functionality for not prepending the i_ on internal fields. """ options = dict() if other_options is None else other_options self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.logger.info("Building alternative keys dictionary...") self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) #TODO: Read missing options from the config file or specify that error should be thrown. self.options = options self._prepend = self.config.get("general", "prepend") if self.options.get(OptionConstants.NO_PREPEND, False): self._prepend = "" self.exposedColumns = set( self.config.get("general", "exposedColumns").split(',')) self._is_entrez_id_message_logged = False
def testCreateDatasourceWithMissingValues(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.missing.txt") destDir = "out" datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.missing.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "EA_GTC,DP" configFilename = os.path.join("out", "esp_coverage.missing.config") datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("data_types", "EA_GTC"), "Float", "Expected EA_GTC data type is %s but was %s." % ("Float", configParser.get("data_types", "EA_GTC"))) self.assertEqual(configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP")))
def __init__(self, filename, configFile="tcgaVCF1.1_output.config", otherOptions=None): self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self.seenDbSNPs = dict() self.fieldMap = {}
def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None): """ """ options = dict() if other_options is None else other_options self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.logger.info("Building alternative keys dictionary...") self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self.options = options self._prepend = self.config.get("general", "prepend") if self.options.get(OptionConstants.NO_PREPEND, False): self._prepend = "" # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not # annotated as part of the INPUT. self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False) self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True) self.exposedColumns = set(self.config.get("general", "exposedColumns").split(',')) self._is_entrez_id_message_logged = False self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False) self._column_collapser = None self._column_collapser_suffix = None if self._is_collapsing_number_cols: self._column_collapser = ColumnCollapser() self._column_collapser_suffix = "_full"
def __init__(self, mut, configFile="sample_name_selection.config", section="SAMPLE_NAME"): config = ConfigUtils.createConfigParser(configFile) self.logger = logging.getLogger(__name__) aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig( config, section) self.configFile = configFile sampleAnnotation = self._getAnnotationFromAliases( mut, aliases["sample_name"]) tumorAnnotation = self._getAnnotationFromAliases( mut, aliases["sample_tumor_name"]) normalAnnotation = self._getAnnotationFromAliases( mut, aliases["sample_normal_name"]) source_column = self._getSourceColumn(sampleAnnotation, tumorAnnotation, normalAnnotation) self._logSampleNameColumnDescription(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.sampleNameGrabber = self._getSampleNameGrabber( source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.outputAnnotationName = self._deriveOutputAnnotationName( sampleAnnotation) self.annotationSource = self._deriveAnnotationSource(source_column)
def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor """ super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options) self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary( self._alternativeDict) missingRequiredHeaders = [] required_columns = sorted( self.config.get("general", "required_headers").split(",")) self._build = genomeBuild self.logger.info( "Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) # The specified fields are those that were given in the input. self._specified_fields = self._tsvReader.getFieldNames() for col in required_columns: if col not in self._specified_fields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in self._specified_fields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException( "Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders))
def testCreateGPTsvConfigFile(self): configFilename = "out/ccle_by_gp.config" datasourceFilename = "ccle_results_by_pos.hg19.import.txt" dataSourceType = "gp_tsv" dataSourceName = "CCLE_By_GP" dataSourceVersion = "09292010" genomicPositionColumnNames = "chr,start,end" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile( configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols( "gp_tsv", genomicPositionColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue( configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
def __init__(self, filename, configFile="tcgaVCF1.1_output.config", otherOptions=None): self._filename = filename self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) self.seenDbSNPs = dict() self.fieldMap = {}
def test_intitialize(self): """Test a simple initialization of an ensembl datasource """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) self.assertIsNotNone(ensembl_ds) ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
def test_intitialize(self): """Test a simple initialization of an ensembl datasource """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) self.assertIsNotNone(ensembl_ds) ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
def getGeneTsvConfigFile(self): configFilename = "out/simple_uniprot.config" datasourceFilename = "simple_uniprot.out.2011_09.tsv" dataSourceType = "gene_tsv" dataSourceName = "UniProt" dataSourceVersion = "2011_09" geneColumnName = "gene" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile( configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols( "gene_tsv", geneColumnName)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "gene_col"), "gene_col option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "gene_col"), geneColumnName, "Expected data source gene_col is %s but was %s." % (geneColumnName, configParser.get("general", "gene_col")))
def testCreateGPTsvDatasource(self): """ """ datasourceFilename = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt" datasourceType = "gp_tsv" datasourceName = "ORegAnno" datasourceFoldername = "ORegAnno" datasourceVersion = "UCSC Track" genomeBuild = "hg19" genomicPositionColumnNames = "hg19.oreganno.chrom,hg19.oreganno.chromStart,hg19.oreganno.chromEnd" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName, datasourceType, datasourceVersion, genomicPositionColumnNames) datasourceFilename = "oreganno_trim.hg19.txt" configFilename = os.path.join(*[destDir, "ORegAnno.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") MutUtils.removeDir(tmpDir)
def __init__(self, mut, configFile="sample_name_selection.config", section="SAMPLE_NAME"): config = ConfigUtils.createConfigParser(configFile) self.logger = logging.getLogger(__name__) aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config, section) self.configFile=configFile sampleAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_name"]) tumorAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_tumor_name"]) normalAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_normal_name"]) source_column = self._getSourceColumn(sampleAnnotation,tumorAnnotation,normalAnnotation) self._logSampleNameColumnDescription(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.sampleNameGrabber = self._getSampleNameGrabber(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation) self.outputAnnotationName = self._deriveOutputAnnotationName(sampleAnnotation) self.annotationSource = self._deriveAnnotationSource(source_column)
def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor Currently, this InputCreator does not support any other options. The parameter is ignored. """ self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig( self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary( self._alternativeDict) missingRequiredHeaders = [] specifiedFields = self._tsvReader.getFieldNames() required_columns = sorted( self.config.get("general", "required_headers").split(",")) self._build = genomeBuild for col in required_columns: if col not in specifiedFields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in specifiedFields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() self.logger.info( "Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException( "Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders))
def testCreateIndexedVcfDatasource(self): datasourceFilename = "testdata/vcf/example.vcf" datasourceFoldername = "1000Genomes" datasourceName = "1000Genomes" datasourceType = "indexed_vcf" datasourceVersion = "V4.1" genomeBuild = "hg19" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName, datasourceType, datasourceVersion) datasourceFilename = "example.tabix_indexed.vcf.gz" configFilename = os.path.join(*[destDir, "1000Genomes.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") # Data source was created correctly tabixIndexedFilename = os.path.join(*[destDir, "example.tabix_indexed.vcf.gz"]) self.assertTrue(os.path.exists(tabixIndexedFilename), "No index file was generated.") vcfReader = vcf.Reader(filename=tabixIndexedFilename, compressed=True, strict_whitespace=True) vcfRecords = vcfReader.fetch(chrom=20, start=1230237, end=1230237) for vcfRecord in vcfRecords: self.assertEqual(vcfRecord.INFO["NS"], 3, "Expected %s but got %s." % (3, vcfRecord.INFO["NS"])) self.assertEqual(vcfRecord.INFO["DP"], 13, "Expected %s but got %s." % (13, vcfRecord.INFO["DP"])) MutUtils.removeDir(tmpDir)
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. TODO: This is code duplication from TCGA MAF Output RendererTest. This should be refactored into a base class (to preserve self.assertTrue, etc). """ statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: if lineDict['Entrez_Gene_Id'] == "0": self.assertTrue( lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue( '\r' not in lineDict[k], "Carriage return character found in an annotation value.") configFile = ConfigUtils.createConfigParser( 'configs/tcgaMAF2.3_output.config') requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") if (k not in requiredColumns) and (k not in optionalColumns): self.assertTrue( k.startswith("i_"), "Internal column was not prepended with 'i_'") unknownKeys.sort() self.assertTrue( len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys)) ctr += 1
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. """ configFile = ConfigUtils.createConfigParser(os.path.join("configs", "tcgaMAF2.4_output.config")) statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf) # if lineDict['Entrez_Gene_Id'] == "0": # self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] self.assertTrue(lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"])) self.assertTrue(lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr)) uniprot_aa_xform_counter = 0 for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.") requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") exposedColumns = configFile.get("general", "exposedColumns") if (k not in requiredColumns) and (k not in optionalColumns) and (k not in exposedColumns): self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'") if lineDict['UniProt_AApos'] == "0": uniprot_aa_xform_counter += 1 if lineDict["Variant_Type"] == VariantClassification.VT_DEL: self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-") if lineDict["Variant_Type"] == VariantClassification.VT_INS: self.assertTrue(lineDict["Reference_Allele"] == "-") unknownKeys.sort() self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys)) self.assertTrue(uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + "). This is probably an error.") ctr += 1
def getConfigTable(self, configFilename, filename): """ :return: """ configParser = ConfigUtils.createConfigParser(configFilename, ignoreCase=False) configTable = VcfInputConfigTable() vcfReader = vcf.Reader(filename=filename, strict_whitespace=True) self.createConfigTableKeys(configParser=configParser, configTable=configTable) self.createConfigTable(vcfReader=vcfReader, configTable=configTable) return configTable
def getConfigTable(self, configFilename, filename=None): """ :param configFilename: :param filename: :return: """ configParser = ConfigUtils.createConfigParser(configFilename, ignoreCase=False) configTable = VcfOutputConfigTable() self.createConfigTableKeys(configParser=configParser, configTable=configTable) return configTable
def getConfigTable(self, configFilename, filename=None): """ :param configFilename: :param filename: :return: """ configParser = ConfigUtils.createConfigParser(configFilename, ignoreCase=False) configTable = VcfOutputConfigTable(configFilename) self.createConfigTableKeys(configParser=configParser, configTable=configTable) return configTable
def test_simple_annotate(self): """ Annotate a simple example. """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) m = MutationData() m.chr = "22" m.start = "22161963" m.end = "22161963" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m)
def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor """ super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options) self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict) missingRequiredHeaders = [] required_columns = sorted(self.config.get("general", "required_headers").split(",")) self._build = genomeBuild self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) # The specified fields are those that were given in the input. self._specified_fields = self._tsvReader.getFieldNames() for col in required_columns: if col not in self._specified_fields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in self._specified_fields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders) )
def test_simple_annotate(self): """ Annotate a simple example. """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) m = MutationData() m.chr = "22" m.start = "22161963" m.end = "22161963" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m)
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. TODO: This is code duplication from TCGA MAF Output RendererTest. This should be refactored into a base class (to preserve self.assertTrue, etc). """ statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: if lineDict['Entrez_Gene_Id'] == "0": self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.") configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.3_output.config') requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") if (k not in requiredColumns) and (k not in optionalColumns): self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'") unknownKeys.sort() self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str( ctr) + ", in fields: " + ", ".join(unknownKeys)) ctr += 1
def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None): """ Constructor Currently, this InputCreator does not support any other options. The parameter is ignored. """ self.logger = logging.getLogger(__name__) self.config = ConfigUtils.createConfigParser(configFile) self._tsvReader = GenericTsvReader(filename) # Key is the required columns and the values are a list of valid alternative headers. # Key is column name to an alternative. self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config) self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict) missingRequiredHeaders = [] specifiedFields = self._tsvReader.getFieldNames() required_columns = sorted(self.config.get("general", "required_headers").split(",")) self._build = genomeBuild for col in required_columns: if col not in specifiedFields: isAltFound = False for alt in self._alternativeDict.get(col, []): if alt in specifiedFields: isAltFound = True break if not isAltFound: # build is optional. if col != "build": missingRequiredHeaders.append(col) missingRequiredHeaders.sort() self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames())) if len(missingRequiredHeaders) > 0: raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders) )
def testCreateDatasourceFromGZFile(self): dsFile = os.path.join("testdata", "example.vcf.gz") destDir = "out" configFilename = os.path.join("out", "esp.config") datasourceFilename = "example.vcf.gz" dataSourceType = "indexed_vcf" datasourceMatchMode = "avg" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" datasourceBuilder = TabixIndexedVcfDatasourceCreator() datasourceBuilder.createDatasource(destDir, dsFile, configFilename, dataSourceType, dataSourceName, dataSourceVersion, datasourceMatchMode) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "match_mode"), datasourceMatchMode, "Expected data source match mode is %s but was %s." % (datasourceMatchMode, configParser.get("general", "match_mode")))
def testCreateGPTsvConfigFile(self): configFilename = "out/ccle_by_gp.config" datasourceFilename = "ccle_results_by_pos.hg19.import.txt" dataSourceType = "gp_tsv" dataSourceName = "CCLE_By_GP" dataSourceVersion = "09292010" genomicPositionColumnNames = "chr,start,end" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols("gp_tsv", genomicPositionColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
def getGeneTsvConfigFile(self): configFilename = "out/simple_uniprot.config" datasourceFilename = "simple_uniprot.out.2011_09.tsv" dataSourceType = "gene_tsv" dataSourceName = "UniProt" dataSourceVersion = "2011_09" geneColumnName = "gene" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols("gene_tsv", geneColumnName)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "gene_col"), "gene_col option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "gene_col"), geneColumnName, "Expected data source gene_col is %s but was %s." % (geneColumnName, configParser.get("general", "gene_col")))
def testInternalFieldsSkipPrepend(self): """ Test that no prepending of "i_" is honored.""" outputFilename = "out/testInternalFields_v2.4.maf.tsv" m = MutationDataFactory.default_create() m.createAnnotation("TEST", "THIS IS A TEST", "TESTING") # The next annotation is real and should not be considered internal. m.createAnnotation("gene", "EGFR") outputRenderer = TcgaMafOutputRenderer( outputFilename, configFile='configs/tcgaMAF2.4_output.config', other_options={OptionConstants.NO_PREPEND: True}) outputRenderer.renderMutations(iter([m]), ['No comments']) configFile = ConfigUtils.createConfigParser( 'configs/tcgaMAF2.4_output.config') requiredColumns = configFile.get("general", "requiredColumns") self.assertTrue( "Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF. If not, the test must be modified." ) statinfo = os.stat(outputFilename) self.assertTrue( statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers") self.assertTrue( "i_TEST" not in headers, "i_TEST was found in output headers when prepend was disabled.") self.assertTrue("TEST" in headers, "TEST was not found in output headers.")
def testInternalFieldsSkipPrepend(self): """ Test that no prepending of "i_" is honored.""" outputFilename = "out/testInternalFields_v2.4.maf.tsv" m = MutationDataFactory.default_create() m.createAnnotation("TEST", "THIS IS A TEST", "TESTING") # The next annotation is real and should not be considered internal. m.createAnnotation("gene", "EGFR") outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config', other_options={OptionConstants.NO_PREPEND:True}) outputRenderer.renderMutations(iter([m]), ['No comments']) configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config') requiredColumns = configFile.get("general", "requiredColumns") self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF. If not, the test must be modified.") statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers") self.assertTrue("i_TEST" not in headers, "i_TEST was found in output headers when prepend was disabled.") self.assertTrue("TEST" in headers, "TEST was not found in output headers.")
def testInternalFields(self): """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """ outputFilename = "out/testInternalFields_v2.4.maf.tsv" m = MutationData() m.createAnnotation("TEST", "THIS IS A TEST", "TESTING") # The next annotation is real and should not be considered internal. m.createAnnotation("gene", "EGFR") outputRenderer = TcgaMafOutputRenderer( outputFilename, configFile='configs/tcgaMAF2.4_output.config') outputRenderer.renderMutations(iter([m]), ['No comments']) configFile = ConfigUtils.createConfigParser( 'configs/tcgaMAF2.4_output.config') requiredColumns = configFile.get("general", "requiredColumns") self.assertTrue( "Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF. If not, the test must be modified." ) statinfo = os.stat(outputFilename) self.assertTrue( statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers") self.assertTrue( "TEST" not in headers, "TEST was found in output headers when it should have been renamed to i_TEST" ) self.assertTrue("i_TEST" in headers, "i_TEST not found in output headers")
def __init__(self, config_file="column_collapse.config"): self._config_parser = ConfigUtils.createConfigParser(config_file, ignoreCase=False) self._columns_to_collapse = self._config_parser.options("columns_to_collapse") # Create a lookup table to get the method for each column self._method_dict = dict() if len(self._columns_to_collapse) != len(set(self._columns_to_collapse)): logging.getLogger(__name__).warn("Duplicate keys in " + config_file + " seen. Some collapsing may produce unexpected values.") for c in self._columns_to_collapse: self._method_dict[c] = self._config_parser.get("columns_to_collapse", c).strip() # Default to mean if not specified if self._method_dict[c] is None or self._method_dict[c] == "": self._method_dict[c] = ColumnCollapser.MEAN # Basic validation problematic_method_assignments = dict() for c in self._columns_to_collapse: if self._method_dict[c] not in ColumnCollapser.VALID_VALUES: problematic_method_assignments[c] = self._method_dict[c] if len(problematic_method_assignments.keys()) > 0: raise ValueError("Invalid column collapsing specified: " + str(problematic_method_assignments))
def testInternalFields(self): """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """ outputFilename = "out/testInternalFields_v2.4.maf.tsv" m = MutationData() m.createAnnotation("TEST", "THIS IS A TEST", "TESTING") # The next annotation is real and should not be considered internal. m.createAnnotation("gene", "EGFR") outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config') outputRenderer.renderMutations(iter([m]), ['No comments']) configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config') requiredColumns = configFile.get("general", "requiredColumns") self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF. If not, the test must be modified.") statinfo = os.stat(outputFilename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.") tsvReader = GenericTsvReader(outputFilename) headers = tsvReader.getFieldNames() self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers") self.assertTrue("TEST" not in headers, "TEST was found in output headers when it should have been renamed to i_TEST") self.assertTrue("i_TEST" in headers, "i_TEST not found in output headers")
def renderMutations(self, segments, metadata=None, comments=None): """Render segments into a gene list as described in the docs for this class. :param segments: iterable of MutationData :param metadata: :param comments: """ config_parser = ConfigUtils.createConfigParser(self._config_file) logging.getLogger(__name__).info("Building alternative keys dictionary...") self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser) if metadata is None: metadata = OrderedDict() if comments is None: comments = [] fp = file(self._filename, 'w') for c in comments: fp.write("## " + c + "\n") # TODO: Define constant for "genes", and other annotations headers = config_parser.options("alternatives") gene_to_segment_dict = dict() annotations = None i = 0 for i, seg in enumerate(segments): if annotations is None: annotations = seg.keys() field_mapping = MutUtils.createFieldsMapping(headers, annotations, self._alternativeDictionary, isRenderInternalFields=True, prepend="") gene_list = seg['genes'].split(",") for g in gene_list: if g == seg["start_gene"]: gene_to_segment_dict[g + " " + seg["start_exon"]] = seg elif g == seg["end_gene"]: gene_to_segment_dict[g + " " + seg["end_exon"]] = seg else: gene_to_segment_dict[g] = seg if i == 0: logging.getLogger(__name__).info("No segments given. There will be no genes in the list.") writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore") writer.writeheader() logging.getLogger(__name__).info("Rendering gene list...") all_genes_seen = sorted(gene_to_segment_dict.keys()) num_genes = len(all_genes_seen) for i,gene in enumerate(all_genes_seen): # This next line may be slow... line_dict = dict() seg = gene_to_segment_dict[gene] for h in headers: annotation_field = field_mapping.get(h, h) line_dict[h] = seg.get(annotation_field, "") line_dict["gene"] = gene writer.writerow(line_dict) if i % 1000 == 0: logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes)) fp.close()
def _validateTcgaMafContents(self, filename): """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created. Note: This method has nothing to do with the TCGA validator. """ configFile = ConfigUtils.createConfigParser( os.path.join("configs", "tcgaMAF2.4_output.config")) statinfo = os.stat(filename) self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.") tsvReader = GenericTsvReader(filename) self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number") ctr = 1 for lineDict in tsvReader: # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf) # if lineDict['Entrez_Gene_Id'] == "0": # self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'. Line: " + str(ctr)) unknownKeys = [] self.assertTrue( lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"])) self.assertTrue( lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr)) uniprot_aa_xform_counter = 0 for k in lineDict.keys(): if lineDict[k] == "__UNKNOWN__": unknownKeys.append(k) self.assertTrue( '\r' not in lineDict[k], "Carriage return character found in an annotation value.") requiredColumns = configFile.get("general", "requiredColumns") optionalColumns = configFile.get("general", "optionalColumns") exposedColumns = configFile.get("general", "exposedColumns") if (k not in requiredColumns) and ( k not in optionalColumns) and (k not in exposedColumns): self.assertTrue( k.startswith("i_"), "Internal column was not prepended with 'i_'") if lineDict['UniProt_AApos'] == "0": uniprot_aa_xform_counter += 1 if lineDict["Variant_Type"] == VariantClassification.VT_DEL: self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-") if lineDict["Variant_Type"] == VariantClassification.VT_INS: self.assertTrue(lineDict["Reference_Allele"] == "-") unknownKeys.sort() self.assertTrue( len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys)) self.assertTrue( uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + "). This is probably an error.") ctr += 1
def testCreateDatasource(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") # Never specify "out/" destDir = "out/create_ds_test/" if os.path.exists(destDir): shutil.rmtree(destDir) os.makedirs(destDir) datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS,REF,ALT" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" configFilename = "out/esp_coverage.config" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "DBSNP,EA_GTC,DP" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) self.assertTrue(os.path.exists(destDir + datasourceFilename)) self.assertTrue(os.path.exists(destDir + datasourceFilename + ".tbi")) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue(configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names"))) self.assertEqual(configParser.get("data_types", "EA_GTC"), "String", "Expected EA_GTC data type is %s but was %s." % ("String", configParser.get("data_types", "EA_GTC"))) self.assertEqual(configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP"))) ds = DatasourceFactory.createDatasourceFromConfigParser(configParser, "out/create_ds_test/") mut = MutationData(chr="1", start="69428", end="69428", ref_allele="T", alt_allele="G") mut2 = ds.annotate_mutation(mut) self.assertEquals(mut2["ESP_DBSNP"], "dbSNP_134") self.assertEquals(mut2["ESP_EA_GTC"], "92,129,3203") self.assertEquals(mut2["ESP_DP"], "110")
def testLocalConfig(self): ''' Get a key from a local config and a config file that are the same basic name, but different values. ''' config = ConfigUtils.createConfigParser("testdata/dummy_configs/dummy.config") self.assertTrue(config.get("general", "dummy1") == "Super") self.assertTrue(config.get("general", "dummy2") == "world")
def testCreateDatasource(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") # Never specify "out/" destDir = "out/create_ds_test/" if os.path.exists(destDir): shutil.rmtree(destDir) os.makedirs(destDir) datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS,REF,ALT" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" configFilename = "out/esp_coverage.config" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "DBSNP,EA_GTC,DP" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource( destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) self.assertTrue(os.path.exists(destDir + datasourceFilename)) self.assertTrue(os.path.exists(destDir + datasourceFilename + ".tbi")) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue( configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue( configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual( configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual( configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual( configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names"))) self.assertEqual( configParser.get("data_types", "EA_GTC"), "String", "Expected EA_GTC data type is %s but was %s." % ("String", configParser.get("data_types", "EA_GTC"))) self.assertEqual( configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP"))) ds = DatasourceFactory.createDatasourceFromConfigParser( configParser, "out/create_ds_test/") mut = MutationData(chr="1", start="69428", end="69428", ref_allele="T", alt_allele="G") mut2 = ds.annotate_mutation(mut) self.assertEquals(mut2["ESP_DBSNP"], "dbSNP_134") self.assertEquals(mut2["ESP_EA_GTC"], "92,129,3203") self.assertEquals(mut2["ESP_DP"], "110")
def testCreateConfigFile(self): """ """ configFilename = os.path.join("out", "esp_coverage.config") datasourceFilename = "ESP6500SI-V2.coverage.txt.gz" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" indexColumnNames = "Chromosome,Position,Position" columnNames = "Chromosome,Position,TotalSamplesCovered,AvgSampleReadDepth,TotalEAsamplesCovered,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth" annotationColumnNames = "TotalSamplesCovered,AvgSampleReadDepth,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_type=dataSourceType, ds_name=dataSourceName, ds_version=dataSourceVersion, column_names=columnNames, annotation_column_names=annotationColumnNames, ds_match_mode=dataSourceMatchMode, indexCols=DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue(configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names")))
def testCreateDatasource(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") destDir = "out" datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS,REF,ALT" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" configFilename = "out/esp_coverage.config" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "DBSNP,EA_GTC,DP" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue(configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names"))) self.assertEqual(configParser.get("data_types", "EA_GTC"), "String", "Expected EA_GTC data type is %s but was %s." % ("String", configParser.get("data_types", "EA_GTC"))) self.assertEqual(configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP")))
def renderMutations(self, segments, metadata=None, comments=None): """Render segments into a gene list as described in the docs for this class. :param segments: iterable of MutationData :param metadata: :param comments: """ config_parser = ConfigUtils.createConfigParser(self._config_file) logging.getLogger(__name__).info( "Building alternative keys dictionary...") self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig( config_parser) if metadata is None: metadata = OrderedDict() if comments is None: comments = [] fp = file(self._filename, 'w') for c in comments: fp.write("## " + c + "\n") # TODO: Define constant for "genes", and other annotations headers = config_parser.options("alternatives") gene_to_segment_dict = dict() annotations = None i = 0 for i, seg in enumerate(segments): if annotations is None: annotations = seg.keys() field_mapping = FieldMapCreator.create_field_map( headers, seg, self._alternativeDictionary, is_render_internal_fields=True, prepend="") gene_list = seg['genes'].split(",") for g in gene_list: if g == seg["start_gene"]: gene_to_segment_dict[g + " " + seg["start_exon"]] = seg elif g == seg["end_gene"]: gene_to_segment_dict[g + " " + seg["end_exon"]] = seg else: gene_to_segment_dict[g] = seg if i == 0: logging.getLogger(__name__).info( "No segments given. There will be no genes in the list.") writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore") writer.writeheader() logging.getLogger(__name__).info("Rendering gene list...") all_genes_seen = sorted(gene_to_segment_dict.keys()) num_genes = len(all_genes_seen) for i, gene in enumerate(all_genes_seen): # This next line may be slow... line_dict = dict() seg = gene_to_segment_dict[gene] for h in headers: annotation_field = field_mapping.get(h, h) line_dict[h] = seg.get(annotation_field, "") line_dict["gene"] = gene writer.writerow(line_dict) if i % 1000 == 0: logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i + 1), num_genes)) fp.close()
def testCreateIndexedTsvDatasource(self): datasourceFilename = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt" datasourceFoldername = "1000Genomes" datasourceName = "1000Genomes" datasourceType = "indexed_tsv" datasourceVersion = "V4.1" genomeBuild = "hg19" indexColumnNames = "CHROM,POS,POS" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" annotationColumnNames = "DBSNP,EA_AC,AA_AC,TAC" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir=destDir, ds_file=datasourceFilename, ds_foldername=datasourceFoldername, ds_name=datasourceName, ds_type=datasourceType, ds_version=datasourceVersion, index_columns=indexColumnNames, ds_annotation_columns=annotationColumnNames) datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" configFilename = os.path.join(*[destDir, "1000Genomes.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") datasource = DatasourceFactory.createDatasource(configFilename, destDir) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "802177" m1.end = "802177" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = datasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("1000Genomes_AA_AC") cur_annotation = Annotation(value="2,866", datasourceName="1000Genomes", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") annotationNames = ["1000Genomes_CHROM", "1000Genomes_POS", "1000Genomes_REF", "1000Genomes_ALT", "1000Genomes_GWAS_PUBMED"] for annotationName in annotationNames: self.assertTrue(annotationName not in m1_annotated, "m1_annotated was annotated with %s." % annotationName) annotationNames = ["1000Genomes_DBSNP", "1000Genomes_EA_AC", "1000Genomes_AA_AC", "1000Genomes_TAC"] for annotationName in annotationNames: self.assertTrue(annotationName in m1_annotated, "m1_annotated was not annotated with %s value." % annotationName) MutUtils.removeDir(tmpDir)