コード例 #1
0
    def createConfigTableKeys(self, configParser, configTable):
        # Parse fields from FORMAT section of the config file
        """


        :param configParser:
        :param configTable:
        """
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "INFO")
        for ID, name in table.items():
            configTable.addInfoFieldID(ID, name)

        # Parse fields from FORMAT section of the config file
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "FORMAT")
        for ID, name in table.items():
            configTable.addFormatFieldID(ID, name)

        # Parse fields from NOT_SPLIT_TAGS section of the config file
        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "NOT_SPLIT_TAGS")
        for fieldType, IDs in table.items():
            configTable.addFieldIDsToNotSplitSet(fieldType, IDs)

        # Parse fields from SPLIT_TAGS section of the config file
        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "SPLIT_TAGS")
        for fieldType, IDs in table.items():
            configTable.addFieldIDsToSplitSet(fieldType, IDs)
コード例 #2
0
 def __init__(self, filename, configFile="tcgaVCF1.1_output.config", otherOptions=None):
     self._filename = filename
     self.logger = logging.getLogger(__name__)
     self.config = ConfigUtils.createConfigParser(configFile)
     self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
     self.seenDbSNPs = dict()
     self.fieldMap = {}
コード例 #3
0
    def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None):
        """
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)

        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not
        #   annotated as part of the INPUT.
        self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False)

        self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True)

        self.exposedColumns = set(self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False

        self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False)
        self._column_collapser = None
        self._column_collapser_suffix = None
        if self._is_collapsing_number_cols:
            self._column_collapser = ColumnCollapser()
            self._column_collapser_suffix = "_full"
コード例 #4
0
    def __init__(self,
                 filename,
                 configFile="tcgaMAF2.4_output.config",
                 other_options=None):
        """
        TODO: Need functionality for not prepending the i_ on internal fields.
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)

        #TODO: Read missing options from the config file or specify that error should be thrown.
        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        self.exposedColumns = set(
            self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False
コード例 #5
0
 def __init__(self,
              mut,
              configFile="sample_name_selection.config",
              section="SAMPLE_NAME"):
     config = ConfigUtils.createConfigParser(configFile)
     self.logger = logging.getLogger(__name__)
     aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
         config, section)
     self.configFile = configFile
     sampleAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_name"])
     tumorAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_tumor_name"])
     normalAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_normal_name"])
     source_column = self._getSourceColumn(sampleAnnotation,
                                           tumorAnnotation,
                                           normalAnnotation)
     self._logSampleNameColumnDescription(source_column, sampleAnnotation,
                                          tumorAnnotation, normalAnnotation)
     self.sampleNameGrabber = self._getSampleNameGrabber(
         source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.outputAnnotationName = self._deriveOutputAnnotationName(
         sampleAnnotation)
     self.annotationSource = self._deriveAnnotationSource(source_column)
コード例 #6
0
    def __init__(self,
                 filename,
                 mutation_data_factory=None,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator,
              self).__init__(filename, mutation_data_factory, configFile,
                             genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
コード例 #7
0
 def __init__(self,
              filename,
              configFile="tcgaVCF1.1_output.config",
              otherOptions=None):
     self._filename = filename
     self.logger = logging.getLogger(__name__)
     self.config = ConfigUtils.createConfigParser(configFile)
     self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
         self.config)
     self.seenDbSNPs = dict()
     self.fieldMap = {}
コード例 #8
0
 def __init__(self, mut, configFile="sample_name_selection.config", section="SAMPLE_NAME"):
     config = ConfigUtils.createConfigParser(configFile)
     self.logger = logging.getLogger(__name__)
     aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config, section)
     self.configFile=configFile
     sampleAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_name"])
     tumorAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_tumor_name"])
     normalAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_normal_name"])
     source_column = self._getSourceColumn(sampleAnnotation,tumorAnnotation,normalAnnotation)
     self._logSampleNameColumnDescription(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.sampleNameGrabber = self._getSampleNameGrabber(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.outputAnnotationName = self._deriveOutputAnnotationName(sampleAnnotation)
     self.annotationSource = self._deriveAnnotationSource(source_column)
コード例 #9
0
    def __init__(self,
                 filename,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
コード例 #10
0
ファイル: ConfigUtilTest.py プロジェクト: xingtech/oncotator
 def testLocate(self):
     ''' Call the locate command on a config file and make sure the location is proper. '''
     fp = ConfigUtils._locateConfigFile("testdata/dummy_configs/dummy.config", isRelaxedLogging=True)
     config = SafeConfigParser()
     config.readfp(fp)
     self.assertTrue(config.get("general", "dummy1") == "Hello")
     self.assertTrue(config.get("general", "dummy2") == "world")
コード例 #11
0
    def testCreateDatasourceWithMissingValues(self):
        """

        """
        dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.missing.txt")
        destDir = "out"
        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.missing.tabix_indexed.txt.gz"
        indexColumnNames = "CHROM,POS,POS"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        annotationColumnNames = "EA_GTC,DP"
        configFilename = os.path.join("out", "esp_coverage.missing.config")

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName,
                                           dataSourceVersion, dataSourceMatchMode, annotationColumnNames,
                                           DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))

        self.assertEqual(configParser.get("data_types", "EA_GTC"), "Float",
                         "Expected EA_GTC data type is %s but was %s."
                         % ("Float", configParser.get("data_types", "EA_GTC")))
        self.assertEqual(configParser.get("data_types", "DP"), "Integer",
                         "Expected DP data type is %s but was %s."
                         % ("Integer", configParser.get("data_types", "DP")))
コード例 #12
0
    def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
コード例 #13
0
    def testCreateGPTsvConfigFile(self):
        configFilename = "out/ccle_by_gp.config"
        datasourceFilename = "ccle_results_by_pos.hg19.import.txt"
        dataSourceType = "gp_tsv"
        dataSourceName = "CCLE_By_GP"
        dataSourceVersion = "09292010"
        genomicPositionColumnNames = "chr,start,end"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(
            configFilename=configFilename,
            baseDSFile=datasourceFilename,
            ds_name=dataSourceName,
            ds_type=dataSourceType,
            ds_version=dataSourceVersion,
            indexCols=DatasourceInstallUtils.getIndexCols(
                "gp_tsv", genomicPositionColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"),
                        "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"),
                        "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"),
                        "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"),
                        "version option is missing in general section.")
        self.assertTrue(
            configParser.has_option("general", "genomic_position_cols"),
            "genomic_position_cols option is missing in general section.")

        self.assertEqual(
            configParser.get("general", "type"), dataSourceType,
            "Expected data source type is %s but was %s." %
            (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(
            configParser.get("general", "src_file"), datasourceFilename,
            "Expected data source src_file is %s but was %s." %
            (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(
            configParser.get("general", "title"), dataSourceName,
            "Expected data source title is %s but was %s." %
            (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(
            configParser.get("general", "version"), dataSourceVersion,
            "Expected data source version is %s but was %s." %
            (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(
            configParser.get("general", "genomic_position_cols"),
            genomicPositionColumnNames,
            "Expected data source genomic_position_cols is %s but was %s." %
            (genomicPositionColumnNames,
             configParser.get("general", "genomic_position_cols")))
コード例 #14
0
    def createConfigTableKeys(self, configParser, configTable):
        # Parse fields from FORMAT section of the config file
        """


        :param configParser:
        :param configTable:
        """
        # Parse fields from INFO section of the config file
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(
            configParser, "INFO")
        for name, ID in table.items():
            configTable.addInfoFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(
            configParser, "FORMAT")
        for name, ID in table.items():
            configTable.addFormatFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(
            configParser, "OTHER")
        for name, ID in table.items():
            configTable.addOtherFieldName(name, ID)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "INFO_DESCRIPTION")
        for name, description in table.items():
            configTable.addInfoFieldNameDescription(
                name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "FORMAT_DESCRIPTION")
        for name, description in table.items():
            configTable.addFormatFieldNameDescription(
                name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "FILTER_DESCRIPTION")
        for name, description in table.items():
            configTable.addFilterFieldNameDescription(
                name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToSplitSet(fieldType, names)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            configParser, "NOT_SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToNotSplitSet(fieldType, names)
コード例 #15
0
    def test_intitialize(self):
        """Test a simple initialization of an ensembl datasource """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)
        self.assertIsNotNone(ensembl_ds)
        ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
    def test_intitialize(self):
        """Test a simple initialization of an ensembl datasource """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)
        self.assertIsNotNone(ensembl_ds)
        ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
コード例 #17
0
    def getGeneTsvConfigFile(self):
        configFilename = "out/simple_uniprot.config"
        datasourceFilename = "simple_uniprot.out.2011_09.tsv"
        dataSourceType = "gene_tsv"
        dataSourceName = "UniProt"
        dataSourceVersion = "2011_09"
        geneColumnName = "gene"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(
            configFilename=configFilename,
            baseDSFile=datasourceFilename,
            ds_name=dataSourceName,
            ds_type=dataSourceType,
            ds_version=dataSourceVersion,
            indexCols=DatasourceInstallUtils.getIndexCols(
                "gene_tsv", geneColumnName))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"),
                        "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"),
                        "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"),
                        "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"),
                        "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "gene_col"),
                        "gene_col option is missing in general section.")

        self.assertEqual(
            configParser.get("general", "type"), dataSourceType,
            "Expected data source type is %s but was %s." %
            (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(
            configParser.get("general", "src_file"), datasourceFilename,
            "Expected data source src_file is %s but was %s." %
            (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(
            configParser.get("general", "title"), dataSourceName,
            "Expected data source title is %s but was %s." %
            (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(
            configParser.get("general", "version"), dataSourceVersion,
            "Expected data source version is %s but was %s." %
            (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(
            configParser.get("general", "gene_col"), geneColumnName,
            "Expected data source gene_col is %s but was %s." %
            (geneColumnName, configParser.get("general", "gene_col")))
コード例 #18
0
    def testCreateGPTsvDatasource(self):
        """


        """
        datasourceFilename = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt"
        datasourceType = "gp_tsv"
        datasourceName = "ORegAnno"
        datasourceFoldername = "ORegAnno"
        datasourceVersion = "UCSC Track"
        genomeBuild = "hg19"
        genomicPositionColumnNames = "hg19.oreganno.chrom,hg19.oreganno.chromStart,hg19.oreganno.chromEnd"

        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName,
                                                 datasourceType, datasourceVersion, genomicPositionColumnNames)

        datasourceFilename = "oreganno_trim.hg19.txt"
        configFilename = os.path.join(*[destDir, "ORegAnno.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "genomic_position_cols"),
                        "genomic_position_cols option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames,
                         "Expected data source genomic_position_cols is %s but was %s."
                         % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        MutUtils.removeDir(tmpDir)
コード例 #19
0
    def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()
        
        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
コード例 #20
0
    def testCreateIndexedVcfDatasource(self):
        datasourceFilename = "testdata/vcf/example.vcf"
        datasourceFoldername = "1000Genomes"
        datasourceName = "1000Genomes"
        datasourceType = "indexed_vcf"
        datasourceVersion = "V4.1"
        genomeBuild = "hg19"
        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName,
                                                 datasourceType, datasourceVersion)

        datasourceFilename = "example.tabix_indexed.vcf.gz"
        configFilename = os.path.join(*[destDir, "1000Genomes.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        # Data source was created correctly
        tabixIndexedFilename = os.path.join(*[destDir, "example.tabix_indexed.vcf.gz"])
        self.assertTrue(os.path.exists(tabixIndexedFilename), "No index file was generated.")

        vcfReader = vcf.Reader(filename=tabixIndexedFilename, compressed=True, strict_whitespace=True)
        vcfRecords = vcfReader.fetch(chrom=20, start=1230237, end=1230237)
        for vcfRecord in vcfRecords:
            self.assertEqual(vcfRecord.INFO["NS"], 3, "Expected %s but got %s." % (3, vcfRecord.INFO["NS"]))
            self.assertEqual(vcfRecord.INFO["DP"], 13, "Expected %s but got %s." % (13, vcfRecord.INFO["DP"]))

        MutUtils.removeDir(tmpDir)
コード例 #21
0
    def _validateTcgaMafContents(self, filename):
        """
        This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
        
        Note: This method has nothing to do with the TCGA validator.
        
        TODO: This is code duplication from TCGA MAF Output RendererTest.  This should be refactored into a base class
        (to preserve self.assertTrue, etc).
        """
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0,
                        "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') <> -1,
                        "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:
            if lineDict['Entrez_Gene_Id'] == "0":
                self.assertTrue(
                    lineDict['Hugo_Symbol'] == "Unknown",
                    "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: "
                    + str(ctr))

            unknownKeys = []
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue(
                    '\r' not in lineDict[k],
                    "Carriage return character found in an annotation value.")

                configFile = ConfigUtils.createConfigParser(
                    'configs/tcgaMAF2.3_output.config')
                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                if (k not in requiredColumns) and (k not in optionalColumns):
                    self.assertTrue(
                        k.startswith("i_"),
                        "Internal column was not prepended with 'i_'")

            unknownKeys.sort()
            self.assertTrue(
                len(unknownKeys) == 0, "__UNKNOWN__ values (" +
                str(len(unknownKeys)) + ") seen on line " + str(ctr) +
                ", in fields: " + ", ".join(unknownKeys))

            ctr += 1
コード例 #22
0
    def getConfigTable(self, configFilename, filename):
        """


        :return:
        """
        configParser = ConfigUtils.createConfigParser(configFilename, ignoreCase=False)
        configTable = VcfInputConfigTable()
        vcfReader = vcf.Reader(filename=filename, strict_whitespace=True)

        self.createConfigTableKeys(configParser=configParser, configTable=configTable)
        self.createConfigTable(vcfReader=vcfReader, configTable=configTable)

        return configTable
コード例 #23
0
    def getConfigTable(self, configFilename, filename=None):
        """



        :param configFilename:
        :param filename:
        :return:
        """
        configParser = ConfigUtils.createConfigParser(configFilename, ignoreCase=False)
        configTable = VcfOutputConfigTable()
        self.createConfigTableKeys(configParser=configParser, configTable=configTable)

        return configTable
コード例 #24
0
    def _validateTcgaMafContents(self, filename):
        """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created.  
        
        Note: This method has nothing to do with the TCGA validator.
        
        """
        configFile = ConfigUtils.createConfigParser(os.path.join("configs", "tcgaMAF2.4_output.config"))
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:

            # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf)
            # if lineDict['Entrez_Gene_Id'] == "0":
            #     self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            self.assertTrue(lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"]))
            self.assertTrue(lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr))
            uniprot_aa_xform_counter = 0
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")

                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                exposedColumns = configFile.get("general", "exposedColumns")
                if (k not in requiredColumns) and (k not in optionalColumns) and (k not in exposedColumns):
                    self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")
            if lineDict['UniProt_AApos'] == "0":
                uniprot_aa_xform_counter += 1

            if lineDict["Variant_Type"] == VariantClassification.VT_DEL:
                self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-")

            if lineDict["Variant_Type"] == VariantClassification.VT_INS:
                self.assertTrue(lineDict["Reference_Allele"] == "-")

            unknownKeys.sort()
            self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys))
            self.assertTrue(uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + ").  This is probably an error.")

            ctr += 1
コード例 #25
0
    def getConfigTable(self, configFilename, filename=None):
        """



        :param configFilename:
        :param filename:
        :return:
        """
        configParser = ConfigUtils.createConfigParser(configFilename,
                                                      ignoreCase=False)
        configTable = VcfOutputConfigTable(configFilename)
        self.createConfigTableKeys(configParser=configParser,
                                   configTable=configTable)

        return configTable
コード例 #26
0
    def test_simple_annotate(self):
        """ Annotate a simple example.
        """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)

        m = MutationData()
        m.chr = "22"
        m.start = "22161963"
        m.end = "22161963"
        m.ref_allele = "C"
        m.alt_allele = "A"

        m2 = ensembl_ds.annotate_mutation(m)
    def test_simple_annotate(self):
        """ Annotate a simple example.
        """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)

        m = MutationData()
        m.chr = "22"
        m.start = "22161963"
        m.end = "22161963"
        m.ref_allele = "C"
        m.alt_allele = "A"

        m2 = ensembl_ds.annotate_mutation(m)
コード例 #28
0
    def createVcfHeader(self, m, commentString=""):
        """Create the VCF Header using a simple template. """
        sourceConfigFP = ConfigUtils.createTemplateFP("tcgaVCF1.1Header.template")
        sHeaderTemplate = Template(sourceConfigFP.read())

        missingAnnotations = []
        headerSubKeys = dict()
        for reqHdr in TcgaVcfOutputRenderer.requiredHeaderAnnotations:
            if reqHdr not in m.keys():
                missingAnnotations.append(reqHdr)
                headerSubKeys[reqHdr] = "."
            else:
                headerSubKeys[reqHdr] = m[reqHdr]

        headerSubKeys['date'] = str(datetime.now().date()).replace('-', '')
        headerSubKeys['comments'] = commentString
        headerSubKeys['tumor_subtype_upper'] = headerSubKeys['tumor_subtype'].upper()
        sHeader = sHeaderTemplate.safe_substitute(headerSubKeys)
        return sHeader
コード例 #29
0
    def createConfigTableKeys(self, configParser, configTable):
        # Parse fields from FORMAT section of the config file
        """


        :param configParser:
        :param configTable:
        """
        # Parse fields from INFO section of the config file
        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "INFO")
        for name, ID in table.items():
            configTable.addInfoFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "FORMAT")
        for name, ID in table.items():
            configTable.addFormatFieldName(name, ID)

        table = ConfigUtils.buildReverseAlternativeDictionaryFromConfig(configParser, "OTHER")
        for name, ID in table.items():
            configTable.addOtherFieldName(name, ID)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "INFO_DESCRIPTION")
        for name, description in table.items():
            configTable.addInfoFieldNameDescription(name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "FORMAT_DESCRIPTION")
        for name, description in table.items():
            configTable.addFormatFieldNameDescription(name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "FILTER_DESCRIPTION")
        for name, description in table.items():
            configTable.addFilterFieldNameDescription(name, string.join(description, ","))

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToSplitSet(fieldType, names)

        table = ConfigUtils.buildAlternateKeyDictionaryFromConfig(configParser, "NOT_SPLIT_TAGS")
        for fieldType, names in table.items():
            configTable.addFieldNamesToNotSplitSet(fieldType, names)
コード例 #30
0
    def _validateTcgaMafContents(self, filename):
        """
        This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
        
        Note: This method has nothing to do with the TCGA validator.
        
        TODO: This is code duplication from TCGA MAF Output RendererTest.  This should be refactored into a base class
        (to preserve self.assertTrue, etc).
        """
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:
            if lineDict['Entrez_Gene_Id'] == "0":
                self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown",
                                "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")

                configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.3_output.config')
                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                if (k not in requiredColumns) and (k not in optionalColumns):
                    self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")

            unknownKeys.sort()
            self.assertTrue(len(unknownKeys) == 0,
                            "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(
                                ctr) + ", in fields: " + ", ".join(unknownKeys))

            ctr += 1
コード例 #31
0
    def createVcfHeader(self, m, commentString=""):
        """Create the VCF Header using a simple template. """
        sourceConfigFP = ConfigUtils.createTemplateFP(
            "tcgaVCF1.1Header.template")
        sHeaderTemplate = Template(sourceConfigFP.read())

        missingAnnotations = []
        headerSubKeys = dict()
        for reqHdr in TcgaVcfOutputRenderer.requiredHeaderAnnotations:
            if reqHdr not in m.keys():
                missingAnnotations.append(reqHdr)
                headerSubKeys[reqHdr] = "."
            else:
                headerSubKeys[reqHdr] = m[reqHdr]

        headerSubKeys['date'] = str(datetime.now().date()).replace('-', '')
        headerSubKeys['comments'] = commentString
        headerSubKeys['tumor_subtype_upper'] = headerSubKeys[
            'tumor_subtype'].upper()
        sHeader = sHeaderTemplate.safe_substitute(headerSubKeys)
        return sHeader
コード例 #32
0
    def testCreateDatasourceFromGZFile(self):
        dsFile = os.path.join("testdata", "example.vcf.gz")
        destDir = "out"
        configFilename = os.path.join("out", "esp.config")
        datasourceFilename = "example.vcf.gz"
        dataSourceType = "indexed_vcf"
        datasourceMatchMode = "avg"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"

        datasourceBuilder = TabixIndexedVcfDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, configFilename, dataSourceType, dataSourceName,
                                           dataSourceVersion, datasourceMatchMode)

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "match_mode"), datasourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (datasourceMatchMode, configParser.get("general", "match_mode")))
コード例 #33
0
    def testCreateGPTsvConfigFile(self):
        configFilename = "out/ccle_by_gp.config"
        datasourceFilename = "ccle_results_by_pos.hg19.import.txt"
        dataSourceType = "gp_tsv"
        dataSourceName = "CCLE_By_GP"
        dataSourceVersion = "09292010"
        genomicPositionColumnNames = "chr,start,end"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                           ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion,
                                           indexCols=DatasourceInstallUtils.getIndexCols("gp_tsv",
                                                                                         genomicPositionColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "genomic_position_cols"),
                        "genomic_position_cols option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames,
                         "Expected data source genomic_position_cols is %s but was %s."
                         % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
コード例 #34
0
    def getGeneTsvConfigFile(self):
        configFilename = "out/simple_uniprot.config"
        datasourceFilename = "simple_uniprot.out.2011_09.tsv"
        dataSourceType = "gene_tsv"
        dataSourceName = "UniProt"
        dataSourceVersion = "2011_09"
        geneColumnName = "gene"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                           ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion,
                                           indexCols=DatasourceInstallUtils.getIndexCols("gene_tsv", geneColumnName))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "gene_col"),
                        "gene_col option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "gene_col"), geneColumnName,
                         "Expected data source gene_col is %s but was %s."
                         % (geneColumnName, configParser.get("general", "gene_col")))
コード例 #35
0
    def testInternalFieldsSkipPrepend(self):
        """ Test that no prepending of "i_" is honored."""
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationDataFactory.default_create()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(
            outputFilename,
            configFile='configs/tcgaMAF2.4_output.config',
            other_options={OptionConstants.NO_PREPEND: True})
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser(
            'configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue(
            "Hugo_Symbol" in requiredColumns,
            " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified."
        )

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers,
                        "Hugo_Symbol not found in output headers")
        self.assertTrue(
            "i_TEST" not in headers,
            "i_TEST was found in output headers when prepend was disabled.")
        self.assertTrue("TEST" in headers,
                        "TEST was not found in output headers.")
コード例 #36
0
    def testInternalFields(self):
        """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationData()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, configFile='configs/tcgaMAF2.4_output.config')
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser(
            'configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue(
            "Hugo_Symbol" in requiredColumns,
            " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified."
        )

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers,
                        "Hugo_Symbol not found in output headers")
        self.assertTrue(
            "TEST" not in headers,
            "TEST was found in output headers when it should have been renamed to i_TEST"
        )
        self.assertTrue("i_TEST" in headers,
                        "i_TEST not found in output headers")
コード例 #37
0
    def testInternalFieldsSkipPrepend(self):
        """ Test that no prepending of "i_" is honored."""
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationDataFactory.default_create()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config', other_options={OptionConstants.NO_PREPEND:True})
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified.")

        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers")
        self.assertTrue("i_TEST" not in headers, "i_TEST was found in output headers when prepend was disabled.")
        self.assertTrue("TEST" in headers, "TEST was not found in output headers.")
コード例 #38
0
    def testInternalFields(self):
        """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationData()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")
        
        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")
        
        outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config')
        outputRenderer.renderMutations(iter([m]), ['No comments'])
        
        configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified.")

        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")
        
        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers")
        self.assertTrue("TEST" not in headers, "TEST was found in output headers when it should have been renamed to i_TEST")
        self.assertTrue("i_TEST" in headers, "i_TEST not found in output headers")
コード例 #39
0
    def __init__(self, config_file="column_collapse.config"):
        self._config_parser = ConfigUtils.createConfigParser(config_file, ignoreCase=False)
        self._columns_to_collapse = self._config_parser.options("columns_to_collapse")

        # Create a lookup table to get the method for each column
        self._method_dict = dict()
        if len(self._columns_to_collapse) != len(set(self._columns_to_collapse)):
            logging.getLogger(__name__).warn("Duplicate keys in " + config_file + " seen.  Some collapsing may produce unexpected values.")

        for c in self._columns_to_collapse:
            self._method_dict[c] = self._config_parser.get("columns_to_collapse", c).strip()

            # Default to mean if not specified
            if self._method_dict[c] is None or self._method_dict[c] == "":
                self._method_dict[c] = ColumnCollapser.MEAN

        # Basic validation
        problematic_method_assignments = dict()
        for c in self._columns_to_collapse:
            if self._method_dict[c] not in ColumnCollapser.VALID_VALUES:
                problematic_method_assignments[c] = self._method_dict[c]

        if len(problematic_method_assignments.keys()) > 0:
            raise ValueError("Invalid column collapsing specified: " + str(problematic_method_assignments))
コード例 #40
0
    def testCreateDatasource(self):
        """

        """
        dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt")

        # Never specify "out/"
        destDir = "out/create_ds_test/"

        if os.path.exists(destDir):
            shutil.rmtree(destDir)
        os.makedirs(destDir)
        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        indexColumnNames = "CHROM,POS,POS,REF,ALT"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        configFilename = "out/esp_coverage.config"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        annotationColumnNames = "DBSNP,EA_GTC,DP"

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType,
                                           dataSourceName, dataSourceVersion, dataSourceMatchMode,
                                           annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType,
                                                                                                      indexColumnNames))

        self.assertTrue(os.path.exists(destDir + datasourceFilename))
        self.assertTrue(os.path.exists(destDir + datasourceFilename + ".tbi"))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section")
        self.assertTrue(configParser.has_option("general", "index_column_names"),
                        "index_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))
        self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (dataSourceMatchMode, configParser.get("general", "match_mode")))
        self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames,
                         "Expected data source index column names is %s but was %s."
                         % (indexColumnNames, configParser.get("general", "index_column_names")))

        self.assertEqual(configParser.get("data_types", "EA_GTC"), "String",
                         "Expected EA_GTC data type is %s but was %s."
                         % ("String", configParser.get("data_types", "EA_GTC")))
        self.assertEqual(configParser.get("data_types", "DP"), "Integer",
                         "Expected DP data type is %s but was %s."
                         % ("Integer", configParser.get("data_types", "DP")))

        ds = DatasourceFactory.createDatasourceFromConfigParser(configParser, "out/create_ds_test/")
        mut = MutationData(chr="1", start="69428", end="69428", ref_allele="T", alt_allele="G")
        mut2 = ds.annotate_mutation(mut)
        self.assertEquals(mut2["ESP_DBSNP"], "dbSNP_134")
        self.assertEquals(mut2["ESP_EA_GTC"], "92,129,3203")
        self.assertEquals(mut2["ESP_DP"], "110")
コード例 #41
0
    def _validateTcgaMafContents(self, filename):
        """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created.  
        
        Note: This method has nothing to do with the TCGA validator.
        
        """
        configFile = ConfigUtils.createConfigParser(
            os.path.join("configs", "tcgaMAF2.4_output.config"))
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0,
                        "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1,
                        "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:

            # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf)
            # if lineDict['Entrez_Gene_Id'] == "0":
            #     self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            self.assertTrue(
                lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"],
                "Reference and alternate were equal in TCGA MAF output on line %d (%s)"
                % (ctr, lineDict["Tumor_Seq_Allele1"]))
            self.assertTrue(
                lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"],
                "Reference Allele should match Tumor_Seq_Allele1 on line " +
                str(ctr))
            uniprot_aa_xform_counter = 0
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue(
                    '\r' not in lineDict[k],
                    "Carriage return character found in an annotation value.")

                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                exposedColumns = configFile.get("general", "exposedColumns")
                if (k not in requiredColumns) and (
                        k not in optionalColumns) and (k
                                                       not in exposedColumns):
                    self.assertTrue(
                        k.startswith("i_"),
                        "Internal column was not prepended with 'i_'")
            if lineDict['UniProt_AApos'] == "0":
                uniprot_aa_xform_counter += 1

            if lineDict["Variant_Type"] == VariantClassification.VT_DEL:
                self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-")

            if lineDict["Variant_Type"] == VariantClassification.VT_INS:
                self.assertTrue(lineDict["Reference_Allele"] == "-")

            unknownKeys.sort()
            self.assertTrue(
                len(unknownKeys) == 0, "__UNKNOWN__ values (" +
                str(len(unknownKeys)) + ") seen on line " + str(ctr) +
                ", in fields: " + ", ".join(unknownKeys))
            self.assertTrue(
                uniprot_aa_xform_counter < 10,
                "Too many uniprot aa xform values are zero (" +
                str(uniprot_aa_xform_counter) +
                ").  This is probably an error.")

            ctr += 1
コード例 #42
0
    def testCreateIndexedTsvDatasource(self):
        datasourceFilename = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt"
        datasourceFoldername = "1000Genomes"
        datasourceName = "1000Genomes"
        datasourceType = "indexed_tsv"
        datasourceVersion = "V4.1"
        genomeBuild = "hg19"
        indexColumnNames = "CHROM,POS,POS"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        annotationColumnNames = "DBSNP,EA_AC,AA_AC,TAC"

        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir=destDir, ds_file=datasourceFilename,
                                                 ds_foldername=datasourceFoldername, ds_name=datasourceName,
                                                 ds_type=datasourceType, ds_version=datasourceVersion,
                                                 index_columns=indexColumnNames,
                                                 ds_annotation_columns=annotationColumnNames)

        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        configFilename = os.path.join(*[destDir, "1000Genomes.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        datasource = DatasourceFactory.createDatasource(configFilename, destDir)

        m1 = MutationDataFactory.default_create()
        m1.chr = "1"
        m1.start = "802177"
        m1.end = "802177"
        m1.ref_allele = "T"
        m1.alt_allele = "C"

        m1_annotated = datasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("1000Genomes_AA_AC")
        cur_annotation = Annotation(value="2,866", datasourceName="1000Genomes", dataType="String",
                                    description="",
                                    tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        annotationNames = ["1000Genomes_CHROM", "1000Genomes_POS", "1000Genomes_REF", "1000Genomes_ALT",
                           "1000Genomes_GWAS_PUBMED"]
        for annotationName in annotationNames:
            self.assertTrue(annotationName not in m1_annotated, "m1_annotated was annotated with %s." % annotationName)

        annotationNames = ["1000Genomes_DBSNP", "1000Genomes_EA_AC", "1000Genomes_AA_AC", "1000Genomes_TAC"]
        for annotationName in annotationNames:
            self.assertTrue(annotationName in m1_annotated, "m1_annotated was not annotated with %s value."
                                                            % annotationName)

        MutUtils.removeDir(tmpDir)
コード例 #43
0
ファイル: ConfigUtilTest.py プロジェクト: xingtech/oncotator
 def testLocalConfig(self):
     ''' Get a key from a local config and a config file that are the same basic name, but different values. '''
     config = ConfigUtils.createConfigParser("testdata/dummy_configs/dummy.config")
     self.assertTrue(config.get("general", "dummy1") == "Super")
     self.assertTrue(config.get("general", "dummy2") == "world")
コード例 #44
0
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info("Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = MutUtils.createFieldsMapping(headers, annotations, self._alternativeDictionary, isRenderInternalFields=True, prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg


        if i == 0:
            logging.getLogger(__name__).info("No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i,gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes))

        fp.close()
コード例 #45
0
    def testCreateConfigFile(self):
        """

        """
        configFilename = os.path.join("out", "esp_coverage.config")
        datasourceFilename = "ESP6500SI-V2.coverage.txt.gz"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        indexColumnNames = "Chromosome,Position,Position"
        columnNames = "Chromosome,Position,TotalSamplesCovered,AvgSampleReadDepth,TotalEAsamplesCovered,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth"
        annotationColumnNames = "TotalSamplesCovered,AvgSampleReadDepth,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth"

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                            ds_type=dataSourceType, ds_name=dataSourceName,
                                            ds_version=dataSourceVersion, column_names=columnNames,
                                            annotation_column_names=annotationColumnNames,
                                            ds_match_mode=dataSourceMatchMode,
                                            indexCols=DatasourceInstallUtils.getIndexCols(dataSourceType,
                                                                                          indexColumnNames))
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section")
        self.assertTrue(configParser.has_option("general", "index_column_names"),
                        "index_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))
        self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (dataSourceMatchMode, configParser.get("general", "match_mode")))
        self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames,
                         "Expected data source index column names is %s but was %s."
                         % (indexColumnNames, configParser.get("general", "index_column_names")))
コード例 #46
0
    def testCreateDatasource(self):
        """

        """
        dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt")
        destDir = "out"
        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        indexColumnNames = "CHROM,POS,POS,REF,ALT"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        configFilename = "out/esp_coverage.config"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        annotationColumnNames = "DBSNP,EA_GTC,DP"

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType,
                                           dataSourceName, dataSourceVersion, dataSourceMatchMode,
                                           annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType,
                                                                                                      indexColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section")
        self.assertTrue(configParser.has_option("general", "index_column_names"),
                        "index_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))
        self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (dataSourceMatchMode, configParser.get("general", "match_mode")))
        self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames,
                         "Expected data source index column names is %s but was %s."
                         % (indexColumnNames, configParser.get("general", "index_column_names")))

        self.assertEqual(configParser.get("data_types", "EA_GTC"), "String",
                         "Expected EA_GTC data type is %s but was %s."
                         % ("String", configParser.get("data_types", "EA_GTC")))
        self.assertEqual(configParser.get("data_types", "DP"), "Integer",
                         "Expected DP data type is %s but was %s."
                         % ("Integer", configParser.get("data_types", "DP")))
コード例 #47
0
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info(
            "Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = FieldMapCreator.create_field_map(
                    headers,
                    seg,
                    self._alternativeDictionary,
                    is_render_internal_fields=True,
                    prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg

        if i == 0:
            logging.getLogger(__name__).info(
                "No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp,
                                headers,
                                delimiter="\t",
                                lineterminator="\n",
                                extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i, gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." %
                                                 ((i + 1), num_genes))

        fp.close()
コード例 #48
0
ファイル: FieldMapCreator.py プロジェクト: xingtech/oncotator
    def create_field_map(headers, m, alternative_dict, is_render_internal_fields=True,
                            exposed_fields=None, prepend="i_", deprioritize_input_annotations=False,
                            additional_columns=None):
        """
        Create a mapping for output header to the best input annotation.

        This can handle prepend fields (attach the prepend to internal fields), exposed fields (ones not in the list of headers, but should not have a prepend),

        :param additional_columns: a list of additional columns not found in the mutation nor the headers.  These will
        be considered internal fields with annotations of the exact same name.
        :type additional_columns list
        :param is_render_internal_fields: Whether annotations not assigned to headers (or superseded by other annotations) should be included in this map.
        :type is_render_internal_fields bool
        :param exposed_fields: list of fields that, if found, should never receive a prepend.
        :param prepend: The prepend to put on internal fields, if any are detected.  If is_render_internal_fields is False, this parameter does nothing.
        :param deprioritize_input_annotations: If an annotation with the exact name of the header is found AND it has a datasource of "INPUT",
            use one the annotations instead.  This is useful in cases where we want to reannotate.  This effectively handles aliases.
        :param headers:  List of headers that need to be populated for rendering.  For example, the columns in a TCGA MAF
        :param m: MutationData to scrape available annotations
        :param alternative_dict: Dictionary of header to list of annotations.  Usually, created from a config file.
        :return: dict of header:annotation name (one annotation name) that should be used for this output rendering.
        """
        result = dict()
        if prepend is None:
            prepend = ""

        if exposed_fields is None:
            exposed_fields = set()

        # Process each header and find the first alias.  If an annotation exists with the exact same name as the header
        #   use that unless deprioritization is in effect.
        annotation_names = MutUtils.get_all_annotation_names(m)

        for h in headers:
            choice = FieldMapCreator.choose_best_annotation(h, m, alternative_dict, deprioritize_input_annotations)
            if choice is None:
                choice = h
            result[h] = choice

        # Now populate internal fields, if requested.
        if is_render_internal_fields:

            if additional_columns is None:
                additional_columns = []

            annotation_names_used = result.values()
            internal_field_dict = dict()
            sAnnotations = set(annotation_names)
            internal_fields = sAnnotations.difference(annotation_names_used)
            internal_fields = internal_fields.union(set(additional_columns))

            # Create a dict to do a lookup of annotation to the column to use.
            reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(alternative_dict)

            for i in internal_fields:
                if i.startswith('_') or i == "transcripts":
                    continue

                no_prepend_name = i
                if prepend != "" and i.startswith(prepend):
                    no_prepend_name = i.replace(prepend, "")

                field_alt_dict = {i: [prepend+i, no_prepend_name]}
                choice = FieldMapCreator.choose_best_annotation(i, m, field_alt_dict, deprioritize_input_annotations)
                if choice is None:
                    choice = i
                key_to_use = reverseAlternativeDict.get(i,i)
                if prepend.strip() == "" or i.startswith(prepend) or i in exposed_fields:
                    internal_field_dict[key_to_use] = choice
                else:
                    internal_field_dict[prepend + key_to_use] = choice

            result.update(internal_field_dict)

        return result