コード例 #1
0
    def __init__(self,
                 filename,
                 configFile="tcgaMAF2.4_output.config",
                 other_options=None):
        """
        TODO: Need functionality for not prepending the i_ on internal fields.
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)

        #TODO: Read missing options from the config file or specify that error should be thrown.
        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        self.exposedColumns = set(
            self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False
コード例 #2
0
    def testCreateDatasourceWithMissingValues(self):
        """

        """
        dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.missing.txt")
        destDir = "out"
        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.missing.tabix_indexed.txt.gz"
        indexColumnNames = "CHROM,POS,POS"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        annotationColumnNames = "EA_GTC,DP"
        configFilename = os.path.join("out", "esp_coverage.missing.config")

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName,
                                           dataSourceVersion, dataSourceMatchMode, annotationColumnNames,
                                           DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))

        self.assertEqual(configParser.get("data_types", "EA_GTC"), "Float",
                         "Expected EA_GTC data type is %s but was %s."
                         % ("Float", configParser.get("data_types", "EA_GTC")))
        self.assertEqual(configParser.get("data_types", "DP"), "Integer",
                         "Expected DP data type is %s but was %s."
                         % ("Integer", configParser.get("data_types", "DP")))
コード例 #3
0
 def __init__(self, filename, configFile="tcgaVCF1.1_output.config", otherOptions=None):
     self._filename = filename
     self.logger = logging.getLogger(__name__)
     self.config = ConfigUtils.createConfigParser(configFile)
     self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
     self.seenDbSNPs = dict()
     self.fieldMap = {}
コード例 #4
0
    def __init__(self, filename, configFile="tcgaMAF2.4_output.config", other_options=None):
        """
        """
        options = dict() if other_options is None else other_options

        self._filename = filename
        self.logger = logging.getLogger(__name__)
        self.config = ConfigUtils.createConfigParser(configFile)

        self.logger.info("Building alternative keys dictionary...")
        self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)

        self.options = options

        self._prepend = self.config.get("general", "prepend")
        if self.options.get(OptionConstants.NO_PREPEND, False):
            self._prepend = ""

        # _is_reannotating is a flag to determine whether we should give precendence to annotations that were not
        #   annotated as part of the INPUT.
        self._is_reannotating = options.get(OptionConstants.REANNOTATE_TCGA_MAF_COLS, False)

        self._is_splitting_allelic_depth = self.options.get(OptionConstants.SPLIT_ALLELIC_DEPTH, True)

        self.exposedColumns = set(self.config.get("general", "exposedColumns").split(','))

        self._is_entrez_id_message_logged = False

        self._is_collapsing_number_cols = options.get(OptionConstants.COLLAPSE_NUMBER_ANNOTATIONS, False)
        self._column_collapser = None
        self._column_collapser_suffix = None
        if self._is_collapsing_number_cols:
            self._column_collapser = ColumnCollapser()
            self._column_collapser_suffix = "_full"
コード例 #5
0
 def __init__(self,
              mut,
              configFile="sample_name_selection.config",
              section="SAMPLE_NAME"):
     config = ConfigUtils.createConfigParser(configFile)
     self.logger = logging.getLogger(__name__)
     aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
         config, section)
     self.configFile = configFile
     sampleAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_name"])
     tumorAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_tumor_name"])
     normalAnnotation = self._getAnnotationFromAliases(
         mut, aliases["sample_normal_name"])
     source_column = self._getSourceColumn(sampleAnnotation,
                                           tumorAnnotation,
                                           normalAnnotation)
     self._logSampleNameColumnDescription(source_column, sampleAnnotation,
                                          tumorAnnotation, normalAnnotation)
     self.sampleNameGrabber = self._getSampleNameGrabber(
         source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.outputAnnotationName = self._deriveOutputAnnotationName(
         sampleAnnotation)
     self.annotationSource = self._deriveAnnotationSource(source_column)
コード例 #6
0
    def __init__(self,
                 filename,
                 mutation_data_factory=None,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator,
              self).__init__(filename, mutation_data_factory, configFile,
                             genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
コード例 #7
0
    def testCreateGPTsvConfigFile(self):
        configFilename = "out/ccle_by_gp.config"
        datasourceFilename = "ccle_results_by_pos.hg19.import.txt"
        dataSourceType = "gp_tsv"
        dataSourceName = "CCLE_By_GP"
        dataSourceVersion = "09292010"
        genomicPositionColumnNames = "chr,start,end"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(
            configFilename=configFilename,
            baseDSFile=datasourceFilename,
            ds_name=dataSourceName,
            ds_type=dataSourceType,
            ds_version=dataSourceVersion,
            indexCols=DatasourceInstallUtils.getIndexCols(
                "gp_tsv", genomicPositionColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"),
                        "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"),
                        "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"),
                        "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"),
                        "version option is missing in general section.")
        self.assertTrue(
            configParser.has_option("general", "genomic_position_cols"),
            "genomic_position_cols option is missing in general section.")

        self.assertEqual(
            configParser.get("general", "type"), dataSourceType,
            "Expected data source type is %s but was %s." %
            (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(
            configParser.get("general", "src_file"), datasourceFilename,
            "Expected data source src_file is %s but was %s." %
            (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(
            configParser.get("general", "title"), dataSourceName,
            "Expected data source title is %s but was %s." %
            (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(
            configParser.get("general", "version"), dataSourceVersion,
            "Expected data source version is %s but was %s." %
            (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(
            configParser.get("general", "genomic_position_cols"),
            genomicPositionColumnNames,
            "Expected data source genomic_position_cols is %s but was %s." %
            (genomicPositionColumnNames,
             configParser.get("general", "genomic_position_cols")))
コード例 #8
0
 def __init__(self,
              filename,
              configFile="tcgaVCF1.1_output.config",
              otherOptions=None):
     self._filename = filename
     self.logger = logging.getLogger(__name__)
     self.config = ConfigUtils.createConfigParser(configFile)
     self.alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
         self.config)
     self.seenDbSNPs = dict()
     self.fieldMap = {}
コード例 #9
0
    def test_intitialize(self):
        """Test a simple initialization of an ensembl datasource """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)
        self.assertIsNotNone(ensembl_ds)
        ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
    def test_intitialize(self):
        """Test a simple initialization of an ensembl datasource """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)
        self.assertIsNotNone(ensembl_ds)
        ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT)
        self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
コード例 #11
0
    def getGeneTsvConfigFile(self):
        configFilename = "out/simple_uniprot.config"
        datasourceFilename = "simple_uniprot.out.2011_09.tsv"
        dataSourceType = "gene_tsv"
        dataSourceName = "UniProt"
        dataSourceVersion = "2011_09"
        geneColumnName = "gene"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(
            configFilename=configFilename,
            baseDSFile=datasourceFilename,
            ds_name=dataSourceName,
            ds_type=dataSourceType,
            ds_version=dataSourceVersion,
            indexCols=DatasourceInstallUtils.getIndexCols(
                "gene_tsv", geneColumnName))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"),
                        "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"),
                        "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"),
                        "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"),
                        "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "gene_col"),
                        "gene_col option is missing in general section.")

        self.assertEqual(
            configParser.get("general", "type"), dataSourceType,
            "Expected data source type is %s but was %s." %
            (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(
            configParser.get("general", "src_file"), datasourceFilename,
            "Expected data source src_file is %s but was %s." %
            (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(
            configParser.get("general", "title"), dataSourceName,
            "Expected data source title is %s but was %s." %
            (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(
            configParser.get("general", "version"), dataSourceVersion,
            "Expected data source version is %s but was %s." %
            (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(
            configParser.get("general", "gene_col"), geneColumnName,
            "Expected data source gene_col is %s but was %s." %
            (geneColumnName, configParser.get("general", "gene_col")))
コード例 #12
0
    def testCreateGPTsvDatasource(self):
        """


        """
        datasourceFilename = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt"
        datasourceType = "gp_tsv"
        datasourceName = "ORegAnno"
        datasourceFoldername = "ORegAnno"
        datasourceVersion = "UCSC Track"
        genomeBuild = "hg19"
        genomicPositionColumnNames = "hg19.oreganno.chrom,hg19.oreganno.chromStart,hg19.oreganno.chromEnd"

        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName,
                                                 datasourceType, datasourceVersion, genomicPositionColumnNames)

        datasourceFilename = "oreganno_trim.hg19.txt"
        configFilename = os.path.join(*[destDir, "ORegAnno.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "genomic_position_cols"),
                        "genomic_position_cols option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames,
                         "Expected data source genomic_position_cols is %s but was %s."
                         % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        MutUtils.removeDir(tmpDir)
コード例 #13
0
 def __init__(self, mut, configFile="sample_name_selection.config", section="SAMPLE_NAME"):
     config = ConfigUtils.createConfigParser(configFile)
     self.logger = logging.getLogger(__name__)
     aliases = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config, section)
     self.configFile=configFile
     sampleAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_name"])
     tumorAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_tumor_name"])
     normalAnnotation = self._getAnnotationFromAliases(mut, aliases["sample_normal_name"])
     source_column = self._getSourceColumn(sampleAnnotation,tumorAnnotation,normalAnnotation)
     self._logSampleNameColumnDescription(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.sampleNameGrabber = self._getSampleNameGrabber(source_column, sampleAnnotation, tumorAnnotation, normalAnnotation)
     self.outputAnnotationName = self._deriveOutputAnnotationName(sampleAnnotation)
     self.annotationSource = self._deriveAnnotationSource(source_column)
コード例 #14
0
    def __init__(self,
                 filename,
                 configFile='maflite_input.config',
                 genomeBuild="hg19",
                 other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)

        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(
            self._alternativeDict)

        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(
            self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        self.logger.info(
            "Initializing a maflite file with the following header: " +
            str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException(
                "Specified maflite file (" + filename +
                ") missing required headers: " +
                ",".join(missingRequiredHeaders))
コード例 #15
0
    def testCreateIndexedVcfDatasource(self):
        datasourceFilename = "testdata/vcf/example.vcf"
        datasourceFoldername = "1000Genomes"
        datasourceName = "1000Genomes"
        datasourceType = "indexed_vcf"
        datasourceVersion = "V4.1"
        genomeBuild = "hg19"
        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName,
                                                 datasourceType, datasourceVersion)

        datasourceFilename = "example.tabix_indexed.vcf.gz"
        configFilename = os.path.join(*[destDir, "1000Genomes.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        # Data source was created correctly
        tabixIndexedFilename = os.path.join(*[destDir, "example.tabix_indexed.vcf.gz"])
        self.assertTrue(os.path.exists(tabixIndexedFilename), "No index file was generated.")

        vcfReader = vcf.Reader(filename=tabixIndexedFilename, compressed=True, strict_whitespace=True)
        vcfRecords = vcfReader.fetch(chrom=20, start=1230237, end=1230237)
        for vcfRecord in vcfRecords:
            self.assertEqual(vcfRecord.INFO["NS"], 3, "Expected %s but got %s." % (3, vcfRecord.INFO["NS"]))
            self.assertEqual(vcfRecord.INFO["DP"], 13, "Expected %s but got %s." % (13, vcfRecord.INFO["DP"]))

        MutUtils.removeDir(tmpDir)
コード例 #16
0
    def _validateTcgaMafContents(self, filename):
        """
        This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
        
        Note: This method has nothing to do with the TCGA validator.
        
        TODO: This is code duplication from TCGA MAF Output RendererTest.  This should be refactored into a base class
        (to preserve self.assertTrue, etc).
        """
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0,
                        "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') <> -1,
                        "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:
            if lineDict['Entrez_Gene_Id'] == "0":
                self.assertTrue(
                    lineDict['Hugo_Symbol'] == "Unknown",
                    "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: "
                    + str(ctr))

            unknownKeys = []
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue(
                    '\r' not in lineDict[k],
                    "Carriage return character found in an annotation value.")

                configFile = ConfigUtils.createConfigParser(
                    'configs/tcgaMAF2.3_output.config')
                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                if (k not in requiredColumns) and (k not in optionalColumns):
                    self.assertTrue(
                        k.startswith("i_"),
                        "Internal column was not prepended with 'i_'")

            unknownKeys.sort()
            self.assertTrue(
                len(unknownKeys) == 0, "__UNKNOWN__ values (" +
                str(len(unknownKeys)) + ") seen on line " + str(ctr) +
                ", in fields: " + ", ".join(unknownKeys))

            ctr += 1
コード例 #17
0
    def _validateTcgaMafContents(self, filename):
        """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created.  
        
        Note: This method has nothing to do with the TCGA validator.
        
        """
        configFile = ConfigUtils.createConfigParser(os.path.join("configs", "tcgaMAF2.4_output.config"))
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1, "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:

            # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf)
            # if lineDict['Entrez_Gene_Id'] == "0":
            #     self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            self.assertTrue(lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"], "Reference and alternate were equal in TCGA MAF output on line %d (%s)" % (ctr, lineDict["Tumor_Seq_Allele1"]))
            self.assertTrue(lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"], "Reference Allele should match Tumor_Seq_Allele1 on line " + str(ctr))
            uniprot_aa_xform_counter = 0
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")

                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                exposedColumns = configFile.get("general", "exposedColumns")
                if (k not in requiredColumns) and (k not in optionalColumns) and (k not in exposedColumns):
                    self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")
            if lineDict['UniProt_AApos'] == "0":
                uniprot_aa_xform_counter += 1

            if lineDict["Variant_Type"] == VariantClassification.VT_DEL:
                self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-")

            if lineDict["Variant_Type"] == VariantClassification.VT_INS:
                self.assertTrue(lineDict["Reference_Allele"] == "-")

            unknownKeys.sort()
            self.assertTrue(len(unknownKeys) == 0, "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(ctr) + ", in fields: " + ", ".join(unknownKeys))
            self.assertTrue(uniprot_aa_xform_counter < 10, "Too many uniprot aa xform values are zero (" + str(uniprot_aa_xform_counter) + ").  This is probably an error.")

            ctr += 1
コード例 #18
0
    def getConfigTable(self, configFilename, filename):
        """


        :return:
        """
        configParser = ConfigUtils.createConfigParser(configFilename, ignoreCase=False)
        configTable = VcfInputConfigTable()
        vcfReader = vcf.Reader(filename=filename, strict_whitespace=True)

        self.createConfigTableKeys(configParser=configParser, configTable=configTable)
        self.createConfigTable(vcfReader=vcfReader, configTable=configTable)

        return configTable
コード例 #19
0
    def getConfigTable(self, configFilename, filename=None):
        """



        :param configFilename:
        :param filename:
        :return:
        """
        configParser = ConfigUtils.createConfigParser(configFilename, ignoreCase=False)
        configTable = VcfOutputConfigTable()
        self.createConfigTableKeys(configParser=configParser, configTable=configTable)

        return configTable
コード例 #20
0
    def getConfigTable(self, configFilename, filename=None):
        """



        :param configFilename:
        :param filename:
        :return:
        """
        configParser = ConfigUtils.createConfigParser(configFilename,
                                                      ignoreCase=False)
        configTable = VcfOutputConfigTable(configFilename)
        self.createConfigTableKeys(configParser=configParser,
                                   configTable=configTable)

        return configTable
    def test_simple_annotate(self):
        """ Annotate a simple example.
        """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)

        m = MutationData()
        m.chr = "22"
        m.start = "22161963"
        m.end = "22161963"
        m.ref_allele = "C"
        m.alt_allele = "A"

        m2 = ensembl_ds.annotate_mutation(m)
コード例 #22
0
    def __init__(self, filename, mutation_data_factory=None, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        """

        super(MafliteInputMutationCreator, self).__init__(filename, mutation_data_factory, configFile, genomeBuild, other_options)

        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))

        # The specified fields are those that were given in the input.
        self._specified_fields = self._tsvReader.getFieldNames()

        for col in required_columns:
            if col not in self._specified_fields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in self._specified_fields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()

        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
コード例 #23
0
    def test_simple_annotate(self):
        """ Annotate a simple example.
        """
        base_config_location = "testdata/ensembl/saccer/"
        config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config")
        title = config_parser.get("general", "title")
        version = config_parser.get("general", "version")
        src_file = config_parser.get("general", "src_file")

        ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file)

        m = MutationData()
        m.chr = "22"
        m.start = "22161963"
        m.end = "22161963"
        m.ref_allele = "C"
        m.alt_allele = "A"

        m2 = ensembl_ds.annotate_mutation(m)
コード例 #24
0
    def _validateTcgaMafContents(self, filename):
        """
        This is a utility, private method for unit tests to get a semblance that a valid maf file was created.
        
        Note: This method has nothing to do with the TCGA validator.
        
        TODO: This is code duplication from TCGA MAF Output RendererTest.  This should be refactored into a base class
        (to preserve self.assertTrue, etc).
        """
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') <> -1, "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:
            if lineDict['Entrez_Gene_Id'] == "0":
                self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown",
                                "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue('\r' not in lineDict[k], "Carriage return character found in an annotation value.")

                configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.3_output.config')
                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                if (k not in requiredColumns) and (k not in optionalColumns):
                    self.assertTrue(k.startswith("i_"), "Internal column was not prepended with 'i_'")

            unknownKeys.sort()
            self.assertTrue(len(unknownKeys) == 0,
                            "__UNKNOWN__ values (" + str(len(unknownKeys)) + ") seen on line " + str(
                                ctr) + ", in fields: " + ", ".join(unknownKeys))

            ctr += 1
コード例 #25
0
    def __init__(self, filename, configFile='maflite_input.config', genomeBuild="hg19", other_options=None):
        """
        Constructor

        Currently, this InputCreator does not support any other options.  The parameter is ignored.

        """
        self.logger = logging.getLogger(__name__)

        self.config = ConfigUtils.createConfigParser(configFile)
        self._tsvReader = GenericTsvReader(filename)
        
        # Key is the required columns and the values are a list of valid alternative headers.
        # Key is column name to an alternative.
        self._alternativeDict = ConfigUtils.buildAlternateKeyDictionaryFromConfig(self.config)
        self._reverseAlternativeDict = ConfigUtils.buildReverseAlternativeDictionary(self._alternativeDict)
        
        missingRequiredHeaders = []
        specifiedFields = self._tsvReader.getFieldNames()
        required_columns = sorted(self.config.get("general", "required_headers").split(","))
        self._build = genomeBuild

        for col in required_columns:
            if col not in specifiedFields:
                isAltFound = False
                for alt in self._alternativeDict.get(col, []):
                    if alt in specifiedFields:
                        isAltFound = True
                        break
                if not isAltFound:

                    # build is optional.
                    if col != "build":
                        missingRequiredHeaders.append(col)
        missingRequiredHeaders.sort()
        
        self.logger.info("Initializing a maflite file with the following header: " + str(self._tsvReader.getFieldNames()))
        if len(missingRequiredHeaders) > 0:
            raise MafliteMissingRequiredHeaderException("Specified maflite file (" + filename + ") missing required headers: " + ",".join(missingRequiredHeaders)  )
コード例 #26
0
    def testCreateDatasourceFromGZFile(self):
        dsFile = os.path.join("testdata", "example.vcf.gz")
        destDir = "out"
        configFilename = os.path.join("out", "esp.config")
        datasourceFilename = "example.vcf.gz"
        dataSourceType = "indexed_vcf"
        datasourceMatchMode = "avg"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"

        datasourceBuilder = TabixIndexedVcfDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, configFilename, dataSourceType, dataSourceName,
                                           dataSourceVersion, datasourceMatchMode)

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "match_mode"), datasourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (datasourceMatchMode, configParser.get("general", "match_mode")))
コード例 #27
0
    def testCreateGPTsvConfigFile(self):
        configFilename = "out/ccle_by_gp.config"
        datasourceFilename = "ccle_results_by_pos.hg19.import.txt"
        dataSourceType = "gp_tsv"
        dataSourceName = "CCLE_By_GP"
        dataSourceVersion = "09292010"
        genomicPositionColumnNames = "chr,start,end"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                           ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion,
                                           indexCols=DatasourceInstallUtils.getIndexCols("gp_tsv",
                                                                                         genomicPositionColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "genomic_position_cols"),
                        "genomic_position_cols option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames,
                         "Expected data source genomic_position_cols is %s but was %s."
                         % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
コード例 #28
0
    def getGeneTsvConfigFile(self):
        configFilename = "out/simple_uniprot.config"
        datasourceFilename = "simple_uniprot.out.2011_09.tsv"
        dataSourceType = "gene_tsv"
        dataSourceName = "UniProt"
        dataSourceVersion = "2011_09"
        geneColumnName = "gene"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                           ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion,
                                           indexCols=DatasourceInstallUtils.getIndexCols("gene_tsv", geneColumnName))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "gene_col"),
                        "gene_col option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "gene_col"), geneColumnName,
                         "Expected data source gene_col is %s but was %s."
                         % (geneColumnName, configParser.get("general", "gene_col")))
コード例 #29
0
    def testInternalFieldsSkipPrepend(self):
        """ Test that no prepending of "i_" is honored."""
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationDataFactory.default_create()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(
            outputFilename,
            configFile='configs/tcgaMAF2.4_output.config',
            other_options={OptionConstants.NO_PREPEND: True})
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser(
            'configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue(
            "Hugo_Symbol" in requiredColumns,
            " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified."
        )

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers,
                        "Hugo_Symbol not found in output headers")
        self.assertTrue(
            "i_TEST" not in headers,
            "i_TEST was found in output headers when prepend was disabled.")
        self.assertTrue("TEST" in headers,
                        "TEST was not found in output headers.")
コード例 #30
0
    def testInternalFieldsSkipPrepend(self):
        """ Test that no prepending of "i_" is honored."""
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationDataFactory.default_create()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config', other_options={OptionConstants.NO_PREPEND:True})
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified.")

        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers")
        self.assertTrue("i_TEST" not in headers, "i_TEST was found in output headers when prepend was disabled.")
        self.assertTrue("TEST" in headers, "TEST was not found in output headers.")
コード例 #31
0
    def testInternalFields(self):
        """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationData()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")

        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")

        outputRenderer = TcgaMafOutputRenderer(
            outputFilename, configFile='configs/tcgaMAF2.4_output.config')
        outputRenderer.renderMutations(iter([m]), ['No comments'])

        configFile = ConfigUtils.createConfigParser(
            'configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue(
            "Hugo_Symbol" in requiredColumns,
            " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified."
        )

        statinfo = os.stat(outputFilename)
        self.assertTrue(
            statinfo.st_size > 0,
            "Generated MAF file (" + outputFilename + ") is empty.")

        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers,
                        "Hugo_Symbol not found in output headers")
        self.assertTrue(
            "TEST" not in headers,
            "TEST was found in output headers when it should have been renamed to i_TEST"
        )
        self.assertTrue("i_TEST" in headers,
                        "i_TEST not found in output headers")
コード例 #32
0
    def __init__(self, config_file="column_collapse.config"):
        self._config_parser = ConfigUtils.createConfigParser(config_file, ignoreCase=False)
        self._columns_to_collapse = self._config_parser.options("columns_to_collapse")

        # Create a lookup table to get the method for each column
        self._method_dict = dict()
        if len(self._columns_to_collapse) != len(set(self._columns_to_collapse)):
            logging.getLogger(__name__).warn("Duplicate keys in " + config_file + " seen.  Some collapsing may produce unexpected values.")

        for c in self._columns_to_collapse:
            self._method_dict[c] = self._config_parser.get("columns_to_collapse", c).strip()

            # Default to mean if not specified
            if self._method_dict[c] is None or self._method_dict[c] == "":
                self._method_dict[c] = ColumnCollapser.MEAN

        # Basic validation
        problematic_method_assignments = dict()
        for c in self._columns_to_collapse:
            if self._method_dict[c] not in ColumnCollapser.VALID_VALUES:
                problematic_method_assignments[c] = self._method_dict[c]

        if len(problematic_method_assignments.keys()) > 0:
            raise ValueError("Invalid column collapsing specified: " + str(problematic_method_assignments))
コード例 #33
0
    def testInternalFields(self):
        """ Test that an annotation that is not listed explicitly in the required or optional columns is rendered with i_ prepended """
        outputFilename = "out/testInternalFields_v2.4.maf.tsv"
        m = MutationData()
        m.createAnnotation("TEST", "THIS IS A TEST", "TESTING")
        
        # The next annotation is real and should not be considered internal.
        m.createAnnotation("gene", "EGFR")
        
        outputRenderer = TcgaMafOutputRenderer(outputFilename, configFile='configs/tcgaMAF2.4_output.config')
        outputRenderer.renderMutations(iter([m]), ['No comments'])
        
        configFile = ConfigUtils.createConfigParser('configs/tcgaMAF2.4_output.config')
        requiredColumns = configFile.get("general", "requiredColumns")
        self.assertTrue("Hugo_Symbol" in requiredColumns, " This test assumes that Hugo_Symbol is a required column in the TCGA MAF.  If not, the test must be modified.")

        statinfo = os.stat(outputFilename)
        self.assertTrue(statinfo.st_size > 0, "Generated MAF file (" + outputFilename + ") is empty.")
        
        tsvReader = GenericTsvReader(outputFilename)
        headers = tsvReader.getFieldNames()
        self.assertTrue("Hugo_Symbol" in headers, "Hugo_Symbol not found in output headers")
        self.assertTrue("TEST" not in headers, "TEST was found in output headers when it should have been renamed to i_TEST")
        self.assertTrue("i_TEST" in headers, "i_TEST not found in output headers")
コード例 #34
0
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info("Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = MutUtils.createFieldsMapping(headers, annotations, self._alternativeDictionary, isRenderInternalFields=True, prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg


        if i == 0:
            logging.getLogger(__name__).info("No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp, headers, delimiter="\t", lineterminator="\n", extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i,gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." % ((i+1),num_genes))

        fp.close()
コード例 #35
0
    def _validateTcgaMafContents(self, filename):
        """ This is a utility, private method for unit tests to get a semblance that a valid maf file was created.  
        
        Note: This method has nothing to do with the TCGA validator.
        
        """
        configFile = ConfigUtils.createConfigParser(
            os.path.join("configs", "tcgaMAF2.4_output.config"))
        statinfo = os.stat(filename)
        self.assertTrue(statinfo.st_size > 0,
                        "Generated MAF file (" + filename + ") is empty.")

        tsvReader = GenericTsvReader(filename)

        self.assertTrue(tsvReader.getComments().find('#version') != -1,
                        "First line did not specify a version number")

        ctr = 1
        for lineDict in tsvReader:

            # TODO: Re-enable when GENCODE and HGNC datasources are concordant (or Entrez_Gene_ID is in the gencode gtf)
            # if lineDict['Entrez_Gene_Id'] == "0":
            #     self.assertTrue(lineDict['Hugo_Symbol'] == "Unknown", "Entrez_Gene_Id was zero, but Hugo Symbol was not 'Unknown'.  Line: " + str(ctr))

            unknownKeys = []
            self.assertTrue(
                lineDict["Tumor_Seq_Allele1"] != lineDict["Tumor_Seq_Allele2"],
                "Reference and alternate were equal in TCGA MAF output on line %d (%s)"
                % (ctr, lineDict["Tumor_Seq_Allele1"]))
            self.assertTrue(
                lineDict["Tumor_Seq_Allele1"] == lineDict["Reference_Allele"],
                "Reference Allele should match Tumor_Seq_Allele1 on line " +
                str(ctr))
            uniprot_aa_xform_counter = 0
            for k in lineDict.keys():
                if lineDict[k] == "__UNKNOWN__":
                    unknownKeys.append(k)

                self.assertTrue(
                    '\r' not in lineDict[k],
                    "Carriage return character found in an annotation value.")

                requiredColumns = configFile.get("general", "requiredColumns")
                optionalColumns = configFile.get("general", "optionalColumns")
                exposedColumns = configFile.get("general", "exposedColumns")
                if (k not in requiredColumns) and (
                        k not in optionalColumns) and (k
                                                       not in exposedColumns):
                    self.assertTrue(
                        k.startswith("i_"),
                        "Internal column was not prepended with 'i_'")
            if lineDict['UniProt_AApos'] == "0":
                uniprot_aa_xform_counter += 1

            if lineDict["Variant_Type"] == VariantClassification.VT_DEL:
                self.assertTrue(lineDict["Tumor_Seq_Allele2"] == "-")

            if lineDict["Variant_Type"] == VariantClassification.VT_INS:
                self.assertTrue(lineDict["Reference_Allele"] == "-")

            unknownKeys.sort()
            self.assertTrue(
                len(unknownKeys) == 0, "__UNKNOWN__ values (" +
                str(len(unknownKeys)) + ") seen on line " + str(ctr) +
                ", in fields: " + ", ".join(unknownKeys))
            self.assertTrue(
                uniprot_aa_xform_counter < 10,
                "Too many uniprot aa xform values are zero (" +
                str(uniprot_aa_xform_counter) +
                ").  This is probably an error.")

            ctr += 1
コード例 #36
0
    def testCreateDatasource(self):
        """

        """
        dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt")

        # Never specify "out/"
        destDir = "out/create_ds_test/"

        if os.path.exists(destDir):
            shutil.rmtree(destDir)
        os.makedirs(destDir)
        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        indexColumnNames = "CHROM,POS,POS,REF,ALT"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        configFilename = "out/esp_coverage.config"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        annotationColumnNames = "DBSNP,EA_GTC,DP"

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType,
                                           dataSourceName, dataSourceVersion, dataSourceMatchMode,
                                           annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType,
                                                                                                      indexColumnNames))

        self.assertTrue(os.path.exists(destDir + datasourceFilename))
        self.assertTrue(os.path.exists(destDir + datasourceFilename + ".tbi"))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section")
        self.assertTrue(configParser.has_option("general", "index_column_names"),
                        "index_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))
        self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (dataSourceMatchMode, configParser.get("general", "match_mode")))
        self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames,
                         "Expected data source index column names is %s but was %s."
                         % (indexColumnNames, configParser.get("general", "index_column_names")))

        self.assertEqual(configParser.get("data_types", "EA_GTC"), "String",
                         "Expected EA_GTC data type is %s but was %s."
                         % ("String", configParser.get("data_types", "EA_GTC")))
        self.assertEqual(configParser.get("data_types", "DP"), "Integer",
                         "Expected DP data type is %s but was %s."
                         % ("Integer", configParser.get("data_types", "DP")))

        ds = DatasourceFactory.createDatasourceFromConfigParser(configParser, "out/create_ds_test/")
        mut = MutationData(chr="1", start="69428", end="69428", ref_allele="T", alt_allele="G")
        mut2 = ds.annotate_mutation(mut)
        self.assertEquals(mut2["ESP_DBSNP"], "dbSNP_134")
        self.assertEquals(mut2["ESP_EA_GTC"], "92,129,3203")
        self.assertEquals(mut2["ESP_DP"], "110")
コード例 #37
0
ファイル: ConfigUtilTest.py プロジェクト: xingtech/oncotator
 def testLocalConfig(self):
     ''' Get a key from a local config and a config file that are the same basic name, but different values. '''
     config = ConfigUtils.createConfigParser("testdata/dummy_configs/dummy.config")
     self.assertTrue(config.get("general", "dummy1") == "Super")
     self.assertTrue(config.get("general", "dummy2") == "world")
コード例 #38
0
    def testCreateDatasource(self):
        """

        """
        dsFile = os.path.join("testdata",
                              "ESP6500SI-V2.chr1.snps_indels.head.25.txt")

        # Never specify "out/"
        destDir = "out/create_ds_test/"

        if os.path.exists(destDir):
            shutil.rmtree(destDir)
        os.makedirs(destDir)
        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        indexColumnNames = "CHROM,POS,POS,REF,ALT"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        configFilename = "out/esp_coverage.config"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        annotationColumnNames = "DBSNP,EA_GTC,DP"

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder.createDatasource(
            destDir, dsFile, indexColumnNames, configFilename, dataSourceType,
            dataSourceName, dataSourceVersion, dataSourceMatchMode,
            annotationColumnNames,
            DatasourceInstallUtils.getIndexCols(dataSourceType,
                                                indexColumnNames))

        self.assertTrue(os.path.exists(destDir + datasourceFilename))
        self.assertTrue(os.path.exists(destDir + datasourceFilename + ".tbi"))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"),
                        "general section is missing.")
        self.assertTrue(configParser.has_section("data_types"),
                        "data_types section is missing.")
        self.assertTrue(configParser.has_option("general", "type"),
                        "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"),
                        "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"),
                        "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(
            configParser.has_option("general", "annotation_column_names"),
            "annotation_column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section")
        self.assertTrue(
            configParser.has_option("general", "index_column_names"),
            "index_column_names option is missing in general section.")

        self.assertEqual(
            configParser.get("general", "type"), dataSourceType,
            "Expected data source type is %s but was %s." %
            (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(
            configParser.get("general", "src_file"), datasourceFilename,
            "Expected data source src_file is %s but was %s." %
            (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(
            configParser.get("general", "title"), dataSourceName,
            "Expected data source title is %s but was %s." %
            (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(
            configParser.get("general", "version"), dataSourceVersion,
            "Expected data source version is %s but was %s." %
            (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(
            configParser.get("general", "column_names"), columnNames,
            "Expected data source column names is %s but was %s." %
            (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(
            configParser.get("general",
                             "annotation_column_names"), annotationColumnNames,
            "Expected data source annotation column names is %s but was %s." %
            (annotationColumnNames,
             configParser.get("general", "annotation_column_names")))
        self.assertEqual(
            configParser.get("general", "match_mode"), dataSourceMatchMode,
            "Expected data source match mode is %s but was %s." %
            (dataSourceMatchMode, configParser.get("general", "match_mode")))
        self.assertEqual(
            configParser.get("general",
                             "index_column_names"), indexColumnNames,
            "Expected data source index column names is %s but was %s." %
            (indexColumnNames, configParser.get("general",
                                                "index_column_names")))

        self.assertEqual(
            configParser.get("data_types", "EA_GTC"), "String",
            "Expected EA_GTC data type is %s but was %s." %
            ("String", configParser.get("data_types", "EA_GTC")))
        self.assertEqual(
            configParser.get("data_types", "DP"), "Integer",
            "Expected DP data type is %s but was %s." %
            ("Integer", configParser.get("data_types", "DP")))

        ds = DatasourceFactory.createDatasourceFromConfigParser(
            configParser, "out/create_ds_test/")
        mut = MutationData(chr="1",
                           start="69428",
                           end="69428",
                           ref_allele="T",
                           alt_allele="G")
        mut2 = ds.annotate_mutation(mut)
        self.assertEquals(mut2["ESP_DBSNP"], "dbSNP_134")
        self.assertEquals(mut2["ESP_EA_GTC"], "92,129,3203")
        self.assertEquals(mut2["ESP_DP"], "110")
コード例 #39
0
    def testCreateConfigFile(self):
        """

        """
        configFilename = os.path.join("out", "esp_coverage.config")
        datasourceFilename = "ESP6500SI-V2.coverage.txt.gz"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        indexColumnNames = "Chromosome,Position,Position"
        columnNames = "Chromosome,Position,TotalSamplesCovered,AvgSampleReadDepth,TotalEAsamplesCovered,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth"
        annotationColumnNames = "TotalSamplesCovered,AvgSampleReadDepth,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth"

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                            ds_type=dataSourceType, ds_name=dataSourceName,
                                            ds_version=dataSourceVersion, column_names=columnNames,
                                            annotation_column_names=annotationColumnNames,
                                            ds_match_mode=dataSourceMatchMode,
                                            indexCols=DatasourceInstallUtils.getIndexCols(dataSourceType,
                                                                                          indexColumnNames))
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section")
        self.assertTrue(configParser.has_option("general", "index_column_names"),
                        "index_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))
        self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (dataSourceMatchMode, configParser.get("general", "match_mode")))
        self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames,
                         "Expected data source index column names is %s but was %s."
                         % (indexColumnNames, configParser.get("general", "index_column_names")))
コード例 #40
0
    def testCreateDatasource(self):
        """

        """
        dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt")
        destDir = "out"
        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        indexColumnNames = "CHROM,POS,POS,REF,ALT"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        configFilename = "out/esp_coverage.config"
        dataSourceType = "indexed_tsv"
        dataSourceName = "ESP"
        dataSourceVersion = "6500SI-V2"
        dataSourceMatchMode = "overlap"
        annotationColumnNames = "DBSNP,EA_GTC,DP"

        datasourceBuilder = TabixIndexedTsvDatasourceCreator()
        datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType,
                                           dataSourceName, dataSourceVersion, dataSourceMatchMode,
                                           annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType,
                                                                                                      indexColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "match_mode"),
                        "match_mode option is missing in general section")
        self.assertTrue(configParser.has_option("general", "index_column_names"),
                        "index_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))
        self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode,
                         "Expected data source match mode is %s but was %s."
                         % (dataSourceMatchMode, configParser.get("general", "match_mode")))
        self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames,
                         "Expected data source index column names is %s but was %s."
                         % (indexColumnNames, configParser.get("general", "index_column_names")))

        self.assertEqual(configParser.get("data_types", "EA_GTC"), "String",
                         "Expected EA_GTC data type is %s but was %s."
                         % ("String", configParser.get("data_types", "EA_GTC")))
        self.assertEqual(configParser.get("data_types", "DP"), "Integer",
                         "Expected DP data type is %s but was %s."
                         % ("Integer", configParser.get("data_types", "DP")))
コード例 #41
0
    def renderMutations(self, segments, metadata=None, comments=None):
        """Render segments into a gene list as described in the docs for this class.

        :param segments: iterable of MutationData
        :param metadata:
        :param comments:
        """

        config_parser = ConfigUtils.createConfigParser(self._config_file)

        logging.getLogger(__name__).info(
            "Building alternative keys dictionary...")
        self._alternativeDictionary = ConfigUtils.buildAlternateKeyDictionaryFromConfig(
            config_parser)

        if metadata is None:
            metadata = OrderedDict()

        if comments is None:
            comments = []

        fp = file(self._filename, 'w')
        for c in comments:
            fp.write("## " + c + "\n")

        # TODO: Define constant for "genes", and other annotations
        headers = config_parser.options("alternatives")
        gene_to_segment_dict = dict()
        annotations = None
        i = 0
        for i, seg in enumerate(segments):
            if annotations is None:
                annotations = seg.keys()
                field_mapping = FieldMapCreator.create_field_map(
                    headers,
                    seg,
                    self._alternativeDictionary,
                    is_render_internal_fields=True,
                    prepend="")

            gene_list = seg['genes'].split(",")
            for g in gene_list:
                if g == seg["start_gene"]:
                    gene_to_segment_dict[g + " " + seg["start_exon"]] = seg
                elif g == seg["end_gene"]:
                    gene_to_segment_dict[g + " " + seg["end_exon"]] = seg
                else:
                    gene_to_segment_dict[g] = seg

        if i == 0:
            logging.getLogger(__name__).info(
                "No segments given.  There will be no genes in the list.")

        writer = csv.DictWriter(fp,
                                headers,
                                delimiter="\t",
                                lineterminator="\n",
                                extrasaction="ignore")
        writer.writeheader()

        logging.getLogger(__name__).info("Rendering gene list...")
        all_genes_seen = sorted(gene_to_segment_dict.keys())
        num_genes = len(all_genes_seen)
        for i, gene in enumerate(all_genes_seen):
            # This next line may be slow...
            line_dict = dict()
            seg = gene_to_segment_dict[gene]
            for h in headers:
                annotation_field = field_mapping.get(h, h)
                line_dict[h] = seg.get(annotation_field, "")
            line_dict["gene"] = gene
            writer.writerow(line_dict)
            if i % 1000 == 0:
                logging.getLogger(__name__).info("Rendered %d/%d genes ..." %
                                                 ((i + 1), num_genes))

        fp.close()
コード例 #42
0
    def testCreateIndexedTsvDatasource(self):
        datasourceFilename = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt"
        datasourceFoldername = "1000Genomes"
        datasourceName = "1000Genomes"
        datasourceType = "indexed_tsv"
        datasourceVersion = "V4.1"
        genomeBuild = "hg19"
        indexColumnNames = "CHROM,POS,POS"
        columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED"
        annotationColumnNames = "DBSNP,EA_AC,AA_AC,TAC"

        tmpDir = tempfile.mkdtemp()
        destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild])
        os.makedirs(destDir)

        DatasourceInstallUtils.create_datasource(destDir=destDir, ds_file=datasourceFilename,
                                                 ds_foldername=datasourceFoldername, ds_name=datasourceName,
                                                 ds_type=datasourceType, ds_version=datasourceVersion,
                                                 index_columns=indexColumnNames,
                                                 ds_annotation_columns=annotationColumnNames)

        datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz"
        configFilename = os.path.join(*[destDir, "1000Genomes.config"])
        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "column_names"),
                        "column_names option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "annotation_column_names"),
                        "annotation_column_names option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), datasourceType,
                         "Expected data source type is %s but was %s."
                         % (datasourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), datasourceName,
                         "Expected data source title is %s but was %s."
                         % (datasourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), datasourceVersion,
                         "Expected data source version is %s but was %s."
                         % (datasourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "column_names"), columnNames,
                         "Expected data source column names is %s but was %s."
                         % (columnNames, configParser.get("general", "column_names")))
        self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames,
                         "Expected data source annotation column names is %s but was %s."
                         % (annotationColumnNames, configParser.get("general", "annotation_column_names")))

        self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])),
                        "No md5 file was generated.")

        datasource = DatasourceFactory.createDatasource(configFilename, destDir)

        m1 = MutationDataFactory.default_create()
        m1.chr = "1"
        m1.start = "802177"
        m1.end = "802177"
        m1.ref_allele = "T"
        m1.alt_allele = "C"

        m1_annotated = datasource.annotate_mutation(m1)
        m1_annotation = m1_annotated.getAnnotation("1000Genomes_AA_AC")
        cur_annotation = Annotation(value="2,866", datasourceName="1000Genomes", dataType="String",
                                    description="",
                                    tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None)
        self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.")

        annotationNames = ["1000Genomes_CHROM", "1000Genomes_POS", "1000Genomes_REF", "1000Genomes_ALT",
                           "1000Genomes_GWAS_PUBMED"]
        for annotationName in annotationNames:
            self.assertTrue(annotationName not in m1_annotated, "m1_annotated was annotated with %s." % annotationName)

        annotationNames = ["1000Genomes_DBSNP", "1000Genomes_EA_AC", "1000Genomes_AA_AC", "1000Genomes_TAC"]
        for annotationName in annotationNames:
            self.assertTrue(annotationName in m1_annotated, "m1_annotated was not annotated with %s value."
                                                            % annotationName)

        MutUtils.removeDir(tmpDir)