def createDatasource(tmpDir): # Parse the input arguments. """ :param tmpDir: temporary directory """ args = parseOptions() validateArgs(args) index_columns = args.index_columns ds_name = args.name ds_version = args.version ds_type = args.ds_type ds_foldername = args.ds_foldername dbDir = args.dbDir genome_build = args.genome_build ds_file = args.ds_file ds_match_mode = args.match_mode # Parameters for indexed_tsv only. ds_annotation_columns = args.annotation_columns # Create appropriate subdirectory in tmp dir. destDir = os.path.join(*[tmpDir, ds_foldername, genome_build]) os.makedirs(destDir) # Create a basic logger to a file loggingFormat = '%(asctime)s %(levelname)s [%(name)s:%(lineno)d] %(message)s' logging.basicConfig(filename="initializeDatasource.log", level=logging.INFO, format=loggingFormat) # Add a console logger to the root logger, which means that all loggers generated will have the console dump. # Output on the console will be the same as what is in the log file. ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter(loggingFormat) ch.setFormatter(formatter) logging.getLogger('').addHandler(ch) # Copy the tsv file into genome build dir DatasourceInstallUtils.create_datasource(destDir, ds_file, ds_foldername, ds_name, ds_type, ds_version, index_columns, ds_annotation_columns, ds_match_mode) logging.getLogger(__name__).info("Config file created: " + os.path.join(destDir, ds_foldername) + ".config") # Last step: Copy the directory to the destination dbDir. shutil.copytree(symlinks=True, src=os.path.join(tmpDir, ds_foldername), dst=os.path.join(dbDir, ds_foldername)) logging.getLogger( __name__).info("Datasource copied from temp location to " + os.path.join(dbDir, ds_foldername))
def testCreateGPTsvDatasource(self): """ """ datasourceFilename = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt" datasourceType = "gp_tsv" datasourceName = "ORegAnno" datasourceFoldername = "ORegAnno" datasourceVersion = "UCSC Track" genomeBuild = "hg19" genomicPositionColumnNames = "hg19.oreganno.chrom,hg19.oreganno.chromStart,hg19.oreganno.chromEnd" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName, datasourceType, datasourceVersion, genomicPositionColumnNames) datasourceFilename = "oreganno_trim.hg19.txt" configFilename = os.path.join(*[destDir, "ORegAnno.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") MutUtils.removeDir(tmpDir)
def testCreateIndexedVcfDatasource(self): datasourceFilename = "testdata/vcf/example.vcf" datasourceFoldername = "1000Genomes" datasourceName = "1000Genomes" datasourceType = "indexed_vcf" datasourceVersion = "V4.1" genomeBuild = "hg19" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir, datasourceFilename, datasourceFoldername, datasourceName, datasourceType, datasourceVersion) datasourceFilename = "example.tabix_indexed.vcf.gz" configFilename = os.path.join(*[destDir, "1000Genomes.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") # Data source was created correctly tabixIndexedFilename = os.path.join(*[destDir, "example.tabix_indexed.vcf.gz"]) self.assertTrue(os.path.exists(tabixIndexedFilename), "No index file was generated.") vcfReader = vcf.Reader(filename=tabixIndexedFilename, compressed=True, strict_whitespace=True) vcfRecords = vcfReader.fetch(chrom=20, start=1230237, end=1230237) for vcfRecord in vcfRecords: self.assertEqual(vcfRecord.INFO["NS"], 3, "Expected %s but got %s." % (3, vcfRecord.INFO["NS"])) self.assertEqual(vcfRecord.INFO["DP"], 13, "Expected %s but got %s." % (13, vcfRecord.INFO["DP"])) MutUtils.removeDir(tmpDir)
def testCreateDatasourceWithMissingValues(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.missing.txt") destDir = "out" datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.missing.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "EA_GTC,DP" configFilename = os.path.join("out", "esp_coverage.missing.config") datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("data_types", "EA_GTC"), "Float", "Expected EA_GTC data type is %s but was %s." % ("Float", configParser.get("data_types", "EA_GTC"))) self.assertEqual(configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP")))
def createDatasource(tmpDir): # Parse the input arguments. """ :param tmpDir: temporary directory """ args = parseOptions() validateArgs(args) index_columns = args.index_columns ds_name = args.name ds_version = args.version ds_type = args.ds_type ds_foldername = args.ds_foldername dbDir = args.dbDir genome_build = args.genome_build ds_file = args.ds_file ds_match_mode = args.match_mode # Parameters for indexed_tsv only. ds_annotation_columns = args.annotation_columns # Create appropriate subdirectory in tmp dir. destDir = os.path.join(*[tmpDir, ds_foldername, genome_build]) os.makedirs(destDir) # Create a basic logger to a file loggingFormat = '%(asctime)s %(levelname)s [%(name)s:%(lineno)d] %(message)s' logging.basicConfig(filename="initializeDatasource.log", level=logging.INFO, format=loggingFormat) # Add a console logger to the root logger, which means that all loggers generated will have the console dump. # Output on the console will be the same as what is in the log file. ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter(loggingFormat) ch.setFormatter(formatter) logging.getLogger('').addHandler(ch) # Copy the tsv file into genome build dir DatasourceInstallUtils.create_datasource(destDir, ds_file, ds_foldername, ds_name, ds_type, ds_version, index_columns, ds_annotation_columns, ds_match_mode) logging.getLogger(__name__).info("Config file created: " + os.path.join(destDir, ds_foldername) + ".config") # Last step: Copy the directory to the destination dbDir. shutil.copytree(symlinks=True, src=os.path.join(tmpDir, ds_foldername), dst=os.path.join(dbDir, ds_foldername)) logging.getLogger(__name__).info("Datasource copied from temp location to " + os.path.join(dbDir, ds_foldername))
def testCreateGPTsvConfigFile(self): configFilename = "out/ccle_by_gp.config" datasourceFilename = "ccle_results_by_pos.hg19.import.txt" dataSourceType = "gp_tsv" dataSourceName = "CCLE_By_GP" dataSourceVersion = "09292010" genomicPositionColumnNames = "chr,start,end" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile( configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols( "gp_tsv", genomicPositionColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue( configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
def getGeneTsvConfigFile(self): configFilename = "out/simple_uniprot.config" datasourceFilename = "simple_uniprot.out.2011_09.tsv" dataSourceType = "gene_tsv" dataSourceName = "UniProt" dataSourceVersion = "2011_09" geneColumnName = "gene" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile( configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols( "gene_tsv", geneColumnName)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "gene_col"), "gene_col option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "gene_col"), geneColumnName, "Expected data source gene_col is %s but was %s." % (geneColumnName, configParser.get("general", "gene_col")))
def testCreateDatasourceWithMissingAnnotationColumns(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.missing.txt") destDir = "out" indexColumnNames = "CHROM,POS,POS" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "EA_GTC,DP,ESP_DBSNP" configFilename = os.path.join("out", "esp_coverage.missing_annotation_cols.config") datasourceBuilder = TabixIndexedTsvDatasourceCreator() with self.assertRaises(ValueError): datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames))
def testCreateGPTsvConfigFile(self): configFilename = "out/ccle_by_gp.config" datasourceFilename = "ccle_results_by_pos.hg19.import.txt" dataSourceType = "gp_tsv" dataSourceName = "CCLE_By_GP" dataSourceVersion = "09292010" genomicPositionColumnNames = "chr,start,end" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols("gp_tsv", genomicPositionColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "genomic_position_cols"), "genomic_position_cols option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames, "Expected data source genomic_position_cols is %s but was %s." % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
def testCreateDatasourceWithMissingAnnotationColumns(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") destDir = "out" indexColumnNames = "CHROM,POS,POS" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "EA_GTC,DP,ESP_DBSNP" configFilename = os.path.join("out", "esp_coverage.missing.config") datasourceBuilder = TabixIndexedTsvDatasourceCreator() try: datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) except ValueError: pass
def testCreateDatasourceWithMissingColumns(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") destDir = "out" indexColumnNames = "CHROM,POS,POS" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "EA_GTC,DP" configFilename = os.path.join("out", "esp_coverage.missing.config") datasourceBuilder = TabixIndexedTsvDatasourceCreator() try: datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) except InputMismatchException: pass
def getGeneTsvConfigFile(self): configFilename = "out/simple_uniprot.config" datasourceFilename = "simple_uniprot.out.2011_09.tsv" dataSourceType = "gene_tsv" dataSourceName = "UniProt" dataSourceVersion = "2011_09" geneColumnName = "gene" datasourceBuilder = GenericTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion, indexCols=DatasourceInstallUtils.getIndexCols("gene_tsv", geneColumnName)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "gene_col"), "gene_col option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "gene_col"), geneColumnName, "Expected data source gene_col is %s but was %s." % (geneColumnName, configParser.get("general", "gene_col")))
def main(): args = parseOptions() ds_dir = args.ds_dir DatasourceInstallUtils.create_datasource_md5_file(ds_dir)
def testCreateDatasource(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") # Never specify "out/" destDir = "out/create_ds_test/" if os.path.exists(destDir): shutil.rmtree(destDir) os.makedirs(destDir) datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS,REF,ALT" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" configFilename = "out/esp_coverage.config" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "DBSNP,EA_GTC,DP" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource( destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) self.assertTrue(os.path.exists(destDir + datasourceFilename)) self.assertTrue(os.path.exists(destDir + datasourceFilename + ".tbi")) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue( configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue( configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual( configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual( configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual( configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual( configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual( configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual( configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual( configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual( configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names"))) self.assertEqual( configParser.get("data_types", "EA_GTC"), "String", "Expected EA_GTC data type is %s but was %s." % ("String", configParser.get("data_types", "EA_GTC"))) self.assertEqual( configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP"))) ds = DatasourceFactory.createDatasourceFromConfigParser( configParser, "out/create_ds_test/") mut = MutationData(chr="1", start="69428", end="69428", ref_allele="T", alt_allele="G") mut2 = ds.annotate_mutation(mut) self.assertEquals(mut2["ESP_DBSNP"], "dbSNP_134") self.assertEquals(mut2["ESP_EA_GTC"], "92,129,3203") self.assertEquals(mut2["ESP_DP"], "110")
def testCreateDatasource(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") destDir = "out" datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS,REF,ALT" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" configFilename = "out/esp_coverage.config" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "DBSNP,EA_GTC,DP" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue(configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names"))) self.assertEqual(configParser.get("data_types", "EA_GTC"), "String", "Expected EA_GTC data type is %s but was %s." % ("String", configParser.get("data_types", "EA_GTC"))) self.assertEqual(configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP")))
def testCreateConfigFile(self): """ """ configFilename = os.path.join("out", "esp_coverage.config") datasourceFilename = "ESP6500SI-V2.coverage.txt.gz" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" indexColumnNames = "Chromosome,Position,Position" columnNames = "Chromosome,Position,TotalSamplesCovered,AvgSampleReadDepth,TotalEAsamplesCovered,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth" annotationColumnNames = "TotalSamplesCovered,AvgSampleReadDepth,AvgEAsampleReadDepth,TotalAAsamplesCovered,AvgAAsampleReadDepth" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename, ds_type=dataSourceType, ds_name=dataSourceName, ds_version=dataSourceVersion, column_names=columnNames, annotation_column_names=annotationColumnNames, ds_match_mode=dataSourceMatchMode, indexCols=DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue(configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names")))
def main(): setup_logging() args = parseOptions() gtf_files = args.gtf_files.split(",") fasta_files = args.fasta_files.split(",") output_dir = args.output_dir genome_build = args.genome_build name = args.name ver = args.version tx_filter = args.filter protein_map_file = args.protein_map_file # create temp dir tmpDir = tempfile.mkdtemp(prefix="onco_ensembl_ds_") try: logging.getLogger(__name__).info("Creating tmp dir (" + tmpDir + ") ....") ds_build_dir = tmpDir + "/" + genome_build + "/" os.mkdir(ds_build_dir) if not (args.gtf_files.lower().find("gencode") !=-1) and tx_filter == "basic": logging.getLogger(__name__).warn("basic filter requested for (apparently) a non-gencode set of GTFs. If this is an ENSEMBL run (not GENCODE), please specify dummy, using --filter.") logging.getLogger(__name__).info("Creating config file...") config_filename = ds_build_dir + "/" + name + ".config" logging.getLogger(__name__).info("config file being written to: " + os.path.abspath(config_filename)) config_file_creator = GenericTsvDatasourceCreator() idx_cols = DatasourceInstallUtils.indexCols("dummy_option", "dummy_values") config_file_creator._createConfigFile(configFilename=config_filename + ".tmp", baseDSFile=os.path.basename(gtf_files[0]),ds_type="ensembl", ds_version=ver, ds_name=name, indexCols=idx_cols) # Append the tx_filter and protein map file config_parser = SafeConfigParser() fp = file(config_filename + ".tmp", 'r') config_parser.readfp(fp) fp.close() config_parser.set("general", "transcript_filter", tx_filter) # Write updated config file fp = file(config_filename, 'w') config_parser.write(fp) fp.close() logging.getLogger(__name__).info("Starting index construction (temp location: " + ds_build_dir + ") ...") factory = GenomeBuildFactory() factory.construct_ensembl_indices(gtf_files, fasta_files, ds_build_dir + os.path.basename(gtf_files[0]), protein_id_mapping_file=protein_map_file) logging.getLogger(__name__).info("Creating datasource md5...") DatasourceInstallUtils.create_datasource_md5_file(ds_build_dir) logging.getLogger(__name__).info("Copying created datasource from temp directory to final location (" + output_dir + ")...") shutil.copytree(symlinks=True, src=tmpDir, dst=output_dir) except Exception as e: import traceback logging.getLogger(__name__).fatal((e.__repr__()) + " " + traceback.format_exc()) logging.getLogger(__name__).info(""""If you are getting and error such as: KeyError: 'ENST00000474204.1'), then you may be out of disk space in /tmp/.""") # Remove the tempdir logging.getLogger(__name__).info("Done...") logging.getLogger(__name__).info("Removing ..." + tmpDir + '/') shutil.rmtree(tmpDir)
def testCreateIndexedTsvDatasource(self): datasourceFilename = "testdata/ESP6500SI-V2.chr1.snps_indels.head.25.txt" datasourceFoldername = "1000Genomes" datasourceName = "1000Genomes" datasourceType = "indexed_tsv" datasourceVersion = "V4.1" genomeBuild = "hg19" indexColumnNames = "CHROM,POS,POS" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" annotationColumnNames = "DBSNP,EA_AC,AA_AC,TAC" tmpDir = tempfile.mkdtemp() destDir = os.path.join(*[tmpDir, datasourceFoldername, genomeBuild]) os.makedirs(destDir) DatasourceInstallUtils.create_datasource(destDir=destDir, ds_file=datasourceFilename, ds_foldername=datasourceFoldername, ds_name=datasourceName, ds_type=datasourceType, ds_version=datasourceVersion, index_columns=indexColumnNames, ds_annotation_columns=annotationColumnNames) datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" configFilename = os.path.join(*[destDir, "1000Genomes.config"]) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), datasourceType, "Expected data source type is %s but was %s." % (datasourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), datasourceName, "Expected data source title is %s but was %s." % (datasourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), datasourceVersion, "Expected data source version is %s but was %s." % (datasourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertTrue(os.path.exists(os.path.join(*[tmpDir, datasourceFoldername, genomeBuild + ".md5"])), "No md5 file was generated.") datasource = DatasourceFactory.createDatasource(configFilename, destDir) m1 = MutationDataFactory.default_create() m1.chr = "1" m1.start = "802177" m1.end = "802177" m1.ref_allele = "T" m1.alt_allele = "C" m1_annotated = datasource.annotate_mutation(m1) m1_annotation = m1_annotated.getAnnotation("1000Genomes_AA_AC") cur_annotation = Annotation(value="2,866", datasourceName="1000Genomes", dataType="String", description="", tags=[TagConstants.INFO, TagConstants.NOT_SPLIT], number=None) self.assertTrue(m1_annotation.isEqual(cur_annotation), "Annotations do not match.") annotationNames = ["1000Genomes_CHROM", "1000Genomes_POS", "1000Genomes_REF", "1000Genomes_ALT", "1000Genomes_GWAS_PUBMED"] for annotationName in annotationNames: self.assertTrue(annotationName not in m1_annotated, "m1_annotated was annotated with %s." % annotationName) annotationNames = ["1000Genomes_DBSNP", "1000Genomes_EA_AC", "1000Genomes_AA_AC", "1000Genomes_TAC"] for annotationName in annotationNames: self.assertTrue(annotationName in m1_annotated, "m1_annotated was not annotated with %s value." % annotationName) MutUtils.removeDir(tmpDir)
def testCreateDatasource(self): """ """ dsFile = os.path.join("testdata", "ESP6500SI-V2.chr1.snps_indels.head.25.txt") # Never specify "out/" destDir = "out/create_ds_test/" if os.path.exists(destDir): shutil.rmtree(destDir) os.makedirs(destDir) datasourceFilename = "ESP6500SI-V2.chr1.snps_indels.head.25.tabix_indexed.txt.gz" indexColumnNames = "CHROM,POS,POS,REF,ALT" columnNames = "CHROM,POS,REF,ALT,DBSNP,EA_AC,AA_AC,TAC,MAF,GTS,EA_GTC,AA_GTC,GTC,DP,FG,GM,AA,AAC,PP,CDP,PH,CP,CG,GL,GS,CA,EXOME_CHIP,GWAS_PUBMED" configFilename = "out/esp_coverage.config" dataSourceType = "indexed_tsv" dataSourceName = "ESP" dataSourceVersion = "6500SI-V2" dataSourceMatchMode = "overlap" annotationColumnNames = "DBSNP,EA_GTC,DP" datasourceBuilder = TabixIndexedTsvDatasourceCreator() datasourceBuilder.createDatasource(destDir, dsFile, indexColumnNames, configFilename, dataSourceType, dataSourceName, dataSourceVersion, dataSourceMatchMode, annotationColumnNames, DatasourceInstallUtils.getIndexCols(dataSourceType, indexColumnNames)) self.assertTrue(os.path.exists(destDir + datasourceFilename)) self.assertTrue(os.path.exists(destDir + datasourceFilename + ".tbi")) configParser = ConfigUtils.createConfigParser(configFilename) self.assertTrue(configParser.has_section("general"), "general section is missing.") self.assertTrue(configParser.has_section("data_types"), "data_types section is missing.") self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.") self.assertTrue(configParser.has_option("general", "src_file"), "src_file option is missing in general section.") self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.") self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.") self.assertTrue(configParser.has_option("general", "column_names"), "column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "annotation_column_names"), "annotation_column_names option is missing in general section.") self.assertTrue(configParser.has_option("general", "match_mode"), "match_mode option is missing in general section") self.assertTrue(configParser.has_option("general", "index_column_names"), "index_column_names option is missing in general section.") self.assertEqual(configParser.get("general", "type"), dataSourceType, "Expected data source type is %s but was %s." % (dataSourceType, configParser.get("general", "type"))) self.assertEqual(configParser.get("general", "src_file"), datasourceFilename, "Expected data source src_file is %s but was %s." % (datasourceFilename, configParser.get("general", "src_file"))) self.assertEqual(configParser.get("general", "title"), dataSourceName, "Expected data source title is %s but was %s." % (dataSourceName, configParser.get("general", "title"))) self.assertEqual(configParser.get("general", "version"), dataSourceVersion, "Expected data source version is %s but was %s." % (dataSourceVersion, configParser.get("general", "version"))) self.assertEqual(configParser.get("general", "column_names"), columnNames, "Expected data source column names is %s but was %s." % (columnNames, configParser.get("general", "column_names"))) self.assertEqual(configParser.get("general", "annotation_column_names"), annotationColumnNames, "Expected data source annotation column names is %s but was %s." % (annotationColumnNames, configParser.get("general", "annotation_column_names"))) self.assertEqual(configParser.get("general", "match_mode"), dataSourceMatchMode, "Expected data source match mode is %s but was %s." % (dataSourceMatchMode, configParser.get("general", "match_mode"))) self.assertEqual(configParser.get("general", "index_column_names"), indexColumnNames, "Expected data source index column names is %s but was %s." % (indexColumnNames, configParser.get("general", "index_column_names"))) self.assertEqual(configParser.get("data_types", "EA_GTC"), "String", "Expected EA_GTC data type is %s but was %s." % ("String", configParser.get("data_types", "EA_GTC"))) self.assertEqual(configParser.get("data_types", "DP"), "Integer", "Expected DP data type is %s but was %s." % ("Integer", configParser.get("data_types", "DP"))) ds = DatasourceFactory.createDatasourceFromConfigParser(configParser, "out/create_ds_test/") mut = MutationData(chr="1", start="69428", end="69428", ref_allele="T", alt_allele="G") mut2 = ds.annotate_mutation(mut) self.assertEquals(mut2["ESP_DBSNP"], "dbSNP_134") self.assertEquals(mut2["ESP_EA_GTC"], "92,129,3203") self.assertEquals(mut2["ESP_DP"], "110")