def testCreateGPTsvDatasource(self):
        dsFile = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt"
        destDir = "out"
        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceFilename = datasourceBuilder._createDatabase(destDir=destDir, ds_file=dsFile)
        datasourceFilename = string.join([destDir, os.sep, datasourceFilename], "")

        self.assertTrue(os.path.exists(datasourceFilename), "No data source file was generated.")
    def testCreateGPTsvConfigFile(self):
        configFilename = "out/ccle_by_gp.config"
        datasourceFilename = "ccle_results_by_pos.hg19.import.txt"
        dataSourceType = "gp_tsv"
        dataSourceName = "CCLE_By_GP"
        dataSourceVersion = "09292010"
        genomicPositionColumnNames = "chr,start,end"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(
            configFilename=configFilename,
            baseDSFile=datasourceFilename,
            ds_name=dataSourceName,
            ds_type=dataSourceType,
            ds_version=dataSourceVersion,
            indexCols=DatasourceInstallUtils.getIndexCols(
                "gp_tsv", genomicPositionColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"),
                        "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"),
                        "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"),
                        "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"),
                        "version option is missing in general section.")
        self.assertTrue(
            configParser.has_option("general", "genomic_position_cols"),
            "genomic_position_cols option is missing in general section.")

        self.assertEqual(
            configParser.get("general", "type"), dataSourceType,
            "Expected data source type is %s but was %s." %
            (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(
            configParser.get("general", "src_file"), datasourceFilename,
            "Expected data source src_file is %s but was %s." %
            (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(
            configParser.get("general", "title"), dataSourceName,
            "Expected data source title is %s but was %s." %
            (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(
            configParser.get("general", "version"), dataSourceVersion,
            "Expected data source version is %s but was %s." %
            (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(
            configParser.get("general", "genomic_position_cols"),
            genomicPositionColumnNames,
            "Expected data source genomic_position_cols is %s but was %s." %
            (genomicPositionColumnNames,
             configParser.get("general", "genomic_position_cols")))
    def testCreateGPTsvDatasource(self):
        dsFile = "testdata/small_genome_position_tsv_ds/oreganno_trim.hg19.txt"
        destDir = "out"
        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceFilename = datasourceBuilder._createDatabase(destDir=destDir,
                                                               ds_file=dsFile)
        datasourceFilename = string.join([destDir, os.sep, datasourceFilename],
                                         "")

        self.assertTrue(os.path.exists(datasourceFilename),
                        "No data source file was generated.")
    def getGeneTsvConfigFile(self):
        configFilename = "out/simple_uniprot.config"
        datasourceFilename = "simple_uniprot.out.2011_09.tsv"
        dataSourceType = "gene_tsv"
        dataSourceName = "UniProt"
        dataSourceVersion = "2011_09"
        geneColumnName = "gene"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(
            configFilename=configFilename,
            baseDSFile=datasourceFilename,
            ds_name=dataSourceName,
            ds_type=dataSourceType,
            ds_version=dataSourceVersion,
            indexCols=DatasourceInstallUtils.getIndexCols(
                "gene_tsv", geneColumnName))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"),
                        "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"),
                        "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"),
                        "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"),
                        "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "gene_col"),
                        "gene_col option is missing in general section.")

        self.assertEqual(
            configParser.get("general", "type"), dataSourceType,
            "Expected data source type is %s but was %s." %
            (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(
            configParser.get("general", "src_file"), datasourceFilename,
            "Expected data source src_file is %s but was %s." %
            (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(
            configParser.get("general", "title"), dataSourceName,
            "Expected data source title is %s but was %s." %
            (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(
            configParser.get("general", "version"), dataSourceVersion,
            "Expected data source version is %s but was %s." %
            (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(
            configParser.get("general", "gene_col"), geneColumnName,
            "Expected data source gene_col is %s but was %s." %
            (geneColumnName, configParser.get("general", "gene_col")))
    def testCreateGPTsvConfigFile(self):
        configFilename = "out/ccle_by_gp.config"
        datasourceFilename = "ccle_results_by_pos.hg19.import.txt"
        dataSourceType = "gp_tsv"
        dataSourceName = "CCLE_By_GP"
        dataSourceVersion = "09292010"
        genomicPositionColumnNames = "chr,start,end"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                           ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion,
                                           indexCols=DatasourceInstallUtils.getIndexCols("gp_tsv",
                                                                                         genomicPositionColumnNames))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "genomic_position_cols"),
                        "genomic_position_cols option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "genomic_position_cols"), genomicPositionColumnNames,
                         "Expected data source genomic_position_cols is %s but was %s."
                         % (genomicPositionColumnNames, configParser.get("general", "genomic_position_cols")))
    def getGeneTsvConfigFile(self):
        configFilename = "out/simple_uniprot.config"
        datasourceFilename = "simple_uniprot.out.2011_09.tsv"
        dataSourceType = "gene_tsv"
        dataSourceName = "UniProt"
        dataSourceVersion = "2011_09"
        geneColumnName = "gene"

        datasourceBuilder = GenericTsvDatasourceCreator()
        datasourceBuilder._createConfigFile(configFilename=configFilename, baseDSFile=datasourceFilename,
                                           ds_name=dataSourceName, ds_type=dataSourceType, ds_version=dataSourceVersion,
                                           indexCols=DatasourceInstallUtils.getIndexCols("gene_tsv", geneColumnName))

        configParser = ConfigUtils.createConfigParser(configFilename)
        self.assertTrue(configParser.has_section("general"), "general section is missing.")
        self.assertTrue(configParser.has_option("general", "type"), "type option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "src_file"),
                        "src_file option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "title"), "title option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "version"), "version option is missing in general section.")
        self.assertTrue(configParser.has_option("general", "gene_col"),
                        "gene_col option is missing in general section.")

        self.assertEqual(configParser.get("general", "type"), dataSourceType,
                         "Expected data source type is %s but was %s."
                         % (dataSourceType, configParser.get("general", "type")))
        self.assertEqual(configParser.get("general", "src_file"), datasourceFilename,
                         "Expected data source src_file is %s but was %s."
                         % (datasourceFilename, configParser.get("general", "src_file")))
        self.assertEqual(configParser.get("general", "title"), dataSourceName,
                         "Expected data source title is %s but was %s."
                         % (dataSourceName, configParser.get("general", "title")))
        self.assertEqual(configParser.get("general", "version"), dataSourceVersion,
                         "Expected data source version is %s but was %s."
                         % (dataSourceVersion, configParser.get("general", "version")))
        self.assertEqual(configParser.get("general", "gene_col"), geneColumnName,
                         "Expected data source gene_col is %s but was %s."
                         % (geneColumnName, configParser.get("general", "gene_col")))
def main():
    setup_logging()
    args = parseOptions()
    gtf_files = args.gtf_files.split(",")
    fasta_files = args.fasta_files.split(",")
    output_dir = args.output_dir
    genome_build = args.genome_build
    name = args.name
    ver = args.version
    tx_filter = args.filter
    protein_map_file = args.protein_map_file

    # create temp dir
    tmpDir = tempfile.mkdtemp(prefix="onco_ensembl_ds_")
    try:
        logging.getLogger(__name__).info("Creating tmp dir (" + tmpDir + ") ....")
        ds_build_dir = tmpDir + "/" + genome_build + "/"
        os.mkdir(ds_build_dir)

        if not (args.gtf_files.lower().find("gencode") !=-1) and tx_filter == "basic":
            logging.getLogger(__name__).warn("basic filter requested for (apparently) a non-gencode set of GTFs.  If this is an ENSEMBL run (not GENCODE), please specify dummy, using --filter.")

        logging.getLogger(__name__).info("Creating config file...")
        config_filename = ds_build_dir + "/" + name + ".config"
        logging.getLogger(__name__).info("config file being written to: " + os.path.abspath(config_filename))

        config_file_creator = GenericTsvDatasourceCreator()
        idx_cols = DatasourceInstallUtils.indexCols("dummy_option", "dummy_values")
        config_file_creator._createConfigFile(configFilename=config_filename + ".tmp", baseDSFile=os.path.basename(gtf_files[0]),ds_type="ensembl", ds_version=ver, ds_name=name, indexCols=idx_cols)

        # Append the tx_filter and protein map file
        config_parser = SafeConfigParser()
        fp = file(config_filename + ".tmp", 'r')
        config_parser.readfp(fp)
        fp.close()
        config_parser.set("general", "transcript_filter", tx_filter)

        # Write updated config file
        fp = file(config_filename, 'w')
        config_parser.write(fp)
        fp.close()

        logging.getLogger(__name__).info("Starting index construction (temp location: " + ds_build_dir + ") ...")
        factory = GenomeBuildFactory()
        factory.construct_ensembl_indices(gtf_files, fasta_files, ds_build_dir + os.path.basename(gtf_files[0]), protein_id_mapping_file=protein_map_file)

        logging.getLogger(__name__).info("Creating datasource md5...")
        DatasourceInstallUtils.create_datasource_md5_file(ds_build_dir)


        logging.getLogger(__name__).info("Copying created datasource from temp directory to final location (" + output_dir + ")...")
        shutil.copytree(symlinks=True, src=tmpDir, dst=output_dir)

    except Exception as e:
        import traceback
        logging.getLogger(__name__).fatal((e.__repr__()) + " " + traceback.format_exc())
        logging.getLogger(__name__).info(""""If you are getting and error such as:  KeyError: 'ENST00000474204.1'), then you may be out of disk space in /tmp/.""")

    # Remove the tempdir
    logging.getLogger(__name__).info("Done...")
    logging.getLogger(__name__).info("Removing ..." + tmpDir + '/')
    shutil.rmtree(tmpDir)