コード例 #1
0
def main():
    setup_logging()
    args = parseOptions()
    gtf_files = args.gtf_files.split(",")
    fasta_files = args.fasta_files.split(",")
    output_dir = args.output_dir
    genome_build = args.genome_build
    name = args.name
    ver = args.version
    tx_filter = args.filter
    protein_map_file = args.protein_map_file

    # create temp dir
    tmpDir = tempfile.mkdtemp(prefix="onco_ensembl_ds_")
    try:
        logging.getLogger(__name__).info("Creating tmp dir (" + tmpDir + ") ....")
        ds_build_dir = tmpDir + "/" + genome_build + "/"
        os.mkdir(ds_build_dir)

        if not (args.gtf_files.lower().find("gencode") !=-1) and tx_filter == "basic":
            logging.getLogger(__name__).warn("basic filter requested for (apparently) a non-gencode set of GTFs.  If this is an ENSEMBL run (not GENCODE), please specify dummy, using --filter.")

        logging.getLogger(__name__).info("Creating config file...")
        config_filename = ds_build_dir + "/" + name + ".config"
        logging.getLogger(__name__).info("config file being written to: " + os.path.abspath(config_filename))

        config_file_creator = GenericTsvDatasourceCreator()
        idx_cols = DatasourceInstallUtils.indexCols("dummy_option", "dummy_values")
        config_file_creator._createConfigFile(configFilename=config_filename + ".tmp", baseDSFile=os.path.basename(gtf_files[0]),ds_type="ensembl", ds_version=ver, ds_name=name, indexCols=idx_cols)

        # Append the tx_filter and protein map file
        config_parser = SafeConfigParser()
        fp = file(config_filename + ".tmp", 'r')
        config_parser.readfp(fp)
        fp.close()
        config_parser.set("general", "transcript_filter", tx_filter)

        # Write updated config file
        fp = file(config_filename, 'w')
        config_parser.write(fp)
        fp.close()

        logging.getLogger(__name__).info("Starting index construction (temp location: " + ds_build_dir + ") ...")
        factory = GenomeBuildFactory()
        factory.construct_ensembl_indices(gtf_files, fasta_files, ds_build_dir + os.path.basename(gtf_files[0]), protein_id_mapping_file=protein_map_file)

        logging.getLogger(__name__).info("Creating datasource md5...")
        DatasourceInstallUtils.create_datasource_md5_file(ds_build_dir)


        logging.getLogger(__name__).info("Copying created datasource from temp directory to final location (" + output_dir + ")...")
        shutil.copytree(symlinks=True, src=tmpDir, dst=output_dir)

    except Exception as e:
        import traceback
        logging.getLogger(__name__).fatal((e.__repr__()) + " " + traceback.format_exc())
        logging.getLogger(__name__).info(""""If you are getting and error such as:  KeyError: 'ENST00000474204.1'), then you may be out of disk space in /tmp/.""")

    # Remove the tempdir
    logging.getLogger(__name__).info("Done...")
    logging.getLogger(__name__).info("Removing ..." + tmpDir + '/')
    shutil.rmtree(tmpDir)