def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() id_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values()) feat_ids = {} for name in table_infixs: if name == "gene": continue cursor.execute(""" CREATE TABLE IF NOT EXISTS exp_{0}_trs ( {0}_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY ({0}_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id), CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine)) feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name)) icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") for id_type, eid in id_port: e = em.find(eid, types.MRNA_ENRICHMENT) if e is None: log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type) log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid)) table_infix = ID_TYPE_TO_TABLE_INFIX[id_type] icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix), ["{}_id".format(table_infix), "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() ib.close() results_repo.close() continue skipped_ids = set() fids = feat_ids[table_infix] # read data for line in reader: line = line.rstrip() data = line.split("\t") feat_name = data[0] data = [data[i] for i in col_indices] if feat_name not in fids: skipped_ids.add(feat_name) continue feat_id = fids[feat_name] ib.insert(feat_id, icdo_id, exp_id, *data) if len(skipped_ids) > 0: log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids))) log.debug("{} results inserted".format(ib.count)) ib.close() reader.close() em.close() es.close() rs.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() oncodrive_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene") icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") cursor.execute( """ CREATE TABLE IF NOT EXISTS exp_gene_trs ( gene_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY (gene_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id), CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={} DEFAULT CHARSET=latin1""".format( db_engine ) ) cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 for eid in oncodrive_port: e = em.find(eid, types.MRNA_ONCODRIVE_GENES) if e is None: log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology) log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid)) icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert( cursor, "exp_gene_trs", [ "gene_id", "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue", ], insert_size, ) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() lock_count += ib.count ib.close() results_repo.close() continue skipped_genes = set() # read data for line in reader: line = line.rstrip() data = line.split("\t") gene_name = data[0] data = [data[i] for i in col_indices] if gene_name not in gene: skipped_genes.add(gene_name) continue gene_id = gene[gene_name] ib.insert(gene_id, icdo_id, exp_id, *data) if len(skipped_genes) > 0: log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes))) log.debug("{} gene results inserted".format(ib.count)) lock_count += ib.count ib.close() reader.close() if lock_count >= 1000000: cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.close() em.close() es.close() rs.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) if "biomart.study_source" in conf: study_source_map = conf["biomart.study_source"] else: study_source_map = conf.create_element() log = task.logger() exp_port = task.ports("experiment") es = EntityServer(conf["entities"]) em = es.manager() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_experiment ( id int(11) NOT NULL, exp_name varchar(64) NOT NULL, study_id varchar(32) NOT NULL, study_source varchar(32) DEFAULT NULL, study_source_url varchar(512) DEFAULT NULL, study_link varchar(512) DEFAULT NULL, pub_pubmed varchar(32) DEFAULT NULL, pub_title varchar(300) DEFAULT NULL, pub_authors varchar(300) DEFAULT NULL, pub_year varchar(16) DEFAULT NULL, pub_journal varchar(200) DEFAULT NULL, platf_id varchar(32) NOT NULL, platf_title varchar(250) DEFAULT NULL, platf_technology varchar(96) DEFAULT NULL, PRIMARY KEY (id), KEY exp_name (exp_name), KEY pub_pubmed (pub_pubmed), KEY pub_title (pub_title), KEY pub_authors (pub_authors), KEY pub_year (pub_year), KEY pub_journal (pub_journal), KEY platf_title (platf_title), KEY platf_technology (platf_technology) ) ENGINE={} CHARACTER SET utf8 COLLATE utf8_general_ci""".format(db_engine)) ib = BatchInsert(cursor, "ent_experiment", ["id", "exp_name", "study_id", "study_source", "study_source_url", "study_link", "pub_title", "pub_authors", "pub_year", "pub_pubmed", "pub_journal", "platf_id", "platf_title", "platf_technology"], insert_size) pubmed = Pubmed() for i, exp in enumerate(exp_port, 1): study_id = exp[0] platform_id = exp[1] study = em.find(study_id, types.SOURCE_STUDY) if study is None: log.error("{} not found: {}".format(types.SOURCE_STUDY, study_id)) continue platf = em.find(platform_id, types.SOURCE_PLATFORM) if platf is None: log.error("{} not found: {}".format(types.SOURCE_PLATFORM, platform_id)) continue log.info("Experiment for study {} and platform {} ...".format(study_id, platform_id)) pub = {} for k in ["title", "short_authors", "date", "journal"]: pub[k] = None if "pubmed" in study: pmid = study["pubmed"] if isinstance(pmid, (DataElementList, list)): pmid = pmid[0] log.warn("Study {} with many pubmed_id's, only the first {} will be considered".format(study_id, pmid)) log.debug("Retrieving information for pubmed_id '{}' ...".format(pmid)) try: pub = pubmed.find(pmid) if len(pub) == 0: log.error("No publication information found for pubmed_id '{}' in experiment ({}, {})".format(pmid, study_id, platform_id)) else: pub = pub[0] except Exception as ex: log.error("Error retrieving pubmed information for experiment ({}, {}) with pubmed_id '{}'".format(study_id, platform_id, pmid)) log.exception(ex) else: pmid = None log.warn("Study {} has no 'pubmed_id' annotation".format(study_id)) if "title" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'title'".format(study_id)) elif "SO/contact_details[0]/contact_name" not in study \ and "SO/contact_details/contact_name" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'SO.contact_details[0].contact_name'".format(study_id)) else: try: pub["title"] = study["title"] if "SO/contact_details[0]/contact_name" in study: pub["short_authors"] = study["SO/contact_details[0]/contact_name"] else: pub["short_authors"] = study["SO/contact_details/contact_name"] if "SO/submission/pub_date" in study: pub["date"] = study["SO/submission/pub_date"] else: pub["date"] = "" except Exception as ex: log.debug(study) log.execption(ex) for k, v in pub.items(): if v is not None and isinstance(v, basestring): pub[k] = v.replace("'", r"\'") exp_name = "{}; {}".format(study_id, platform_id) study_source = None study_source_url = None study_link = None parts = study_id.split("-") if len(parts) >= 2 and parts[0] in study_source_map: ss = study_source_map[parts[0]] study_source = ss.get("name") study_source_url = ss.get("home_url") try: study_link = ss.get("link", "").format(parts[1]) except: pass ib.insert(i, exp_name, study_id, study_source, study_source_url, study_link, pub["title"], pub["short_authors"], pub["date"], pmid, pub["journal"], platform_id, platf["SO/platform_title"], "") log.debug("{} experiments inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db", "biomart.files.icdo_topography", "biomart.files.icdo_morphology"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) log = task.logger() icdo_port = task.ports("icdo") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_topography = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_morphology = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_icdo ( id int(11) NOT NULL, icdo_name varchar(512) NOT NULL DEFAULT '', icdo_topography varchar(255) NOT NULL DEFAULT '', icdo_morphology varchar(255) NOT NULL DEFAULT '', icdo_topography_code varchar(24) NOT NULL DEFAULT '', icdo_morphology_code varchar(24) NOT NULL DEFAULT '', icdo_topography_name varchar(255) NOT NULL DEFAULT '', icdo_morphology_name varchar(255) NOT NULL DEFAULT '', PRIMARY KEY (id), KEY icdo_name (icdo_name), KEY icdo_tm (icdo_topography,icdo_morphology), KEY icdo_m (icdo_morphology), KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code), KEY icdo_m_c (icdo_morphology_code) ) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine)) ib = BatchInsert(cursor, "ent_icdo", ["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name", "icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size) for i, tm in enumerate(icdo_port, 1): t_code = tm[0] if t_code == "": t_name = t_desc = "ANY topography" elif t_code not in icdo_topography: log.error("Unknown topography description for code {}".format(t_code)) t_name = "" t_desc = "[{}]".format(t_code) else: t_name = icdo_topography[t_code] t_desc = "{} [{}]".format(t_name, t_code) m_code = tm[1] if m_code == "": m_name = m_desc = "ANY morphology" elif m_code not in icdo_morphology: log.error("Unknown morphology description for code {}".format(m_code)) m_name = "" m_desc = "[{}]".format(m_code) else: m_name = icdo_morphology[m_code] m_desc = "{} [{}]".format(m_name, m_code) name = "; ".join((t_desc, m_desc)) log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc)) ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name) log.debug("{} ICDO terms inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close() rs.close()