def main(): conf = Config() logging.basicConfig(format = "%(asctime)s %(name)s %(levelname) -5s : %(message)s") log = logging.getLogger("reload_pubmed") log.setLevel(logging.DEBUG) pubmed = Pubmed() log.info("Connecting ...") conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() update_cursor = conn.cursor() log.info("Querying experiments ...") cursor.execute(""" select id, pub_pubmed, study_id, platf_id from ent_experiment where pub_pubmed is not NULL""") SPACES = re.compile("\s+") row = cursor.fetchone() while row is not None: id, pmid, study_id, platf_id = row log.info(">>> PMID: {}, STUDY: {}, PLATFORM: {}".format(pmid, study_id, platf_id)) pub = pubmed.find(pmid) if pub is None: log.error("PMID not found: {}".format(pmid)) continue pub = pub[0] for k, v in pub.items(): if v is not None and isinstance(v, basestring): pub[k] = v.replace("'", r"\'") sql = u""" update ent_experiment set pub_title='{}', pub_authors='{}', pub_year='{}', pub_journal='{}' where id={}""".format( pub["title"], pub["short_authors"], pub["date"], pub["journal"], id) log.debug(SPACES.sub(" ", sql.strip())) update_cursor.execute(sql) row = cursor.fetchone() cursor.close() update_cursor.close() conn.close()
def main(): conf = Config() logging.basicConfig(format = "%(asctime)s %(name)s %(levelname) -5s : %(message)s") log = logging.getLogger("reload_pubmed") log.setLevel(logging.DEBUG) log.info("Connecting ...") conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() update_cursor = conn.cursor() log.info("Querying experiments ...") cursor.execute("""select id, pub_pubmed, study_id, platf_id from ent_experiment""") SPACES = re.compile("\s+") row = cursor.fetchone() while row is not None: id, pmid, study_id, platf_id = row log.info(">>> PMID: {}, STUDY: {}, PLATFORM: {}".format(pmid, study_id, platf_id)) exp_name = "; ".join((study_id, platf_id)) sql = u""" update ent_experiment set exp_name='{}' where id={}""".format(exp_name, id) log.debug(SPACES.sub(" ", sql.strip())) update_cursor.execute(sql) row = cursor.fetchone() cursor.close() update_cursor.close() conn.close()
def main(): task.check_conf(["biomart.db"]) conf = task.conf log = task.logger() conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() cursor.execute( """DROP TABLE IF EXISTS omod_cmb, omod_exp, exp_gene_trs, exp_go_trs, exp_kegg_trs, exp_tfbs_trs, exp_mirna_trs, cmb_gene_trs, cmb_go_trs, cmb_kegg_trs, cmb_tfbs_trs, cmb_mirna_trs, exp_gene_cna, exp_go_cna, exp_kegg_cna, exp_tfbs_cna, exp_mirna_cna, cmb_gene_cna, cmb_go_cna, cmb_kegg_cna, cmb_tfbs_cna, cmb_mirna_cna, ent_experiment, ent_icdo""" ) cursor.close() conn.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() log.info("Gain modules ...") cursor.execute(""" create table omod_cmb ( cond_id integer not null, icdo_id integer not null, gene_id integer not null, omod_cmb_name varchar(600) not null, primary key (cond_id, icdo_id, gene_id), index (icdo_id, gene_id), index (gene_id), foreign key (cond_id) references ent_condition(cond_id), foreign key (icdo_id) references ent_icdo(id), foreign key (gene_id) references ent_gene(id) ) ENGINE={}, DEFAULT CHARSET=latin1 select 1 as cond_id, r.icdo_id, r.gene_id, concat('gain; ', i.icdo_topography, '; ', i.icdo_morphology) as omod_cmb_name from cmb_gene_cna r left join ent_icdo i on r.icdo_id = i.id where gain_pvalue <= 0.05 order by icdo_id, gene_id""".format(db_engine)) log.info("Loss modules ...") cursor.execute(""" insert into omod_cmb select 2 as cond_id, r.icdo_id, r.gene_id, concat('loss; ', i.icdo_topography, '; ', i.icdo_morphology) as omod_cmb_name from cmb_gene_cna r left join ent_icdo i on r.icdo_id = i.id where loss_pvalue <= 0.05 order by icdo_id, gene_id""") log.info("Upregulation modules ...") cursor.execute(""" insert into omod_cmb select 3 as cond_id, r.icdo_id, r.gene_id, concat('upreg; ', i.icdo_topography, '; ', i.icdo_morphology) as omod_cmb_name from cmb_gene_trs r left join ent_icdo i on r.icdo_id = i.id where upreg_pvalue <= 0.05 order by icdo_id, gene_id""") log.info("Downregulation modules ...") cursor.execute(""" insert into omod_cmb select 4 as cond_id, r.icdo_id, r.gene_id, concat('downreg; ', i.icdo_topography, '; ', i.icdo_morphology) as omod_cmb_name from cmb_gene_trs r left join ent_icdo i on r.icdo_id = i.id where downreg_pvalue <= 0.05 order by icdo_id, gene_id""") cursor.close() conn.close()
def main(): conf = Config() logging.basicConfig(format = "%(asctime)s %(name)s %(levelname) -5s : %(message)s") log = logging.getLogger("reload_pubmed") log.setLevel(logging.DEBUG) log.info("Connecting ...") conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() update_cursor = conn.cursor() log.info("Querying experiments ...") cursor.execute("""select id, icdo_topography, icdo_topography_code, icdo_morphology, icdo_morphology_code from ent_icdo""") SPACES = re.compile("\s+") TOPOGRAPHY = re.compile(r"^\[C\d\d(\.\d)?\] (.+)") TOPOGRAPHY2 = re.compile(r"(.+) \[C\d\d(\.\d)?\]$") MORPHOLOGY = re.compile(r"^\[\d\d\d\d\/\d\] (.+)") row = cursor.fetchone() while row is not None: id, icdo_topography, icdo_topography_code, icdo_morphology, icdo_morphology_code = row row = cursor.fetchone() log.info(">>> ID: {}, T: {}, M: {}".format(id, icdo_topography, icdo_morphology)) # if icdo_morphology != "ANY morphology": # continue # m = TOPOGRAPHY.match(icdo_topography) # if m is None: # log.error("Wrong topography: {}".format(icdo_topography)) # continue # # icdo_topography = m.group(2) # m = MORPHOLOGY.match(icdo_morphology) # if m is None: # log.error("Wrong morphology: {}".format(icdo_morphology)) # continue # # icdo_morphology = m.group(1) #icdo_name = "{} [{}]; {} [{}]".format(icdo_topography, icdo_topography_code, icdo_morphology, icdo_morphology_code) # icdo_name = "{} [{}]; {}".format(icdo_topography, icdo_topography_code, icdo_morphology) # sql = u""" # update ent_icdo # set icdo_name='{}', # icdo_topography='{}', # icdo_morphology='{}' # where id={}""".format(icdo_name, icdo_topography, icdo_morphology, id) # icdo_topography_name = icdo_topography # icdo_topography = "{} [{}]".format(icdo_topography, icdo_topography_code) # # icdo_morphology_name = icdo_morphology # if icdo_morphology != "ANY morphology": # icdo_morphology = "{} [{}]".format(icdo_morphology, icdo_morphology_code) # sql = u""" # update ent_icdo # set icdo_topography='{}', icdo_morphology_name='{}', # icdo_morphology='{}', icdo_morphology_name='{}' # where id={}""".format(icdo_topography, icdo_topography_name, icdo_morphology, icdo_morphology_name, id) m = TOPOGRAPHY2.match(icdo_topography) if m is None: log.error("Wrong topography: {}".format(icdo_topography)) continue icdo_topography_name = m.group(1) sql = u""" update ent_icdo set icdo_topography_name='{}' where id={}""".format(icdo_topography_name, id) log.debug(SPACES.sub(" ", sql.strip())) update_cursor.execute(sql) cursor.close() update_cursor.close() conn.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() id_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values()) feat_ids = {} for name in table_infixs: if name == "gene": continue cursor.execute(""" CREATE TABLE IF NOT EXISTS exp_{0}_trs ( {0}_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY ({0}_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id), CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine)) feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name)) icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") for id_type, eid in id_port: e = em.find(eid, types.MRNA_ENRICHMENT) if e is None: log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type) log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid)) table_infix = ID_TYPE_TO_TABLE_INFIX[id_type] icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix), ["{}_id".format(table_infix), "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() ib.close() results_repo.close() continue skipped_ids = set() fids = feat_ids[table_infix] # read data for line in reader: line = line.rstrip() data = line.split("\t") feat_name = data[0] data = [data[i] for i in col_indices] if feat_name not in fids: skipped_ids.add(feat_name) continue feat_id = fids[feat_name] ib.insert(feat_id, icdo_id, exp_id, *data) if len(skipped_ids) > 0: log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids))) log.debug("{} results inserted".format(ib.count)) ib.close() reader.close() em.close() es.close() rs.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) if "biomart.study_source" in conf: study_source_map = conf["biomart.study_source"] else: study_source_map = conf.create_element() log = task.logger() exp_port = task.ports("experiment") es = EntityServer(conf["entities"]) em = es.manager() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_experiment ( id int(11) NOT NULL, exp_name varchar(64) NOT NULL, study_id varchar(32) NOT NULL, study_source varchar(32) DEFAULT NULL, study_source_url varchar(512) DEFAULT NULL, study_link varchar(512) DEFAULT NULL, pub_pubmed varchar(32) DEFAULT NULL, pub_title varchar(300) DEFAULT NULL, pub_authors varchar(300) DEFAULT NULL, pub_year varchar(16) DEFAULT NULL, pub_journal varchar(200) DEFAULT NULL, platf_id varchar(32) NOT NULL, platf_title varchar(250) DEFAULT NULL, platf_technology varchar(96) DEFAULT NULL, PRIMARY KEY (id), KEY exp_name (exp_name), KEY pub_pubmed (pub_pubmed), KEY pub_title (pub_title), KEY pub_authors (pub_authors), KEY pub_year (pub_year), KEY pub_journal (pub_journal), KEY platf_title (platf_title), KEY platf_technology (platf_technology) ) ENGINE={} CHARACTER SET utf8 COLLATE utf8_general_ci""".format(db_engine)) ib = BatchInsert(cursor, "ent_experiment", ["id", "exp_name", "study_id", "study_source", "study_source_url", "study_link", "pub_title", "pub_authors", "pub_year", "pub_pubmed", "pub_journal", "platf_id", "platf_title", "platf_technology"], insert_size) pubmed = Pubmed() for i, exp in enumerate(exp_port, 1): study_id = exp[0] platform_id = exp[1] study = em.find(study_id, types.SOURCE_STUDY) if study is None: log.error("{} not found: {}".format(types.SOURCE_STUDY, study_id)) continue platf = em.find(platform_id, types.SOURCE_PLATFORM) if platf is None: log.error("{} not found: {}".format(types.SOURCE_PLATFORM, platform_id)) continue log.info("Experiment for study {} and platform {} ...".format(study_id, platform_id)) pub = {} for k in ["title", "short_authors", "date", "journal"]: pub[k] = None if "pubmed" in study: pmid = study["pubmed"] if isinstance(pmid, (DataElementList, list)): pmid = pmid[0] log.warn("Study {} with many pubmed_id's, only the first {} will be considered".format(study_id, pmid)) log.debug("Retrieving information for pubmed_id '{}' ...".format(pmid)) try: pub = pubmed.find(pmid) if len(pub) == 0: log.error("No publication information found for pubmed_id '{}' in experiment ({}, {})".format(pmid, study_id, platform_id)) else: pub = pub[0] except Exception as ex: log.error("Error retrieving pubmed information for experiment ({}, {}) with pubmed_id '{}'".format(study_id, platform_id, pmid)) log.exception(ex) else: pmid = None log.warn("Study {} has no 'pubmed_id' annotation".format(study_id)) if "title" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'title'".format(study_id)) elif "SO/contact_details[0]/contact_name" not in study \ and "SO/contact_details/contact_name" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'SO.contact_details[0].contact_name'".format(study_id)) else: try: pub["title"] = study["title"] if "SO/contact_details[0]/contact_name" in study: pub["short_authors"] = study["SO/contact_details[0]/contact_name"] else: pub["short_authors"] = study["SO/contact_details/contact_name"] if "SO/submission/pub_date" in study: pub["date"] = study["SO/submission/pub_date"] else: pub["date"] = "" except Exception as ex: log.debug(study) log.execption(ex) for k, v in pub.items(): if v is not None and isinstance(v, basestring): pub[k] = v.replace("'", r"\'") exp_name = "{}; {}".format(study_id, platform_id) study_source = None study_source_url = None study_link = None parts = study_id.split("-") if len(parts) >= 2 and parts[0] in study_source_map: ss = study_source_map[parts[0]] study_source = ss.get("name") study_source_url = ss.get("home_url") try: study_link = ss.get("link", "").format(parts[1]) except: pass ib.insert(i, exp_name, study_id, study_source, study_source_url, study_link, pub["title"], pub["short_authors"], pub["date"], pmid, pub["journal"], platform_id, platf["SO/platform_title"], "") log.debug("{} experiments inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() oncodrive_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene") icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") cursor.execute( """ CREATE TABLE IF NOT EXISTS exp_gene_trs ( gene_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY (gene_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id), CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={} DEFAULT CHARSET=latin1""".format( db_engine ) ) cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 for eid in oncodrive_port: e = em.find(eid, types.MRNA_ONCODRIVE_GENES) if e is None: log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology) log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid)) icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert( cursor, "exp_gene_trs", [ "gene_id", "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue", ], insert_size, ) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() lock_count += ib.count ib.close() results_repo.close() continue skipped_genes = set() # read data for line in reader: line = line.rstrip() data = line.split("\t") gene_name = data[0] data = [data[i] for i in col_indices] if gene_name not in gene: skipped_genes.add(gene_name) continue gene_id = gene[gene_name] ib.insert(gene_id, icdo_id, exp_id, *data) if len(skipped_genes) > 0: log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes))) log.debug("{} gene results inserted".format(ib.count)) lock_count += ib.count ib.close() reader.close() if lock_count >= 1000000: cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.close() em.close() es.close() rs.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db", "biomart.files.icdo_topography", "biomart.files.icdo_morphology"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) log = task.logger() icdo_port = task.ports("icdo") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_topography = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_morphology = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_icdo ( id int(11) NOT NULL, icdo_name varchar(512) NOT NULL DEFAULT '', icdo_topography varchar(255) NOT NULL DEFAULT '', icdo_morphology varchar(255) NOT NULL DEFAULT '', icdo_topography_code varchar(24) NOT NULL DEFAULT '', icdo_morphology_code varchar(24) NOT NULL DEFAULT '', icdo_topography_name varchar(255) NOT NULL DEFAULT '', icdo_morphology_name varchar(255) NOT NULL DEFAULT '', PRIMARY KEY (id), KEY icdo_name (icdo_name), KEY icdo_tm (icdo_topography,icdo_morphology), KEY icdo_m (icdo_morphology), KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code), KEY icdo_m_c (icdo_morphology_code) ) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine)) ib = BatchInsert(cursor, "ent_icdo", ["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name", "icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size) for i, tm in enumerate(icdo_port, 1): t_code = tm[0] if t_code == "": t_name = t_desc = "ANY topography" elif t_code not in icdo_topography: log.error("Unknown topography description for code {}".format(t_code)) t_name = "" t_desc = "[{}]".format(t_code) else: t_name = icdo_topography[t_code] t_desc = "{} [{}]".format(t_name, t_code) m_code = tm[1] if m_code == "": m_name = m_desc = "ANY morphology" elif m_code not in icdo_morphology: log.error("Unknown morphology description for code {}".format(m_code)) m_name = "" m_desc = "[{}]".format(m_code) else: m_name = icdo_morphology[m_code] m_desc = "{} [{}]".format(m_name, m_code) name = "; ".join((t_desc, m_desc)) log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc)) ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name) log.debug("{} ICDO terms inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close() rs.close()
def main(): conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) log = task.logger() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() log.info("Creating table ...") cursor.execute("DROP TABLE IF EXISTS ent_gene") cursor.execute(""" CREATE TABLE `ent_gene` ( `id` int(11) NOT NULL, `gene_name` varchar(32) NOT NULL, `gene_sym` varchar(48) DEFAULT NULL, `gene_desc` varchar(1020) DEFAULT '', `gene_chr` varchar(16) DEFAULT NULL, `gene_band` varchar(20) DEFAULT NULL, `gene_strand` int(2) DEFAULT NULL, `gene_pos_start` int(11) DEFAULT NULL, `gene_pos_end` int(11) DEFAULT NULL, `gene_biotype` varchar(22) DEFAULT NULL, `gene_status` varchar(12) DEFAULT NULL, `gene_source` varchar(14) DEFAULT NULL, `gene_transcript_count` int(3) DEFAULT NULL, `gene_gc_content` double DEFAULT NULL, PRIMARY KEY (`id`), KEY `gene_name` (`gene_name`), KEY `gene_sym` (`gene_sym`), KEY `gene_chr` (`gene_chr`), KEY `gene_band` (`gene_band`), KEY `gene_strand` (`gene_strand`), KEY `gene_pos_start` (`gene_pos_start`), KEY `gene_pos_end` (`gene_pos_end`), KEY `gene_biotype` (`gene_biotype`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1""") insert_sql = """INSERT INTO `ent_gene` ( `id`, `gene_name`, `gene_sym`, `gene_desc`, `gene_chr`, `gene_band`, `gene_strand`, `gene_pos_start`, `gene_pos_end`, `gene_biotype`, `gene_status`, `gene_source`, `gene_transcript_count`, `gene_gc_content`) VALUES""" ib = BatchInsert(cursor, insert_sql, insert_size) repo, path = rs.from_url(conf["biomart.files.gene"]) local_path = repo.get_local(path) f = FileReader(local_path) hdr = f.readline().rstrip().split("\t") hdr_index = {} for i, name in enumerate(hdr): hdr_index[name] = i __COLUMNS = ["gene_name", "gene_sym", "gene_desc", "gene_chr", "gene_band", "gene_strand", "gene_pos_start", "gene_pos_end", "gene_biotype", "gene_status", "gene_source", "gene_transcript_count", "gene_gc_content"] indices = [hdr_index[c] for c in __COLUMNS] for id, line in enumerate(f): fields = line.rstrip().split("\t") ib.insert([id] + [fields[idx] for idx in indices]) ib.close() f.close() repo.close_local(local_path) rs.close()