def run(task): # Initialization conf = task.conf log = task.logger() es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") # Run for k, v in vars(types).items(): if k.startswith("CNV_"): log.info("Preparing '{0}' ...".format(v)) em.ensure_collection_exists(v) path = rpath.absolute(v.replace(".", "/")) log.debug("\tData: {0}".format(path)) data_repo.mkdir_if_not_exists(path) em.close() es.close() data_repo.close() rs.close() return 0
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() id_port = task.ports("mrna_normal_pool") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) results_base_path = "reports/" + types.CNV_COMBINATION.replace(".", "/") # Run for id in id_port: e = em.find(oid, types.MRNA_LOG2R_TUMOUR_UNIT) if e is None: log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT, id)) continue repo, data_path = rs.from_url(e["data_file"]) data_local_path = repo.get_local(data_path) cmd = " ".join([conf["bin_paths.R"], "--vanilla --slave -f", script, "--args", results_base_path, id, data_local_path]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("R script failed") repo.close_local(data_local_path) repo.close() em.close() es.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() id_port = task.ports("mrna_normal_pool") es = EntityServer(conf["entities"]) em = es.manager() # Run em.close() es.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() mrna_log2r_tunit_port, mrna_normal_pool_port = task.ports(["mrna_log2r_tunit", "mrna_normal_pool"]) es = EntityServer(conf["entities"]) em = es.manager() # Run # mrna preprocessing extract_and_send(log, em, types.MRNA_NORMAL_POOL, mrna_normal_pool_port) extract_and_send(log, em, types.MRNA_LOG2R_TUMOUR_UNIT, mrna_log2r_tunit_port) em.close() es.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.data", "repositories.source", "mrna.enrichment", "bin_paths.gitools"]) conf = task.conf log = task.logger() task.check_in_ports(["oncodrive_ids"]) task.check_out_ports(["enrichment_ids"]) oncodrive_port = task.ports["oncodrive_ids"] enrichment_port = task.ports["enrichment_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # retrieve enrichment configurations ec = conf["mrna.enrichment"] if "default" in ec: default = ec["default"] else: default = conf.create_element() if "modules" not in ec: log.error("There is no enrichment modules section available in mrna.enrichment") return -1 log.info("Reading modules configuration ...") econfs = list() for mod in ec["modules"]: m = ec.create_element() m.merge(default) m.merge(mod) mf = m.missing_fields(["id_type", "test", "modules_file"]) if len(mf) > 0: log.error("Enrichment configuration missing required fields: {}".format(", ".join(mf))) log.error("Module configuration: {}".format(m)) else: econfs.append(m) log.debug("{} -> {}".format(m["id_type"], m["modules_file"])) if len(econfs) == 0: log.error("There are no enrichment configurations available in mrna.enrichment") return 0 results_base_path = types.MRNA_ENRICHMENT.replace(".", "/") log.info("Indexing available enrichment results ...") enrichment_results_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology", "id_type"], types.MRNA_ENRICHMENT, unique = True) for oid in oncodrive_port: o = em.find(oid, types.MRNA_ONCODRIVE_GENES) if o is None: log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, oid)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) log.info("Enrichment for oncodrive results ({0}) [{1}] ...".format(", ".join(okey), oid)) for ec in econfs: log.info("Module {} [{}] ...".format(ec["id_type"], ec["modules_file"])) key = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"], ec["id_type"]) if key in enrichment_results_index: eid = enrichment_results_index[key][0] e = em.find(eid, types.MRNA_ENRICHMENT) if e is None: log.error("{} not found: {}".format(types.MRNA_ENRICHMENT, eid)) continue else: e = o.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"]) e["id"] = eid = str(uuid.uuid4()) e["id_type"] = ec["id_type"] # enrichment results results_path = rpath.join(results_base_path, eid + ".tsv.gz") if skip_file(overwrite, data_repo, results_path, e.get("results_file")): log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid)) enrichment_port.write(eid) continue valid = enrichment(log, conf, rs, data_repo, results_path, o["results_file"], e, ec, ["id", "upreg_corrected_right_p_value", "downreg_corrected_right_p_value"], ["id", "upreg", "downreg"]) # save mapped results if valid: em.persist(e, types.MRNA_ENRICHMENT) enrichment_port.write(eid) em.close() es.close() data_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() enrichment_port, combination_port = \ task.ports("enrichment_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) classif = {} log.info("Classifying enrichment results ...") for eid in enrichment_port: e = em.find(eid, types.CNV_ENRICHMENT) if e is None: log.error("{} not found: {}".format(types.CNV_ENRICHMENT, eid)) continue ekey = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"], e["id_type"]) key = (e["icdo_topography"], e["icdo_morphology"], e["id_type"]) log.debug("Enrichment results ({}) [{}] classified into ({}) ...".format(", ".join(ekey), eid, ", ".join(key))) if key in classif: classif[key] += [e] else: classif[key] = [e] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = key[2] elist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(elist))) ids = c.create_list() flist = c.create_list() for e in elist: ids += [e["id"]] flist += [e["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ENRICHMENT src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() id_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values()) feat_ids = {} for name in table_infixs: if name == "gene": continue cursor.execute(""" CREATE TABLE IF NOT EXISTS exp_{0}_trs ( {0}_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY ({0}_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id), CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine)) feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name)) icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") for id_type, eid in id_port: e = em.find(eid, types.MRNA_ENRICHMENT) if e is None: log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type) log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid)) table_infix = ID_TYPE_TO_TABLE_INFIX[id_type] icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix), ["{}_id".format(table_infix), "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() ib.close() results_repo.close() continue skipped_ids = set() fids = feat_ids[table_infix] # read data for line in reader: line = line.rstrip() data = line.split("\t") feat_name = data[0] data = [data[i] for i in col_indices] if feat_name not in fids: skipped_ids.add(feat_name) continue feat_id = fids[feat_name] ib.insert(feat_id, icdo_id, exp_id, *data) if len(skipped_ids) > 0: log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids))) log.debug("{} results inserted".format(ib.count)) ib.close() reader.close() em.close() es.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["absi_tumour_unit_ids"]) task.check_out_ports(["log2r_ids"]) absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"] log2r_port = task.ports["log2r_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run # Index normal pools by study, platform, topography log.debug("Indexing normal pools by study, platform and topography ...") pools_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_NORMAL_POOL, unique = True) # Index log2r assays by absi_id log.debug("Indexing log2r assays by absi assay ...") log2r_index = em.group_ids( ["absi_id"], types.MRNA_LOG2R, unique = True) absi_tumour_unit_ids = absi_tumour_unit_port.read_all() log.info("Processing %i mrna absi tumour units ..." % len(absi_tumour_unit_ids)) #log.debug("[%s]" % (", ".join(absi_tumour_unit_ids))) # For each abs intensity assay pool = None pool_data = {} for absi in iter_tumour_absi(conf, em, absi_tumour_unit_ids, log): absi_id = absi["id"] rpath = os.path.join(absi["data_file/path"], absi["data_file/name"]) icdo_topography = absi["icdo_topography"] normal_counterpart = absi.get("normal_counterpart", icdo_topography) if icdo_topography != normal_counterpart: keystr = "(%s, %s, %s --> %s)" % (absi["study_id"], absi["platform_id"], icdo_topography, normal_counterpart) else: keystr = "(%s, %s, %s)" % (absi["study_id"], absi["platform_id"], icdo_topography) exists = (absi_id,) in log2r_index if exists: log2r_id = log2r_index[(absi_id,)][0] else: log2r_id = str(uuid.uuid4()) data_file_path = types.MRNA_LOG2R.replace(".", "/") data_file_name = log2r_id + ".tsv.gz" dst_path = os.path.join(data_file_path, data_file_name) if not overwrite and exists and data_repo.exists(dst_path): log.debug("Skipping calculation of log2r for tumour assay %s %s as it is already calculated" % (keystr, absi_id)) log2r_port.write(log2r_id) continue log.info("Processing tumour assay %s %s from %s ..." % (keystr, absi_id, rpath)) repo = rs.repository(absi["data_file/repo"]) if not repo.exists(rpath): log.error("File not found: %s" % rpath) continue # Get normal counterpart data if pool is None \ or absi["study_id"] != pool["study_id"] \ or absi["platform_id"] != pool["platform_id"] \ or normal_counterpart != pool["icdo_topography"]: pool_key = (absi["study_id"], absi["platform_id"], normal_counterpart) if pool_key not in pools_index: log.error("Normal pool not found for tumour assay (%s) %s {%s}" % (", ".join(pool_key), absi_id, absi.get("source_path", ""))) continue pool_id = pools_index[pool_key][0] pool = em.find(pool_id, types.MRNA_NORMAL_POOL) if pool is None: log.error("Normal pool %s not found by the entity manager !" % pool_id) continue pool_data = read_pool_data(conf, rs, pool, log) if pool_data is None: pool = None continue log.info("Using normal pool ({}) [{}]".format(", ".join(pool_key), pool_id)) # Calculate log2 ratios mr = MatrixReader(repo.open_reader(rpath)) header = mr.read_header() if len(header.columns) != 2: log.error("Unexpected number of columns: %i" % len(header.columns)) mr.close() continue warn_count = { "id_not_in_pool" : 0, "value_is_nan" : 0, "pool_value_is_nan" : 0, "value_is_inf" : 0, "pool_value_is_inf" : 0} data = {} for row in mr: if row.name in data: log.error("Skipping tumour assay, duplicated row %s at file %s" % (row.name, rpath)) break value = row.values[0] value_is_nan = numpy.isnan(value) if value_is_nan: warn_count["value_is_nan"] += 1 elif numpy.isinf(value): warn_count["value_is_inf"] += 1 if row.name not in pool_data: pool_value = value = numpy.nan warn_count["id_not_in_pool"] += 1 else: pool_value = pool_data[row.name] pool_value_is_nan = numpy.isnan(pool_value) if pool_value_is_nan: warn_count["pool_value_is_nan"] += 1 elif numpy.isinf(pool_value): warn_count["pool_value_is_inf"] += 1 if not value_is_nan and not pool_value_is_nan: # and value != 0.0 and pool_value != 0.0: log2r = value - pool_value else: log2r = numpy.nan if not numpy.isinf(log2r): data[row.name] = log2r #else: # log.warn("row = %s, log2r = %f, value = %f, pool_value = %f" % (row.name, log2r, value, pool_value)) mr.close() sb = ["{0}={1}".format(k, v) for k, v in warn_count.items() if v > 0] if len(sb) > 0: log.warn(", ".join(sb)) # Save log2 ratios data and assay log2r = deepcopy(absi) log2r["id"] = log2r_id log2r["absi_id"] = absi_id log2r["normal_pool_id"] = pool["id"] log2r["data_file/repo"] = data_repo.name() log2r["data_file/path"] = data_file_path log2r["data_file/name"] = data_file_name msg = {True : "Overwritting", False : "Writting"}[exists] log.debug("%s log2 ratio data to %s ..." % (msg, dst_path)) mw = MatrixWriter(data_repo.open_writer(dst_path)) mw.write_header(["id", "value"]) for name, value in sorted(data.items()): mw.write(name, [value]) mw.close() em.persist(log2r, types.MRNA_LOG2R) log2r_port.write(log2r_id) em.close() es.close() data_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() icdo_port, exp_port = task.ports(["icdo", "experiment"]) mrna_oncodrive_gene_port, mrna_enrichment_port, mrna_combination_port = \ task.ports(["mrna_oncodrive_gene", "mrna_enrichment", "mrna_combination"]) cnv_oncodrive_gene_port, cnv_enrichment_port, cnv_combination_port = \ task.ports(["cnv_oncodrive_gene", "cnv_enrichment", "cnv_combination"]) es = EntityServer(conf["entities"]) em = es.manager() # Run exp = set() icdo = set() excludes = None if "biomart.excludes" in conf: excludes = conf["biomart.excludes"] # mrna oncodrive genes results = set() extract(log, em, types.MRNA_ONCODRIVE_GENES, (results, ("id")), (exp, ("study_id", "platform_id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.MRNA_ONCODRIVE_GENES)) for rid, in results: mrna_oncodrive_gene_port.write(rid) # mrna enrichment results = set() extract(log, em, types.MRNA_ENRICHMENT, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.MRNA_ENRICHMENT)) for r in sorted(results): mrna_enrichment_port.write(r) # mrna combination results = set() extract(log, em, types.MRNA_COMBINATION, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.MRNA_COMBINATION)) for r in sorted(results): mrna_combination_port.write(r) # cnv oncodrive genes results = set() extract(log, em, types.CNV_ONCODRIVE_GENES, (results, ("id")), (exp, ("study_id", "platform_id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.CNV_ONCODRIVE_GENES)) for rid, in results: cnv_oncodrive_gene_port.write(rid) # cnv enrichment results = set() extract(log, em, types.CNV_ENRICHMENT, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.CNV_ENRICHMENT)) for r in sorted(results): cnv_enrichment_port.write(r) # cnv combination results = set() extract(log, em, types.CNV_COMBINATION, (results, ("id_type", "id")), (icdo, ("icdo_topography", "icdo_morphology")), excludes = excludes) log.info("Sending {} ids ...".format(types.CNV_COMBINATION)) for r in sorted(results): cnv_combination_port.write(r) # icdo log.info("Sending icdo's ...") for tm in icdo: icdo_port.write(tm) # exp log.info("Sending experiments ...") for e in exp: exp_port.write(e) em.close() es.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf min_tumour_unit_size = conf.get("mrna.min_tumour_unit_size", 20, dtype=int) log = task.logger() task.check_in_ports(["log2r_ids"]) task.check_out_ports(["log2r_tumour_unit_ids"]) log2r_port = task.ports["log2r_ids"] log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] es = EntityServer(conf["entities"]) em = es.manager() overwrite = conf.get("overwrite", False, dtype=bool) if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Indexing available mrna log2r tumour units ...") log2r_tumour_unit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_TUMOUR_UNIT, unique = True) units = {} for log2r_id in log2r_port: e = em.find(log2r_id, types.MRNA_LOG2R) if e is None: log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id)) continue eid = e["id"] study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e.get("icdo_morphology", "") log.info("Classifying mrna log2r (%s, %s, %s, %s) [%s] ..." % (study_id, platform_id, icdo_topography, icdo_morphology, eid)) keys = [] m = _ICDO_TOPOGRAPHY_PAT.match(icdo_topography) if m is None: log.error("Wrong ICD-O Topography code: {0}".format(icdo_topography)) continue else: level1 = m.group(1) level2 = m.group(2) if len(icdo_morphology) > 0: m = _ICDO_MORPHOLOGY_PAT.match(icdo_morphology) if m is None: log.error("Wrong ICD-O Morphology code: {0}".format(icdo_morphology)) continue keys += [(study_id, platform_id, level1, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, level1, icdo_morphology)] #keys += [(study_id, platform_id, "", icdo_morphology)] if level2 is not None: keys += [(study_id, platform_id, icdo_topography, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, icdo_topography, icdo_morphology)] for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: log.debug("\t(%s) [excluded]" % ", ".join(key)) continue log.debug("\t(%s)" % ", ".join(key)) if key not in units: units[key] = [eid] else: units[key] += [eid] log.info("Persisting %i mrna log2r tumour units ..." % len(units)) log.debug("Minimum size = %i" % min_tumour_unit_size) for key, ids in sorted(units.iteritems()): size = len(ids) if size < min_tumour_unit_size: log.debug("\t(%s)\t%i assays [Skipped]" % (", ".join(key), size)) continue else: log.debug("\t(%s)\t%i assays" % (", ".join(key), size)) if key in log2r_tumour_unit_index: uid = log2r_tumour_unit_index[key][0] if not overwrite: u = em.find(uid, types.MRNA_LOG2R_TUMOUR_UNIT) else: u = DataElement(key_sep = "/") else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] u["size"] = size u["mrna_log2r_ids"] = u.create_list(ids) em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT) log2r_tumour_unit_port.write(uid) em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) if "biomart.study_source" in conf: study_source_map = conf["biomart.study_source"] else: study_source_map = conf.create_element() log = task.logger() exp_port = task.ports("experiment") es = EntityServer(conf["entities"]) em = es.manager() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_experiment ( id int(11) NOT NULL, exp_name varchar(64) NOT NULL, study_id varchar(32) NOT NULL, study_source varchar(32) DEFAULT NULL, study_source_url varchar(512) DEFAULT NULL, study_link varchar(512) DEFAULT NULL, pub_pubmed varchar(32) DEFAULT NULL, pub_title varchar(300) DEFAULT NULL, pub_authors varchar(300) DEFAULT NULL, pub_year varchar(16) DEFAULT NULL, pub_journal varchar(200) DEFAULT NULL, platf_id varchar(32) NOT NULL, platf_title varchar(250) DEFAULT NULL, platf_technology varchar(96) DEFAULT NULL, PRIMARY KEY (id), KEY exp_name (exp_name), KEY pub_pubmed (pub_pubmed), KEY pub_title (pub_title), KEY pub_authors (pub_authors), KEY pub_year (pub_year), KEY pub_journal (pub_journal), KEY platf_title (platf_title), KEY platf_technology (platf_technology) ) ENGINE={} CHARACTER SET utf8 COLLATE utf8_general_ci""".format(db_engine)) ib = BatchInsert(cursor, "ent_experiment", ["id", "exp_name", "study_id", "study_source", "study_source_url", "study_link", "pub_title", "pub_authors", "pub_year", "pub_pubmed", "pub_journal", "platf_id", "platf_title", "platf_technology"], insert_size) pubmed = Pubmed() for i, exp in enumerate(exp_port, 1): study_id = exp[0] platform_id = exp[1] study = em.find(study_id, types.SOURCE_STUDY) if study is None: log.error("{} not found: {}".format(types.SOURCE_STUDY, study_id)) continue platf = em.find(platform_id, types.SOURCE_PLATFORM) if platf is None: log.error("{} not found: {}".format(types.SOURCE_PLATFORM, platform_id)) continue log.info("Experiment for study {} and platform {} ...".format(study_id, platform_id)) pub = {} for k in ["title", "short_authors", "date", "journal"]: pub[k] = None if "pubmed" in study: pmid = study["pubmed"] if isinstance(pmid, (DataElementList, list)): pmid = pmid[0] log.warn("Study {} with many pubmed_id's, only the first {} will be considered".format(study_id, pmid)) log.debug("Retrieving information for pubmed_id '{}' ...".format(pmid)) try: pub = pubmed.find(pmid) if len(pub) == 0: log.error("No publication information found for pubmed_id '{}' in experiment ({}, {})".format(pmid, study_id, platform_id)) else: pub = pub[0] except Exception as ex: log.error("Error retrieving pubmed information for experiment ({}, {}) with pubmed_id '{}'".format(study_id, platform_id, pmid)) log.exception(ex) else: pmid = None log.warn("Study {} has no 'pubmed_id' annotation".format(study_id)) if "title" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'title'".format(study_id)) elif "SO/contact_details[0]/contact_name" not in study \ and "SO/contact_details/contact_name" not in study: log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'SO.contact_details[0].contact_name'".format(study_id)) else: try: pub["title"] = study["title"] if "SO/contact_details[0]/contact_name" in study: pub["short_authors"] = study["SO/contact_details[0]/contact_name"] else: pub["short_authors"] = study["SO/contact_details/contact_name"] if "SO/submission/pub_date" in study: pub["date"] = study["SO/submission/pub_date"] else: pub["date"] = "" except Exception as ex: log.debug(study) log.execption(ex) for k, v in pub.items(): if v is not None and isinstance(v, basestring): pub[k] = v.replace("'", r"\'") exp_name = "{}; {}".format(study_id, platform_id) study_source = None study_source_url = None study_link = None parts = study_id.split("-") if len(parts) >= 2 and parts[0] in study_source_map: ss = study_source_map[parts[0]] study_source = ss.get("name") study_source_url = ss.get("home_url") try: study_link = ss.get("link", "").format(parts[1]) except: pass ib.insert(i, exp_name, study_id, study_source, study_source_url, study_link, pub["title"], pub["short_authors"], pub["date"], pmid, pub["journal"], platform_id, platf["SO/platform_title"], "") log.debug("{} experiments inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "cnv.background.ensg", "cnv.mapping.ensg", "bin_paths.bed_tools"]) conf = task.conf log = task.logger() evt_tunit_port, joined_evt_tunit_port = \ task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run mapping_file = conf["cnv.mapping.ensg"] log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file)) mapping_repo, mapping_path = rs.from_url(mapping_file) mapping_local_path = mapping_repo.get_local(mapping_path) background_file = conf["cnv.background.ensg"] log.info("Loading background from {} ...".format(background_file)) background = set() repo, path = rs.from_url(background_file) reader = repo.open_reader(path) for line in reader: line = line.rstrip() if len(line) == 0: continue background.add(line) reader.close() repo.close() for uid in evt_tunit_port: u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", "")) tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/") tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz") if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")): log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid)) joined_evt_tunit_port.write(uid) continue log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid)) cnv_evt_ids = u["cnv_evt_ids"] log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS)) data = {} tmp_path = mkdtemp(prefix = "evt_map_and_join_") log.debug("Temporary directory: {}".format(tmp_path)) try: for eid in cnv_evt_ids: e = em.find(eid, types.CNV_EVENTS) if e is None: log.error("{} not found: {}".format(types.CNV_EVENTS, eid)) continue data_file = e["data_file"] log.debug("{} ...".format(data_file)) repo, path = rs.from_url(data_file) local_path = repo.get_local(path) # Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed) # tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"])) # writer = FileWriter(tmp_file) # reader = repo.open_reader(path) # for line in reader: # if line.lstrip().startswith("#"): # continue # fields = line.rstrip().split("\t") # end = int(fields[2]) + 0 # FIXME fix not necessary already # fields[2] = str(end) # writer.write("\t".join(fields)) # writer.write("\n") # writer.close() # reader.close() # Run BED tools to intersect event regions with gene names tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"])) cmd = " ".join([ os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"), "-a", mapping_local_path, #"-b", tmp_file, "-b", local_path, "-s -wb", ">{}".format(tmp_file2)]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) repo.close_local(local_path) # Read BED tools results and load event data into memory reader = FileReader(tmp_file2) name_index = 3 value_index = 12 line_num = 1 for line in reader: try: fields = line.rstrip().split("\t") name = fields[name_index] value = int(fields[value_index]) if value not in [1, 2]: log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file)) continue except: log.error("Error parsing line {} of data file {}".format(line_num, data_file)) continue k = (eid, name) if k in data: prev_value = data[k] else: prev_value = 0 data[k] = prev_value | value line_num += 1 reader.close() repo.close() finally: if os.path.exists(tmp_path): log.debug("Removing temporary directory {} ...".format(tmp_path)) shutil.rmtree(tmp_path) # Write events data to data file and merge with background labels log.info("Writing data to {} ...".format(tunit_path)) u["data_file"] = data_repo.url(tunit_path) #TODO u["data_timestamp"] = ... writer = data_repo.open_writer(tunit_path) # header for name in cnv_evt_ids: writer.write("\t") writer.write(name) writer.write("\n") # data for row_name in sorted(background): writer.write(row_name) for col_name in cnv_evt_ids: k = (col_name, row_name) if k in data: value = data[k] else: value = 0 writer.write("\t") writer.write(str(value)) writer.write("\n") writer.close() log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key))) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) joined_evt_tunit_port.write(uid) em.close() es.close() mapping_repo.close_local(mapping_local_path) mapping_repo.close() data_repo.close() rs.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) log = task.logger() oncodrive_port = task.ports("id") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) conn = biomart_db_connect(conf["biomart.db"], log) cursor = conn.cursor() gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene") icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo") exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment") cursor.execute( """ CREATE TABLE IF NOT EXISTS exp_gene_trs ( gene_id int(11) NOT NULL, icdo_id int(11) NOT NULL, exp_id int(11) NOT NULL, upreg_total int(11) DEFAULT NULL, upreg_observed double DEFAULT NULL, upreg_expected double DEFAULT NULL, upreg_stdev double DEFAULT NULL, upreg_pvalue double DEFAULT NULL, upreg_cpvalue double DEFAULT NULL, downreg_total int(11) DEFAULT NULL, downreg_observed double DEFAULT NULL, downreg_expected double DEFAULT NULL, downreg_stdev double DEFAULT NULL, downreg_pvalue double DEFAULT NULL, downreg_cpvalue double DEFAULT NULL, PRIMARY KEY (gene_id,icdo_id,exp_id), KEY icdo (icdo_id,exp_id), KEY exp (exp_id), CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id), CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id), CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id) ) ENGINE={} DEFAULT CHARSET=latin1""".format( db_engine ) ) cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 for eid in oncodrive_port: e = em.find(eid, types.MRNA_ONCODRIVE_GENES) if e is None: log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid)) continue if "results_file" not in e: log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid)) continue study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e["icdo_morphology"] okey = (study_id, platform_id, icdo_topography, icdo_morphology) log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid)) icdo_key = (icdo_topography, icdo_morphology) if icdo_key not in icdo: log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key))) continue icdo_id = icdo[icdo_key] exp_key = (study_id, platform_id) if exp_key not in exp: log.error("Experiment ({}) not found in the database".format(", ".join(exp_key))) continue exp_id = exp[exp_key] ib = BatchInsert( cursor, "exp_gene_trs", [ "gene_id", "icdo_id", "exp_id", "upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue", "downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue", ], insert_size, ) results_repo, results_path = rs.from_url(e["results_file"]) try: reader = results_repo.open_reader(results_path) except Exception as ex: log.exception(ex) ib.close() results_repo.close() continue # read header hdr_map = {} hdr = reader.readline().rstrip().split("\t") for i, name in enumerate(hdr): hdr_map[name] = i try: col_indices = [hdr_map[x] for x in __COLUMN_NAMES] except KeyError as e: log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0])) reader.close() lock_count += ib.count ib.close() results_repo.close() continue skipped_genes = set() # read data for line in reader: line = line.rstrip() data = line.split("\t") gene_name = data[0] data = [data[i] for i in col_indices] if gene_name not in gene: skipped_genes.add(gene_name) continue gene_id = gene[gene_name] ib.insert(gene_id, icdo_id, exp_id, *data) if len(skipped_genes) > 0: log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes))) log.debug("{} gene results inserted".format(ib.count)) lock_count += ib.count ib.close() reader.close() if lock_count >= 1000000: cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.execute("LOCK TABLES exp_gene_trs WRITE") lock_count = 0 cursor.execute("UNLOCK TABLES") cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs") cursor.close() em.close() es.close() rs.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "repositories.assay", "cnv.min_tumour_unit_size"]) conf = task.conf log = task.logger() study_ids_port, evt_port, evt_tunit_port = \ task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) source_repo = rs.repository("source") if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Creating indices for {} ...".format(types.CNV_EVENTS)) evt_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS, unique = True) log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) evt_tunit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS_TUMOUR_UNIT, unique = True) processed_studies = set() processed_assays = 0 valid_assay_count = {} skipped_assay_count = {} wrong_assays = {} wrong_samples = {} tumour_units = {} evt_dup = {} study_ids = study_ids_port.read_all() log.info("Processing %i studies ..." % len(study_ids)) for assay in em.iter_all(types.SOURCE_ASSAY): assay_id = assay.get("id", "WITHOUT ID") log.debug("Reading assay %s ..." % assay_id) mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id", "assay_property/assay_design", "assay_property/data_type", "assay_property/study_type", "assay_property/filename"]) assay_source_path = assay.get("source_path", "") if len(mf) > 0: study_id = assay.get("study_id", "WITHOUT ID") doc_path = assay.get("__doc_path", "UNKNOWN") log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path)) map_list_add(wrong_assays, study_id, assay_id) continue study_id = assay["study_id"] if study_id not in study_ids: log.debug("Assay {} not included in 'study_ids'".format(assay_id)) continue platform_id = assay["platform_id"] sample_id = assay["sample_id"] assay_design = assay["assay_property/assay_design"] data_type = assay["assay_property/data_type"] study_type = assay["assay_property/study_type"] source_path = assay["source_path"] source_file = assay["assay_property/filename"] e = assay.transform([ ("assay_id", "id"), "study_id", "platform_id", "sample_id", "source_path"]) e["data_file"] = source_repo.url("assay", source_path, source_file) included = study_id in study_ids and study_type == "genomic" included &= (assay_design == "cancer_vs_normal" and data_type == "binary") if not included: if study_type != "transcriptomic" and study_id in study_ids: s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]]) log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s)) map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type)) continue sample = em.find(sample_id, types.SOURCE_SAMPLE) if sample is None: log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id)) map_list_add(wrong_assays, study_id, assay_id) continue mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"]) if len(mf) > 0: sample_source_path = sample.get("source_path", "") log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path)) map_list_add(wrong_samples, study_id, sample_id) continue sample = sample.transform([ "id", "source_path", ("disease_state", "basic_sample_details/disease_state"), ("normal_counterpart", "normal_counterpart_location/topography"), ("icdo_topography", "icdo/topography"), ("icdo_morphology", "icdo/morphology") ]) disease_state = sample["disease_state"] if disease_state not in disease_state_map: log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", ""))) map_list_add(wrong_samples, study_id, sample_id) continue orig_disease_state = disease_state disease_state = disease_state_map[disease_state] if disease_state not in ["tumour"]: log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state)) continue e["disease_state"] = disease_state e["icdo_topography"] = sample["icdo_topography"] e["icdo_morphology"] = sample.get("icdo_morphology", "") if "normal_counterpart" in sample: e["normal_counterpart"] = sample["normal_counterpart"] repo, rel_path = rs.from_url(e["data_file"]) if not repo.exists(rel_path): log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path)) map_list_add(wrong_assays, study_id, assay_id) continue e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"]) eid = None duplicated = False exists = False if e_key in evt_dup: duplicated = True elif e_key in evt_index: eid = evt_index[e_key][0] exists = True if duplicated: log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id)) map_list_add(wrong_assays, study_id, assay_id) continue if eid is None: eid = str(uuid.uuid4()) e["id"] = eid u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", "")) keys = classify_by_experiment_and_icdo( u_key[0], u_key[1], u_key[2], u_key[3]) for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: continue map_list_add(tumour_units, key, eid) processed_studies.add(study_id) processed_assays += 1 map_inc(valid_assay_count, (study_id, platform_id)) msg = {True : "Overwritting", False : "Writting"}[exists] log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key))) em.persist(e, types.CNV_EVENTS) evt_port.write(eid) evt_dup[e_key] = eid min_tumour_unit_size = conf["cnv.min_tumour_unit_size"] log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) log.debug("Minimum size = {}".format(min_tumour_unit_size)) for key in sorted(tumour_units): v = tumour_units[key] size = len(v) if size < min_tumour_unit_size: discard = True discard_text = "[skipped]" else: discard = False discard_text = "" if key in evt_tunit_index: uid = evt_tunit_index[key][0] u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue arrow_text = "==>" else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] arrow_text = "-->" log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text)) if discard: continue u["size"] = len(v) u["cnv_evt_ids"] = u.create_list(v) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) evt_tunit_port.write(uid) sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))] log.info("".join(sb)) log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1))) log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays))) log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples))) em.close() es.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "repositories.data", "bin_paths.gitools"]) conf = task.conf log = task.logger() combinations_port, combination_ids_port = \ task.ports("combinations", "combination_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) results_base_path = types.CNV_COMBINATION.replace(".", "/") conditions = ("gain", "loss") for c_dict in combinations_port: c = DataFactory.from_native(c_dict, key_sep = "/") """ o = em.find(c, types.CNV_ONCODRIVE_GENES) if o is None: log.error("{0} not found: {1}".format(types.CNV_ONCODRIVE_GENES, c)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) """ cid = c["id"] key = (c["icdo_topography"], c["icdo_morphology"], c["id_type"]) log.info("Processing combination for ({}) [{}] ...".format(", ".join(key), cid)) #files = c["files"] #if len(files) == 1: # log.info("No combination required, copyed from {0}".format(files[0])) # c["results_file"] = files[0] #else: results_path = rpath.join(results_base_path, cid + ".tsv.gz") results_url = data_repo.url(results_path) if skip_file(overwrite, data_repo, results_path, c.get("results_file")): log.warn("Skipping {} ({}) [{}] as it already exists".format(types.CNV_COMBINATION, ", ".join(key), cid)) combination_ids_port.write(cid) continue c["results_file"] = results_url combination(log, conf, rs, c, data_repo, results_path, conditions) # save combination results em.persist(c, types.CNV_COMBINATION) combination_ids_port.write(cid) em.close() es.close() data_repo.close() rs.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() oncodrive_port, combination_port = \ task.ports("oncodrive_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) ENSEMBL_GENE = "ensembl:gene" classif = {} log.info("Classifying oncodrive results ...") for oid in oncodrive_port: o = em.find(oid, types.CNV_ONCODRIVE_GENES) if o is None: log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, oid)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) key = (o["icdo_topography"], o["icdo_morphology"], ENSEMBL_GENE) log.debug("Oncodrive results ({}) [{}] classified into ({}) ...".format(", ".join(okey), oid, ", ".join(key))) if key in classif: classif[key] += [o] else: classif[key] = [o] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = ENSEMBL_GENE olist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(olist))) ids = c.create_list() flist = c.create_list() for o in olist: ids += [o["id"]] flist += [o["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ONCODRIVE_GENES src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def main(): task.check_conf(["entities", "repositories", "biomart.db", "biomart.files.icdo_topography", "biomart.files.icdo_morphology"]) conf = task.conf insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int) log = task.logger() icdo_port = task.ports("icdo") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_topography = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"])) icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"]) icdo_local_path = icdo_repo.get_local(icdo_path) icdo_morphology = map_from_file(icdo_local_path) icdo_repo.close_local(icdo_path) icdo_repo.close() conn = biomart_db_connect(conf["biomart.db"], log) db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE) cursor = conn.cursor() cursor.execute(""" CREATE TABLE ent_icdo ( id int(11) NOT NULL, icdo_name varchar(512) NOT NULL DEFAULT '', icdo_topography varchar(255) NOT NULL DEFAULT '', icdo_morphology varchar(255) NOT NULL DEFAULT '', icdo_topography_code varchar(24) NOT NULL DEFAULT '', icdo_morphology_code varchar(24) NOT NULL DEFAULT '', icdo_topography_name varchar(255) NOT NULL DEFAULT '', icdo_morphology_name varchar(255) NOT NULL DEFAULT '', PRIMARY KEY (id), KEY icdo_name (icdo_name), KEY icdo_tm (icdo_topography,icdo_morphology), KEY icdo_m (icdo_morphology), KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code), KEY icdo_m_c (icdo_morphology_code) ) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine)) ib = BatchInsert(cursor, "ent_icdo", ["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name", "icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size) for i, tm in enumerate(icdo_port, 1): t_code = tm[0] if t_code == "": t_name = t_desc = "ANY topography" elif t_code not in icdo_topography: log.error("Unknown topography description for code {}".format(t_code)) t_name = "" t_desc = "[{}]".format(t_code) else: t_name = icdo_topography[t_code] t_desc = "{} [{}]".format(t_name, t_code) m_code = tm[1] if m_code == "": m_name = m_desc = "ANY morphology" elif m_code not in icdo_morphology: log.error("Unknown morphology description for code {}".format(m_code)) m_name = "" m_desc = "[{}]".format(m_code) else: m_name = icdo_morphology[m_code] m_desc = "{} [{}]".format(m_name, m_code) name = "; ".join((t_desc, m_desc)) log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc)) ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name) log.debug("{} ICDO terms inserted".format(ib.count)) ib.close() cursor.close() conn.close() em.close() es.close() rs.close()