def _merge_env(self, env1, env2): env = DataElement() if env1 is not None: env.merge(env1) if env2 is not None: env.merge(env2) return env
def resources(self): if self.parent is None: conf = DataElement() else: conf = self.parent.resources if self.model.resources is not None: conf.merge(self.model.resources) return conf
def __init__(self, initial_conf = None, required = [], args_usage = "", add_options = None, expand_vars = False): DataElement.__init__(self) from optparse import OptionParser parser = OptionParser(usage = "usage: %prog [options] " + args_usage, version = VERSION) parser.add_option("-L", "--log-level", dest="log_level", default=None, choices=["debug", "info", "warn", "error", "critical", "notset"], help="Which log level: debug, info, warn, error, critical, notset") parser.add_option("-c", "--conf", action="append", dest="conf_files", default=[], metavar="FILE", help="Load configuration from a file. Multiple files can be specified") parser.add_option("-D", action="append", dest="data", default=[], metavar="PARAM=VALUE", help="External data value. example -D param1=value") if add_options is not None: add_options(parser) (self.options, self.args) = parser.parse_args() self.builder = ConfigBuilder() if initial_conf is not None: if isinstance(initial_conf, dict): initial_conf = DataFactory.from_native(initial_conf) self.builder.add_element(initial_conf) if self.options.log_level is not None: self.builder.add_value("wok.log.level", self.options.log_level) if len(self.options.conf_files) > 0: files = [] for conf_file in self.options.conf_files: self.builder.add_file(conf_file) files.append(os.path.abspath(conf_file)) self.builder.add_value("__files", DataFactory.from_native(files)) for data in self.options.data: d = data.split("=") if len(d) != 2: raise Exception("Data argument wrong: " + data) self.builder.add_value(d[0], d[1]) self.builder.merge_into(self) if len(required) > 0: self.check_required(required) if expand_vars: self.expand_vars()
def to_element(self, e = None): if e is None: e = DataElement() e["name"] = self.name e["conf"] = self.conf self.root_node.update_tasks_count_by_state() self.root_node.update_modules_count_by_state() self.root_node.to_element(e.create_element("root")) return e
def _task_config_to_element(task): e = DataElement(key_sep = "/") e["id"] = task.id e["name"] = task.name e["index"] = task.index e["module"] = task.parent.id e["instance"] = task.instance.name e["conf"] = task.conf #TODO depends on module definition iter = e.create_element("iteration") iter["strategy"] = "dot" iter["size"] = 0 ports = e.create_element("ports") in_ports = ports.create_list("in") for i, port_node in enumerate(task.parent.in_ports): pe = DataElement(key_sep = "/") # pe["name"] = port_node.name # pe["serializer"] = port_node.serializer # pe["partition"] = pdata.partition # pe["start"] = pdata.start # pe["size"] = pdata.size #task.in_port_data[i].fill_element(pe.create_element("data")) task.in_port_data[i].fill_element(pe) in_ports.append(pe) out_ports = ports.create_list("out") for i, port_node in enumerate(task.parent.out_ports): pe = DataElement(key_sep = "/") # pe["name"] = port_node.name # pe["serializer"] = port_node.serializer # pe["partition"] = pdata.partition #task.out_port_data[i].fill_element(pe.create_element("data")) task.out_port_data[i].fill_element(pe) out_ports.append(pe) return e
def prepare(self, task): wok_conf = task.instance.conf.get("wok") if wok_conf is None: wok_conf = DataElement() lang = self.conf.get("language", "python") lang_key = "execution.mode.native.{}".format(lang) if lang_key in wok_conf: lang_conf = wok_conf[lang_key] else: lang_conf = DataElement() if "script_path" not in self.conf: raise MissingRequiredOption("script_path") script_path = self.conf["script_path"] if lang == "python": cmd = lang_conf.get("bin", "python") args = [self._task_absolute_path(task, script_path)] env = self._merge_env(lang_conf.get("env"), self.conf.get("env")) if "lib_path" in lang_conf: if "PYTHONPATH" in env: env["PYTHONPATH"] = ":".join(lang_conf["lib_path"]) + ":" + env["PYTHONPATH"] else: env["PYTHONPATH"] = ":".join(lang_conf["lib_path"]) else: raise UnknownNativeCmdBuilderLanguage(lang) args += ["-D", "instance_name=" + task.instance.name, "-D", "module_path=" + ".".join([task.parent.namespace, task.parent.name]), "-D", "task_index=" + str(task.index)] for key, value in self._storage_conf(task.instance.engine.storage.basic_conf): args += ["-D", "storage.{}={}".format(key, value)] return cmd, args, env.to_native()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() oncodrive_port, combination_port = \ task.ports("oncodrive_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) ENSEMBL_GENE = "ensembl:gene" classif = {} log.info("Classifying oncodrive results ...") for oid in oncodrive_port: o = em.find(oid, types.CNV_ONCODRIVE_GENES) if o is None: log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, oid)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) key = (o["icdo_topography"], o["icdo_morphology"], ENSEMBL_GENE) log.debug("Oncodrive results ({}) [{}] classified into ({}) ...".format(", ".join(okey), oid, ", ".join(key))) if key in classif: classif[key] += [o] else: classif[key] = [o] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = ENSEMBL_GENE olist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(olist))) ids = c.create_list() flist = c.create_list() for o in olist: ids += [o["id"]] flist += [o["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ONCODRIVE_GENES src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() enrichment_port, combination_port = \ task.ports("enrichment_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) classif = {} log.info("Classifying enrichment results ...") for eid in enrichment_port: e = em.find(eid, types.CNV_ENRICHMENT) if e is None: log.error("{} not found: {}".format(types.CNV_ENRICHMENT, eid)) continue ekey = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"], e["id_type"]) key = (e["icdo_topography"], e["icdo_morphology"], e["id_type"]) log.debug("Enrichment results ({}) [{}] classified into ({}) ...".format(", ".join(ekey), eid, ", ".join(key))) if key in classif: classif[key] += [e] else: classif[key] = [e] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = key[2] elist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(elist))) ids = c.create_list() flist = c.create_list() for e in elist: ids += [e["id"]] flist += [e["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ENRICHMENT src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf min_tumour_unit_size = conf.get("mrna.min_tumour_unit_size", 20, dtype=int) log = task.logger() task.check_in_ports(["log2r_ids"]) task.check_out_ports(["log2r_tumour_unit_ids"]) log2r_port = task.ports["log2r_ids"] log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] es = EntityServer(conf["entities"]) em = es.manager() overwrite = conf.get("overwrite", False, dtype=bool) if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Indexing available mrna log2r tumour units ...") log2r_tumour_unit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_TUMOUR_UNIT, unique = True) units = {} for log2r_id in log2r_port: e = em.find(log2r_id, types.MRNA_LOG2R) if e is None: log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id)) continue eid = e["id"] study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e.get("icdo_morphology", "") log.info("Classifying mrna log2r (%s, %s, %s, %s) [%s] ..." % (study_id, platform_id, icdo_topography, icdo_morphology, eid)) keys = [] m = _ICDO_TOPOGRAPHY_PAT.match(icdo_topography) if m is None: log.error("Wrong ICD-O Topography code: {0}".format(icdo_topography)) continue else: level1 = m.group(1) level2 = m.group(2) if len(icdo_morphology) > 0: m = _ICDO_MORPHOLOGY_PAT.match(icdo_morphology) if m is None: log.error("Wrong ICD-O Morphology code: {0}".format(icdo_morphology)) continue keys += [(study_id, platform_id, level1, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, level1, icdo_morphology)] #keys += [(study_id, platform_id, "", icdo_morphology)] if level2 is not None: keys += [(study_id, platform_id, icdo_topography, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, icdo_topography, icdo_morphology)] for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: log.debug("\t(%s) [excluded]" % ", ".join(key)) continue log.debug("\t(%s)" % ", ".join(key)) if key not in units: units[key] = [eid] else: units[key] += [eid] log.info("Persisting %i mrna log2r tumour units ..." % len(units)) log.debug("Minimum size = %i" % min_tumour_unit_size) for key, ids in sorted(units.iteritems()): size = len(ids) if size < min_tumour_unit_size: log.debug("\t(%s)\t%i assays [Skipped]" % (", ".join(key), size)) continue else: log.debug("\t(%s)\t%i assays" % (", ".join(key), size)) if key in log2r_tumour_unit_index: uid = log2r_tumour_unit_index[key][0] if not overwrite: u = em.find(uid, types.MRNA_LOG2R_TUMOUR_UNIT) else: u = DataElement(key_sep = "/") else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] u["size"] = size u["mrna_log2r_ids"] = u.create_list(ids) em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT) log2r_tumour_unit_port.write(uid) em.close() es.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "repositories.assay", "cnv.min_tumour_unit_size"]) conf = task.conf log = task.logger() study_ids_port, evt_port, evt_tunit_port = \ task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) source_repo = rs.repository("source") if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Creating indices for {} ...".format(types.CNV_EVENTS)) evt_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS, unique = True) log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) evt_tunit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS_TUMOUR_UNIT, unique = True) processed_studies = set() processed_assays = 0 valid_assay_count = {} skipped_assay_count = {} wrong_assays = {} wrong_samples = {} tumour_units = {} evt_dup = {} study_ids = study_ids_port.read_all() log.info("Processing %i studies ..." % len(study_ids)) for assay in em.iter_all(types.SOURCE_ASSAY): assay_id = assay.get("id", "WITHOUT ID") log.debug("Reading assay %s ..." % assay_id) mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id", "assay_property/assay_design", "assay_property/data_type", "assay_property/study_type", "assay_property/filename"]) assay_source_path = assay.get("source_path", "") if len(mf) > 0: study_id = assay.get("study_id", "WITHOUT ID") doc_path = assay.get("__doc_path", "UNKNOWN") log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path)) map_list_add(wrong_assays, study_id, assay_id) continue study_id = assay["study_id"] if study_id not in study_ids: log.debug("Assay {} not included in 'study_ids'".format(assay_id)) continue platform_id = assay["platform_id"] sample_id = assay["sample_id"] assay_design = assay["assay_property/assay_design"] data_type = assay["assay_property/data_type"] study_type = assay["assay_property/study_type"] source_path = assay["source_path"] source_file = assay["assay_property/filename"] e = assay.transform([ ("assay_id", "id"), "study_id", "platform_id", "sample_id", "source_path"]) e["data_file"] = source_repo.url("assay", source_path, source_file) included = study_id in study_ids and study_type == "genomic" included &= (assay_design == "cancer_vs_normal" and data_type == "binary") if not included: if study_type != "transcriptomic" and study_id in study_ids: s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]]) log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s)) map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type)) continue sample = em.find(sample_id, types.SOURCE_SAMPLE) if sample is None: log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id)) map_list_add(wrong_assays, study_id, assay_id) continue mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"]) if len(mf) > 0: sample_source_path = sample.get("source_path", "") log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path)) map_list_add(wrong_samples, study_id, sample_id) continue sample = sample.transform([ "id", "source_path", ("disease_state", "basic_sample_details/disease_state"), ("normal_counterpart", "normal_counterpart_location/topography"), ("icdo_topography", "icdo/topography"), ("icdo_morphology", "icdo/morphology") ]) disease_state = sample["disease_state"] if disease_state not in disease_state_map: log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", ""))) map_list_add(wrong_samples, study_id, sample_id) continue orig_disease_state = disease_state disease_state = disease_state_map[disease_state] if disease_state not in ["tumour"]: log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state)) continue e["disease_state"] = disease_state e["icdo_topography"] = sample["icdo_topography"] e["icdo_morphology"] = sample.get("icdo_morphology", "") if "normal_counterpart" in sample: e["normal_counterpart"] = sample["normal_counterpart"] repo, rel_path = rs.from_url(e["data_file"]) if not repo.exists(rel_path): log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path)) map_list_add(wrong_assays, study_id, assay_id) continue e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"]) eid = None duplicated = False exists = False if e_key in evt_dup: duplicated = True elif e_key in evt_index: eid = evt_index[e_key][0] exists = True if duplicated: log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id)) map_list_add(wrong_assays, study_id, assay_id) continue if eid is None: eid = str(uuid.uuid4()) e["id"] = eid u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", "")) keys = classify_by_experiment_and_icdo( u_key[0], u_key[1], u_key[2], u_key[3]) for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: continue map_list_add(tumour_units, key, eid) processed_studies.add(study_id) processed_assays += 1 map_inc(valid_assay_count, (study_id, platform_id)) msg = {True : "Overwritting", False : "Writting"}[exists] log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key))) em.persist(e, types.CNV_EVENTS) evt_port.write(eid) evt_dup[e_key] = eid min_tumour_unit_size = conf["cnv.min_tumour_unit_size"] log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) log.debug("Minimum size = {}".format(min_tumour_unit_size)) for key in sorted(tumour_units): v = tumour_units[key] size = len(v) if size < min_tumour_unit_size: discard = True discard_text = "[skipped]" else: discard = False discard_text = "" if key in evt_tunit_index: uid = evt_tunit_index[key][0] u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue arrow_text = "==>" else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] arrow_text = "-->" log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text)) if discard: continue u["size"] = len(v) u["cnv_evt_ids"] = u.create_list(v) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) evt_tunit_port.write(uid) sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))] log.info("".join(sb)) log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1))) log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays))) log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples))) em.close() es.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["study_ids"]) task.check_out_ports(["absi_ids", "absi_tumour_unit_ids", "normal_pool_ids", "log2r_source_ids"]) study_ids_port = task.ports["study_ids"] absi_port = task.ports["absi_ids"] absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"] normal_pool_port = task.ports["normal_pool_ids"] log2r_source_port = task.ports["log2r_source_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) #overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Creating indices for {} ...".format(types.MRNA_ABS_INTENSITY)) absi_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.MRNA_ABS_INTENSITY, unique = True) log.info("Creating indices for {} ...".format(types.MRNA_LOG2R_SOURCE)) log2r_src_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_SOURCE, unique = True) log.info("Creating indices for {} ...".format(types.MRNA_ABSI_TUMOUR_UNIT)) absi_tumour_unit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_ABSI_TUMOUR_UNIT, unique = True) processed_studies = set() processed_assays = 0 valid_assay_count = {} skipped_assay_count = {} wrong_assays = {} wrong_samples = {} log2r_src_units = {} tumour_units = {} normal_pools = {} absi_dup = {} log2r_source_dup = {} study_ids = study_ids_port.read_all() log.info("Processing %i studies ..." % len(study_ids)) for assay in em.iter_all(types.SOURCE_ASSAY): assay_id = assay.get("id", "WITHOUT ID") log.debug("Reading assay %s ..." % assay_id) mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id", "assay_property/assay_design", "assay_property/data_type", "assay_property/study_type", "assay_property/filename"]) assay_source_path = assay.get("source_path", "") if len(mf) > 0: study_id = assay.get("study_id", "WITHOUT ID") doc_path = assay.get("__doc_path", "UNKNOWN") log.error("Assay %s in study %s missing required fields: %s {%s}" % (assay_id, study_id, mf, assay_source_path)) map_list_add(wrong_assays, study_id, assay_id) continue study_id = assay["study_id"] if study_id not in study_ids: log.debug("Assay %s not included in 'study_ids'" % assay_id) continue platform_id = assay["platform_id"] sample_id = assay["sample_id"] assay_design = assay["assay_property/assay_design"] data_type = assay["assay_property/data_type"] study_type = assay["assay_property/study_type"] e = assay.transform([ ("assay_id", "id"), "study_id", "platform_id", "sample_id", "source_path", ("data_file/path", "source_path"), ("data_file/name", "assay_property/filename") ]) e["data_file/repo"] = assay.get("data_file/repo", "assay") included = study_id in study_ids and study_type == "transcriptomic" included &= (assay_design == "cancer_and_normal" and data_type == "log_abs_readings") \ or (assay_design == "cancer_vs_normal" and data_type == "log2ratios") if not included: if study_type != "genomic" and study_id in study_ids: s = ", ".join(["%s = %s" % (v[0], v[1]) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]]) log.warn("Skipping assay %s {%s}: %s." % (assay_id, assay_source_path, s)) map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type)) continue sample = em.find(sample_id, types.SOURCE_SAMPLE) if sample is None: log.error("Assay %s references a non-existent sample: %s" % (assay_id, sample_id)) map_list_add(wrong_assays, study_id, assay_id) continue mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"]) if len(mf) > 0: sample_id = sample.get("id", "WITHOUT ID") doc_path = sample.get("__doc_path", "UNKNOWN") sample_source_path = sample.get("source_path", "") log.error("Sample %s associated with assay %s in study %s missing required fields: %s {%s}" % (sample_id, assay_id, study_id, mf, sample_source_path)) map_list_add(wrong_samples, study_id, sample_id) continue sample = sample.transform([ "id", ("source_path", "source_path"), ("disease_state", "basic_sample_details/disease_state"), ("normal_counterpart", "normal_counterpart_location/topography"), ("icdo_topography", "icdo/topography"), ("icdo_morphology", "icdo/morphology") ]) disease_state = sample["disease_state"] if disease_state not in disease_state_map: log.error("Unknown disease_state '%s' for sample %s {%s}" % (disease_state, sample_id, sample.get("source_path", ""))) map_list_add(wrong_samples, study_id, sample_id) continue disease_state = disease_state_map[disease_state] if disease_state not in ["tumour", "normal"]: continue e["disease_state"] = disease_state e["icdo_topography"] = sample["icdo_topography"] e["icdo_morphology"] = sample.get("icdo_morphology", "") if "normal_counterpart" in sample: e["normal_counterpart"] = sample["normal_counterpart"] repo = rs.repository(e["data_file/repo"]) rel_path = os.path.join(e["data_file/path"], e["data_file/name"]) if not repo.exists(rel_path): log.error("Assay %s in study %s missing data file: [%s]" % (assay_id, study_id, rel_path)) map_list_add(wrong_assays, study_id, assay_id) continue key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"]) eid = None duplicated = False exists = False if data_type == "log_abs_readings": if key in absi_dup: duplicated = True elif key in absi_index: eid = absi_index[key][0] exists = True elif data_type == "log2ratios": if key in log2r_source_dup: duplicated = True elif key in log2r_src_index: eid = log2r_src_index[key][0] exists = True if duplicated: log.error("Duplicated key (%s) for assay %s" % (", ".join(key), assay_id)) map_list_add(wrong_assays, study_id, assay_id) continue if eid is None: eid = str(uuid.uuid4()) e["id"] = eid if disease_state == "normal": if data_type == "log2ratios": k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"])) map_list_add(log2r_src_units, k, eid) elif data_type == "log_abs_readings": map_list_add(normal_pools, (study_id, platform_id, e["icdo_topography"]), eid) else: log.error("Assay %s has an unexpected combination of (disease_state, assay_design, data_type): (%s, %s)" % (assay_id, disease_state, assay_design, data_type)) map_list_add(wrong_assays, study_id, assay_id) continue elif disease_state == "tumour": k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"])) if data_type == "log_abs_readings": map_list_add(tumour_units, k, eid) elif data_type == "log2ratios": map_list_add(log2r_src_units, k, eid) processed_studies.add(study_id) processed_assays += 1 map_inc(valid_assay_count, (study_id, platform_id)) msg = {True : "Overwritting", False : "Writting"}[exists] if data_type == "log_abs_readings": log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_ABS_INTENSITY, ", ".join(key), eid)) em.persist(e, types.MRNA_ABS_INTENSITY) absi_port.write(eid) absi_dup[key] = eid elif data_type == "log2ratios": log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_LOG2R_SOURCE, ", ".join(key), eid)) em.persist(e, types.MRNA_LOG2R_SOURCE) log2r_source_port.write(eid) log2r_source_dup[key] = eid log.info("Persisting mrna absi tumour units ...") for k, v in sorted(tumour_units.items()): key = (k[0], k[1], k[2]) exists = key in absi_tumour_unit_index if exists: uid = absi_tumour_unit_index[key][0] else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = k[0] u["platform_id"] = k[1] u["icdo_topography"] = k[2] u["size"] = len(v) u["mrna_absi_ids"] = u.create_list(v) if exists: log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid)) else: log.debug("\t(%s) --> %s ..." % (", ".join(k), uid)) em.persist(u, types.MRNA_ABSI_TUMOUR_UNIT) absi_tumour_unit_port.write(uid) log.info("Creating indices for mrna normal pools ...") normal_pool_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_NORMAL_POOL, unique = True) log.info("Persisting mrna normal pools ...") for k, v in sorted(normal_pools.items()): key = (k[0], k[1], k[2]) exists = key in normal_pool_index if exists: uid = normal_pool_index[key][0] else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = k[0] u["platform_id"] = k[1] u["icdo_topography"] = k[2] u["size"] = len(v) u["mrna_absi_ids"] = u.create_list(v) if exists: log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid)) else: log.debug("\t(%s) --> %s ..." % (", ".join(k), uid)) em.persist(u, types.MRNA_NORMAL_POOL) normal_pool_port.write(uid) sb = ["\n\nProcessed %i assays for %i studies (out of %i):\n\n" % (processed_assays, len(processed_studies), len(study_ids))] sb += ["%i mrna tumour units:\n\n" % (len(tumour_units))] for k, v in sorted(tumour_units.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\n%i mrna normal pools:\n\n" % (len(normal_pools))] for k, v in sorted(normal_pools.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\n%i mrna source log2r units:\n\n" % (len(log2r_src_units))] for k, v in sorted(log2r_src_units.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\nAssay counts by study and platform:\n\n"] for k, v in sorted(valid_assay_count.items()): sb += ["\t%s\t%i assays" % (k, v)] if k in wrong_assays: sb += ["\t%i failed assays" % len(wrong_assays[k])] if k in wrong_samples: sb += ["\t%i failed samples" % len(wrong_samples[k])] sb += ["\n"] log.info("".join(sb)) if len(skipped_assay_count) > 0: log.info("Skipped assays:\n\n%s" % map_count_tostring(skipped_assay_count, indent = 1)) if len(wrong_assays) > 0: log.info("Summary of failed assays:\n\n%s" % map_list_tostring(wrong_assays)) if len(wrong_samples) > 0: log.info("Summary of failed samples:\n\n%s" % map_list_tostring(wrong_samples)) em.close() return 0