def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() oncodrive_port, combination_port = \ task.ports("oncodrive_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) ENSEMBL_GENE = "ensembl:gene" classif = {} log.info("Classifying oncodrive results ...") for oid in oncodrive_port: o = em.find(oid, types.CNV_ONCODRIVE_GENES) if o is None: log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, oid)) continue okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"]) key = (o["icdo_topography"], o["icdo_morphology"], ENSEMBL_GENE) log.debug("Oncodrive results ({}) [{}] classified into ({}) ...".format(", ".join(okey), oid, ", ".join(key))) if key in classif: classif[key] += [o] else: classif[key] = [o] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = ENSEMBL_GENE olist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(olist))) ids = c.create_list() flist = c.create_list() for o in olist: ids += [o["id"]] flist += [o["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ONCODRIVE_GENES src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def main(): # Initialization task.check_conf(["entities"]) conf = task.conf log = task.logger() enrichment_port, combination_port = \ task.ports("enrichment_ids", "combinations") es = EntityServer(conf["entities"]) em = es.manager() log.info("Indexing available {} results ...".format(types.CNV_COMBINATION)) comb_results_index = em.group_ids( ["icdo_topography", "icdo_morphology", "id_type"], types.CNV_COMBINATION, unique = True) classif = {} log.info("Classifying enrichment results ...") for eid in enrichment_port: e = em.find(eid, types.CNV_ENRICHMENT) if e is None: log.error("{} not found: {}".format(types.CNV_ENRICHMENT, eid)) continue ekey = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"], e["id_type"]) key = (e["icdo_topography"], e["icdo_morphology"], e["id_type"]) log.debug("Enrichment results ({}) [{}] classified into ({}) ...".format(", ".join(ekey), eid, ", ".join(key))) if key in classif: classif[key] += [e] else: classif[key] = [e] log.info("Preparing combinations ...") for key in sorted(classif): if key in comb_results_index: cid = comb_results_index[key][0] c = em.find(cid, types.CNV_COMBINATION) if c is None: log.error("{} not found: {}".format(types.CNV_COMBINATION, cid)) return else: c = DataElement(key_sep = "/") c["id"] = cid = str(uuid.uuid4()) c["icdo_topography"] = key[0] c["icdo_morphology"] = key[1] c["id_type"] = key[2] elist = classif[key] log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(elist))) ids = c.create_list() flist = c.create_list() for e in elist: ids += [e["id"]] flist += [e["results_file"]] c["source"] = src = c.create_element() src["type"] = types.CNV_ENRICHMENT src["ids"] = ids c["files"] = flist combination_port.write(c.to_native()) em.close() es.close()
def main(): # Initialization task.check_conf(["entities", "repositories", "repositories.assay", "cnv.min_tumour_unit_size"]) conf = task.conf log = task.logger() study_ids_port, evt_port, evt_tunit_port = \ task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) source_repo = rs.repository("source") if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Creating indices for {} ...".format(types.CNV_EVENTS)) evt_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS, unique = True) log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) evt_tunit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.CNV_EVENTS_TUMOUR_UNIT, unique = True) processed_studies = set() processed_assays = 0 valid_assay_count = {} skipped_assay_count = {} wrong_assays = {} wrong_samples = {} tumour_units = {} evt_dup = {} study_ids = study_ids_port.read_all() log.info("Processing %i studies ..." % len(study_ids)) for assay in em.iter_all(types.SOURCE_ASSAY): assay_id = assay.get("id", "WITHOUT ID") log.debug("Reading assay %s ..." % assay_id) mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id", "assay_property/assay_design", "assay_property/data_type", "assay_property/study_type", "assay_property/filename"]) assay_source_path = assay.get("source_path", "") if len(mf) > 0: study_id = assay.get("study_id", "WITHOUT ID") doc_path = assay.get("__doc_path", "UNKNOWN") log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path)) map_list_add(wrong_assays, study_id, assay_id) continue study_id = assay["study_id"] if study_id not in study_ids: log.debug("Assay {} not included in 'study_ids'".format(assay_id)) continue platform_id = assay["platform_id"] sample_id = assay["sample_id"] assay_design = assay["assay_property/assay_design"] data_type = assay["assay_property/data_type"] study_type = assay["assay_property/study_type"] source_path = assay["source_path"] source_file = assay["assay_property/filename"] e = assay.transform([ ("assay_id", "id"), "study_id", "platform_id", "sample_id", "source_path"]) e["data_file"] = source_repo.url("assay", source_path, source_file) included = study_id in study_ids and study_type == "genomic" included &= (assay_design == "cancer_vs_normal" and data_type == "binary") if not included: if study_type != "transcriptomic" and study_id in study_ids: s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]]) log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s)) map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type)) continue sample = em.find(sample_id, types.SOURCE_SAMPLE) if sample is None: log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id)) map_list_add(wrong_assays, study_id, assay_id) continue mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"]) if len(mf) > 0: sample_source_path = sample.get("source_path", "") log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path)) map_list_add(wrong_samples, study_id, sample_id) continue sample = sample.transform([ "id", "source_path", ("disease_state", "basic_sample_details/disease_state"), ("normal_counterpart", "normal_counterpart_location/topography"), ("icdo_topography", "icdo/topography"), ("icdo_morphology", "icdo/morphology") ]) disease_state = sample["disease_state"] if disease_state not in disease_state_map: log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", ""))) map_list_add(wrong_samples, study_id, sample_id) continue orig_disease_state = disease_state disease_state = disease_state_map[disease_state] if disease_state not in ["tumour"]: log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state)) continue e["disease_state"] = disease_state e["icdo_topography"] = sample["icdo_topography"] e["icdo_morphology"] = sample.get("icdo_morphology", "") if "normal_counterpart" in sample: e["normal_counterpart"] = sample["normal_counterpart"] repo, rel_path = rs.from_url(e["data_file"]) if not repo.exists(rel_path): log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path)) map_list_add(wrong_assays, study_id, assay_id) continue e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"]) eid = None duplicated = False exists = False if e_key in evt_dup: duplicated = True elif e_key in evt_index: eid = evt_index[e_key][0] exists = True if duplicated: log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id)) map_list_add(wrong_assays, study_id, assay_id) continue if eid is None: eid = str(uuid.uuid4()) e["id"] = eid u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", "")) keys = classify_by_experiment_and_icdo( u_key[0], u_key[1], u_key[2], u_key[3]) for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: continue map_list_add(tumour_units, key, eid) processed_studies.add(study_id) processed_assays += 1 map_inc(valid_assay_count, (study_id, platform_id)) msg = {True : "Overwritting", False : "Writting"}[exists] log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key))) em.persist(e, types.CNV_EVENTS) evt_port.write(eid) evt_dup[e_key] = eid min_tumour_unit_size = conf["cnv.min_tumour_unit_size"] log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT)) log.debug("Minimum size = {}".format(min_tumour_unit_size)) for key in sorted(tumour_units): v = tumour_units[key] size = len(v) if size < min_tumour_unit_size: discard = True discard_text = "[skipped]" else: discard = False discard_text = "" if key in evt_tunit_index: uid = evt_tunit_index[key][0] u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue arrow_text = "==>" else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] arrow_text = "-->" log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text)) if discard: continue u["size"] = len(v) u["cnv_evt_ids"] = u.create_list(v) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) evt_tunit_port.write(uid) sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))] log.info("".join(sb)) log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1))) log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays))) log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples))) em.close() es.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf min_tumour_unit_size = conf.get("mrna.min_tumour_unit_size", 20, dtype=int) log = task.logger() task.check_in_ports(["log2r_ids"]) task.check_out_ports(["log2r_tumour_unit_ids"]) log2r_port = task.ports["log2r_ids"] log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"] es = EntityServer(conf["entities"]) em = es.manager() overwrite = conf.get("overwrite", False, dtype=bool) if "excluded_topographies" in conf: excluded_topographies = set(conf.get("excluded_topographies")) log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies))) else: excluded_topographies = set() # Run log.info("Indexing available mrna log2r tumour units ...") log2r_tumour_unit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_TUMOUR_UNIT, unique = True) units = {} for log2r_id in log2r_port: e = em.find(log2r_id, types.MRNA_LOG2R) if e is None: log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id)) continue eid = e["id"] study_id = e["study_id"] platform_id = e["platform_id"] icdo_topography = e["icdo_topography"] icdo_morphology = e.get("icdo_morphology", "") log.info("Classifying mrna log2r (%s, %s, %s, %s) [%s] ..." % (study_id, platform_id, icdo_topography, icdo_morphology, eid)) keys = [] m = _ICDO_TOPOGRAPHY_PAT.match(icdo_topography) if m is None: log.error("Wrong ICD-O Topography code: {0}".format(icdo_topography)) continue else: level1 = m.group(1) level2 = m.group(2) if len(icdo_morphology) > 0: m = _ICDO_MORPHOLOGY_PAT.match(icdo_morphology) if m is None: log.error("Wrong ICD-O Morphology code: {0}".format(icdo_morphology)) continue keys += [(study_id, platform_id, level1, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, level1, icdo_morphology)] #keys += [(study_id, platform_id, "", icdo_morphology)] if level2 is not None: keys += [(study_id, platform_id, icdo_topography, "")] if len(icdo_morphology) > 0: keys += [(study_id, platform_id, icdo_topography, icdo_morphology)] for key in keys: icdo_topography = key[2] if icdo_topography in excluded_topographies: log.debug("\t(%s) [excluded]" % ", ".join(key)) continue log.debug("\t(%s)" % ", ".join(key)) if key not in units: units[key] = [eid] else: units[key] += [eid] log.info("Persisting %i mrna log2r tumour units ..." % len(units)) log.debug("Minimum size = %i" % min_tumour_unit_size) for key, ids in sorted(units.iteritems()): size = len(ids) if size < min_tumour_unit_size: log.debug("\t(%s)\t%i assays [Skipped]" % (", ".join(key), size)) continue else: log.debug("\t(%s)\t%i assays" % (", ".join(key), size)) if key in log2r_tumour_unit_index: uid = log2r_tumour_unit_index[key][0] if not overwrite: u = em.find(uid, types.MRNA_LOG2R_TUMOUR_UNIT) else: u = DataElement(key_sep = "/") else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = key[0] u["platform_id"] = key[1] u["icdo_topography"] = key[2] u["icdo_morphology"] = key[3] u["size"] = size u["mrna_log2r_ids"] = u.create_list(ids) em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT) log2r_tumour_unit_port.write(uid) em.close() es.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["study_ids"]) task.check_out_ports(["absi_ids", "absi_tumour_unit_ids", "normal_pool_ids", "log2r_source_ids"]) study_ids_port = task.ports["study_ids"] absi_port = task.ports["absi_ids"] absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"] normal_pool_port = task.ports["normal_pool_ids"] log2r_source_port = task.ports["log2r_source_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) #overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Creating indices for {} ...".format(types.MRNA_ABS_INTENSITY)) absi_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.MRNA_ABS_INTENSITY, unique = True) log.info("Creating indices for {} ...".format(types.MRNA_LOG2R_SOURCE)) log2r_src_index = em.group_ids( ["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"], types.MRNA_LOG2R_SOURCE, unique = True) log.info("Creating indices for {} ...".format(types.MRNA_ABSI_TUMOUR_UNIT)) absi_tumour_unit_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_ABSI_TUMOUR_UNIT, unique = True) processed_studies = set() processed_assays = 0 valid_assay_count = {} skipped_assay_count = {} wrong_assays = {} wrong_samples = {} log2r_src_units = {} tumour_units = {} normal_pools = {} absi_dup = {} log2r_source_dup = {} study_ids = study_ids_port.read_all() log.info("Processing %i studies ..." % len(study_ids)) for assay in em.iter_all(types.SOURCE_ASSAY): assay_id = assay.get("id", "WITHOUT ID") log.debug("Reading assay %s ..." % assay_id) mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id", "assay_property/assay_design", "assay_property/data_type", "assay_property/study_type", "assay_property/filename"]) assay_source_path = assay.get("source_path", "") if len(mf) > 0: study_id = assay.get("study_id", "WITHOUT ID") doc_path = assay.get("__doc_path", "UNKNOWN") log.error("Assay %s in study %s missing required fields: %s {%s}" % (assay_id, study_id, mf, assay_source_path)) map_list_add(wrong_assays, study_id, assay_id) continue study_id = assay["study_id"] if study_id not in study_ids: log.debug("Assay %s not included in 'study_ids'" % assay_id) continue platform_id = assay["platform_id"] sample_id = assay["sample_id"] assay_design = assay["assay_property/assay_design"] data_type = assay["assay_property/data_type"] study_type = assay["assay_property/study_type"] e = assay.transform([ ("assay_id", "id"), "study_id", "platform_id", "sample_id", "source_path", ("data_file/path", "source_path"), ("data_file/name", "assay_property/filename") ]) e["data_file/repo"] = assay.get("data_file/repo", "assay") included = study_id in study_ids and study_type == "transcriptomic" included &= (assay_design == "cancer_and_normal" and data_type == "log_abs_readings") \ or (assay_design == "cancer_vs_normal" and data_type == "log2ratios") if not included: if study_type != "genomic" and study_id in study_ids: s = ", ".join(["%s = %s" % (v[0], v[1]) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]]) log.warn("Skipping assay %s {%s}: %s." % (assay_id, assay_source_path, s)) map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type)) continue sample = em.find(sample_id, types.SOURCE_SAMPLE) if sample is None: log.error("Assay %s references a non-existent sample: %s" % (assay_id, sample_id)) map_list_add(wrong_assays, study_id, assay_id) continue mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"]) if len(mf) > 0: sample_id = sample.get("id", "WITHOUT ID") doc_path = sample.get("__doc_path", "UNKNOWN") sample_source_path = sample.get("source_path", "") log.error("Sample %s associated with assay %s in study %s missing required fields: %s {%s}" % (sample_id, assay_id, study_id, mf, sample_source_path)) map_list_add(wrong_samples, study_id, sample_id) continue sample = sample.transform([ "id", ("source_path", "source_path"), ("disease_state", "basic_sample_details/disease_state"), ("normal_counterpart", "normal_counterpart_location/topography"), ("icdo_topography", "icdo/topography"), ("icdo_morphology", "icdo/morphology") ]) disease_state = sample["disease_state"] if disease_state not in disease_state_map: log.error("Unknown disease_state '%s' for sample %s {%s}" % (disease_state, sample_id, sample.get("source_path", ""))) map_list_add(wrong_samples, study_id, sample_id) continue disease_state = disease_state_map[disease_state] if disease_state not in ["tumour", "normal"]: continue e["disease_state"] = disease_state e["icdo_topography"] = sample["icdo_topography"] e["icdo_morphology"] = sample.get("icdo_morphology", "") if "normal_counterpart" in sample: e["normal_counterpart"] = sample["normal_counterpart"] repo = rs.repository(e["data_file/repo"]) rel_path = os.path.join(e["data_file/path"], e["data_file/name"]) if not repo.exists(rel_path): log.error("Assay %s in study %s missing data file: [%s]" % (assay_id, study_id, rel_path)) map_list_add(wrong_assays, study_id, assay_id) continue key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"]) eid = None duplicated = False exists = False if data_type == "log_abs_readings": if key in absi_dup: duplicated = True elif key in absi_index: eid = absi_index[key][0] exists = True elif data_type == "log2ratios": if key in log2r_source_dup: duplicated = True elif key in log2r_src_index: eid = log2r_src_index[key][0] exists = True if duplicated: log.error("Duplicated key (%s) for assay %s" % (", ".join(key), assay_id)) map_list_add(wrong_assays, study_id, assay_id) continue if eid is None: eid = str(uuid.uuid4()) e["id"] = eid if disease_state == "normal": if data_type == "log2ratios": k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"])) map_list_add(log2r_src_units, k, eid) elif data_type == "log_abs_readings": map_list_add(normal_pools, (study_id, platform_id, e["icdo_topography"]), eid) else: log.error("Assay %s has an unexpected combination of (disease_state, assay_design, data_type): (%s, %s)" % (assay_id, disease_state, assay_design, data_type)) map_list_add(wrong_assays, study_id, assay_id) continue elif disease_state == "tumour": k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"])) if data_type == "log_abs_readings": map_list_add(tumour_units, k, eid) elif data_type == "log2ratios": map_list_add(log2r_src_units, k, eid) processed_studies.add(study_id) processed_assays += 1 map_inc(valid_assay_count, (study_id, platform_id)) msg = {True : "Overwritting", False : "Writting"}[exists] if data_type == "log_abs_readings": log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_ABS_INTENSITY, ", ".join(key), eid)) em.persist(e, types.MRNA_ABS_INTENSITY) absi_port.write(eid) absi_dup[key] = eid elif data_type == "log2ratios": log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_LOG2R_SOURCE, ", ".join(key), eid)) em.persist(e, types.MRNA_LOG2R_SOURCE) log2r_source_port.write(eid) log2r_source_dup[key] = eid log.info("Persisting mrna absi tumour units ...") for k, v in sorted(tumour_units.items()): key = (k[0], k[1], k[2]) exists = key in absi_tumour_unit_index if exists: uid = absi_tumour_unit_index[key][0] else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = k[0] u["platform_id"] = k[1] u["icdo_topography"] = k[2] u["size"] = len(v) u["mrna_absi_ids"] = u.create_list(v) if exists: log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid)) else: log.debug("\t(%s) --> %s ..." % (", ".join(k), uid)) em.persist(u, types.MRNA_ABSI_TUMOUR_UNIT) absi_tumour_unit_port.write(uid) log.info("Creating indices for mrna normal pools ...") normal_pool_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_NORMAL_POOL, unique = True) log.info("Persisting mrna normal pools ...") for k, v in sorted(normal_pools.items()): key = (k[0], k[1], k[2]) exists = key in normal_pool_index if exists: uid = normal_pool_index[key][0] else: uid = str(uuid.uuid4()) u = DataElement(key_sep = "/") u["id"] = uid u["study_id"] = k[0] u["platform_id"] = k[1] u["icdo_topography"] = k[2] u["size"] = len(v) u["mrna_absi_ids"] = u.create_list(v) if exists: log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid)) else: log.debug("\t(%s) --> %s ..." % (", ".join(k), uid)) em.persist(u, types.MRNA_NORMAL_POOL) normal_pool_port.write(uid) sb = ["\n\nProcessed %i assays for %i studies (out of %i):\n\n" % (processed_assays, len(processed_studies), len(study_ids))] sb += ["%i mrna tumour units:\n\n" % (len(tumour_units))] for k, v in sorted(tumour_units.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\n%i mrna normal pools:\n\n" % (len(normal_pools))] for k, v in sorted(normal_pools.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\n%i mrna source log2r units:\n\n" % (len(log2r_src_units))] for k, v in sorted(log2r_src_units.items()): sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))] sb += ["\nAssay counts by study and platform:\n\n"] for k, v in sorted(valid_assay_count.items()): sb += ["\t%s\t%i assays" % (k, v)] if k in wrong_assays: sb += ["\t%i failed assays" % len(wrong_assays[k])] if k in wrong_samples: sb += ["\t%i failed samples" % len(wrong_samples[k])] sb += ["\n"] log.info("".join(sb)) if len(skipped_assay_count) > 0: log.info("Skipped assays:\n\n%s" % map_count_tostring(skipped_assay_count, indent = 1)) if len(wrong_assays) > 0: log.info("Summary of failed assays:\n\n%s" % map_list_tostring(wrong_assays)) if len(wrong_samples) > 0: log.info("Summary of failed samples:\n\n%s" % map_list_tostring(wrong_samples)) em.close() return 0