def read_pool_data(conf, rs, pool, log): pool_data = {} mf = pool.missing_fields(["study_id", "platform_id", "icdo_topography", "data_file/repo", "data_file/path", "data_file/name"]) if len(mf) > 0: log.error("Normal pool %s has missing fields: %s" % (pool["id"], ", ".join(mf))) return None key = "(%s, %s, %s)" % (pool["study_id"], pool["platform_id"], pool["icdo_topography"]) log.info("Reading normal pool %s %s ..." % (key, pool["id"])) file_repo = pool["data_file/repo"] repo = rs.repository(file_repo) file_path = pool["data_file/path"] file_name = pool["data_file/name"] rpath = os.path.join(file_path, file_name) log.debug("Reading normal pool data from %s ..." % (rpath)) mr = MatrixReader(repo.open_reader(rpath)) header = mr.read_header() if len(header.columns) != 2: log.error("Unexpected number of columns: %i" % len(header.columns)) mr.close() return None for row in mr: pool_data[row.name] = row.values[0] mr.close() return pool_data
def mask_filtering(input_path, output_path, mask): mr = MatrixReader(input_path, dtype=int) mw = MatrixWriter(output_path, dtype=int) mw.write_header(mr.read_header()) for row in mr: values = [1 if (v & mask) != 0 else 0 for v in row.values] mw.write(row.name, values) mr.close() mw.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["absi_tumour_unit_ids"]) task.check_out_ports(["log2r_ids"]) absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"] log2r_port = task.ports["log2r_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run # Index normal pools by study, platform, topography log.debug("Indexing normal pools by study, platform and topography ...") pools_index = em.group_ids( ["study_id", "platform_id", "icdo_topography"], types.MRNA_NORMAL_POOL, unique = True) # Index log2r assays by absi_id log.debug("Indexing log2r assays by absi assay ...") log2r_index = em.group_ids( ["absi_id"], types.MRNA_LOG2R, unique = True) absi_tumour_unit_ids = absi_tumour_unit_port.read_all() log.info("Processing %i mrna absi tumour units ..." % len(absi_tumour_unit_ids)) #log.debug("[%s]" % (", ".join(absi_tumour_unit_ids))) # For each abs intensity assay pool = None pool_data = {} for absi in iter_tumour_absi(conf, em, absi_tumour_unit_ids, log): absi_id = absi["id"] rpath = os.path.join(absi["data_file/path"], absi["data_file/name"]) icdo_topography = absi["icdo_topography"] normal_counterpart = absi.get("normal_counterpart", icdo_topography) if icdo_topography != normal_counterpart: keystr = "(%s, %s, %s --> %s)" % (absi["study_id"], absi["platform_id"], icdo_topography, normal_counterpart) else: keystr = "(%s, %s, %s)" % (absi["study_id"], absi["platform_id"], icdo_topography) exists = (absi_id,) in log2r_index if exists: log2r_id = log2r_index[(absi_id,)][0] else: log2r_id = str(uuid.uuid4()) data_file_path = types.MRNA_LOG2R.replace(".", "/") data_file_name = log2r_id + ".tsv.gz" dst_path = os.path.join(data_file_path, data_file_name) if not overwrite and exists and data_repo.exists(dst_path): log.debug("Skipping calculation of log2r for tumour assay %s %s as it is already calculated" % (keystr, absi_id)) log2r_port.write(log2r_id) continue log.info("Processing tumour assay %s %s from %s ..." % (keystr, absi_id, rpath)) repo = rs.repository(absi["data_file/repo"]) if not repo.exists(rpath): log.error("File not found: %s" % rpath) continue # Get normal counterpart data if pool is None \ or absi["study_id"] != pool["study_id"] \ or absi["platform_id"] != pool["platform_id"] \ or normal_counterpart != pool["icdo_topography"]: pool_key = (absi["study_id"], absi["platform_id"], normal_counterpart) if pool_key not in pools_index: log.error("Normal pool not found for tumour assay (%s) %s {%s}" % (", ".join(pool_key), absi_id, absi.get("source_path", ""))) continue pool_id = pools_index[pool_key][0] pool = em.find(pool_id, types.MRNA_NORMAL_POOL) if pool is None: log.error("Normal pool %s not found by the entity manager !" % pool_id) continue pool_data = read_pool_data(conf, rs, pool, log) if pool_data is None: pool = None continue log.info("Using normal pool ({}) [{}]".format(", ".join(pool_key), pool_id)) # Calculate log2 ratios mr = MatrixReader(repo.open_reader(rpath)) header = mr.read_header() if len(header.columns) != 2: log.error("Unexpected number of columns: %i" % len(header.columns)) mr.close() continue warn_count = { "id_not_in_pool" : 0, "value_is_nan" : 0, "pool_value_is_nan" : 0, "value_is_inf" : 0, "pool_value_is_inf" : 0} data = {} for row in mr: if row.name in data: log.error("Skipping tumour assay, duplicated row %s at file %s" % (row.name, rpath)) break value = row.values[0] value_is_nan = numpy.isnan(value) if value_is_nan: warn_count["value_is_nan"] += 1 elif numpy.isinf(value): warn_count["value_is_inf"] += 1 if row.name not in pool_data: pool_value = value = numpy.nan warn_count["id_not_in_pool"] += 1 else: pool_value = pool_data[row.name] pool_value_is_nan = numpy.isnan(pool_value) if pool_value_is_nan: warn_count["pool_value_is_nan"] += 1 elif numpy.isinf(pool_value): warn_count["pool_value_is_inf"] += 1 if not value_is_nan and not pool_value_is_nan: # and value != 0.0 and pool_value != 0.0: log2r = value - pool_value else: log2r = numpy.nan if not numpy.isinf(log2r): data[row.name] = log2r #else: # log.warn("row = %s, log2r = %f, value = %f, pool_value = %f" % (row.name, log2r, value, pool_value)) mr.close() sb = ["{0}={1}".format(k, v) for k, v in warn_count.items() if v > 0] if len(sb) > 0: log.warn(", ".join(sb)) # Save log2 ratios data and assay log2r = deepcopy(absi) log2r["id"] = log2r_id log2r["absi_id"] = absi_id log2r["normal_pool_id"] = pool["id"] log2r["data_file/repo"] = data_repo.name() log2r["data_file/path"] = data_file_path log2r["data_file/name"] = data_file_name msg = {True : "Overwritting", False : "Writting"}[exists] log.debug("%s log2 ratio data to %s ..." % (msg, dst_path)) mw = MatrixWriter(data_repo.open_writer(dst_path)) mw.write_header(["id", "value"]) for name, value in sorted(data.items()): mw.write(name, [value]) mw.close() em.persist(log2r, types.MRNA_LOG2R) log2r_port.write(log2r_id) em.close() es.close() data_repo.close() rs.close()
def run(task): # Initialization task.check_conf(["entities", "repositories", "repositories.assay"]) conf = task.conf log = task.logger() task.check_in_ports(["normal_pool_ids"]) normal_pool_port = task.ports["normal_pool_ids"] es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run log.info("Processing %i mrna normal pools ..." % normal_pool_port.size()) for pool_id in normal_pool_port: pool = em.find(pool_id, types.MRNA_NORMAL_POOL) if pool is None: log.error("%s not found: %s" % (types.MRNA_NORMAL_POOL, pool_id)) continue mf = pool.missing_fields(["study_id", "platform_id", "icdo_topography", "size", "mrna_absi_ids"]) if len(mf) > 0: log.error("Normal pool %s missing required fields: %s {%s}" % (pool_id, mf, pool.get("__doc_path", ""))) continue key = (pool["study_id"], pool["platform_id"], pool["icdo_topography"]) log.info("Normal pool (%s) [%s] with %i assays ..." % (", ".join(key), pool_id, pool["size"])) data_file_path = types.MRNA_NORMAL_POOL.replace(".", "/") data_file_name = pool_id + ".tsv.gz" dst_rel_path = os.path.join(data_file_path, data_file_name) #dst_path = os.path.join(conf["repo.data"], dst_rel_path) if not overwrite and data_repo.exists(dst_rel_path) \ and "mrna_absi_ids" in pool and "pooled_assays" in pool and \ len(pool["mrna_absi_ids"]) == pool.get("pooled_assays", dtype=int): log.warn("Skipping normal pool %s that already has data" % pool_id) continue method = MeanPoolMethod() pooled_assays = 0 duplicated_rows = False for absi in em.iter_all(types.MRNA_ABS_INTENSITY, eids = pool["mrna_absi_ids"]): mf = absi.missing_fields(["data_file/path", "data_file/name"]) if len(mf) > 0: log.error("Normal assay %s missing required fields: %s {%s}" % (absi["id"], mf, absi.get("__doc_path", ""))) continue data_file = absi["data_file"] rel_path = os.path.join(data_file["path"], data_file["name"]) #filename = os.path.join(conf["repo.assays"], rel_path) repo = rs.repository(data_file["repo"]) if not repo.exists(rel_path): log.error("File not found: %s" % rel_path) continue log.debug("Processing normal assay %s for source assay %s at %s ..." % (absi["id"], absi["assay_id"], rel_path)) pooled_assays += 1 mr = MatrixReader(repo.open_reader(rel_path)) header = mr.read_header() if len(header.columns) != 2: log.error("Unexpected number of columns: %i" % len(header.columns)) mr.close() continue row_names = set() for row in mr: if row.name in row_names: log.error("Skipping normal assay, duplicated row %s at file %s" % (row.name, rel_path)) duplicated_rows = True break else: row_names.add(row.name) value = numpy.exp2(row.values[0]) method.process(row.name, value) mr.close() if not duplicated_rows and pooled_assays > 0: exists = data_repo.exists(dst_rel_path) msg = {True : "Overwritting", False : "Writting"}[exists] log.debug("%s pooled data to %s ..." % (msg, dst_rel_path)) mw = MatrixWriter(data_repo.open_writer(dst_rel_path)) mw.write_header(["id", "value"]) for row in method.pooled_rows(): value = numpy.log2(row.values[0]) mw.write(row.name, [value]) mw.close() pool["pooled_assays"] = pooled_assays pool["data_file/repo"] = "data" pool["data_file/path"] = data_file_path pool["data_file/name"] = data_file_name em.persist(pool, types.MRNA_NORMAL_POOL) em.close() return 0