Example #1
0
def run(task):

	# Initialization

	conf = task.conf

	log = task.logger()

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	# Run

	for k, v in vars(types).items():
		if k.startswith("CNV_"):
			log.info("Preparing '{0}' ...".format(v))
			em.ensure_collection_exists(v)
			path = rpath.absolute(v.replace(".", "/"))
			log.debug("\tData: {0}".format(path))
			data_repo.mkdir_if_not_exists(path)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
	
	return 0
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	id_port = task.ports("mrna_normal_pool")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	results_base_path = "reports/" + types.CNV_COMBINATION.replace(".", "/")

	# Run

	for id in id_port:
		e = em.find(oid, types.MRNA_LOG2R_TUMOUR_UNIT)
		if e is None:
			log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT, id))
			continue

		repo, data_path = rs.from_url(e["data_file"])
		data_local_path = repo.get_local(data_path)

		cmd = " ".join([conf["bin_paths.R"],
			"--vanilla --slave -f", script,
			"--args", results_base_path, id, data_local_path])

		log.debug(cmd)

		retcode = subprocess.call(args = cmd, shell = True)

		if retcode != 0:
			raise Exception("R script failed")

		repo.close_local(data_local_path)
		repo.close()

	em.close()
	es.close()
Example #3
0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "repositories.source",
						"mrna.enrichment", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["oncodrive_ids"])
	task.check_out_ports(["enrichment_ids"])

	oncodrive_port = task.ports["oncodrive_ids"]
	enrichment_port = task.ports["enrichment_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	data_repo = rs.repository("data")
	
	overwrite = conf.get("overwrite", False, dtype=bool)

	# retrieve enrichment configurations
	ec = conf["mrna.enrichment"]
	if "default" in ec:
		default = ec["default"]
	else:
		default = conf.create_element()

	if "modules" not in ec:
		log.error("There is no enrichment modules section available in mrna.enrichment")
		return -1

	log.info("Reading modules configuration ...")

	econfs = list()
	for mod in ec["modules"]:
		m = ec.create_element()
		m.merge(default)
		m.merge(mod)
		mf = m.missing_fields(["id_type", "test", "modules_file"])
		if len(mf) > 0:
			log.error("Enrichment configuration missing required fields: {}".format(", ".join(mf)))
			log.error("Module configuration: {}".format(m))
		else:
			econfs.append(m)
			log.debug("{} -> {}".format(m["id_type"], m["modules_file"]))

	if len(econfs) == 0:
		log.error("There are no enrichment configurations available in mrna.enrichment")
		return 0

	results_base_path = types.MRNA_ENRICHMENT.replace(".", "/")
	
	log.info("Indexing available enrichment results ...")
	enrichment_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology", "id_type"],
		types.MRNA_ENRICHMENT, unique = True)

	for oid in oncodrive_port:
		o = em.find(oid, types.MRNA_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, oid))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])

		log.info("Enrichment for oncodrive results ({0}) [{1}] ...".format(", ".join(okey), oid))

		for ec in econfs:
			log.info("Module {} [{}] ...".format(ec["id_type"], ec["modules_file"]))

			key = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"], ec["id_type"])

			if key in enrichment_results_index:
				eid = enrichment_results_index[key][0]
				e = em.find(eid, types.MRNA_ENRICHMENT)
				if e is None:
					log.error("{} not found: {}".format(types.MRNA_ENRICHMENT, eid))
					continue
			else:
				e = o.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
				e["id"] = eid = str(uuid.uuid4())

			e["id_type"] = ec["id_type"]

			# enrichment results

			results_path = rpath.join(results_base_path, eid + ".tsv.gz")

			if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
				log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
				enrichment_port.write(eid)
				continue

			valid = enrichment(log, conf, rs, data_repo, results_path, o["results_file"], e, ec,
						["id", "upreg_corrected_right_p_value", "downreg_corrected_right_p_value"],
						["id", "upreg", "downreg"])

			# save mapped results
			if valid:
				em.persist(e, types.MRNA_ENRICHMENT)
				enrichment_port.write(eid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
def run(task):

    # Initialization

    task.check_conf(
        [
            "entities",
            "repositories",
            "repositories.data",
            "repositories.source",
            "bin_paths.python",
            "bin_paths.matrix_map",
        ]
    )
    conf = task.conf

    log = task.logger()

    task.check_in_ports(["oncodrive_ids"])
    task.check_out_ports(["mapped_oncodrive_ids"])

    oncodrive_port = task.ports["oncodrive_ids"]
    mapped_oncodrive_port = task.ports["mapped_oncodrive_ids"]

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    data_repo = rs.repository("data")
    source_repo = rs.repository("source")

    overwrite = conf.get("overwrite", False, dtype=bool)

    platform_base_path = "platform"
    vplatform_base_path = "vplatform"

    results_base_path = types.MRNA_ONCODRIVE_GENES.replace(".", "/")

    log.info("Indexing available oncodrive results for genes ...")
    oncodrive_results_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_GENES, unique=True
    )

    for oid in oncodrive_port:
        o = em.find(oid, types.MRNA_ONCODRIVE_PROBES)
        if o is None:
            log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_PROBES, oid))
            continue

        study_id = o["study_id"]
        platform_id = o["platform_id"]
        key = (study_id, platform_id, o["icdo_topography"], o["icdo_morphology"])

        if key in oncodrive_results_index:
            mid = oncodrive_results_index[key][0]
            m = em.find(mid, types.MRNA_ONCODRIVE_GENES)
            if m is None:
                log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, mid))
                continue
        else:
            m = o.transform(
                [
                    "study_id",
                    "platform_id",
                    "icdo_topography",
                    "icdo_morphology",
                    "log2r_tumour_unit_id",
                    ("oncodrive_probes_id", "id"),
                ]
            )
            m["id"] = mid = str(uuid.uuid4())

            # mapped oncodrive results

        results_path = rpath.join(results_base_path, mid + ".tsv.gz")
        gitools_results_path = rpath.join(results_base_path, mid + ".tdm.gz")

        if skip_file(overwrite, data_repo, results_path, m.get("results_file")):
            log.warn("Skipping ({0}) [{1}] as it already exists".format(", ".join(key), mid))
            mapped_oncodrive_port.write(mid)
            continue

        log.info("Mapping oncodriver results ({0}) [{1}] ...".format(", ".join(key), oid))

        # determine the mapping file
        map_file = None
        p = em.find(platform_id, types.SOURCE_PLATFORM)
        if p is None:
            log.error("{0} not found: {1}".format(types.SOURCE_PLATFORM, platform_id))
            continue

        platform_id_type = p.get("SO/platform_id_type")
        if platform_id_type is None:
            log.error("Undefined annotation 'SO/platform_id_type' for platform '{0}'.".format(platform_id))
            continue
        elif platform_id_type != "genbank_accession":  # affy_accession, custom, ...
            missing = p.missing_fields(["ensg_map", "ensg_map/file"])
            if len(missing) > 0:
                log.error("Missing required fields for platform '{0}': {1}".format(platform_id, ", ".join(missing)))
                continue
            map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if not source_repo.exists(map_file):
                log.error("Mapping file not found for platform '{0}': {1}".format(platform_id, map_file))
                continue
        elif platform_id_type == "genbank_accession":
            if len(p.missing_fields(["ensg_map", "ensg_map/file"])) > 0:
                map_file = None
            else:
                map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if map_file is None or not source_repo.exists(map_file):
                vpid = "-".join([platform_id, study_id])
                vp = em.find(vpid, types.SOURCE_VPLATFORM)
                if vp is None:
                    log.error("{0} not found: {1}".format(types.SOURCE_VPLATFORM, vpid))
                    continue
                missing = vp.missing_fields(["ensg_map", "ensg_map/path", "ensg_map/file"])
                if len(missing) > 0:
                    log.error("Missing required fields for vplatform '{0}': {1}".format(vpid, ", ".join(missing)))
                    continue
                map_file = rpath.join(vplatform_base_path, vp["ensg_map/path"], vp["ensg_map/file"])
                if not source_repo.exists(map_file):
                    log.error(
                        "Mapping file not found for vplatform ({0}, {1}): {2}".format(platform_id, study_id, map_file)
                    )
                    continue
        else:
            log.error("Unknown SO/platform_id_type '{0}' for platform '{1}'.".format(platform_id_type, platform_id))
            continue

        log.debug("Mapping file: {0}".format(map_file))

        m["platform_map_file"] = source_repo.url(map_file)

        # oncodrive results file
        repo, repo_path = rs.from_url(o["results_file"])
        local_path = repo.get_local(repo_path)

        # mapped oncodrive results
        m["results_file"] = data_repo.url(results_path)
        results_local_path = data_repo.create_local(results_path)
        gitools_results_local_path = data_repo.create_local(gitools_results_path)

        mapping_path = rpath.join(results_base_path, mid + ".mapping.tsv.gz")
        m["mapping_file"] = data_repo.url(mapping_path)
        mapping_local_path = data_repo.create_local(mapping_path)

        map_results_file = tempfile.mkstemp(prefix="mrna_oncodrive_map_", suffix=".tsv")[1]

        try:
            # run the mapping tool
            local_map_file = source_repo.get_local(map_file)

            log.debug("Mapping {0} to {1} ...".format(repo_path, map_results_file))

            cmd = " ".join(
                [
                    conf["bin_paths.python"],
                    conf["bin_paths.matrix_map"],
                    "-o",
                    map_results_file,
                    "-i",
                    mapping_local_path,
                    local_path,
                    local_map_file,
                ]
            )

            log.debug(cmd)

            retcode = subprocess.call(args=cmd, shell=True)

            if retcode != 0:
                raise Exception("There was an error mapping the results")

                # merge repeated ids

            log.debug("Merging {0} to {1} ...".format(map_results_file, results_path))
            log.debug("Gitools file: {0}".format(gitools_results_path))

            upreg_count, downreg_count = merge(log, map_results_file, results_local_path, gitools_results_local_path)
            if upreg_count == 0 and downreg_count == 0:
                log.error(
                    "The results of the mapping for ({0}) are empty. This could be because the annotated platform or the mapping file is wrong.".format(
                        ", ".join(key)
                    )
                )

                # close local paths
            data_repo.put_local(results_local_path)
            data_repo.put_local(mapping_local_path)

        except Exception as e:
            log.exception(e)

            data_repo.close_local(results_local_path)
            data_repo.close_local(mapping_local_path)
            continue

        finally:
            os.remove(map_results_file)
            repo.close_local(local_path)
            source_repo.close_local(local_map_file)

            # save mapped results
        em.persist(m, types.MRNA_ONCODRIVE_GENES)
        mapped_oncodrive_port.write(mid)

    em.close()
    data_repo.close()
    source_repo.close()
    rs.close()
Example #5
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)
	
	log = task.logger()

	id_port = task.ports("id")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	conn = biomart_db_connect(conf["biomart.db"], log)

	cursor = conn.cursor()

	table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values())

	feat_ids = {}

	for name in table_infixs:
		if name == "gene":
			continue
			
		cursor.execute("""
			CREATE TABLE IF NOT EXISTS exp_{0}_trs (
			  {0}_id int(11) NOT NULL,
			  icdo_id int(11) NOT NULL,
			  exp_id int(11) NOT NULL,
			  upreg_total int(11) DEFAULT NULL,
			  upreg_observed double DEFAULT NULL,
			  upreg_expected double DEFAULT NULL,
			  upreg_stdev double DEFAULT NULL,
			  upreg_pvalue double DEFAULT NULL,
			  upreg_cpvalue double DEFAULT NULL,
			  downreg_total int(11) DEFAULT NULL,
			  downreg_observed double DEFAULT NULL,
			  downreg_expected double DEFAULT NULL,
			  downreg_stdev double DEFAULT NULL,
			  downreg_pvalue double DEFAULT NULL,
			  downreg_cpvalue double DEFAULT NULL,
			  PRIMARY KEY ({0}_id,icdo_id,exp_id),
			  KEY icdo (icdo_id,exp_id),
			  KEY exp (exp_id),
			  CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id),
			  CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
			  CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
			) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine))

		feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name))

	icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
	exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

	for id_type, eid in id_port:
		e = em.find(eid, types.MRNA_ENRICHMENT)
		if e is None:
			log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid))
			continue

		if "results_file" not in e:
			log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid))
			continue

		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e["icdo_morphology"]

		okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type)

		log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid))

		table_infix = ID_TYPE_TO_TABLE_INFIX[id_type]

		icdo_key = (icdo_topography, icdo_morphology)
		if icdo_key not in icdo:
			log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
			continue
		icdo_id = icdo[icdo_key]

		exp_key = (study_id, platform_id)
		if exp_key not in exp:
			log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
			continue
		exp_id = exp[exp_key]

		ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix),
				["{}_id".format(table_infix), "icdo_id", "exp_id",
						"upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue",
						"downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size)

		results_repo, results_path = rs.from_url(e["results_file"])

		try:
			reader = results_repo.open_reader(results_path)
		except Exception as ex:
			log.exception(ex)
			ib.close()
			results_repo.close()
			continue
		
		# read header
		hdr_map = {}
		hdr = reader.readline().rstrip().split("\t")
		for i, name in enumerate(hdr):
			hdr_map[name] = i

		try:
			col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
		except KeyError as e:
			log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
			reader.close()
			ib.close()
			results_repo.close()
			continue

		skipped_ids = set()

		fids = feat_ids[table_infix]

		# read data
		for line in reader:
			line = line.rstrip()
			data = line.split("\t")
			feat_name = data[0]
			data = [data[i] for i in col_indices]
			if feat_name not in fids:
				skipped_ids.add(feat_name)
				continue

			feat_id = fids[feat_name]
			
			ib.insert(feat_id, icdo_id, exp_id, *data)

		if len(skipped_ids) > 0:
			log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids)))

		log.debug("{} results inserted".format(ib.count))

		ib.close()
		reader.close()

	em.close()
	es.close()
	rs.close()
Example #6
0
def run(task):
	
	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["absi_tumour_unit_ids"])
	task.check_out_ports(["log2r_ids"])

	absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"]
	log2r_port = task.ports["log2r_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)
	
	# Run
	
	# Index normal pools by study, platform, topography
	log.debug("Indexing normal pools by study, platform and topography ...")
	pools_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_NORMAL_POOL, unique = True)

	# Index log2r assays by absi_id
	log.debug("Indexing log2r assays by absi assay ...")
	log2r_index = em.group_ids(
		["absi_id"],
		types.MRNA_LOG2R, unique = True)

	absi_tumour_unit_ids = absi_tumour_unit_port.read_all()
	
	log.info("Processing %i mrna absi tumour units ..." % len(absi_tumour_unit_ids))
	#log.debug("[%s]" % (", ".join(absi_tumour_unit_ids)))

	# For each abs intensity assay
	pool = None
	pool_data = {}
	for absi in iter_tumour_absi(conf, em, absi_tumour_unit_ids, log):

		absi_id = absi["id"]

		rpath = os.path.join(absi["data_file/path"], absi["data_file/name"])
		
		icdo_topography = absi["icdo_topography"]
		normal_counterpart = absi.get("normal_counterpart", icdo_topography)
		if icdo_topography != normal_counterpart:
			keystr = "(%s, %s, %s --> %s)" % (absi["study_id"], absi["platform_id"], icdo_topography, normal_counterpart)
		else:
			keystr = "(%s, %s, %s)" % (absi["study_id"], absi["platform_id"], icdo_topography)

		exists = (absi_id,) in log2r_index
		if exists:
			log2r_id = log2r_index[(absi_id,)][0]
		else:
			log2r_id = str(uuid.uuid4())

		data_file_path = types.MRNA_LOG2R.replace(".", "/")
		data_file_name = log2r_id + ".tsv.gz"
		dst_path = os.path.join(data_file_path, data_file_name)

		if not overwrite and exists and data_repo.exists(dst_path):
			log.debug("Skipping calculation of log2r for tumour assay %s %s as it is already calculated" % (keystr, absi_id))
			log2r_port.write(log2r_id)
			continue

		log.info("Processing tumour assay %s %s from %s ..." % (keystr, absi_id, rpath))

		repo = rs.repository(absi["data_file/repo"])
		if not repo.exists(rpath):
			log.error("File not found: %s" % rpath)
			continue

		# Get normal counterpart data
		if pool is None \
			or absi["study_id"] != pool["study_id"] \
			or absi["platform_id"] != pool["platform_id"] \
			or normal_counterpart != pool["icdo_topography"]:

			pool_key = (absi["study_id"], absi["platform_id"], normal_counterpart)
			if pool_key not in pools_index:
				log.error("Normal pool not found for tumour assay (%s) %s {%s}" % (", ".join(pool_key), absi_id, absi.get("source_path", "")))
				continue

			pool_id = pools_index[pool_key][0]
			pool = em.find(pool_id, types.MRNA_NORMAL_POOL)
			if pool is None:
				log.error("Normal pool %s not found by the entity manager !" % pool_id)
				continue
			
			pool_data = read_pool_data(conf, rs, pool, log)
			if pool_data is None:
				pool = None
				continue

		log.info("Using normal pool ({}) [{}]".format(", ".join(pool_key), pool_id))

		# Calculate log2 ratios
		mr = MatrixReader(repo.open_reader(rpath))
		header = mr.read_header()
		if len(header.columns) != 2:
			log.error("Unexpected number of columns: %i" % len(header.columns))
			mr.close()
			continue

		warn_count = {
			"id_not_in_pool" : 0,
			"value_is_nan" : 0,
			"pool_value_is_nan" : 0,
			"value_is_inf" : 0,
			"pool_value_is_inf" : 0}

		data = {}
		for row in mr:
			if row.name in data:
				log.error("Skipping tumour assay, duplicated row %s at file %s" % (row.name, rpath))
				break

			value = row.values[0]

			value_is_nan = numpy.isnan(value)

			if value_is_nan:
				warn_count["value_is_nan"] += 1
			elif numpy.isinf(value):
				warn_count["value_is_inf"] += 1

			if row.name not in pool_data:
				pool_value = value = numpy.nan
				warn_count["id_not_in_pool"] += 1
			else:
				pool_value = pool_data[row.name]

			pool_value_is_nan = numpy.isnan(pool_value)
			if pool_value_is_nan:
				warn_count["pool_value_is_nan"] += 1
			elif numpy.isinf(pool_value):
				warn_count["pool_value_is_inf"] += 1

			if not value_is_nan and not pool_value_is_nan: # and value != 0.0 and pool_value != 0.0:
				log2r = value - pool_value
			else:
				log2r = numpy.nan

			if not numpy.isinf(log2r):
				data[row.name] = log2r
			#else:
			#	log.warn("row = %s, log2r = %f, value = %f, pool_value = %f" % (row.name, log2r, value, pool_value))

		mr.close()
		
		sb = ["{0}={1}".format(k, v) for k, v in warn_count.items() if v > 0]
		if len(sb) > 0:
			log.warn(", ".join(sb))

		# Save log2 ratios data and assay
		log2r = deepcopy(absi)

		log2r["id"] = log2r_id
		log2r["absi_id"] = absi_id
		log2r["normal_pool_id"] = pool["id"]

		log2r["data_file/repo"] = data_repo.name()
		log2r["data_file/path"] = data_file_path
		log2r["data_file/name"] = data_file_name

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.debug("%s log2 ratio data to %s ..." % (msg, dst_path))

		mw = MatrixWriter(data_repo.open_writer(dst_path))
		mw.write_header(["id", "value"])
		for name, value in sorted(data.items()):
			mw.write(name, [value])
		mw.close()

		em.persist(log2r, types.MRNA_LOG2R)
		log2r_port.write(log2r_id)

	em.close()
	es.close()

	data_repo.close()
	rs.close()
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	evt_tumour_unit_port, oncodrive_results_port = \
		task.ports("evt_tumour_unit_ids", "oncodrive_results_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	log.info("Indexing available {} ...".format(types.CNV_ONCODRIVE_GENES))
	oncodrive_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_ONCODRIVE_GENES, unique = True)

	results_base_path = types.CNV_ONCODRIVE_GENES.replace(".", "/")

	for uid in evt_tumour_unit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"])
		if key in oncodrive_results_index:
			eid = oncodrive_results_index[key][0]
			e = em.find(eid, types.CNV_ONCODRIVE_GENES)
			if e is None:
				log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, eid))
				continue
		else:
			e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
			eid = e["id"] = str(uuid.uuid4())

		# create oncodrive results entity
		e["evt_tumour_unit_id"] = uid

		results_path = rpath.join(results_base_path, eid + ".tsv.gz")

		if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
			log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
			oncodrive_results_port.write(eid)
			continue

		e["results_file"] = data_repo.url(results_path)
		
		# data matrix for oncodrive calculation
		matrix_repo, matrix_path = rs.from_url(u["data_file"])

		# Gain & Loss

		log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))
		log.debug("{} id is {}".format(types.CNV_ONCODRIVE_GENES, eid))

		tmp_path = mkdtemp(prefix = "cnv_oncodrive_calc_")
		log.debug("Temporary directory: {}".format(tmp_path))
		tmp_file = os.path.join(tmp_path, "filtered_data.tsv")

		matrix_local_path = matrix_repo.get_local(matrix_path)
		log.debug("Matrix path: {}".format(matrix_path))

		try:
			try:
				log.info("Calculating Gain ...")
				log.debug("Bit mask filtering (01) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 1)
				gain_results = run_oncodrive(
					conf, log, e, "gain", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for gain failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			try:
				log.info("Calculating Loss ...")
				log.debug("Bit mask filtering (10) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 2)
				loss_results = run_oncodrive(
					conf, log, e, "loss", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for downreg failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			# Join gain & loss results

			log.info("Joining upreg & downreg results into memory ...")

			# the join is done in memory with a map
			dmap = read_data_map(log, gain_results, loss_results)

			log.info("Writting joined data to {} ...".format(results_path))

			results_local_path = data_repo.create_local(results_path)

			write_data_map(dmap, results_local_path)

		finally:
			matrix_repo.close_local(matrix_local_path)
			matrix_repo.close()

			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		data_repo.put_local(results_local_path)

		em.persist(e, types.CNV_ONCODRIVE_GENES)
		oncodrive_results_port.write(eid)
	
	em.close()
	data_repo.close()
	rs.close()
Example #8
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db",
		"biomart.files.icdo_topography", "biomart.files.icdo_morphology"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	log = task.logger()

	icdo_port = task.ports("icdo")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_topography = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_morphology = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	conn = biomart_db_connect(conf["biomart.db"], log)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

	cursor = conn.cursor()

	cursor.execute("""
		CREATE TABLE  ent_icdo (
		  id int(11) NOT NULL,
		  icdo_name varchar(512) NOT NULL DEFAULT '',
		  icdo_topography varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology varchar(255) NOT NULL DEFAULT '',
		  icdo_topography_code varchar(24) NOT NULL DEFAULT '',
		  icdo_morphology_code varchar(24) NOT NULL DEFAULT '',
		  icdo_topography_name varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology_name varchar(255) NOT NULL DEFAULT '',
		  PRIMARY KEY (id),
		  KEY icdo_name (icdo_name),
		  KEY icdo_tm (icdo_topography,icdo_morphology),
		  KEY icdo_m (icdo_morphology),
		  KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code),
		  KEY icdo_m_c (icdo_morphology_code)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine))

	ib = BatchInsert(cursor, "ent_icdo",
			["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name",
				"icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size)

	for i, tm in enumerate(icdo_port, 1):
		t_code = tm[0]
		if t_code == "":
			t_name = t_desc = "ANY topography"
		elif t_code not in icdo_topography:
			log.error("Unknown topography description for code {}".format(t_code))
			t_name = ""
			t_desc = "[{}]".format(t_code)
		else:
			t_name = icdo_topography[t_code]
			t_desc = "{} [{}]".format(t_name, t_code)

		m_code = tm[1]
		if m_code == "":
			m_name = m_desc = "ANY morphology"
		elif m_code not in icdo_morphology:
			log.error("Unknown morphology description for code {}".format(m_code))
			m_name = ""
			m_desc = "[{}]".format(m_code)
		else:
			m_name = icdo_morphology[m_code]
			m_desc = "{} [{}]".format(m_name, m_code)

		name = "; ".join((t_desc, m_desc))

		log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc))

		ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name)

	log.debug("{} ICDO terms inserted".format(ib.count))

	ib.close()
	cursor.close()
	conn.close()
	em.close()
	es.close()
	rs.close()
Example #9
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	combinations_port, combination_ids_port = \
		task.ports("combinations", "combination_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	results_base_path = types.CNV_COMBINATION.replace(".", "/")

	conditions = ("gain", "loss")
	
	for c_dict in combinations_port:
		c = DataFactory.from_native(c_dict, key_sep = "/")
		
		"""
		o = em.find(c, types.CNV_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.CNV_ONCODRIVE_GENES, c))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])
		"""

		cid = c["id"]

		key = (c["icdo_topography"], c["icdo_morphology"], c["id_type"])
		
		log.info("Processing combination for ({}) [{}] ...".format(", ".join(key), cid))

		#files = c["files"]
		#if len(files) == 1:
		#	log.info("No combination required, copyed from {0}".format(files[0]))
		#	c["results_file"] = files[0]
		#else:
		results_path = rpath.join(results_base_path, cid + ".tsv.gz")
		results_url = data_repo.url(results_path)

		if skip_file(overwrite, data_repo, results_path, c.get("results_file")):
			log.warn("Skipping {} ({}) [{}] as it already exists".format(types.CNV_COMBINATION, ", ".join(key), cid))
			combination_ids_port.write(cid)
			continue

		c["results_file"] = results_url

		combination(log, conf, rs, c, data_repo, results_path, conditions)

		# save combination results
		em.persist(c, types.CNV_COMBINATION)
		combination_ids_port.write(cid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
Example #10
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories",
		"cnv.background.ensg", "cnv.mapping.ensg",
		"bin_paths.bed_tools"])

	conf = task.conf

	log = task.logger()

	evt_tunit_port, joined_evt_tunit_port = \
		task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	mapping_file = conf["cnv.mapping.ensg"]
	log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file))
	mapping_repo, mapping_path = rs.from_url(mapping_file)
	mapping_local_path = mapping_repo.get_local(mapping_path)

	background_file = conf["cnv.background.ensg"]
	log.info("Loading background from {} ...".format(background_file))

	background = set()
	repo, path = rs.from_url(background_file)
	reader = repo.open_reader(path)
	for line in reader:
		line = line.rstrip()
		if len(line) == 0:
			continue
		background.add(line)
	reader.close()
	repo.close()

	for uid in evt_tunit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", ""))

		tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/")
		tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz")

		if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")):
			log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid))
			joined_evt_tunit_port.write(uid)
			continue

		log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))

		cnv_evt_ids = u["cnv_evt_ids"]
		log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS))

		data = {}
		
		tmp_path = mkdtemp(prefix = "evt_map_and_join_")
		log.debug("Temporary directory: {}".format(tmp_path))
		
		try:
			for eid in cnv_evt_ids:
				e = em.find(eid, types.CNV_EVENTS)
				if e is None:
					log.error("{} not found: {}".format(types.CNV_EVENTS, eid))
					continue

				data_file = e["data_file"]

				log.debug("{} ...".format(data_file))

				repo, path = rs.from_url(data_file)

				local_path = repo.get_local(path)

				# Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed)

#				tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"]))

#				writer = FileWriter(tmp_file)
#				reader = repo.open_reader(path)
#				for line in reader:
#					if line.lstrip().startswith("#"):
#						continue
#					fields = line.rstrip().split("\t")
#					end = int(fields[2]) + 0 # FIXME fix not necessary already
#					fields[2] = str(end)
#					writer.write("\t".join(fields))
#					writer.write("\n")
#				writer.close()
#				reader.close()

				# Run BED tools to intersect event regions with gene names

				tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"]))

				cmd = " ".join([
					os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"),
					"-a", mapping_local_path,
					#"-b", tmp_file,
					"-b", local_path,
					"-s -wb",
					">{}".format(tmp_file2)])

				log.debug(cmd)

				retcode = subprocess.call(args = cmd, shell = True)

				if retcode != 0:
					raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

				repo.close_local(local_path)

				# Read BED tools results and load event data into memory

				reader = FileReader(tmp_file2)

				name_index = 3
				value_index = 12

				line_num = 1
				for line in reader:
					try:
						fields = line.rstrip().split("\t")
						name = fields[name_index]
						value = int(fields[value_index])
						if value not in [1, 2]:
							log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file))
							continue
					except:
						log.error("Error parsing line {} of data file {}".format(line_num, data_file))
						continue

					k = (eid, name)
					if k in data:
						prev_value = data[k]
					else:
						prev_value = 0

					data[k] = prev_value | value

					line_num += 1

				reader.close()
				repo.close()

		finally:
			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		# Write events data to data file and merge with background labels

		log.info("Writing data to {} ...".format(tunit_path))

		u["data_file"] = data_repo.url(tunit_path)
		#TODO u["data_timestamp"] = ...

		writer = data_repo.open_writer(tunit_path)

		# header
		for name in cnv_evt_ids:
			writer.write("\t")
			writer.write(name)
		writer.write("\n")

		# data
		for row_name in sorted(background):
			writer.write(row_name)
			for col_name in cnv_evt_ids:
				k = (col_name, row_name)
				if k in data:
					value = data[k]
				else:
					value = 0
				writer.write("\t")
				writer.write(str(value))
			writer.write("\n")

		writer.close()
		
		log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key)))
		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		joined_evt_tunit_port.write(uid)

	em.close()
	es.close()

	mapping_repo.close_local(mapping_local_path)
	mapping_repo.close()
	data_repo.close()
	rs.close()
Example #11
0
def main():
    task.check_conf(["entities", "repositories", "biomart.db"])
    conf = task.conf

    insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

    db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

    log = task.logger()

    oncodrive_port = task.ports("id")

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    conn = biomart_db_connect(conf["biomart.db"], log)

    cursor = conn.cursor()

    gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene")
    icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
    exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

    cursor.execute(
        """
		CREATE TABLE IF NOT EXISTS exp_gene_trs (
		  gene_id int(11) NOT NULL,
		  icdo_id int(11) NOT NULL,
		  exp_id int(11) NOT NULL,
		  upreg_total int(11) DEFAULT NULL,
		  upreg_observed double DEFAULT NULL,
		  upreg_expected double DEFAULT NULL,
		  upreg_stdev double DEFAULT NULL,
		  upreg_pvalue double DEFAULT NULL,
		  upreg_cpvalue double DEFAULT NULL,
		  downreg_total int(11) DEFAULT NULL,
		  downreg_observed double DEFAULT NULL,
		  downreg_expected double DEFAULT NULL,
		  downreg_stdev double DEFAULT NULL,
		  downreg_pvalue double DEFAULT NULL,
		  downreg_cpvalue double DEFAULT NULL,
		  PRIMARY KEY (gene_id,icdo_id,exp_id),
		  KEY icdo (icdo_id,exp_id),
		  KEY exp (exp_id),
		  CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id),
		  CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
		  CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(
            db_engine
        )
    )

    cursor.execute("LOCK TABLES exp_gene_trs WRITE")

    lock_count = 0

    for eid in oncodrive_port:
        e = em.find(eid, types.MRNA_ONCODRIVE_GENES)
        if e is None:
            log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        if "results_file" not in e:
            log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        study_id = e["study_id"]
        platform_id = e["platform_id"]
        icdo_topography = e["icdo_topography"]
        icdo_morphology = e["icdo_morphology"]

        okey = (study_id, platform_id, icdo_topography, icdo_morphology)

        log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid))

        icdo_key = (icdo_topography, icdo_morphology)
        if icdo_key not in icdo:
            log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
            continue
        icdo_id = icdo[icdo_key]

        exp_key = (study_id, platform_id)
        if exp_key not in exp:
            log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
            continue
        exp_id = exp[exp_key]

        ib = BatchInsert(
            cursor,
            "exp_gene_trs",
            [
                "gene_id",
                "icdo_id",
                "exp_id",
                "upreg_total",
                "upreg_observed",
                "upreg_expected",
                "upreg_stdev",
                "upreg_pvalue",
                "upreg_cpvalue",
                "downreg_total",
                "downreg_observed",
                "downreg_expected",
                "downreg_stdev",
                "downreg_pvalue",
                "downreg_cpvalue",
            ],
            insert_size,
        )

        results_repo, results_path = rs.from_url(e["results_file"])

        try:
            reader = results_repo.open_reader(results_path)
        except Exception as ex:
            log.exception(ex)
            ib.close()
            results_repo.close()
            continue

            # read header
        hdr_map = {}
        hdr = reader.readline().rstrip().split("\t")
        for i, name in enumerate(hdr):
            hdr_map[name] = i

        try:
            col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
        except KeyError as e:
            log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
            reader.close()
            lock_count += ib.count
            ib.close()
            results_repo.close()
            continue

        skipped_genes = set()

        # read data
        for line in reader:
            line = line.rstrip()
            data = line.split("\t")
            gene_name = data[0]
            data = [data[i] for i in col_indices]
            if gene_name not in gene:
                skipped_genes.add(gene_name)
                continue

            gene_id = gene[gene_name]

            ib.insert(gene_id, icdo_id, exp_id, *data)

        if len(skipped_genes) > 0:
            log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes)))

        log.debug("{} gene results inserted".format(ib.count))

        lock_count += ib.count

        ib.close()
        reader.close()

        if lock_count >= 1000000:
            cursor.execute("UNLOCK TABLES")
            cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
            cursor.execute("LOCK TABLES exp_gene_trs WRITE")
            lock_count = 0

    cursor.execute("UNLOCK TABLES")
    cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
    cursor.close()

    em.close()
    es.close()
    rs.close()
Example #12
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay",
				"cnv.min_tumour_unit_size"])

	conf = task.conf

	log = task.logger()

	study_ids_port, evt_port, evt_tunit_port = \
		task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	source_repo = rs.repository("source")

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Creating indices for {} ...".format(types.CNV_EVENTS))
	evt_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS, unique = True)
	
	log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	evt_tunit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS_TUMOUR_UNIT, unique = True)
	
	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	tumour_units = {}
	evt_dup = {}
	
	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay {} not included in 'study_ids'".format(assay_id))
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]

		source_path = assay["source_path"]
		source_file = assay["assay_property/filename"]

		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path"])

		e["data_file"] = source_repo.url("assay", source_path, source_file)

		included = study_id in study_ids and study_type == "genomic"
		included &= (assay_design == "cancer_vs_normal" and data_type == "binary")

		if not included:
			if study_type != "transcriptomic" and study_id in study_ids:
				s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_source_path = sample.get("source_path", "")
			log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			"source_path",
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		orig_disease_state = disease_state
		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour"]:
			log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state))
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo, rel_path = rs.from_url(e["data_file"])

		if not repo.exists(rel_path):
			log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])

		eid = None
		duplicated = False
		exists = False
		if e_key in evt_dup:
			duplicated = True
		elif e_key in evt_index:
			eid = evt_index[e_key][0]
			exists = True
		
		if duplicated:
			log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid

		u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", ""))
		keys = classify_by_experiment_and_icdo(
					u_key[0], u_key[1], u_key[2], u_key[3])
		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				continue
			map_list_add(tumour_units, key, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key)))
		em.persist(e, types.CNV_EVENTS)
		evt_port.write(eid)
		evt_dup[e_key] = eid

	min_tumour_unit_size = conf["cnv.min_tumour_unit_size"]

	log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	log.debug("Minimum size = {}".format(min_tumour_unit_size))

	for key in sorted(tumour_units):
		v = tumour_units[key]
		size = len(v)
		if size < min_tumour_unit_size:
			discard = True
			discard_text = "[skipped]"
		else:
			discard = False
			discard_text = ""

		if key in evt_tunit_index:
			uid = evt_tunit_index[key][0]
			u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
			if u is None:
				log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
				continue

			arrow_text = "==>"
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")
			u["id"] = uid
			u["study_id"] = key[0]
			u["platform_id"] = key[1]
			u["icdo_topography"] = key[2]
			u["icdo_morphology"] = key[3]

			arrow_text = "-->"

		log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text))

		if discard:
			continue

		u["size"] = len(v)
		u["cnv_evt_ids"] = u.create_list(v)

		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		evt_tunit_port.write(uid)

	sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))]
	log.info("".join(sb))

	log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1)))
	
	log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays)))
	
	log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples)))

	em.close()
	es.close()
Example #13
0
def run(task):
	
	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	log = task.logger()
	
	task.check_in_ports(["normal_pool_ids"])

	normal_pool_port = task.ports["normal_pool_ids"]
	
	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)
	
	# Run

	log.info("Processing %i mrna normal pools ..." % normal_pool_port.size())

	for pool_id in normal_pool_port:
		pool = em.find(pool_id, types.MRNA_NORMAL_POOL)
		if pool is None:
			log.error("%s not found: %s" % (types.MRNA_NORMAL_POOL, pool_id))
			continue

		mf = pool.missing_fields(["study_id", "platform_id", "icdo_topography", "size", "mrna_absi_ids"])
		if len(mf) > 0:
			log.error("Normal pool %s missing required fields: %s {%s}" % (pool_id, mf, pool.get("__doc_path", "")))
			continue

		key = (pool["study_id"], pool["platform_id"], pool["icdo_topography"])
		log.info("Normal pool (%s) [%s] with %i assays ..." % (", ".join(key), pool_id, pool["size"]))

		data_file_path = types.MRNA_NORMAL_POOL.replace(".", "/")
		data_file_name = pool_id + ".tsv.gz"
		dst_rel_path = os.path.join(data_file_path, data_file_name)
		#dst_path = os.path.join(conf["repo.data"], dst_rel_path)

		if not overwrite and data_repo.exists(dst_rel_path) \
			and "mrna_absi_ids" in pool and "pooled_assays" in pool and \
					len(pool["mrna_absi_ids"]) == pool.get("pooled_assays", dtype=int):
			log.warn("Skipping normal pool %s that already has data" % pool_id)
			continue

		method = MeanPoolMethod()

		pooled_assays = 0
		duplicated_rows = False
		for absi in em.iter_all(types.MRNA_ABS_INTENSITY, eids = pool["mrna_absi_ids"]):
			mf = absi.missing_fields(["data_file/path", "data_file/name"])
			if len(mf) > 0:
				log.error("Normal assay %s missing required fields: %s {%s}" % (absi["id"], mf, absi.get("__doc_path", "")))
				continue

			data_file = absi["data_file"]
			rel_path = os.path.join(data_file["path"], data_file["name"])
			#filename = os.path.join(conf["repo.assays"], rel_path)
			repo = rs.repository(data_file["repo"])
			if not repo.exists(rel_path):
				log.error("File not found: %s" % rel_path)
				continue

			log.debug("Processing normal assay %s for source assay %s at %s ..." % (absi["id"], absi["assay_id"], rel_path))

			pooled_assays += 1
			
			mr = MatrixReader(repo.open_reader(rel_path))
			header = mr.read_header()
			if len(header.columns) != 2:
				log.error("Unexpected number of columns: %i" % len(header.columns))
				mr.close()
				continue

			row_names = set()
			for row in mr:
				if row.name in row_names:
					log.error("Skipping normal assay, duplicated row %s at file %s" % (row.name, rel_path))
					duplicated_rows = True
					break
				else:
					row_names.add(row.name)

				value = numpy.exp2(row.values[0])
				method.process(row.name, value)

			mr.close()

		if not duplicated_rows and pooled_assays > 0:
			exists = data_repo.exists(dst_rel_path)
			msg = {True : "Overwritting", False : "Writting"}[exists]
			log.debug("%s pooled data to %s ..." % (msg, dst_rel_path))

			mw = MatrixWriter(data_repo.open_writer(dst_rel_path))
			mw.write_header(["id", "value"])
			for row in method.pooled_rows():
				value = numpy.log2(row.values[0])
				mw.write(row.name, [value])
			mw.close()

			pool["pooled_assays"] = pooled_assays
			pool["data_file/repo"] = "data"
			pool["data_file/path"] = data_file_path
			pool["data_file/name"] = data_file_name
			em.persist(pool, types.MRNA_NORMAL_POOL)

	em.close()

	return 0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	log = task.logger()
	
	task.check_in_ports(["study_ids"])
	task.check_out_ports(["absi_ids", "absi_tumour_unit_ids", "normal_pool_ids", "log2r_source_ids"])

	study_ids_port = task.ports["study_ids"]
	absi_port = task.ports["absi_ids"]
	absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"]
	normal_pool_port = task.ports["normal_pool_ids"]
	log2r_source_port = task.ports["log2r_source_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])

	#overwrite = conf.get("overwrite", False, dtype=bool)

	# Run
	
	log.info("Creating indices for {} ...".format(types.MRNA_ABS_INTENSITY))
	absi_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_ABS_INTENSITY, unique = True)
	
	log.info("Creating indices for {} ...".format(types.MRNA_LOG2R_SOURCE))
	log2r_src_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_LOG2R_SOURCE, unique = True)

	log.info("Creating indices for {} ...".format(types.MRNA_ABSI_TUMOUR_UNIT))
	absi_tumour_unit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_ABSI_TUMOUR_UNIT, unique = True)

	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	log2r_src_units = {}
	tumour_units = {}
	normal_pools = {}
	absi_dup = {}
	log2r_source_dup = {}

	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay %s in study %s missing required fields: %s {%s}" % (assay_id, study_id, mf, assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay %s not included in 'study_ids'" % assay_id)
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]
		
		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path",
			("data_file/path", "source_path"),
			("data_file/name", "assay_property/filename") ])

		e["data_file/repo"] = assay.get("data_file/repo", "assay")

		included = study_id in study_ids and study_type == "transcriptomic"
		included &= (assay_design == "cancer_and_normal" and data_type == "log_abs_readings") \
						or (assay_design == "cancer_vs_normal" and data_type == "log2ratios")

		if not included:
			if study_type != "genomic" and study_id in study_ids:
				s = ", ".join(["%s = %s" % (v[0], v[1]) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.warn("Skipping assay %s {%s}: %s." % (assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay %s references a non-existent sample: %s" % (assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_id = sample.get("id", "WITHOUT ID")
			doc_path = sample.get("__doc_path", "UNKNOWN")
			sample_source_path = sample.get("source_path", "")
			
			log.error("Sample %s associated with assay %s in study %s missing required fields: %s {%s}" % (sample_id, assay_id, study_id, mf, sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			("source_path", "source_path"),
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '%s' for sample %s {%s}" % (disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour", "normal"]:
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo = rs.repository(e["data_file/repo"])
		rel_path = os.path.join(e["data_file/path"], e["data_file/name"])

		if not repo.exists(rel_path):
			log.error("Assay %s in study %s missing data file: [%s]" % (assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])
		
		eid = None
		duplicated = False
		exists = False
		if data_type == "log_abs_readings":
			if key in absi_dup:
				duplicated = True
			elif key in absi_index:
				eid = absi_index[key][0]
				exists = True
		elif data_type == "log2ratios":
			if key in log2r_source_dup:
				duplicated = True
			elif key in log2r_src_index:
				eid = log2r_src_index[key][0]
				exists = True

		if duplicated:
			log.error("Duplicated key (%s) for assay %s" % (", ".join(key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid
		
		if disease_state == "normal":
			if data_type == "log2ratios":
				k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]))
				map_list_add(log2r_src_units, k, eid)
			elif data_type == "log_abs_readings":
				map_list_add(normal_pools, (study_id, platform_id, e["icdo_topography"]), eid)
			else:
				log.error("Assay %s has an unexpected combination of (disease_state, assay_design, data_type): (%s, %s)" % (assay_id, disease_state, assay_design, data_type))
				map_list_add(wrong_assays, study_id, assay_id)
				continue
		elif disease_state == "tumour":
			k = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]))
			if data_type == "log_abs_readings":
				map_list_add(tumour_units, k, eid)
			elif data_type == "log2ratios":
				map_list_add(log2r_src_units, k, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		if data_type == "log_abs_readings":
			log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_ABS_INTENSITY, ", ".join(key), eid))
			em.persist(e, types.MRNA_ABS_INTENSITY)
			absi_port.write(eid)
			absi_dup[key] = eid
		elif data_type == "log2ratios":
			log.info("%s %s (%s) [%s] ..." % (msg, types.MRNA_LOG2R_SOURCE, ", ".join(key), eid))
			em.persist(e, types.MRNA_LOG2R_SOURCE)
			log2r_source_port.write(eid)
			log2r_source_dup[key] = eid

	log.info("Persisting mrna absi tumour units ...")

	for k, v in sorted(tumour_units.items()):
		key = (k[0], k[1], k[2])
		exists = key in absi_tumour_unit_index
		if exists:
			uid = absi_tumour_unit_index[key][0]
		else:
			uid = str(uuid.uuid4())

		u = DataElement(key_sep = "/")
		u["id"] = uid
		u["study_id"] = k[0]
		u["platform_id"] = k[1]
		u["icdo_topography"] = k[2]
		u["size"] = len(v)
		u["mrna_absi_ids"] = u.create_list(v)

		if exists:
			log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid))
		else:
			log.debug("\t(%s) --> %s ..." % (", ".join(k), uid))

		em.persist(u, types.MRNA_ABSI_TUMOUR_UNIT)
		absi_tumour_unit_port.write(uid)

	log.info("Creating indices for mrna normal pools ...")
	normal_pool_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_NORMAL_POOL, unique = True)

	log.info("Persisting mrna normal pools ...")

	for k, v in sorted(normal_pools.items()):
		key = (k[0], k[1], k[2])
		exists = key in normal_pool_index
		if exists:
			uid = normal_pool_index[key][0]
		else:
			uid = str(uuid.uuid4())

		u = DataElement(key_sep = "/")
		u["id"] = uid
		u["study_id"] = k[0]
		u["platform_id"] = k[1]
		u["icdo_topography"] = k[2]
		u["size"] = len(v)
		u["mrna_absi_ids"] = u.create_list(v)

		if exists:
			log.debug("\t(%s) ==> %s ..." % (", ".join(k), uid))
		else:
			log.debug("\t(%s) --> %s ..." % (", ".join(k), uid))

		em.persist(u, types.MRNA_NORMAL_POOL)
		normal_pool_port.write(uid)

	sb = ["\n\nProcessed %i assays for %i studies (out of %i):\n\n" % (processed_assays, len(processed_studies), len(study_ids))]
	
	sb += ["%i mrna tumour units:\n\n" % (len(tumour_units))]
	
	for k, v in sorted(tumour_units.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]

	sb += ["\n%i mrna normal pools:\n\n" % (len(normal_pools))]
	
	for k, v in sorted(normal_pools.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]
	
	sb += ["\n%i mrna source log2r units:\n\n" % (len(log2r_src_units))]
	
	for k, v in sorted(log2r_src_units.items()):
		sb += ["\t(%s)\t%i assays\n" % (", ".join(k), len(v))]

	sb += ["\nAssay counts by study and platform:\n\n"]
	
	for k, v in sorted(valid_assay_count.items()):
		sb += ["\t%s\t%i assays" % (k, v)]
		if k in wrong_assays:
			sb += ["\t%i failed assays" % len(wrong_assays[k])]
		if k in wrong_samples:
			sb += ["\t%i failed samples" % len(wrong_samples[k])]
		sb += ["\n"]

	log.info("".join(sb))

	if len(skipped_assay_count) > 0:
		log.info("Skipped assays:\n\n%s" % map_count_tostring(skipped_assay_count, indent = 1))

	if len(wrong_assays) > 0:
		log.info("Summary of failed assays:\n\n%s" % map_list_tostring(wrong_assays))

	if len(wrong_samples) > 0:
		log.info("Summary of failed samples:\n\n%s" % map_list_tostring(wrong_samples))

	em.close()

	return 0
def run(task):

    # Initialization

    task.check_conf(["entities", "repositories", "bin_paths.gitools"])
    conf = task.conf

    log = task.logger()

    task.check_in_ports(["log2r_tumour_unit_ids"])
    task.check_out_ports(["oncodrive_results_ids"])

    log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]
    oncodrive_results_port = task.ports["oncodrive_results_ids"]

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])
    data_repo = rs.repository("data")

    overwrite = conf.get("overwrite", False, dtype=bool)

    # Run

    log.info("Indexing available oncodrive results for probes ...")
    oncodrive_results_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
        types.MRNA_ONCODRIVE_PROBES,
        unique=True)

    log.info("Indexing available mrna log2r cutoffs ...")
    log2r_cutoff_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
        types.MRNA_LOG2R_CUTOFF,
        unique=True)

    results_base_path = types.MRNA_ONCODRIVE_PROBES.replace(".", "/")

    for log2r_unit_id in log2r_tumour_unit_port:
        u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT)
        if u is None:
            log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT,
                                                log2r_unit_id))
            continue

        key = (u["study_id"], u["platform_id"], u["icdo_topography"],
               u["icdo_morphology"])
        if key in oncodrive_results_index:
            eid = oncodrive_results_index[key][0]
            e = em.find(eid, types.MRNA_ONCODRIVE_PROBES)
            if e is None:
                log.error("{} not found: {}".format(
                    types.MRNA_ONCODRIVE_PROBES, eid))
                continue
        else:
            e = u.transform([
                "study_id", "platform_id", "icdo_topography", "icdo_morphology"
            ])
            eid = e["id"] = str(uuid.uuid4())

        log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(
            types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id))
        log.debug("{} id is {}".format(types.MRNA_ONCODRIVE_PROBES, eid))

        # create oncodrive results entity
        e["log2r_tumour_unit_id"] = log2r_unit_id

        results_path = rpath.join(results_base_path, eid + ".tsv.gz")

        if skip_file(overwrite, data_repo, results_path,
                     e.get("results_file")):
            log.warn("Skipping ({}) [{}] as it already exists".format(
                ", ".join(key), eid))
            oncodrive_results_port.write(eid)
            continue

        e["results_file"] = data_repo.url(results_path)

        # data matrix for oncodrive calculation
        file_repo = u["data_file/repo"]
        matrix_repo = rs.repository(file_repo)

        file_path = u["data_file/path"]
        file_name = u["data_file/file"]
        matrix_path = os.path.join(file_path, file_name)

        # Load calculated cutoff

        log.info("Loading mrna cutoff for key ({}) ...".format(", ".join(key)))

        if key not in log2r_cutoff_index:
            log.error("mrna log2r cuttof not found for key ({})".format(
                ", ".join(key)))
            matrix_repo.close()
            continue

        cutoff_id = log2r_cutoff_index[key][0]
        cutoff = em.find(cutoff_id, types.MRNA_LOG2R_CUTOFF)
        if cutoff is None:
            log.error("mrna log2r cuttof for key ({}) [{}] couldn't be loaded".
                      format(", ".join(key), cutoff_id))
            matrix_repo.close()
            continue

        log.debug("{} id is {}".format(types.MRNA_LOG2R_CUTOFF, cutoff_id))

        # Upregulation & downregulation

        try:
            from tempfile import mkdtemp
            tmp_path = mkdtemp(prefix="mrna_oncodrive_calc_")
            log.debug("Temporary directory: {}".format(tmp_path))

            matrix_local_path = matrix_repo.get_local(matrix_path)
            log.debug("Matrix path: {}".format(matrix_path))

            try:
                log.info("Calculating Upregulation with cutoff {} ...".format(
                    cutoff["upreg/cutoff"]))
                upreg_results = run_oncodrive(conf, log, e, "upreg",
                                              matrix_local_path, "gt",
                                              cutoff["upreg/cutoff"], tmp_path)
            except:
                log.error("Oncodrive calculation for upreg failed")
                matrix_repo.close_local(matrix_local_path)
                raise

            try:
                log.info(
                    "Calculating Downregulation with cutoff {} ...".format(
                        cutoff["downreg/cutoff"]))
                downreg_results = run_oncodrive(
                    conf, log, e, "downreg", matrix_local_path, "lt",
                    cutoff["downreg/cutoff"], tmp_path)
            except:
                log.error("Oncodrive calculation for downreg failed")
                matrix_repo.close_local(matrix_local_path)
                raise

            # Join upreg & downreg results

            log.info("Joining upreg & downreg results into memory ...")

            # the join is done in memory with a map
            dmap = read_data_map(log, upreg_results, downreg_results)

            log.info("Writting joined results to {} ...".format(results_path))

            results_local_path = data_repo.create_local(results_path)

            write_data_map(dmap, results_local_path)

        finally:
            matrix_repo.close_local(matrix_local_path)
            matrix_repo.close()

            if os.path.exists(tmp_path):
                log.debug(
                    "Removing temporary directory {} ...".format(tmp_path))
                import shutil
                shutil.rmtree(tmp_path)

        data_repo.put_local(results_local_path)

        em.persist(e, types.MRNA_ONCODRIVE_PROBES)
        oncodrive_results_port.write(eid)

    em.close()
    data_repo.close()
    rs.close()
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "bin_paths.R"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["log2r_tumour_unit_ids"])
	task.check_out_ports(["processed_log2r_tumour_unit_ids"])

	log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]
	processed_log2r_tumour_unit_port = task.ports["processed_log2r_tumour_unit_ids"]
	
	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	log.info("Indexing available mrna log2r cutoffs ...")
	log2r_cutoff_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_LOG2R_CUTOFF, unique = True)

	cutoff_path = types.MRNA_LOG2R_CUTOFF.replace(".", "/")

	for log2r_unit_id in log2r_tumour_unit_port:
		u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT)
		if u is None:
			log.error("%s not found: %s" % (types.MRNA_LOG2R_TUMOUR_UNIT, log2r_unit_id))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"])
		if key in log2r_cutoff_index:
			eid = log2r_cutoff_index[key][0]
			e = em.find(eid, types.MRNA_LOG2R_CUTOFF)
			if ("upreg/cutoff" in e) and ("upreg/cutoff" in e) and not overwrite:
				log.warn("Skipping (%s) [%s] as it already exists" % (", ".join(key), eid))
				processed_log2r_tumour_unit_port.write(log2r_unit_id)
				continue
		else:
			e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
			eid = e["id"] = str(uuid.uuid4())

		log.info("Calculating cutoffs for {} ({}) [{}] ...".format(types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id))
		log.debug("{} id is {}".format(types.MRNA_LOG2R_CUTOFF, eid))

		file_repo = u["data_file/repo"]
		matrix_repo = rs.repository(file_repo)

		file_path = u["data_file/path"]
		file_name = u["data_file/file"]
		matrix_path = os.path.join(file_path, file_name)

		if "mrna.log2r_slope_cutoff.slope" in conf:
			slope = conf["mrna.log2r_slope_cutoff.slope"]
		else:
			slope = str(-0.05)

		log.debug("slope = {}".format(slope))

		# Upregulation

		log.info("Upregulation ...")

		try:
			cutoff, cutoff_file, plot_file = calc_cutoff(
				conf, log, log2r_unit_id, matrix_repo, matrix_path, data_repo, cutoff_path, "upreg", slope)
		except Exception as e:
			log.error("Upreg cutoff calculation for {} ({}) [{}] failed".format(types.MRNA_LOG2R_TUMOUR_UNIT, ",".join(key), log2r_unit_id))
			log.exception(e)
			return -1

		log.debug("Upregulation cutoff = {}".format(cutoff))

		e["upreg/cutoff"] = cutoff

		e["upreg/plot_file"] = pf = e.create_element()
		pf["repo"] = data_repo.name()
		pf["path"] = os.path.dirname(plot_file)
		pf["file"] = os.path.basename(plot_file)

		# Downregulation

		log.info("Downregulation ...")

		try:
			cutoff, cutoff_file, plot_file = calc_cutoff(
				conf, log, log2r_unit_id, matrix_repo, matrix_path, data_repo, cutoff_path, "downreg", slope)
		except Exception as e:
			log.error("Downreg cutoff calculation for {} ({}) [{}] failed".format(types.MRNA_LOG2R_TUMOUR_UNIT, ",".join(key), log2r_unit_id))
			log.exception(e)
			return -1

		log.debug("Downregulation cutoff = {}".format(cutoff))

		e["downreg/cutoff"] = cutoff

		e["downreg/plot_file"] = pf = e.create_element()
		pf["repo"] = data_repo.name()
		pf["path"] = os.path.dirname(plot_file)
		pf["file"] = os.path.basename(plot_file)

		em.persist(e, types.MRNA_LOG2R_CUTOFF)
		processed_log2r_tumour_unit_port.write(log2r_unit_id)
		
	em.close()
	
	data_repo.close()
Example #17
0
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "bin_paths.matrix_join", "bin_paths.python"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["log2r_tumour_unit_ids"])
	task.check_out_ports(["joined_log2r_tumour_unit_ids"])

	log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]
	joined_log2r_tumour_unit_port = task.ports["joined_log2r_tumour_unit_ids"]

	python_bin = conf["bin_paths.python"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	unit_base_path = types.MRNA_LOG2R_TUMOUR_UNIT.replace(".", "/")
	
	for log2r_unit_id in log2r_tumour_unit_port:
		u = em.find(log2r_unit_id, types.MRNA_LOG2R_TUMOUR_UNIT)
		if u is None:
			log.error("%s not found: %s" % (types.MRNA_LOG2R_TUMOUR_UNIT, log2r_unit_id))
			continue
		
		uid = u["id"]
		study_id = u["study_id"]
		platform_id = u["platform_id"]
		icdo_topography = u["icdo_topography"]
		icdo_morphology = u["icdo_morphology"]
		key = (study_id, platform_id, icdo_topography, icdo_morphology)

		log.info("Joining columns for {} ({}) [{}] ...".format(types.MRNA_LOG2R_TUMOUR_UNIT, ", ".join(key), log2r_unit_id))

		if "mrna_log2r_ids" not in u:
			log.warn("Discarding empty unit (%s) [%s]" % (", ".join(key), log2r_unit_id))
			continue

		unit_repo = data_repo

		if "data_file" in u:
			unit_repo = rs.repository(u["data_file/repo"])
			unit_repo_path = os.path.join(u["data_file/path"], u["data_file/file"])
			exists = unit_repo is not None and unit_repo.exists(unit_repo_path)
		else:
			unit_repo_path = os.path.join(unit_base_path, log2r_unit_id + ".tsv.gz")
			exists = False

		if exists and not overwrite:
			log.warn("Skipping log2r tumour unit data join (%s) [%s] as it already exists in %s" % (", ".join(key), log2r_unit_id, unit_repo_path))
			joined_log2r_tumour_unit_port.write(uid)
			continue

		valid = True
		repos = []
		files = []
		for log2r_id in u["mrna_log2r_ids"]:
			e = em.find(log2r_id, types.MRNA_LOG2R)
			if e is None:
				log.error("log2r assay '%s' not found" % log2r_id)
				valid = False
				break

			repo = rs.repository(e["data_file/repo"])
			repo_path = os.path.join(e["data_file/path"], e["data_file/name"])

			if repo is None or not repo.exists(repo_path):
				log.error("File not found: %s" % repo_path)
				valid = False
				break

			repos += [repo]
			files += [repo.get_local(repo_path)]

		if not valid:
			log.info("Skipping log2r tumour unit (%s) [%s] as there were errors" % (", ".join(key), log2r_unit_id))
			continue

		if exists:
			unit_local_path = unit_repo.get_local(unit_repo_path)
		else:
			unit_local_path = unit_repo.create_local(unit_repo_path)

		cmd = " ".join([
			python_bin, conf["bin_paths.matrix_join"],
			"-o '%s'" % unit_local_path,
			"-C '${filename_noext}'",
			"--skip-empty",
			" ".join(files)])

		log.debug(cmd)

		retcode = subprocess.call(args = cmd, shell = True)

		if retcode != 0:
			log.error("There was an error joining matrices:\n%s" % "\n".join(files))
			continue

		for i in xrange(len(files)):
			repos[i].close_local(files[i])

		unit_repo.put_local(unit_local_path)

		df = u["data_file"] = u.create_element()
		df["repo"] = unit_repo.name()
		df["path"] = os.path.dirname(unit_repo_path)
		df["file"] = os.path.basename(unit_repo_path)

		em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT)
		joined_log2r_tumour_unit_port.write(uid)
	
	em.close()

	data_repo.close()