Exemple #1
0
def run(task):

	# Initialization

	conf = task.conf

	log = task.logger()

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	# Run

	for k, v in vars(types).items():
		if k.startswith("CNV_"):
			log.info("Preparing '{0}' ...".format(v))
			em.ensure_collection_exists(v)
			path = rpath.absolute(v.replace(".", "/"))
			log.debug("\tData: {0}".format(path))
			data_repo.mkdir_if_not_exists(path)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
	
	return 0
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	id_port = task.ports("mrna_normal_pool")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	results_base_path = "reports/" + types.CNV_COMBINATION.replace(".", "/")

	# Run

	for id in id_port:
		e = em.find(oid, types.MRNA_LOG2R_TUMOUR_UNIT)
		if e is None:
			log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT, id))
			continue

		repo, data_path = rs.from_url(e["data_file"])
		data_local_path = repo.get_local(data_path)

		cmd = " ".join([conf["bin_paths.R"],
			"--vanilla --slave -f", script,
			"--args", results_base_path, id, data_local_path])

		log.debug(cmd)

		retcode = subprocess.call(args = cmd, shell = True)

		if retcode != 0:
			raise Exception("R script failed")

		repo.close_local(data_local_path)
		repo.close()

	em.close()
	es.close()
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	id_port = task.ports("mrna_normal_pool")

	es = EntityServer(conf["entities"])
	em = es.manager()

	# Run



	em.close()
	es.close()
def main():

    # Initialization

    task.check_conf(["entities"])
    conf = task.conf

    log = task.logger()

    mrna_log2r_tunit_port, mrna_normal_pool_port = task.ports(["mrna_log2r_tunit", "mrna_normal_pool"])

    es = EntityServer(conf["entities"])
    em = es.manager()

    # Run

    # mrna preprocessing

    extract_and_send(log, em, types.MRNA_NORMAL_POOL, mrna_normal_pool_port)

    extract_and_send(log, em, types.MRNA_LOG2R_TUMOUR_UNIT, mrna_log2r_tunit_port)

    em.close()
    es.close()
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "repositories.source",
						"mrna.enrichment", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["oncodrive_ids"])
	task.check_out_ports(["enrichment_ids"])

	oncodrive_port = task.ports["oncodrive_ids"]
	enrichment_port = task.ports["enrichment_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	data_repo = rs.repository("data")
	
	overwrite = conf.get("overwrite", False, dtype=bool)

	# retrieve enrichment configurations
	ec = conf["mrna.enrichment"]
	if "default" in ec:
		default = ec["default"]
	else:
		default = conf.create_element()

	if "modules" not in ec:
		log.error("There is no enrichment modules section available in mrna.enrichment")
		return -1

	log.info("Reading modules configuration ...")

	econfs = list()
	for mod in ec["modules"]:
		m = ec.create_element()
		m.merge(default)
		m.merge(mod)
		mf = m.missing_fields(["id_type", "test", "modules_file"])
		if len(mf) > 0:
			log.error("Enrichment configuration missing required fields: {}".format(", ".join(mf)))
			log.error("Module configuration: {}".format(m))
		else:
			econfs.append(m)
			log.debug("{} -> {}".format(m["id_type"], m["modules_file"]))

	if len(econfs) == 0:
		log.error("There are no enrichment configurations available in mrna.enrichment")
		return 0

	results_base_path = types.MRNA_ENRICHMENT.replace(".", "/")
	
	log.info("Indexing available enrichment results ...")
	enrichment_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology", "id_type"],
		types.MRNA_ENRICHMENT, unique = True)

	for oid in oncodrive_port:
		o = em.find(oid, types.MRNA_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, oid))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])

		log.info("Enrichment for oncodrive results ({0}) [{1}] ...".format(", ".join(okey), oid))

		for ec in econfs:
			log.info("Module {} [{}] ...".format(ec["id_type"], ec["modules_file"]))

			key = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"], ec["id_type"])

			if key in enrichment_results_index:
				eid = enrichment_results_index[key][0]
				e = em.find(eid, types.MRNA_ENRICHMENT)
				if e is None:
					log.error("{} not found: {}".format(types.MRNA_ENRICHMENT, eid))
					continue
			else:
				e = o.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
				e["id"] = eid = str(uuid.uuid4())

			e["id_type"] = ec["id_type"]

			# enrichment results

			results_path = rpath.join(results_base_path, eid + ".tsv.gz")

			if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
				log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
				enrichment_port.write(eid)
				continue

			valid = enrichment(log, conf, rs, data_repo, results_path, o["results_file"], e, ec,
						["id", "upreg_corrected_right_p_value", "downreg_corrected_right_p_value"],
						["id", "upreg", "downreg"])

			# save mapped results
			if valid:
				em.persist(e, types.MRNA_ENRICHMENT)
				enrichment_port.write(eid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	enrichment_port, combination_port = \
		task.ports("enrichment_ids", "combinations")

	es = EntityServer(conf["entities"])
	em = es.manager()

	log.info("Indexing available {} results ...".format(types.CNV_COMBINATION))
	comb_results_index = em.group_ids(
		["icdo_topography", "icdo_morphology", "id_type"],
		types.CNV_COMBINATION, unique = True)

	classif = {}

	log.info("Classifying enrichment results ...")

	for eid in enrichment_port:
		e = em.find(eid, types.CNV_ENRICHMENT)
		if e is None:
			log.error("{} not found: {}".format(types.CNV_ENRICHMENT, eid))
			continue

		ekey = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"], e["id_type"])

		key = (e["icdo_topography"], e["icdo_morphology"], e["id_type"])

		log.debug("Enrichment results ({}) [{}] classified into ({}) ...".format(", ".join(ekey), eid, ", ".join(key)))

		if key in classif:
			classif[key] += [e]
		else:
			classif[key] = [e]

	log.info("Preparing combinations ...")

	for key in sorted(classif):
		if key in comb_results_index:
			cid = comb_results_index[key][0]
			c = em.find(cid, types.CNV_COMBINATION)
			if c is None:
				log.error("{} not found: {}".format(types.CNV_COMBINATION, cid))
				return
		else:
			c = DataElement(key_sep = "/")
			c["id"] = cid = str(uuid.uuid4())
			c["icdo_topography"] = key[0]
			c["icdo_morphology"] = key[1]
			c["id_type"] = key[2]

		elist = classif[key]
		
		log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(elist)))

		ids = c.create_list()
		flist = c.create_list()

		for e in elist:
			ids += [e["id"]]
			flist += [e["results_file"]]

		c["source"] = src = c.create_element()
		src["type"] = types.CNV_ENRICHMENT
		src["ids"] = ids

		c["files"] = flist

		combination_port.write(c.to_native())

	em.close()
	es.close()
def main():
	task.check_conf(["entities", "repositories", "biomart.db"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)
	
	log = task.logger()

	id_port = task.ports("id")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	conn = biomart_db_connect(conf["biomart.db"], log)

	cursor = conn.cursor()

	table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values())

	feat_ids = {}

	for name in table_infixs:
		if name == "gene":
			continue
			
		cursor.execute("""
			CREATE TABLE IF NOT EXISTS exp_{0}_trs (
			  {0}_id int(11) NOT NULL,
			  icdo_id int(11) NOT NULL,
			  exp_id int(11) NOT NULL,
			  upreg_total int(11) DEFAULT NULL,
			  upreg_observed double DEFAULT NULL,
			  upreg_expected double DEFAULT NULL,
			  upreg_stdev double DEFAULT NULL,
			  upreg_pvalue double DEFAULT NULL,
			  upreg_cpvalue double DEFAULT NULL,
			  downreg_total int(11) DEFAULT NULL,
			  downreg_observed double DEFAULT NULL,
			  downreg_expected double DEFAULT NULL,
			  downreg_stdev double DEFAULT NULL,
			  downreg_pvalue double DEFAULT NULL,
			  downreg_cpvalue double DEFAULT NULL,
			  PRIMARY KEY ({0}_id,icdo_id,exp_id),
			  KEY icdo (icdo_id,exp_id),
			  KEY exp (exp_id),
			  CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id),
			  CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
			  CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
			) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine))

		feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name))

	icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
	exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

	for id_type, eid in id_port:
		e = em.find(eid, types.MRNA_ENRICHMENT)
		if e is None:
			log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid))
			continue

		if "results_file" not in e:
			log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid))
			continue

		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e["icdo_morphology"]

		okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type)

		log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid))

		table_infix = ID_TYPE_TO_TABLE_INFIX[id_type]

		icdo_key = (icdo_topography, icdo_morphology)
		if icdo_key not in icdo:
			log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
			continue
		icdo_id = icdo[icdo_key]

		exp_key = (study_id, platform_id)
		if exp_key not in exp:
			log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
			continue
		exp_id = exp[exp_key]

		ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix),
				["{}_id".format(table_infix), "icdo_id", "exp_id",
						"upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue",
						"downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size)

		results_repo, results_path = rs.from_url(e["results_file"])

		try:
			reader = results_repo.open_reader(results_path)
		except Exception as ex:
			log.exception(ex)
			ib.close()
			results_repo.close()
			continue
		
		# read header
		hdr_map = {}
		hdr = reader.readline().rstrip().split("\t")
		for i, name in enumerate(hdr):
			hdr_map[name] = i

		try:
			col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
		except KeyError as e:
			log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
			reader.close()
			ib.close()
			results_repo.close()
			continue

		skipped_ids = set()

		fids = feat_ids[table_infix]

		# read data
		for line in reader:
			line = line.rstrip()
			data = line.split("\t")
			feat_name = data[0]
			data = [data[i] for i in col_indices]
			if feat_name not in fids:
				skipped_ids.add(feat_name)
				continue

			feat_id = fids[feat_name]
			
			ib.insert(feat_id, icdo_id, exp_id, *data)

		if len(skipped_ids) > 0:
			log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids)))

		log.debug("{} results inserted".format(ib.count))

		ib.close()
		reader.close()

	em.close()
	es.close()
	rs.close()
def run(task):
	
	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	log = task.logger()

	task.check_in_ports(["absi_tumour_unit_ids"])
	task.check_out_ports(["log2r_ids"])

	absi_tumour_unit_port = task.ports["absi_tumour_unit_ids"]
	log2r_port = task.ports["log2r_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)
	
	# Run
	
	# Index normal pools by study, platform, topography
	log.debug("Indexing normal pools by study, platform and topography ...")
	pools_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography"],
		types.MRNA_NORMAL_POOL, unique = True)

	# Index log2r assays by absi_id
	log.debug("Indexing log2r assays by absi assay ...")
	log2r_index = em.group_ids(
		["absi_id"],
		types.MRNA_LOG2R, unique = True)

	absi_tumour_unit_ids = absi_tumour_unit_port.read_all()
	
	log.info("Processing %i mrna absi tumour units ..." % len(absi_tumour_unit_ids))
	#log.debug("[%s]" % (", ".join(absi_tumour_unit_ids)))

	# For each abs intensity assay
	pool = None
	pool_data = {}
	for absi in iter_tumour_absi(conf, em, absi_tumour_unit_ids, log):

		absi_id = absi["id"]

		rpath = os.path.join(absi["data_file/path"], absi["data_file/name"])
		
		icdo_topography = absi["icdo_topography"]
		normal_counterpart = absi.get("normal_counterpart", icdo_topography)
		if icdo_topography != normal_counterpart:
			keystr = "(%s, %s, %s --> %s)" % (absi["study_id"], absi["platform_id"], icdo_topography, normal_counterpart)
		else:
			keystr = "(%s, %s, %s)" % (absi["study_id"], absi["platform_id"], icdo_topography)

		exists = (absi_id,) in log2r_index
		if exists:
			log2r_id = log2r_index[(absi_id,)][0]
		else:
			log2r_id = str(uuid.uuid4())

		data_file_path = types.MRNA_LOG2R.replace(".", "/")
		data_file_name = log2r_id + ".tsv.gz"
		dst_path = os.path.join(data_file_path, data_file_name)

		if not overwrite and exists and data_repo.exists(dst_path):
			log.debug("Skipping calculation of log2r for tumour assay %s %s as it is already calculated" % (keystr, absi_id))
			log2r_port.write(log2r_id)
			continue

		log.info("Processing tumour assay %s %s from %s ..." % (keystr, absi_id, rpath))

		repo = rs.repository(absi["data_file/repo"])
		if not repo.exists(rpath):
			log.error("File not found: %s" % rpath)
			continue

		# Get normal counterpart data
		if pool is None \
			or absi["study_id"] != pool["study_id"] \
			or absi["platform_id"] != pool["platform_id"] \
			or normal_counterpart != pool["icdo_topography"]:

			pool_key = (absi["study_id"], absi["platform_id"], normal_counterpart)
			if pool_key not in pools_index:
				log.error("Normal pool not found for tumour assay (%s) %s {%s}" % (", ".join(pool_key), absi_id, absi.get("source_path", "")))
				continue

			pool_id = pools_index[pool_key][0]
			pool = em.find(pool_id, types.MRNA_NORMAL_POOL)
			if pool is None:
				log.error("Normal pool %s not found by the entity manager !" % pool_id)
				continue
			
			pool_data = read_pool_data(conf, rs, pool, log)
			if pool_data is None:
				pool = None
				continue

		log.info("Using normal pool ({}) [{}]".format(", ".join(pool_key), pool_id))

		# Calculate log2 ratios
		mr = MatrixReader(repo.open_reader(rpath))
		header = mr.read_header()
		if len(header.columns) != 2:
			log.error("Unexpected number of columns: %i" % len(header.columns))
			mr.close()
			continue

		warn_count = {
			"id_not_in_pool" : 0,
			"value_is_nan" : 0,
			"pool_value_is_nan" : 0,
			"value_is_inf" : 0,
			"pool_value_is_inf" : 0}

		data = {}
		for row in mr:
			if row.name in data:
				log.error("Skipping tumour assay, duplicated row %s at file %s" % (row.name, rpath))
				break

			value = row.values[0]

			value_is_nan = numpy.isnan(value)

			if value_is_nan:
				warn_count["value_is_nan"] += 1
			elif numpy.isinf(value):
				warn_count["value_is_inf"] += 1

			if row.name not in pool_data:
				pool_value = value = numpy.nan
				warn_count["id_not_in_pool"] += 1
			else:
				pool_value = pool_data[row.name]

			pool_value_is_nan = numpy.isnan(pool_value)
			if pool_value_is_nan:
				warn_count["pool_value_is_nan"] += 1
			elif numpy.isinf(pool_value):
				warn_count["pool_value_is_inf"] += 1

			if not value_is_nan and not pool_value_is_nan: # and value != 0.0 and pool_value != 0.0:
				log2r = value - pool_value
			else:
				log2r = numpy.nan

			if not numpy.isinf(log2r):
				data[row.name] = log2r
			#else:
			#	log.warn("row = %s, log2r = %f, value = %f, pool_value = %f" % (row.name, log2r, value, pool_value))

		mr.close()
		
		sb = ["{0}={1}".format(k, v) for k, v in warn_count.items() if v > 0]
		if len(sb) > 0:
			log.warn(", ".join(sb))

		# Save log2 ratios data and assay
		log2r = deepcopy(absi)

		log2r["id"] = log2r_id
		log2r["absi_id"] = absi_id
		log2r["normal_pool_id"] = pool["id"]

		log2r["data_file/repo"] = data_repo.name()
		log2r["data_file/path"] = data_file_path
		log2r["data_file/name"] = data_file_name

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.debug("%s log2 ratio data to %s ..." % (msg, dst_path))

		mw = MatrixWriter(data_repo.open_writer(dst_path))
		mw.write_header(["id", "value"])
		for name, value in sorted(data.items()):
			mw.write(name, [value])
		mw.close()

		em.persist(log2r, types.MRNA_LOG2R)
		log2r_port.write(log2r_id)

	em.close()
	es.close()

	data_repo.close()
	rs.close()
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	icdo_port, exp_port = task.ports(["icdo", "experiment"])

	mrna_oncodrive_gene_port, mrna_enrichment_port, mrna_combination_port = \
		task.ports(["mrna_oncodrive_gene", "mrna_enrichment", "mrna_combination"])

	cnv_oncodrive_gene_port, cnv_enrichment_port, cnv_combination_port = \
		task.ports(["cnv_oncodrive_gene", "cnv_enrichment", "cnv_combination"])

	es = EntityServer(conf["entities"])
	em = es.manager()

	# Run

	exp = set()
	icdo = set()

	excludes = None
	if "biomart.excludes" in conf:
		excludes = conf["biomart.excludes"]

	# mrna oncodrive genes
	results = set()
	extract(log, em, types.MRNA_ONCODRIVE_GENES,
		(results, ("id")),
		(exp, ("study_id", "platform_id")),
		(icdo, ("icdo_topography", "icdo_morphology")),
		excludes = excludes)

	log.info("Sending {} ids ...".format(types.MRNA_ONCODRIVE_GENES))
	for rid, in results:
		mrna_oncodrive_gene_port.write(rid)

	# mrna enrichment
	results = set()
	extract(log, em, types.MRNA_ENRICHMENT,
		(results, ("id_type", "id")),
		(icdo, ("icdo_topography", "icdo_morphology")),
		excludes = excludes)

	log.info("Sending {} ids ...".format(types.MRNA_ENRICHMENT))
	for r in sorted(results):
		mrna_enrichment_port.write(r)

	# mrna combination
	results = set()
	extract(log, em, types.MRNA_COMBINATION,
		(results, ("id_type", "id")),
		(icdo, ("icdo_topography", "icdo_morphology")),
		excludes = excludes)

	log.info("Sending {} ids ...".format(types.MRNA_COMBINATION))
	for r in sorted(results):
		mrna_combination_port.write(r)

	# cnv oncodrive genes
	results = set()
	extract(log, em, types.CNV_ONCODRIVE_GENES,
		(results, ("id")),
		(exp, ("study_id", "platform_id")),
		(icdo, ("icdo_topography", "icdo_morphology")),
		excludes = excludes)

	log.info("Sending {} ids ...".format(types.CNV_ONCODRIVE_GENES))
	for rid, in results:
		cnv_oncodrive_gene_port.write(rid)

	# cnv enrichment
	results = set()
	extract(log, em, types.CNV_ENRICHMENT,
		(results, ("id_type", "id")),
		(icdo, ("icdo_topography", "icdo_morphology")),
		excludes = excludes)

	log.info("Sending {} ids ...".format(types.CNV_ENRICHMENT))
	for r in sorted(results):
		cnv_enrichment_port.write(r)

	# cnv combination
	results = set()
	extract(log, em, types.CNV_COMBINATION,
		(results, ("id_type", "id")),
		(icdo, ("icdo_topography", "icdo_morphology")),
		excludes = excludes)

	log.info("Sending {} ids ...".format(types.CNV_COMBINATION))
	for r in sorted(results):
		cnv_combination_port.write(r)

	# icdo

	log.info("Sending icdo's ...")
	for tm in icdo:
		icdo_port.write(tm)

	# exp

	log.info("Sending experiments ...")
	for e in exp:
		exp_port.write(e)

	em.close()
	es.close()
def run(task):

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay"])
	conf = task.conf

	min_tumour_unit_size = conf.get("mrna.min_tumour_unit_size", 20, dtype=int)

	log = task.logger()

	task.check_in_ports(["log2r_ids"])
	task.check_out_ports(["log2r_tumour_unit_ids"])

	log2r_port = task.ports["log2r_ids"]
	log2r_tumour_unit_port = task.ports["log2r_tumour_unit_ids"]

	es = EntityServer(conf["entities"])
	em = es.manager()

	overwrite = conf.get("overwrite", False, dtype=bool)

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Indexing available mrna log2r tumour units ...")
	log2r_tumour_unit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.MRNA_LOG2R_TUMOUR_UNIT, unique = True)

	units = {}
	for log2r_id in log2r_port:
		e = em.find(log2r_id, types.MRNA_LOG2R)
		if e is None:
			log.error("%s not found: %s" % (types.MRNA_LOG2R, log2r_id))
			continue

		eid = e["id"]
		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e.get("icdo_morphology", "")
		
		log.info("Classifying mrna log2r (%s, %s, %s, %s) [%s] ..." % (study_id, platform_id, icdo_topography, icdo_morphology, eid))
		
		keys = []
	
		m = _ICDO_TOPOGRAPHY_PAT.match(icdo_topography)
		if m is None:
			log.error("Wrong ICD-O Topography code: {0}".format(icdo_topography))
			continue
		else:
			level1 = m.group(1)
			level2 = m.group(2)

		if len(icdo_morphology) > 0:
			m = _ICDO_MORPHOLOGY_PAT.match(icdo_morphology)
			if m is None:
				log.error("Wrong ICD-O Morphology code: {0}".format(icdo_morphology))
				continue

		keys += [(study_id, platform_id, level1, "")]
		if len(icdo_morphology) > 0:
			keys += [(study_id, platform_id, level1, icdo_morphology)]
			#keys += [(study_id, platform_id, "", icdo_morphology)]
	
		if level2 is not None:
			keys += [(study_id, platform_id, icdo_topography, "")]
			if len(icdo_morphology) > 0:
				keys += [(study_id, platform_id, icdo_topography, icdo_morphology)]

		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				log.debug("\t(%s) [excluded]" % ", ".join(key))
				continue

			log.debug("\t(%s)" % ", ".join(key))
			
			if key not in units:
				units[key] = [eid]
			else:
				units[key] += [eid]

	log.info("Persisting %i mrna log2r tumour units ..." % len(units))
	log.debug("Minimum size = %i" % min_tumour_unit_size)

	for key, ids in sorted(units.iteritems()):
		
		size = len(ids)
		
		if size < min_tumour_unit_size:
			log.debug("\t(%s)\t%i assays [Skipped]" % (", ".join(key), size))
			continue
		else:
			log.debug("\t(%s)\t%i assays" % (", ".join(key), size))

		if key in log2r_tumour_unit_index:
			uid = log2r_tumour_unit_index[key][0]
			if not overwrite:
				u = em.find(uid, types.MRNA_LOG2R_TUMOUR_UNIT)
			else:
				u = DataElement(key_sep = "/")
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")

		u["id"] = uid
		u["study_id"] = key[0]
		u["platform_id"] = key[1]
		u["icdo_topography"] = key[2]
		u["icdo_morphology"] = key[3]

		u["size"] = size
		u["mrna_log2r_ids"] = u.create_list(ids)
		
		em.persist(u, types.MRNA_LOG2R_TUMOUR_UNIT)
		log2r_tumour_unit_port.write(uid)
	
	em.close()
	es.close()
Exemple #11
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	if "biomart.study_source" in conf:
		study_source_map = conf["biomart.study_source"]
	else:
		study_source_map = conf.create_element()

	log = task.logger()

	exp_port = task.ports("experiment")
	
	es = EntityServer(conf["entities"])
	em = es.manager()

	conn = biomart_db_connect(conf["biomart.db"], log)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

	cursor = conn.cursor()

	cursor.execute("""
		CREATE TABLE ent_experiment (
		  id int(11) NOT NULL,
		  exp_name varchar(64) NOT NULL,
		  study_id varchar(32) NOT NULL,
		  study_source varchar(32) DEFAULT NULL,
		  study_source_url varchar(512) DEFAULT NULL,
		  study_link varchar(512) DEFAULT NULL,
		  pub_pubmed varchar(32) DEFAULT NULL,
		  pub_title varchar(300) DEFAULT NULL,
		  pub_authors varchar(300) DEFAULT NULL,
		  pub_year varchar(16) DEFAULT NULL,
		  pub_journal varchar(200) DEFAULT NULL,
		  platf_id varchar(32) NOT NULL,
		  platf_title varchar(250) DEFAULT NULL,
		  platf_technology varchar(96) DEFAULT NULL,
		  PRIMARY KEY (id),
		  KEY exp_name (exp_name),
		  KEY pub_pubmed (pub_pubmed),
		  KEY pub_title (pub_title),
		  KEY pub_authors (pub_authors),
		  KEY pub_year (pub_year),
		  KEY pub_journal (pub_journal),
		  KEY platf_title (platf_title),
		  KEY platf_technology (platf_technology)
		) ENGINE={} CHARACTER SET utf8 COLLATE utf8_general_ci""".format(db_engine))

	ib = BatchInsert(cursor, "ent_experiment",
			["id", "exp_name", "study_id", "study_source", "study_source_url", "study_link",
				"pub_title", "pub_authors", "pub_year", "pub_pubmed", "pub_journal",
				"platf_id", "platf_title", "platf_technology"], insert_size)

	pubmed = Pubmed()

	for i, exp in enumerate(exp_port, 1):
		study_id = exp[0]
		platform_id = exp[1]

		study = em.find(study_id, types.SOURCE_STUDY)
		if study is None:
			log.error("{} not found: {}".format(types.SOURCE_STUDY, study_id))
			continue

		platf = em.find(platform_id, types.SOURCE_PLATFORM)
		if platf is None:
			log.error("{} not found: {}".format(types.SOURCE_PLATFORM, platform_id))
			continue

		log.info("Experiment for study {} and platform {} ...".format(study_id, platform_id))

		pub = {}
		for k in ["title", "short_authors", "date", "journal"]:
			pub[k] = None

		if "pubmed" in study:
			pmid = study["pubmed"]
			if isinstance(pmid, (DataElementList, list)):
				pmid = pmid[0]
				log.warn("Study {} with many pubmed_id's, only the first {} will be considered".format(study_id, pmid))

			log.debug("Retrieving information for pubmed_id '{}' ...".format(pmid))
			try:
				pub = pubmed.find(pmid)
				if len(pub) == 0:
					log.error("No publication information found for pubmed_id '{}' in experiment ({}, {})".format(pmid, study_id, platform_id))
				else:
					pub = pub[0]
			except Exception as ex:
				log.error("Error retrieving pubmed information for experiment ({}, {}) with pubmed_id '{}'".format(study_id, platform_id, pmid))
				log.exception(ex)
		else:
			pmid = None
			log.warn("Study {} has no 'pubmed_id' annotation".format(study_id))

			if "title" not in study:
				log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'title'".format(study_id))
			elif "SO/contact_details[0]/contact_name" not in study \
					and "SO/contact_details/contact_name" not in study:
				log.error("Study {} doesn't have annotation for 'pubmed_id' nor 'SO.contact_details[0].contact_name'".format(study_id))
			else:
				try:
					pub["title"] = study["title"]

					if "SO/contact_details[0]/contact_name" in study:
						pub["short_authors"] = study["SO/contact_details[0]/contact_name"]
					else:
						pub["short_authors"] = study["SO/contact_details/contact_name"]

					if "SO/submission/pub_date" in study:
						pub["date"] = study["SO/submission/pub_date"]
					else:
						pub["date"] = ""
				except Exception as ex:
					log.debug(study)
					log.execption(ex)

		for k, v in pub.items():
			if v is not None and isinstance(v, basestring):
				pub[k] = v.replace("'", r"\'")

		exp_name = "{}; {}".format(study_id, platform_id)

		study_source = None
		study_source_url = None
		study_link = None

		parts = study_id.split("-")
		if len(parts) >= 2 and parts[0] in study_source_map:
			ss = study_source_map[parts[0]]
			study_source = ss.get("name")
			study_source_url = ss.get("home_url")
			try:
				study_link = ss.get("link", "").format(parts[1])
			except:
				pass

		ib.insert(i, exp_name, study_id, study_source, study_source_url, study_link,
			pub["title"], pub["short_authors"], pub["date"], pmid, pub["journal"],
			platform_id, platf["SO/platform_title"], "")

	log.debug("{} experiments inserted".format(ib.count))

	ib.close()
	cursor.close()
	conn.close()
	em.close()
	es.close()
def main():

	# Initialization

	task.check_conf(["entities", "repositories",
		"cnv.background.ensg", "cnv.mapping.ensg",
		"bin_paths.bed_tools"])

	conf = task.conf

	log = task.logger()

	evt_tunit_port, joined_evt_tunit_port = \
		task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	mapping_file = conf["cnv.mapping.ensg"]
	log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file))
	mapping_repo, mapping_path = rs.from_url(mapping_file)
	mapping_local_path = mapping_repo.get_local(mapping_path)

	background_file = conf["cnv.background.ensg"]
	log.info("Loading background from {} ...".format(background_file))

	background = set()
	repo, path = rs.from_url(background_file)
	reader = repo.open_reader(path)
	for line in reader:
		line = line.rstrip()
		if len(line) == 0:
			continue
		background.add(line)
	reader.close()
	repo.close()

	for uid in evt_tunit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", ""))

		tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/")
		tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz")

		if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")):
			log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid))
			joined_evt_tunit_port.write(uid)
			continue

		log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))

		cnv_evt_ids = u["cnv_evt_ids"]
		log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS))

		data = {}
		
		tmp_path = mkdtemp(prefix = "evt_map_and_join_")
		log.debug("Temporary directory: {}".format(tmp_path))
		
		try:
			for eid in cnv_evt_ids:
				e = em.find(eid, types.CNV_EVENTS)
				if e is None:
					log.error("{} not found: {}".format(types.CNV_EVENTS, eid))
					continue

				data_file = e["data_file"]

				log.debug("{} ...".format(data_file))

				repo, path = rs.from_url(data_file)

				local_path = repo.get_local(path)

				# Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed)

#				tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"]))

#				writer = FileWriter(tmp_file)
#				reader = repo.open_reader(path)
#				for line in reader:
#					if line.lstrip().startswith("#"):
#						continue
#					fields = line.rstrip().split("\t")
#					end = int(fields[2]) + 0 # FIXME fix not necessary already
#					fields[2] = str(end)
#					writer.write("\t".join(fields))
#					writer.write("\n")
#				writer.close()
#				reader.close()

				# Run BED tools to intersect event regions with gene names

				tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"]))

				cmd = " ".join([
					os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"),
					"-a", mapping_local_path,
					#"-b", tmp_file,
					"-b", local_path,
					"-s -wb",
					">{}".format(tmp_file2)])

				log.debug(cmd)

				retcode = subprocess.call(args = cmd, shell = True)

				if retcode != 0:
					raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

				repo.close_local(local_path)

				# Read BED tools results and load event data into memory

				reader = FileReader(tmp_file2)

				name_index = 3
				value_index = 12

				line_num = 1
				for line in reader:
					try:
						fields = line.rstrip().split("\t")
						name = fields[name_index]
						value = int(fields[value_index])
						if value not in [1, 2]:
							log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file))
							continue
					except:
						log.error("Error parsing line {} of data file {}".format(line_num, data_file))
						continue

					k = (eid, name)
					if k in data:
						prev_value = data[k]
					else:
						prev_value = 0

					data[k] = prev_value | value

					line_num += 1

				reader.close()
				repo.close()

		finally:
			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		# Write events data to data file and merge with background labels

		log.info("Writing data to {} ...".format(tunit_path))

		u["data_file"] = data_repo.url(tunit_path)
		#TODO u["data_timestamp"] = ...

		writer = data_repo.open_writer(tunit_path)

		# header
		for name in cnv_evt_ids:
			writer.write("\t")
			writer.write(name)
		writer.write("\n")

		# data
		for row_name in sorted(background):
			writer.write(row_name)
			for col_name in cnv_evt_ids:
				k = (col_name, row_name)
				if k in data:
					value = data[k]
				else:
					value = 0
				writer.write("\t")
				writer.write(str(value))
			writer.write("\n")

		writer.close()
		
		log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key)))
		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		joined_evt_tunit_port.write(uid)

	em.close()
	es.close()

	mapping_repo.close_local(mapping_local_path)
	mapping_repo.close()
	data_repo.close()
	rs.close()
def main():
    task.check_conf(["entities", "repositories", "biomart.db"])
    conf = task.conf

    insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

    db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

    log = task.logger()

    oncodrive_port = task.ports("id")

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    conn = biomart_db_connect(conf["biomart.db"], log)

    cursor = conn.cursor()

    gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene")
    icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
    exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

    cursor.execute(
        """
		CREATE TABLE IF NOT EXISTS exp_gene_trs (
		  gene_id int(11) NOT NULL,
		  icdo_id int(11) NOT NULL,
		  exp_id int(11) NOT NULL,
		  upreg_total int(11) DEFAULT NULL,
		  upreg_observed double DEFAULT NULL,
		  upreg_expected double DEFAULT NULL,
		  upreg_stdev double DEFAULT NULL,
		  upreg_pvalue double DEFAULT NULL,
		  upreg_cpvalue double DEFAULT NULL,
		  downreg_total int(11) DEFAULT NULL,
		  downreg_observed double DEFAULT NULL,
		  downreg_expected double DEFAULT NULL,
		  downreg_stdev double DEFAULT NULL,
		  downreg_pvalue double DEFAULT NULL,
		  downreg_cpvalue double DEFAULT NULL,
		  PRIMARY KEY (gene_id,icdo_id,exp_id),
		  KEY icdo (icdo_id,exp_id),
		  KEY exp (exp_id),
		  CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id),
		  CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
		  CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(
            db_engine
        )
    )

    cursor.execute("LOCK TABLES exp_gene_trs WRITE")

    lock_count = 0

    for eid in oncodrive_port:
        e = em.find(eid, types.MRNA_ONCODRIVE_GENES)
        if e is None:
            log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        if "results_file" not in e:
            log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        study_id = e["study_id"]
        platform_id = e["platform_id"]
        icdo_topography = e["icdo_topography"]
        icdo_morphology = e["icdo_morphology"]

        okey = (study_id, platform_id, icdo_topography, icdo_morphology)

        log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid))

        icdo_key = (icdo_topography, icdo_morphology)
        if icdo_key not in icdo:
            log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
            continue
        icdo_id = icdo[icdo_key]

        exp_key = (study_id, platform_id)
        if exp_key not in exp:
            log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
            continue
        exp_id = exp[exp_key]

        ib = BatchInsert(
            cursor,
            "exp_gene_trs",
            [
                "gene_id",
                "icdo_id",
                "exp_id",
                "upreg_total",
                "upreg_observed",
                "upreg_expected",
                "upreg_stdev",
                "upreg_pvalue",
                "upreg_cpvalue",
                "downreg_total",
                "downreg_observed",
                "downreg_expected",
                "downreg_stdev",
                "downreg_pvalue",
                "downreg_cpvalue",
            ],
            insert_size,
        )

        results_repo, results_path = rs.from_url(e["results_file"])

        try:
            reader = results_repo.open_reader(results_path)
        except Exception as ex:
            log.exception(ex)
            ib.close()
            results_repo.close()
            continue

            # read header
        hdr_map = {}
        hdr = reader.readline().rstrip().split("\t")
        for i, name in enumerate(hdr):
            hdr_map[name] = i

        try:
            col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
        except KeyError as e:
            log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
            reader.close()
            lock_count += ib.count
            ib.close()
            results_repo.close()
            continue

        skipped_genes = set()

        # read data
        for line in reader:
            line = line.rstrip()
            data = line.split("\t")
            gene_name = data[0]
            data = [data[i] for i in col_indices]
            if gene_name not in gene:
                skipped_genes.add(gene_name)
                continue

            gene_id = gene[gene_name]

            ib.insert(gene_id, icdo_id, exp_id, *data)

        if len(skipped_genes) > 0:
            log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes)))

        log.debug("{} gene results inserted".format(ib.count))

        lock_count += ib.count

        ib.close()
        reader.close()

        if lock_count >= 1000000:
            cursor.execute("UNLOCK TABLES")
            cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
            cursor.execute("LOCK TABLES exp_gene_trs WRITE")
            lock_count = 0

    cursor.execute("UNLOCK TABLES")
    cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
    cursor.close()

    em.close()
    es.close()
    rs.close()
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay",
				"cnv.min_tumour_unit_size"])

	conf = task.conf

	log = task.logger()

	study_ids_port, evt_port, evt_tunit_port = \
		task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	source_repo = rs.repository("source")

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Creating indices for {} ...".format(types.CNV_EVENTS))
	evt_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS, unique = True)
	
	log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	evt_tunit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS_TUMOUR_UNIT, unique = True)
	
	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	tumour_units = {}
	evt_dup = {}
	
	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay {} not included in 'study_ids'".format(assay_id))
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]

		source_path = assay["source_path"]
		source_file = assay["assay_property/filename"]

		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path"])

		e["data_file"] = source_repo.url("assay", source_path, source_file)

		included = study_id in study_ids and study_type == "genomic"
		included &= (assay_design == "cancer_vs_normal" and data_type == "binary")

		if not included:
			if study_type != "transcriptomic" and study_id in study_ids:
				s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_source_path = sample.get("source_path", "")
			log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			"source_path",
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		orig_disease_state = disease_state
		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour"]:
			log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state))
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo, rel_path = rs.from_url(e["data_file"])

		if not repo.exists(rel_path):
			log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])

		eid = None
		duplicated = False
		exists = False
		if e_key in evt_dup:
			duplicated = True
		elif e_key in evt_index:
			eid = evt_index[e_key][0]
			exists = True
		
		if duplicated:
			log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid

		u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", ""))
		keys = classify_by_experiment_and_icdo(
					u_key[0], u_key[1], u_key[2], u_key[3])
		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				continue
			map_list_add(tumour_units, key, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key)))
		em.persist(e, types.CNV_EVENTS)
		evt_port.write(eid)
		evt_dup[e_key] = eid

	min_tumour_unit_size = conf["cnv.min_tumour_unit_size"]

	log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	log.debug("Minimum size = {}".format(min_tumour_unit_size))

	for key in sorted(tumour_units):
		v = tumour_units[key]
		size = len(v)
		if size < min_tumour_unit_size:
			discard = True
			discard_text = "[skipped]"
		else:
			discard = False
			discard_text = ""

		if key in evt_tunit_index:
			uid = evt_tunit_index[key][0]
			u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
			if u is None:
				log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
				continue

			arrow_text = "==>"
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")
			u["id"] = uid
			u["study_id"] = key[0]
			u["platform_id"] = key[1]
			u["icdo_topography"] = key[2]
			u["icdo_morphology"] = key[3]

			arrow_text = "-->"

		log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text))

		if discard:
			continue

		u["size"] = len(v)
		u["cnv_evt_ids"] = u.create_list(v)

		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		evt_tunit_port.write(uid)

	sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))]
	log.info("".join(sb))

	log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1)))
	
	log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays)))
	
	log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples)))

	em.close()
	es.close()
Exemple #15
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.data", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	combinations_port, combination_ids_port = \
		task.ports("combinations", "combination_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	results_base_path = types.CNV_COMBINATION.replace(".", "/")

	conditions = ("gain", "loss")
	
	for c_dict in combinations_port:
		c = DataFactory.from_native(c_dict, key_sep = "/")
		
		"""
		o = em.find(c, types.CNV_ONCODRIVE_GENES)
		if o is None:
			log.error("{0} not found: {1}".format(types.CNV_ONCODRIVE_GENES, c))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])
		"""

		cid = c["id"]

		key = (c["icdo_topography"], c["icdo_morphology"], c["id_type"])
		
		log.info("Processing combination for ({}) [{}] ...".format(", ".join(key), cid))

		#files = c["files"]
		#if len(files) == 1:
		#	log.info("No combination required, copyed from {0}".format(files[0]))
		#	c["results_file"] = files[0]
		#else:
		results_path = rpath.join(results_base_path, cid + ".tsv.gz")
		results_url = data_repo.url(results_path)

		if skip_file(overwrite, data_repo, results_path, c.get("results_file")):
			log.warn("Skipping {} ({}) [{}] as it already exists".format(types.CNV_COMBINATION, ", ".join(key), cid))
			combination_ids_port.write(cid)
			continue

		c["results_file"] = results_url

		combination(log, conf, rs, c, data_repo, results_path, conditions)

		# save combination results
		em.persist(c, types.CNV_COMBINATION)
		combination_ids_port.write(cid)

	em.close()
	es.close()
	data_repo.close()
	rs.close()
def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	oncodrive_port, combination_port = \
		task.ports("oncodrive_ids", "combinations")

	es = EntityServer(conf["entities"])
	em = es.manager()

	log.info("Indexing available {} results ...".format(types.CNV_COMBINATION))
	comb_results_index = em.group_ids(
		["icdo_topography", "icdo_morphology", "id_type"],
		types.CNV_COMBINATION, unique = True)

	ENSEMBL_GENE = "ensembl:gene"

	classif = {}

	log.info("Classifying oncodrive results ...")

	for oid in oncodrive_port:
		o = em.find(oid, types.CNV_ONCODRIVE_GENES)
		if o is None:
			log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, oid))
			continue

		okey = (o["study_id"], o["platform_id"], o["icdo_topography"], o["icdo_morphology"])

		key = (o["icdo_topography"], o["icdo_morphology"], ENSEMBL_GENE)

		log.debug("Oncodrive results ({}) [{}] classified into ({}) ...".format(", ".join(okey), oid, ", ".join(key)))

		if key in classif:
			classif[key] += [o]
		else:
			classif[key] = [o]

	log.info("Preparing combinations ...")

	for key in sorted(classif):
		if key in comb_results_index:
			cid = comb_results_index[key][0]
			c = em.find(cid, types.CNV_COMBINATION)
			if c is None:
				log.error("{} not found: {}".format(types.CNV_COMBINATION, cid))
				return
		else:
			c = DataElement(key_sep = "/")
			c["id"] = cid = str(uuid.uuid4())
			c["icdo_topography"] = key[0]
			c["icdo_morphology"] = key[1]

		c["id_type"] = ENSEMBL_GENE

		olist = classif[key]
		
		log.info("({}) [{}] --> {} results".format(", ".join(key), cid, len(olist)))

		ids = c.create_list()
		flist = c.create_list()

		for o in olist:
			ids += [o["id"]]
			flist += [o["results_file"]]

		c["source"] = src = c.create_element()
		src["type"] = types.CNV_ONCODRIVE_GENES
		src["ids"] = ids

		c["files"] = flist

		combination_port.write(c.to_native())

	em.close()
	es.close()
Exemple #17
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db",
		"biomart.files.icdo_topography", "biomart.files.icdo_morphology"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	log = task.logger()

	icdo_port = task.ports("icdo")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_topography = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_morphology = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	conn = biomart_db_connect(conf["biomart.db"], log)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

	cursor = conn.cursor()

	cursor.execute("""
		CREATE TABLE  ent_icdo (
		  id int(11) NOT NULL,
		  icdo_name varchar(512) NOT NULL DEFAULT '',
		  icdo_topography varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology varchar(255) NOT NULL DEFAULT '',
		  icdo_topography_code varchar(24) NOT NULL DEFAULT '',
		  icdo_morphology_code varchar(24) NOT NULL DEFAULT '',
		  icdo_topography_name varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology_name varchar(255) NOT NULL DEFAULT '',
		  PRIMARY KEY (id),
		  KEY icdo_name (icdo_name),
		  KEY icdo_tm (icdo_topography,icdo_morphology),
		  KEY icdo_m (icdo_morphology),
		  KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code),
		  KEY icdo_m_c (icdo_morphology_code)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine))

	ib = BatchInsert(cursor, "ent_icdo",
			["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name",
				"icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size)

	for i, tm in enumerate(icdo_port, 1):
		t_code = tm[0]
		if t_code == "":
			t_name = t_desc = "ANY topography"
		elif t_code not in icdo_topography:
			log.error("Unknown topography description for code {}".format(t_code))
			t_name = ""
			t_desc = "[{}]".format(t_code)
		else:
			t_name = icdo_topography[t_code]
			t_desc = "{} [{}]".format(t_name, t_code)

		m_code = tm[1]
		if m_code == "":
			m_name = m_desc = "ANY morphology"
		elif m_code not in icdo_morphology:
			log.error("Unknown morphology description for code {}".format(m_code))
			m_name = ""
			m_desc = "[{}]".format(m_code)
		else:
			m_name = icdo_morphology[m_code]
			m_desc = "{} [{}]".format(m_name, m_code)

		name = "; ".join((t_desc, m_desc))

		log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc))

		ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name)

	log.debug("{} ICDO terms inserted".format(ib.count))

	ib.close()
	cursor.close()
	conn.close()
	em.close()
	es.close()
	rs.close()