def main():

	# Initialization

	task.check_conf(["entities"])
	conf = task.conf

	log = task.logger()

	id_port = task.ports("mrna_normal_pool")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	results_base_path = "reports/" + types.CNV_COMBINATION.replace(".", "/")

	# Run

	for id in id_port:
		e = em.find(oid, types.MRNA_LOG2R_TUMOUR_UNIT)
		if e is None:
			log.error("{} not found: {}".format(types.MRNA_LOG2R_TUMOUR_UNIT, id))
			continue

		repo, data_path = rs.from_url(e["data_file"])
		data_local_path = repo.get_local(data_path)

		cmd = " ".join([conf["bin_paths.R"],
			"--vanilla --slave -f", script,
			"--args", results_base_path, id, data_local_path])

		log.debug(cmd)

		retcode = subprocess.call(args = cmd, shell = True)

		if retcode != 0:
			raise Exception("R script failed")

		repo.close_local(data_local_path)
		repo.close()

	em.close()
	es.close()
Example #2
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)
	
	log = task.logger()

	id_port = task.ports("id")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	conn = biomart_db_connect(conf["biomart.db"], log)

	cursor = conn.cursor()

	table_infixs = set(ID_TYPE_TO_TABLE_INFIX.values())

	feat_ids = {}

	for name in table_infixs:
		if name == "gene":
			continue
			
		cursor.execute("""
			CREATE TABLE IF NOT EXISTS exp_{0}_trs (
			  {0}_id int(11) NOT NULL,
			  icdo_id int(11) NOT NULL,
			  exp_id int(11) NOT NULL,
			  upreg_total int(11) DEFAULT NULL,
			  upreg_observed double DEFAULT NULL,
			  upreg_expected double DEFAULT NULL,
			  upreg_stdev double DEFAULT NULL,
			  upreg_pvalue double DEFAULT NULL,
			  upreg_cpvalue double DEFAULT NULL,
			  downreg_total int(11) DEFAULT NULL,
			  downreg_observed double DEFAULT NULL,
			  downreg_expected double DEFAULT NULL,
			  downreg_stdev double DEFAULT NULL,
			  downreg_pvalue double DEFAULT NULL,
			  downreg_cpvalue double DEFAULT NULL,
			  PRIMARY KEY ({0}_id,icdo_id,exp_id),
			  KEY icdo (icdo_id,exp_id),
			  KEY exp (exp_id),
			  CONSTRAINT exp_{0}_trs_{0}_id FOREIGN KEY ({0}_id) REFERENCES ent_{0} ({0}_id),
			  CONSTRAINT exp_{0}_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
			  CONSTRAINT exp_{0}_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
			) ENGINE={1} DEFAULT CHARSET=latin1""".format(name, db_engine))

		feat_ids[name] = map_from_select(cursor, "SELECT {0}_id, {0}_name FROM ent_{0}".format(name))

	icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
	exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

	for id_type, eid in id_port:
		e = em.find(eid, types.MRNA_ENRICHMENT)
		if e is None:
			log.error("{} not found: {1}".format(types.MRNA_ENRICHMENT, eid))
			continue

		if "results_file" not in e:
			log.error("{} [{}] without results file.".format(types.MRNA_ENRICHMENT, eid))
			continue

		study_id = e["study_id"]
		platform_id = e["platform_id"]
		icdo_topography = e["icdo_topography"]
		icdo_morphology = e["icdo_morphology"]

		okey = (study_id, platform_id, icdo_topography, icdo_morphology, id_type)

		log.info("Exporting enrichment results ({}) [{}] ...".format(", ".join(okey), eid))

		table_infix = ID_TYPE_TO_TABLE_INFIX[id_type]

		icdo_key = (icdo_topography, icdo_morphology)
		if icdo_key not in icdo:
			log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
			continue
		icdo_id = icdo[icdo_key]

		exp_key = (study_id, platform_id)
		if exp_key not in exp:
			log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
			continue
		exp_id = exp[exp_key]

		ib = BatchInsert(cursor, "exp_{}_trs".format(table_infix),
				["{}_id".format(table_infix), "icdo_id", "exp_id",
						"upreg_total", "upreg_observed", "upreg_expected", "upreg_stdev", "upreg_pvalue", "upreg_cpvalue",
						"downreg_total", "downreg_observed", "downreg_expected", "downreg_stdev", "downreg_pvalue", "downreg_cpvalue"], insert_size)

		results_repo, results_path = rs.from_url(e["results_file"])

		try:
			reader = results_repo.open_reader(results_path)
		except Exception as ex:
			log.exception(ex)
			ib.close()
			results_repo.close()
			continue
		
		# read header
		hdr_map = {}
		hdr = reader.readline().rstrip().split("\t")
		for i, name in enumerate(hdr):
			hdr_map[name] = i

		try:
			col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
		except KeyError as e:
			log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
			reader.close()
			ib.close()
			results_repo.close()
			continue

		skipped_ids = set()

		fids = feat_ids[table_infix]

		# read data
		for line in reader:
			line = line.rstrip()
			data = line.split("\t")
			feat_name = data[0]
			data = [data[i] for i in col_indices]
			if feat_name not in fids:
				skipped_ids.add(feat_name)
				continue

			feat_id = fids[feat_name]
			
			ib.insert(feat_id, icdo_id, exp_id, *data)

		if len(skipped_ids) > 0:
			log.warn("There were {} feature names not found:\n{}".format(len(skipped_ids), ",".join(skipped_ids)))

		log.debug("{} results inserted".format(ib.count))

		ib.close()
		reader.close()

	em.close()
	es.close()
	rs.close()
def run(task):

    # Initialization

    task.check_conf(
        [
            "entities",
            "repositories",
            "repositories.data",
            "repositories.source",
            "bin_paths.python",
            "bin_paths.matrix_map",
        ]
    )
    conf = task.conf

    log = task.logger()

    task.check_in_ports(["oncodrive_ids"])
    task.check_out_ports(["mapped_oncodrive_ids"])

    oncodrive_port = task.ports["oncodrive_ids"]
    mapped_oncodrive_port = task.ports["mapped_oncodrive_ids"]

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    data_repo = rs.repository("data")
    source_repo = rs.repository("source")

    overwrite = conf.get("overwrite", False, dtype=bool)

    platform_base_path = "platform"
    vplatform_base_path = "vplatform"

    results_base_path = types.MRNA_ONCODRIVE_GENES.replace(".", "/")

    log.info("Indexing available oncodrive results for genes ...")
    oncodrive_results_index = em.group_ids(
        ["study_id", "platform_id", "icdo_topography", "icdo_morphology"], types.MRNA_ONCODRIVE_GENES, unique=True
    )

    for oid in oncodrive_port:
        o = em.find(oid, types.MRNA_ONCODRIVE_PROBES)
        if o is None:
            log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_PROBES, oid))
            continue

        study_id = o["study_id"]
        platform_id = o["platform_id"]
        key = (study_id, platform_id, o["icdo_topography"], o["icdo_morphology"])

        if key in oncodrive_results_index:
            mid = oncodrive_results_index[key][0]
            m = em.find(mid, types.MRNA_ONCODRIVE_GENES)
            if m is None:
                log.error("{0} not found: {1}".format(types.MRNA_ONCODRIVE_GENES, mid))
                continue
        else:
            m = o.transform(
                [
                    "study_id",
                    "platform_id",
                    "icdo_topography",
                    "icdo_morphology",
                    "log2r_tumour_unit_id",
                    ("oncodrive_probes_id", "id"),
                ]
            )
            m["id"] = mid = str(uuid.uuid4())

            # mapped oncodrive results

        results_path = rpath.join(results_base_path, mid + ".tsv.gz")
        gitools_results_path = rpath.join(results_base_path, mid + ".tdm.gz")

        if skip_file(overwrite, data_repo, results_path, m.get("results_file")):
            log.warn("Skipping ({0}) [{1}] as it already exists".format(", ".join(key), mid))
            mapped_oncodrive_port.write(mid)
            continue

        log.info("Mapping oncodriver results ({0}) [{1}] ...".format(", ".join(key), oid))

        # determine the mapping file
        map_file = None
        p = em.find(platform_id, types.SOURCE_PLATFORM)
        if p is None:
            log.error("{0} not found: {1}".format(types.SOURCE_PLATFORM, platform_id))
            continue

        platform_id_type = p.get("SO/platform_id_type")
        if platform_id_type is None:
            log.error("Undefined annotation 'SO/platform_id_type' for platform '{0}'.".format(platform_id))
            continue
        elif platform_id_type != "genbank_accession":  # affy_accession, custom, ...
            missing = p.missing_fields(["ensg_map", "ensg_map/file"])
            if len(missing) > 0:
                log.error("Missing required fields for platform '{0}': {1}".format(platform_id, ", ".join(missing)))
                continue
            map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if not source_repo.exists(map_file):
                log.error("Mapping file not found for platform '{0}': {1}".format(platform_id, map_file))
                continue
        elif platform_id_type == "genbank_accession":
            if len(p.missing_fields(["ensg_map", "ensg_map/file"])) > 0:
                map_file = None
            else:
                map_file = rpath.join(platform_base_path, p.get("ensg_map/path", ""), p["ensg_map/file"])
            if map_file is None or not source_repo.exists(map_file):
                vpid = "-".join([platform_id, study_id])
                vp = em.find(vpid, types.SOURCE_VPLATFORM)
                if vp is None:
                    log.error("{0} not found: {1}".format(types.SOURCE_VPLATFORM, vpid))
                    continue
                missing = vp.missing_fields(["ensg_map", "ensg_map/path", "ensg_map/file"])
                if len(missing) > 0:
                    log.error("Missing required fields for vplatform '{0}': {1}".format(vpid, ", ".join(missing)))
                    continue
                map_file = rpath.join(vplatform_base_path, vp["ensg_map/path"], vp["ensg_map/file"])
                if not source_repo.exists(map_file):
                    log.error(
                        "Mapping file not found for vplatform ({0}, {1}): {2}".format(platform_id, study_id, map_file)
                    )
                    continue
        else:
            log.error("Unknown SO/platform_id_type '{0}' for platform '{1}'.".format(platform_id_type, platform_id))
            continue

        log.debug("Mapping file: {0}".format(map_file))

        m["platform_map_file"] = source_repo.url(map_file)

        # oncodrive results file
        repo, repo_path = rs.from_url(o["results_file"])
        local_path = repo.get_local(repo_path)

        # mapped oncodrive results
        m["results_file"] = data_repo.url(results_path)
        results_local_path = data_repo.create_local(results_path)
        gitools_results_local_path = data_repo.create_local(gitools_results_path)

        mapping_path = rpath.join(results_base_path, mid + ".mapping.tsv.gz")
        m["mapping_file"] = data_repo.url(mapping_path)
        mapping_local_path = data_repo.create_local(mapping_path)

        map_results_file = tempfile.mkstemp(prefix="mrna_oncodrive_map_", suffix=".tsv")[1]

        try:
            # run the mapping tool
            local_map_file = source_repo.get_local(map_file)

            log.debug("Mapping {0} to {1} ...".format(repo_path, map_results_file))

            cmd = " ".join(
                [
                    conf["bin_paths.python"],
                    conf["bin_paths.matrix_map"],
                    "-o",
                    map_results_file,
                    "-i",
                    mapping_local_path,
                    local_path,
                    local_map_file,
                ]
            )

            log.debug(cmd)

            retcode = subprocess.call(args=cmd, shell=True)

            if retcode != 0:
                raise Exception("There was an error mapping the results")

                # merge repeated ids

            log.debug("Merging {0} to {1} ...".format(map_results_file, results_path))
            log.debug("Gitools file: {0}".format(gitools_results_path))

            upreg_count, downreg_count = merge(log, map_results_file, results_local_path, gitools_results_local_path)
            if upreg_count == 0 and downreg_count == 0:
                log.error(
                    "The results of the mapping for ({0}) are empty. This could be because the annotated platform or the mapping file is wrong.".format(
                        ", ".join(key)
                    )
                )

                # close local paths
            data_repo.put_local(results_local_path)
            data_repo.put_local(mapping_local_path)

        except Exception as e:
            log.exception(e)

            data_repo.close_local(results_local_path)
            data_repo.close_local(mapping_local_path)
            continue

        finally:
            os.remove(map_results_file)
            repo.close_local(local_path)
            source_repo.close_local(local_map_file)

            # save mapped results
        em.persist(m, types.MRNA_ONCODRIVE_GENES)
        mapped_oncodrive_port.write(mid)

    em.close()
    data_repo.close()
    source_repo.close()
    rs.close()
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "bin_paths.gitools"])
	conf = task.conf

	log = task.logger()

	evt_tumour_unit_port, oncodrive_results_port = \
		task.ports("evt_tumour_unit_ids", "oncodrive_results_ids")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	log.info("Indexing available {} ...".format(types.CNV_ONCODRIVE_GENES))
	oncodrive_results_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_ONCODRIVE_GENES, unique = True)

	results_base_path = types.CNV_ONCODRIVE_GENES.replace(".", "/")

	for uid in evt_tumour_unit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u["icdo_morphology"])
		if key in oncodrive_results_index:
			eid = oncodrive_results_index[key][0]
			e = em.find(eid, types.CNV_ONCODRIVE_GENES)
			if e is None:
				log.error("{} not found: {}".format(types.CNV_ONCODRIVE_GENES, eid))
				continue
		else:
			e = u.transform(["study_id", "platform_id", "icdo_topography", "icdo_morphology"])
			eid = e["id"] = str(uuid.uuid4())

		# create oncodrive results entity
		e["evt_tumour_unit_id"] = uid

		results_path = rpath.join(results_base_path, eid + ".tsv.gz")

		if skip_file(overwrite, data_repo, results_path, e.get("results_file")):
			log.warn("Skipping ({}) [{}] as it already exists".format(", ".join(key), eid))
			oncodrive_results_port.write(eid)
			continue

		e["results_file"] = data_repo.url(results_path)
		
		# data matrix for oncodrive calculation
		matrix_repo, matrix_path = rs.from_url(u["data_file"])

		# Gain & Loss

		log.info("Calculating Oncodrive results for {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))
		log.debug("{} id is {}".format(types.CNV_ONCODRIVE_GENES, eid))

		tmp_path = mkdtemp(prefix = "cnv_oncodrive_calc_")
		log.debug("Temporary directory: {}".format(tmp_path))
		tmp_file = os.path.join(tmp_path, "filtered_data.tsv")

		matrix_local_path = matrix_repo.get_local(matrix_path)
		log.debug("Matrix path: {}".format(matrix_path))

		try:
			try:
				log.info("Calculating Gain ...")
				log.debug("Bit mask filtering (01) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 1)
				gain_results = run_oncodrive(
					conf, log, e, "gain", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for gain failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			try:
				log.info("Calculating Loss ...")
				log.debug("Bit mask filtering (10) {} to {} ...".format(matrix_local_path, tmp_file))
				mask_filtering(matrix_local_path, tmp_file, 2)
				loss_results = run_oncodrive(
					conf, log, e, "loss", tmp_file, tmp_path)
			except:
				log.error("Oncodrive calculation for evt tumour unit ({}) [{}] for downreg failed".format(",".join(key), uid))
				matrix_repo.close_local(matrix_local_path)
				raise

			# Join gain & loss results

			log.info("Joining upreg & downreg results into memory ...")

			# the join is done in memory with a map
			dmap = read_data_map(log, gain_results, loss_results)

			log.info("Writting joined data to {} ...".format(results_path))

			results_local_path = data_repo.create_local(results_path)

			write_data_map(dmap, results_local_path)

		finally:
			matrix_repo.close_local(matrix_local_path)
			matrix_repo.close()

			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		data_repo.put_local(results_local_path)

		em.persist(e, types.CNV_ONCODRIVE_GENES)
		oncodrive_results_port.write(eid)
	
	em.close()
	data_repo.close()
	rs.close()
Example #5
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories",
		"cnv.background.ensg", "cnv.mapping.ensg",
		"bin_paths.bed_tools"])

	conf = task.conf

	log = task.logger()

	evt_tunit_port, joined_evt_tunit_port = \
		task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	mapping_file = conf["cnv.mapping.ensg"]
	log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file))
	mapping_repo, mapping_path = rs.from_url(mapping_file)
	mapping_local_path = mapping_repo.get_local(mapping_path)

	background_file = conf["cnv.background.ensg"]
	log.info("Loading background from {} ...".format(background_file))

	background = set()
	repo, path = rs.from_url(background_file)
	reader = repo.open_reader(path)
	for line in reader:
		line = line.rstrip()
		if len(line) == 0:
			continue
		background.add(line)
	reader.close()
	repo.close()

	for uid in evt_tunit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", ""))

		tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/")
		tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz")

		if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")):
			log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid))
			joined_evt_tunit_port.write(uid)
			continue

		log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))

		cnv_evt_ids = u["cnv_evt_ids"]
		log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS))

		data = {}
		
		tmp_path = mkdtemp(prefix = "evt_map_and_join_")
		log.debug("Temporary directory: {}".format(tmp_path))
		
		try:
			for eid in cnv_evt_ids:
				e = em.find(eid, types.CNV_EVENTS)
				if e is None:
					log.error("{} not found: {}".format(types.CNV_EVENTS, eid))
					continue

				data_file = e["data_file"]

				log.debug("{} ...".format(data_file))

				repo, path = rs.from_url(data_file)

				local_path = repo.get_local(path)

				# Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed)

#				tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"]))

#				writer = FileWriter(tmp_file)
#				reader = repo.open_reader(path)
#				for line in reader:
#					if line.lstrip().startswith("#"):
#						continue
#					fields = line.rstrip().split("\t")
#					end = int(fields[2]) + 0 # FIXME fix not necessary already
#					fields[2] = str(end)
#					writer.write("\t".join(fields))
#					writer.write("\n")
#				writer.close()
#				reader.close()

				# Run BED tools to intersect event regions with gene names

				tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"]))

				cmd = " ".join([
					os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"),
					"-a", mapping_local_path,
					#"-b", tmp_file,
					"-b", local_path,
					"-s -wb",
					">{}".format(tmp_file2)])

				log.debug(cmd)

				retcode = subprocess.call(args = cmd, shell = True)

				if retcode != 0:
					raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

				repo.close_local(local_path)

				# Read BED tools results and load event data into memory

				reader = FileReader(tmp_file2)

				name_index = 3
				value_index = 12

				line_num = 1
				for line in reader:
					try:
						fields = line.rstrip().split("\t")
						name = fields[name_index]
						value = int(fields[value_index])
						if value not in [1, 2]:
							log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file))
							continue
					except:
						log.error("Error parsing line {} of data file {}".format(line_num, data_file))
						continue

					k = (eid, name)
					if k in data:
						prev_value = data[k]
					else:
						prev_value = 0

					data[k] = prev_value | value

					line_num += 1

				reader.close()
				repo.close()

		finally:
			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		# Write events data to data file and merge with background labels

		log.info("Writing data to {} ...".format(tunit_path))

		u["data_file"] = data_repo.url(tunit_path)
		#TODO u["data_timestamp"] = ...

		writer = data_repo.open_writer(tunit_path)

		# header
		for name in cnv_evt_ids:
			writer.write("\t")
			writer.write(name)
		writer.write("\n")

		# data
		for row_name in sorted(background):
			writer.write(row_name)
			for col_name in cnv_evt_ids:
				k = (col_name, row_name)
				if k in data:
					value = data[k]
				else:
					value = 0
				writer.write("\t")
				writer.write(str(value))
			writer.write("\n")

		writer.close()
		
		log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key)))
		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		joined_evt_tunit_port.write(uid)

	em.close()
	es.close()

	mapping_repo.close_local(mapping_local_path)
	mapping_repo.close()
	data_repo.close()
	rs.close()
Example #6
0
def main():
    task.check_conf(["entities", "repositories", "biomart.db"])
    conf = task.conf

    insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

    db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

    log = task.logger()

    oncodrive_port = task.ports("id")

    es = EntityServer(conf["entities"])
    em = es.manager()

    rs = RepositoryServer(conf["repositories"])

    conn = biomart_db_connect(conf["biomart.db"], log)

    cursor = conn.cursor()

    gene = map_from_select(cursor, "SELECT id, gene_name FROM ent_gene")
    icdo = map_from_select(cursor, "SELECT id, icdo_topography_code, icdo_morphology_code FROM ent_icdo")
    exp = map_from_select(cursor, "SELECT id, study_id, platf_id FROM ent_experiment")

    cursor.execute(
        """
		CREATE TABLE IF NOT EXISTS exp_gene_trs (
		  gene_id int(11) NOT NULL,
		  icdo_id int(11) NOT NULL,
		  exp_id int(11) NOT NULL,
		  upreg_total int(11) DEFAULT NULL,
		  upreg_observed double DEFAULT NULL,
		  upreg_expected double DEFAULT NULL,
		  upreg_stdev double DEFAULT NULL,
		  upreg_pvalue double DEFAULT NULL,
		  upreg_cpvalue double DEFAULT NULL,
		  downreg_total int(11) DEFAULT NULL,
		  downreg_observed double DEFAULT NULL,
		  downreg_expected double DEFAULT NULL,
		  downreg_stdev double DEFAULT NULL,
		  downreg_pvalue double DEFAULT NULL,
		  downreg_cpvalue double DEFAULT NULL,
		  PRIMARY KEY (gene_id,icdo_id,exp_id),
		  KEY icdo (icdo_id,exp_id),
		  KEY exp (exp_id),
		  CONSTRAINT exp_gene_trs_gene_id FOREIGN KEY (gene_id) REFERENCES ent_gene (id),
		  CONSTRAINT exp_gene_trs_icdo_id FOREIGN KEY (icdo_id) REFERENCES ent_icdo (id),
		  CONSTRAINT exp_gene_trs_exp_id FOREIGN KEY (exp_id) REFERENCES ent_experiment (id)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(
            db_engine
        )
    )

    cursor.execute("LOCK TABLES exp_gene_trs WRITE")

    lock_count = 0

    for eid in oncodrive_port:
        e = em.find(eid, types.MRNA_ONCODRIVE_GENES)
        if e is None:
            log.error("{} not found: {}".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        if "results_file" not in e:
            log.error("{} [{}] without results file.".format(types.MRNA_ONCODRIVE_GENES, eid))
            continue

        study_id = e["study_id"]
        platform_id = e["platform_id"]
        icdo_topography = e["icdo_topography"]
        icdo_morphology = e["icdo_morphology"]

        okey = (study_id, platform_id, icdo_topography, icdo_morphology)

        log.info("Exporting oncodrive results ({}) [{}] ...".format(", ".join(okey), eid))

        icdo_key = (icdo_topography, icdo_morphology)
        if icdo_key not in icdo:
            log.error("ICDO ({}) not found in the database".format(", ".join(icdo_key)))
            continue
        icdo_id = icdo[icdo_key]

        exp_key = (study_id, platform_id)
        if exp_key not in exp:
            log.error("Experiment ({}) not found in the database".format(", ".join(exp_key)))
            continue
        exp_id = exp[exp_key]

        ib = BatchInsert(
            cursor,
            "exp_gene_trs",
            [
                "gene_id",
                "icdo_id",
                "exp_id",
                "upreg_total",
                "upreg_observed",
                "upreg_expected",
                "upreg_stdev",
                "upreg_pvalue",
                "upreg_cpvalue",
                "downreg_total",
                "downreg_observed",
                "downreg_expected",
                "downreg_stdev",
                "downreg_pvalue",
                "downreg_cpvalue",
            ],
            insert_size,
        )

        results_repo, results_path = rs.from_url(e["results_file"])

        try:
            reader = results_repo.open_reader(results_path)
        except Exception as ex:
            log.exception(ex)
            ib.close()
            results_repo.close()
            continue

            # read header
        hdr_map = {}
        hdr = reader.readline().rstrip().split("\t")
        for i, name in enumerate(hdr):
            hdr_map[name] = i

        try:
            col_indices = [hdr_map[x] for x in __COLUMN_NAMES]
        except KeyError as e:
            log.warn("Column {} not found in results files, most probably because it is empty".format(e.args[0]))
            reader.close()
            lock_count += ib.count
            ib.close()
            results_repo.close()
            continue

        skipped_genes = set()

        # read data
        for line in reader:
            line = line.rstrip()
            data = line.split("\t")
            gene_name = data[0]
            data = [data[i] for i in col_indices]
            if gene_name not in gene:
                skipped_genes.add(gene_name)
                continue

            gene_id = gene[gene_name]

            ib.insert(gene_id, icdo_id, exp_id, *data)

        if len(skipped_genes) > 0:
            log.warn("There were {} gene names not found:\n{}".format(len(skipped_genes), ",".join(skipped_genes)))

        log.debug("{} gene results inserted".format(ib.count))

        lock_count += ib.count

        ib.close()
        reader.close()

        if lock_count >= 1000000:
            cursor.execute("UNLOCK TABLES")
            cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
            cursor.execute("LOCK TABLES exp_gene_trs WRITE")
            lock_count = 0

    cursor.execute("UNLOCK TABLES")
    cursor.execute("OPTIMIZE NO_WRITE_TO_BINLOG TABLE exp_gene_trs")
    cursor.close()

    em.close()
    es.close()
    rs.close()
def main():

	# Initialization

	task.check_conf(["entities", "repositories", "repositories.assay",
				"cnv.min_tumour_unit_size"])

	conf = task.conf

	log = task.logger()

	study_ids_port, evt_port, evt_tunit_port = \
		task.ports("study_ids", "evt_ids", "evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	source_repo = rs.repository("source")

	if "excluded_topographies" in conf:
		excluded_topographies = set(conf.get("excluded_topographies"))
		log.debug("Excluded topographies: {}".format(", ".join(excluded_topographies)))
	else:
		excluded_topographies = set()
		
	# Run

	log.info("Creating indices for {} ...".format(types.CNV_EVENTS))
	evt_index = em.group_ids(
		["study_id", "platform_id", "sample_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS, unique = True)
	
	log.info("Creating indices for {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	evt_tunit_index = em.group_ids(
		["study_id", "platform_id", "icdo_topography", "icdo_morphology"],
		types.CNV_EVENTS_TUMOUR_UNIT, unique = True)
	
	processed_studies = set()
	processed_assays = 0
	valid_assay_count = {}
	skipped_assay_count = {}
	wrong_assays = {}
	wrong_samples = {}
	tumour_units = {}
	evt_dup = {}
	
	study_ids = study_ids_port.read_all()
	log.info("Processing %i studies ..." % len(study_ids))

	for assay in em.iter_all(types.SOURCE_ASSAY):

		assay_id = assay.get("id", "WITHOUT ID")
		log.debug("Reading assay %s ..." % assay_id)

		mf = assay.missing_fields(["id", "study_id", "platform_id", "sample_id",
			"assay_property/assay_design", "assay_property/data_type",
			"assay_property/study_type", "assay_property/filename"])	
		
		assay_source_path = assay.get("source_path", "")
		
		if len(mf) > 0:
			study_id = assay.get("study_id", "WITHOUT ID")
			doc_path = assay.get("__doc_path", "UNKNOWN")

			log.error("Assay {} in study {} missing required fields: ({}) ({})".format(assay_id, study_id, ", ".join(mf), assay_source_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		study_id = assay["study_id"]

		if study_id not in study_ids:
			log.debug("Assay {} not included in 'study_ids'".format(assay_id))
			continue

		platform_id = assay["platform_id"]
		sample_id = assay["sample_id"]
		
		assay_design = assay["assay_property/assay_design"]
		data_type = assay["assay_property/data_type"]
		study_type = assay["assay_property/study_type"]

		source_path = assay["source_path"]
		source_file = assay["assay_property/filename"]

		e = assay.transform([
			("assay_id", "id"),
			"study_id",
			"platform_id",
			"sample_id",
			"source_path"])

		e["data_file"] = source_repo.url("assay", source_path, source_file)

		included = study_id in study_ids and study_type == "genomic"
		included &= (assay_design == "cancer_vs_normal" and data_type == "binary")

		if not included:
			if study_type != "transcriptomic" and study_id in study_ids:
				s = ", ".join([" = ".join(v) for v in [("study_id", study_id), ("assay_design", assay_design), ("data_type", data_type), ("study_type", study_type)]])
				log.debug("Skipping assay {} ({}): {}.".format(assay_id, assay_source_path, s))
				map_inc(skipped_assay_count, (study_id, assay_design, data_type, study_type))
			continue

		sample = em.find(sample_id, types.SOURCE_SAMPLE)
		if sample is None:
			log.error("Assay {} references a non-existent sample: {}".format(assay_id, sample_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue
		
		mf = sample.missing_fields(["id", "basic_sample_details/disease_state", "icdo/topography"])
		if len(mf) > 0:
			sample_source_path = sample.get("source_path", "")
			log.error("Sample {} associated with assay {} in study {} missing required fields: ({}) ({})".format(sample_id, assay_id, study_id, ", ".join(mf), sample_source_path))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		sample = sample.transform([
			"id",
			"source_path",
			("disease_state", "basic_sample_details/disease_state"),
			("normal_counterpart", "normal_counterpart_location/topography"),
			("icdo_topography", "icdo/topography"),
			("icdo_morphology", "icdo/morphology") ])
		
		disease_state = sample["disease_state"]
		if disease_state not in disease_state_map:
			log.error("Unknown disease_state '{}' for sample {} ({})".format(disease_state, sample_id, sample.get("source_path", "")))
			map_list_add(wrong_samples, study_id, sample_id)
			continue

		orig_disease_state = disease_state
		disease_state = disease_state_map[disease_state]
		if disease_state not in ["tumour"]:
			log.warn("Sample {} associated with assay {} in study {} has not a tumour 'disease_state' ({}): {}".format(sample_id, assay_id, study_id, sample_source_path, orig_disease_state))
			continue

		e["disease_state"] = disease_state
		
		e["icdo_topography"] = sample["icdo_topography"]
		e["icdo_morphology"] = sample.get("icdo_morphology", "")
		if "normal_counterpart" in sample:
			e["normal_counterpart"] = sample["normal_counterpart"]

		repo, rel_path = rs.from_url(e["data_file"])

		if not repo.exists(rel_path):
			log.error("Assay {} in study {} missing data file: [{}]".format(assay_id, study_id, rel_path))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		e_key = (study_id, platform_id, sample_id, e["icdo_topography"], e["icdo_morphology"])

		eid = None
		duplicated = False
		exists = False
		if e_key in evt_dup:
			duplicated = True
		elif e_key in evt_index:
			eid = evt_index[e_key][0]
			exists = True
		
		if duplicated:
			log.error("Duplicated key ({}) for assay {}".format(", ".join(e_key), assay_id))
			map_list_add(wrong_assays, study_id, assay_id)
			continue

		if eid is None:
			eid = str(uuid.uuid4())
		
		e["id"] = eid

		u_key = (study_id, platform_id, e.get("normal_counterpart", e["icdo_topography"]), e.get("icdo_morphology", ""))
		keys = classify_by_experiment_and_icdo(
					u_key[0], u_key[1], u_key[2], u_key[3])
		for key in keys:
			icdo_topography = key[2]
			if icdo_topography in excluded_topographies:
				continue
			map_list_add(tumour_units, key, eid)

		processed_studies.add(study_id)
		processed_assays += 1
		map_inc(valid_assay_count, (study_id, platform_id))

		msg = {True : "Overwritting", False : "Writting"}[exists]
		log.info("{} {} ({}) ...".format(msg, types.CNV_EVENTS, ", ".join(e_key)))
		em.persist(e, types.CNV_EVENTS)
		evt_port.write(eid)
		evt_dup[e_key] = eid

	min_tumour_unit_size = conf["cnv.min_tumour_unit_size"]

	log.info("Persisting {} ...".format(types.CNV_EVENTS_TUMOUR_UNIT))
	log.debug("Minimum size = {}".format(min_tumour_unit_size))

	for key in sorted(tumour_units):
		v = tumour_units[key]
		size = len(v)
		if size < min_tumour_unit_size:
			discard = True
			discard_text = "[skipped]"
		else:
			discard = False
			discard_text = ""

		if key in evt_tunit_index:
			uid = evt_tunit_index[key][0]
			u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
			if u is None:
				log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
				continue

			arrow_text = "==>"
		else:
			uid = str(uuid.uuid4())
			u = DataElement(key_sep = "/")
			u["id"] = uid
			u["study_id"] = key[0]
			u["platform_id"] = key[1]
			u["icdo_topography"] = key[2]
			u["icdo_morphology"] = key[3]

			arrow_text = "-->"

		log.info("\t[{}] ({})\t{} {} assays {}".format(uid, ", ".join(key), arrow_text, size, discard_text))

		if discard:
			continue

		u["size"] = len(v)
		u["cnv_evt_ids"] = u.create_list(v)

		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		evt_tunit_port.write(uid)

	sb = ["Processed {} assays for {} studies (out of {}):\n\n".format(processed_assays, len(processed_studies), len(study_ids))]
	log.info("".join(sb))

	log.info("Skipped assays:\n\n{}".format(map_count_tostring(skipped_assay_count, indent = 1)))
	
	log.info("Summary of failed assays:\n\n{}".format(map_list_tostring(wrong_assays)))
	
	log.info("Summary of failed samples:\n\n{}".format(map_list_tostring(wrong_samples)))

	em.close()
	es.close()
Example #8
0
def main():
	task.check_conf(["entities", "repositories", "biomart.db",
		"biomart.files.icdo_topography", "biomart.files.icdo_morphology"])
	conf = task.conf

	insert_size = conf.get("biomart.insert_size", DEFAULT_INSERT_SIZE, dtype=int)

	log = task.logger()

	icdo_port = task.ports("icdo")

	es = EntityServer(conf["entities"])
	em = es.manager()

	rs = RepositoryServer(conf["repositories"])

	log.info("Loading topography codes from {} ...".format(conf["biomart.files.icdo_topography"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_topography"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_topography = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	log.info("Loading morphology codes from {} ...".format(conf["biomart.files.icdo_morphology"]))
	icdo_repo, icdo_path = rs.from_url(conf["biomart.files.icdo_morphology"])
	icdo_local_path = icdo_repo.get_local(icdo_path)
	icdo_morphology = map_from_file(icdo_local_path)
	icdo_repo.close_local(icdo_path)
	icdo_repo.close()

	conn = biomart_db_connect(conf["biomart.db"], log)

	db_engine = conf.get("biomart.db.engine", DEFAULT_DB_ENGINE)

	cursor = conn.cursor()

	cursor.execute("""
		CREATE TABLE  ent_icdo (
		  id int(11) NOT NULL,
		  icdo_name varchar(512) NOT NULL DEFAULT '',
		  icdo_topography varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology varchar(255) NOT NULL DEFAULT '',
		  icdo_topography_code varchar(24) NOT NULL DEFAULT '',
		  icdo_morphology_code varchar(24) NOT NULL DEFAULT '',
		  icdo_topography_name varchar(255) NOT NULL DEFAULT '',
		  icdo_morphology_name varchar(255) NOT NULL DEFAULT '',
		  PRIMARY KEY (id),
		  KEY icdo_name (icdo_name),
		  KEY icdo_tm (icdo_topography,icdo_morphology),
		  KEY icdo_m (icdo_morphology),
		  KEY icdo_tm_c (icdo_topography_code,icdo_morphology_code),
		  KEY icdo_m_c (icdo_morphology_code)
		) ENGINE={} DEFAULT CHARSET=latin1""".format(db_engine))

	ib = BatchInsert(cursor, "ent_icdo",
			["id", "icdo_name", "icdo_topography", "icdo_topography_code", "icdo_topography_name",
				"icdo_morphology", "icdo_morphology_code", "icdo_morphology_name"], insert_size)

	for i, tm in enumerate(icdo_port, 1):
		t_code = tm[0]
		if t_code == "":
			t_name = t_desc = "ANY topography"
		elif t_code not in icdo_topography:
			log.error("Unknown topography description for code {}".format(t_code))
			t_name = ""
			t_desc = "[{}]".format(t_code)
		else:
			t_name = icdo_topography[t_code]
			t_desc = "{} [{}]".format(t_name, t_code)

		m_code = tm[1]
		if m_code == "":
			m_name = m_desc = "ANY morphology"
		elif m_code not in icdo_morphology:
			log.error("Unknown morphology description for code {}".format(m_code))
			m_name = ""
			m_desc = "[{}]".format(m_code)
		else:
			m_name = icdo_morphology[m_code]
			m_desc = "{} [{}]".format(m_name, m_code)

		name = "; ".join((t_desc, m_desc))

		log.info("({}, {}) --> ({}, {})".format(t_code, m_code, t_desc, m_desc))

		ib.insert(i, name, t_desc, t_code, t_name, m_desc, m_code, m_name)

	log.debug("{} ICDO terms inserted".format(ib.count))

	ib.close()
	cursor.close()
	conn.close()
	em.close()
	es.close()
	rs.close()