Esempio n. 1
0
def map_from_file(path):
	map = {}
	f = FileReader(path)
	for line in f:
		line = line.rstrip()
		fields = line.split("\t")
		if len(fields) != 2:
			raise Exception("Unexpected number of columns: ({0})".format(", ".join(fields)))
		map[fields[0]] = fields[1]
	f.close()
	return map
Esempio n. 2
0
	def read_header(self):
		if self.header == None:
			line = FileReader.readline(self).rstrip()
			hdr = line.split("\t")
			self.header = MatrixHeader(hdr)

		return self.header
Esempio n. 3
0
	def __iter__(self):
		parser = self._parser_from_type(self.dtype)

		self.read_header()

		for line in FileReader.__iter__(self):
			line = line.rstrip()

			fields = line.split("\t")
			row_name = fields[0]
			values = [parser(x) for x in fields[1:]]

			yield MatrixRow(row_name, values)
Esempio n. 4
0
def read_data_map(log, gain_results, loss_results):
	dmap = {}

	log.debug("Reading gain data from {} ...".format(gain_results))

	# read gain data
	uf = FileReader(gain_results)
	hdr = read_header(uf)
	count = 0
	for line in uf:
		k, d = read_data(line, hdr, "row", FIELDS)
		dmap[k] = d
		count += 1
	uf.close()

	log.debug("Total gain rows = {0}".format(count))

	log.debug("Reading loss data from {0} ...".format(loss_results))

	# read loss data and join with gain
	df = FileReader(loss_results)
	hdr = read_header(df)
	count = 0
	for line in df:
		k, d = read_data(line, hdr, "row", FIELDS)
		if k not in dmap:
			data = ["-"] * len(FIELDS)
		else:
			data = dmap[k]
		data += d
		dmap[k] = data
		count += 1

	log.debug("Total loss rows = {0}".format(count))

	return dmap
def read_data_map(log, upreg_results, downreg_results):
    dmap = {}

    log.debug("Reading upreg data from {0} ...".format(upreg_results))

    # read upreg data
    uf = FileReader(upreg_results)
    hdr = read_header(uf)
    count = 0
    for line in uf:
        k, d = read_data(line, hdr, "row", FIELDS)
        dmap[k] = d
        count += 1
    uf.close()

    log.debug("Total upreg rows = {0}".format(count))

    log.debug("Reading downreg data from {0} ...".format(downreg_results))

    # read downreg data and join with upreg
    df = FileReader(downreg_results)
    hdr = read_header(df)
    count = 0
    for line in df:
        k, d = read_data(line, hdr, "row", FIELDS)
        if k not in dmap:
            data = ["-"] * len(FIELDS)
        else:
            data = dmap[k]
        data += d
        dmap[k] = data
        count += 1

    log.debug("Total downreg rows = {0}".format(count))

    return dmap
Esempio n. 6
0
def merge(log, input, output, gitools_output):
    """
	Merge repeated rows by the lowest pvalue, in case the pvalue is the same take the one with greater n
	"""

    f = FileReader(input)
    hdr = f.readline().rstrip().split("\t")

    upreg = {}
    downreg = {}

    upreg_count = 0
    downreg_count = 0

    mid_index = 8

    for line in f:
        line = line.rstrip()
        if len(line) == 0:
            continue

        fields = line.split("\t")
        row_name = fields[0]

        upreg_count += merge_data(row_name, fields[1:mid_index], upreg)
        downreg_count += merge_data(row_name, fields[mid_index:], downreg)

    f.close()

    upreg_keys = upreg.keys()
    downreg_keys = downreg.keys()

    log.debug("Total rows: upreg = {}, downreg = {}".format(len(upreg_keys), len(downreg_keys)))
    log.debug("Merged rows: upreg = {}, downreg = {}".format(upreg_count, downreg_count))

    ofile = FileWriter(output)
    ofile.write("\t".join(hdr))
    ofile.write("\n")

    gfile = FileWriter(gitools_output)
    gfile.write("column\trow\t")
    gfile.write("\t".join([x[6:] for x in hdr if x.startswith("upreg_")]))
    gfile.write("\n")

    for row_name in upreg_keys:
        upreg_data = upreg[row_name]
        upreg_data_join = "\t".join(upreg_data)

        downreg_data = downreg[row_name]
        downreg_data_join = "\t".join(downreg_data)

        ofile.write(row_name)
        ofile.write("\t")
        ofile.write(upreg_data_join)
        ofile.write("\t")
        ofile.write(downreg_data_join)
        ofile.write("\n")

        gfile.write("upreg\t")
        gfile.write(row_name)
        gfile.write("\t")
        gfile.write(upreg_data_join)
        gfile.write("\n")
        gfile.write("downreg\t")
        gfile.write(row_name)
        gfile.write("\t")
        gfile.write(downreg_data_join)
        gfile.write("\n")

    ofile.close()
    gfile.close()

    return (upreg_count, downreg_count)
Esempio n. 7
0
	def __init__(self, obj, dtype=float):
		FileReader.__init__(self, obj)

		self.dtype = dtype

		self.header = None
Esempio n. 8
0
def main():

	# Initialization

	task.check_conf(["entities", "repositories",
		"cnv.background.ensg", "cnv.mapping.ensg",
		"bin_paths.bed_tools"])

	conf = task.conf

	log = task.logger()

	evt_tunit_port, joined_evt_tunit_port = \
		task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids")
	
	es = EntityServer(conf["entities"])
	em = es.manager()
	
	rs = RepositoryServer(conf["repositories"])
	data_repo = rs.repository("data")

	overwrite = conf.get("overwrite", False, dtype=bool)

	# Run

	mapping_file = conf["cnv.mapping.ensg"]
	log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file))
	mapping_repo, mapping_path = rs.from_url(mapping_file)
	mapping_local_path = mapping_repo.get_local(mapping_path)

	background_file = conf["cnv.background.ensg"]
	log.info("Loading background from {} ...".format(background_file))

	background = set()
	repo, path = rs.from_url(background_file)
	reader = repo.open_reader(path)
	for line in reader:
		line = line.rstrip()
		if len(line) == 0:
			continue
		background.add(line)
	reader.close()
	repo.close()

	for uid in evt_tunit_port:
		u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT)
		if u is None:
			log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid))
			continue

		key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", ""))

		tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/")
		tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz")

		if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")):
			log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid))
			joined_evt_tunit_port.write(uid)
			continue

		log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid))

		cnv_evt_ids = u["cnv_evt_ids"]
		log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS))

		data = {}
		
		tmp_path = mkdtemp(prefix = "evt_map_and_join_")
		log.debug("Temporary directory: {}".format(tmp_path))
		
		try:
			for eid in cnv_evt_ids:
				e = em.find(eid, types.CNV_EVENTS)
				if e is None:
					log.error("{} not found: {}".format(types.CNV_EVENTS, eid))
					continue

				data_file = e["data_file"]

				log.debug("{} ...".format(data_file))

				repo, path = rs.from_url(data_file)

				local_path = repo.get_local(path)

				# Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed)

#				tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"]))

#				writer = FileWriter(tmp_file)
#				reader = repo.open_reader(path)
#				for line in reader:
#					if line.lstrip().startswith("#"):
#						continue
#					fields = line.rstrip().split("\t")
#					end = int(fields[2]) + 0 # FIXME fix not necessary already
#					fields[2] = str(end)
#					writer.write("\t".join(fields))
#					writer.write("\n")
#				writer.close()
#				reader.close()

				# Run BED tools to intersect event regions with gene names

				tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"]))

				cmd = " ".join([
					os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"),
					"-a", mapping_local_path,
					#"-b", tmp_file,
					"-b", local_path,
					"-s -wb",
					">{}".format(tmp_file2)])

				log.debug(cmd)

				retcode = subprocess.call(args = cmd, shell = True)

				if retcode != 0:
					raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

				repo.close_local(local_path)

				# Read BED tools results and load event data into memory

				reader = FileReader(tmp_file2)

				name_index = 3
				value_index = 12

				line_num = 1
				for line in reader:
					try:
						fields = line.rstrip().split("\t")
						name = fields[name_index]
						value = int(fields[value_index])
						if value not in [1, 2]:
							log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file))
							continue
					except:
						log.error("Error parsing line {} of data file {}".format(line_num, data_file))
						continue

					k = (eid, name)
					if k in data:
						prev_value = data[k]
					else:
						prev_value = 0

					data[k] = prev_value | value

					line_num += 1

				reader.close()
				repo.close()

		finally:
			if os.path.exists(tmp_path):
				log.debug("Removing temporary directory {} ...".format(tmp_path))
				shutil.rmtree(tmp_path)

		# Write events data to data file and merge with background labels

		log.info("Writing data to {} ...".format(tunit_path))

		u["data_file"] = data_repo.url(tunit_path)
		#TODO u["data_timestamp"] = ...

		writer = data_repo.open_writer(tunit_path)

		# header
		for name in cnv_evt_ids:
			writer.write("\t")
			writer.write(name)
		writer.write("\n")

		# data
		for row_name in sorted(background):
			writer.write(row_name)
			for col_name in cnv_evt_ids:
				k = (col_name, row_name)
				if k in data:
					value = data[k]
				else:
					value = 0
				writer.write("\t")
				writer.write(str(value))
			writer.write("\n")

		writer.close()
		
		log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key)))
		em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT)
		joined_evt_tunit_port.write(uid)

	em.close()
	es.close()

	mapping_repo.close_local(mapping_local_path)
	mapping_repo.close()
	data_repo.close()
	rs.close()
Esempio n. 9
0
def enrichment(log, conf, rs, data_repo, results_path, data_file, e, ec,
				filtered_columns, filtered_columns_new_names):

	eid = e["id"]

	key = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"])

	# determine the modules file
	mod_repo, mod_path = rs.from_url(ec["modules_file"])
	mod_local_path = mod_repo.get_local(mod_path)

	# oncodrive data file
	matrix_repo, matrix_path = rs.from_url(data_file)
	matrix_local_path = matrix_repo.get_local(matrix_path)

	e["data_file"] = data_file
	e["modules_file"] = ec["modules_file"]

	results_local_path = None

	tmp_path = mkdtemp(prefix = "enrichment_")
	log.debug("Temporary directory: {}".format(tmp_path))

	valid = True

	try:
		log.info("Filtering pvalue columns from {} ...".format(data_file))

		# filter columns for pvalues
		data_local_path = os.path.join(tmp_path, "data.tsv")

		rf = of = None
		try:
			rf = FileReader(matrix_local_path)
			of = FileWriter(data_local_path)
			row_count = tsv.filter_columns(rf, of,
					filtered_columns, filtered_columns_new_names)
		finally:
			if rf is not None:
				rf.close()
			if of is not None:
				of.close()

		if row_count == 0:
			log.warn("Oncodrive results are empty: {}".format(matrix_path))
			raise EmptyResults

		# apply background if necessary
		if "population.file" in ec:
			pop_url = ec["population.file"]
			pop_missing_value = ec.get("population.missing_value", "-")
			log.info("Applying background from {} with missing value {} ...".format(pop_url, pop_missing_value))
			data2_local_path = os.path.join(tmp_path, "data-filtered.tsv")
			pop_repo, pop_path = rs.from_url(pop_url)
			pop_local_path = pop_repo.get_local(pop_path)
			cmd = " ".join([
				conf["bin_paths.python"], conf["bin_paths.matrix_background"],
				"--verbose --missing-value", pop_missing_value,
				"-o", data2_local_path,
				data_local_path, pop_local_path ])

			log.debug(cmd)
			retcode = subprocess.call(args = cmd, shell = True)

			if retcode != 0:
				raise Exception("Applying population background for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

			pop_repo.close_local(pop_local_path)
			data_local_path = data2_local_path

		# enrichment results
		e["results_file"] = data_repo.url(results_path)
		results_local_path = data_repo.create_local(results_path)

		log.info("Running enrichment ...")
		log.debug("\tData file: {}".format(data_local_path))
		log.debug("\tModules file: {}".format(ec["modules_file"]))

		gitools_enrichment_bin = os.path.join(conf["bin_paths.gitools"], "bin", "gitools-enrichment")

		sb = [ gitools_enrichment_bin,
			"-N", eid, "-w", tmp_path, "-p 1",
			"-mf tcm", "-m", mod_local_path,
			"-df cdm", "-d", data_local_path,
			"-t", ec["test"] ]

		if "filter" in ec:
			sb += ["-b", ec["filter"]]

		if ec.get("only_mapped_items", False, dtype=bool):
			sb += ["-only-mapped-items"]

		#if "population" in ec:
		#	pop_repo, pop_path = rs.from_url(ec["population"])
		#	pop_local_path = pop_repo.get_local(pop_path)
		#	sb += ["-P", pop_local_path]

		cmd = " ".join(sb)

		log.debug(cmd)

		retcode = subprocess.call(args = cmd, shell = True)

		sys.stdout.write("\n")
		sys.stdout.flush()

		if retcode != 0:
			raise Exception("Enrichment for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

		# flatten results

		log.info("Flattening results into {} ...".format(e["results_file"]))

		try:
			gitools_results = os.path.join(tmp_path, eid + "-results.tdm.gz")
			rf = FileReader(gitools_results)
			of = FileWriter(results_local_path)
			tdm.flatten(rf, of,
				{ "column" : str, "row" : str, "N" : int, "observed" : int,
				"expected-mean" : float, "expected-stdev" : float, "probability" : float,
				"right-p-value" : float, "corrected-right-p-value" : float },

				["N", "observed", "expected-mean", "expected-stdev",
				"probability", "right-p-value", "corrected-right-p-value"])
		finally:
			if rf is not None:
				rf.close()
			if of is not None:
				of.close()

		# close local paths
		data_repo.put_local(results_local_path)

	except EmptyResults:
		valid = False

	except Exception as ex:
		log.exception(ex)

		if results_local_path is not None:
			data_repo.close_local(results_local_path)

		valid = False

	finally:
		shutil.rmtree(tmp_path)
		mod_repo.close_local(mod_local_path)
		data_repo.close_local(matrix_local_path)
		#if "population" in ec:
		#	pop_repo.close_local(pop_local_path)

	return valid