Beispiel #1
0
	def write(self, name, values):
		value_to_str = self._value_to_str_func(self.dtype)

		FileWriter.write(self, name)
		for value in values:
			FileWriter.write(self, "\t")
			FileWriter.write(self, value_to_str(value))
		FileWriter.write(self, "\n")
Beispiel #2
0
	def write_header(self, header = None):
		if isinstance(header, MatrixHeader):
			FileWriter.write(self, "\t".join(header.columns))
		elif isinstance(header, (list, tuple)):
			FileWriter.write(self, "\t".join(header))
		else:
			raise Exception("Unsupported headers type: {}".format(str(type(header))))
		FileWriter.write(self, "\n")
def merge(log, input, output, gitools_output):
    """
	Merge repeated rows by the lowest pvalue, in case the pvalue is the same take the one with greater n
	"""

    f = FileReader(input)
    hdr = f.readline().rstrip().split("\t")

    upreg = {}
    downreg = {}

    upreg_count = 0
    downreg_count = 0

    mid_index = 8

    for line in f:
        line = line.rstrip()
        if len(line) == 0:
            continue

        fields = line.split("\t")
        row_name = fields[0]

        upreg_count += merge_data(row_name, fields[1:mid_index], upreg)
        downreg_count += merge_data(row_name, fields[mid_index:], downreg)

    f.close()

    upreg_keys = upreg.keys()
    downreg_keys = downreg.keys()

    log.debug("Total rows: upreg = {}, downreg = {}".format(len(upreg_keys), len(downreg_keys)))
    log.debug("Merged rows: upreg = {}, downreg = {}".format(upreg_count, downreg_count))

    ofile = FileWriter(output)
    ofile.write("\t".join(hdr))
    ofile.write("\n")

    gfile = FileWriter(gitools_output)
    gfile.write("column\trow\t")
    gfile.write("\t".join([x[6:] for x in hdr if x.startswith("upreg_")]))
    gfile.write("\n")

    for row_name in upreg_keys:
        upreg_data = upreg[row_name]
        upreg_data_join = "\t".join(upreg_data)

        downreg_data = downreg[row_name]
        downreg_data_join = "\t".join(downreg_data)

        ofile.write(row_name)
        ofile.write("\t")
        ofile.write(upreg_data_join)
        ofile.write("\t")
        ofile.write(downreg_data_join)
        ofile.write("\n")

        gfile.write("upreg\t")
        gfile.write(row_name)
        gfile.write("\t")
        gfile.write(upreg_data_join)
        gfile.write("\n")
        gfile.write("downreg\t")
        gfile.write(row_name)
        gfile.write("\t")
        gfile.write(downreg_data_join)
        gfile.write("\n")

    ofile.close()
    gfile.close()

    return (upreg_count, downreg_count)
Beispiel #4
0
	def __init__(self, obj, dtype=float):
		FileWriter.__init__(self, obj)

		self.dtype = dtype
def write_data_map(dmap, path):
	rf = FileWriter(path)
	hdr = ["id"]
	hdr.extend(["_".join(("gain", f.replace("-", "_").lower())) for f in FIELDS])
	hdr.extend(["_".join(("loss", f.replace("-", "_").lower())) for f in FIELDS])
	rf.write("\t".join(hdr) + "\n")
	for row, values in dmap.iteritems():
		rf.write(row)
		for v in values:
			rf.write("\t")
			rf.write(v)
		if len(values) == len(FIELDS):
			rf.write("\t".join(["-"] * len(FIELDS)))
		rf.write("\n")
	rf.close()
Beispiel #6
0
def combination(log, conf, rs, c, data_repo, results_path, conditions):

	cid = c["id"]
	ids = c["source/ids"]
	files = c["files"]
	results_url = data_repo.url(results_path)

	try:
		# prepare temporary path and files
		tmp_path = mkdtemp(prefix = "cnv_combination_")
		data_file = os.path.join(tmp_path, "data.tdm")
		columns_file = os.path.join(tmp_path, "columns.gmt")
		tmp_file = os.path.join(tmp_path, "tmp.tdm")
		log.debug("Temporary directory: {}".format(tmp_path))

		# join files to combine in a single TDM file
		log.info("Joining files ...".format(files[0]))
		outpf = FileWriter(data_file)

		log.debug("\t{} ...".format(files[0]))
		repo, path = rs.from_url(files[0])
		local_path = repo.get_local(path)
		ref_hdr = tdm.unflatten(local_path, outpf, row_column = "id",
			column_and_attr_func = lambda name: unflatten_filtered_names(name, ids[0]))
		#outpf.flush()
		#ref_hdr = tdm.read_header_names(data_file)
		repo.close_local(path)

		for i in xrange(1, len(files)):
			log.debug("\t{} ...".format(files[i]))
			repo, path = rs.from_url(files[i])
			local_path = repo.get_local(path)
			hdr = tdm.unflatten(local_path, tmp_file, row_column = "id",
				column_and_attr_func = lambda name: unflatten_filtered_names(name, ids[i]))
			tdm.append(outpf, tmp_file, ref_hdr)
			repo.close_local(path)

		outpf.close()

		# prepare conditions columns file in GMT format

		outpf = FileWriter(columns_file)
		for cond in conditions:
			outpf.write(cond)
			outpf.write("\t\t")
			outpf.write("\t".join(["_".join((sid, cond)) for sid in ids]))
			outpf.write("\n")
		outpf.close()

		# run gitools-combination with data.tdm
		log.info("Running gitools combination ...")
		log.debug("\tData: {}".format(data_file))
		log.debug("\tColumns: {}".format(columns_file))

		gitools_combination_bin = os.path.join(conf["bin_paths.gitools"], "bin", "gitools-combination")

		cmd = " ".join([ gitools_combination_bin,
			"-N", cid, "-w", tmp_path,
			"-d", data_file,
			"-c", columns_file,
			"-pn", P_VALUE_FIELD,
			"-sn n",
			"-p 1", "-debug"])

		log.debug(cmd)

		retcode = subprocess.call(args = cmd, shell = True)

		sys.stdout.write("\n")
		sys.stdout.flush()

		if retcode != 0:
			raise Exception("Combination exit code = {}".format(retcode))

		# flatten results
		log.info("Flattening results into {} ...".format(results_url))

		try:
			results_local_path = data_repo.create_local(results_path)
			tdm.flatten(os.path.join(tmp_path, cid + "-results.tdm.gz"), results_local_path,
				None, ["N", "z-score", "p-value"])

			data_repo.put_local(results_local_path)
		except:
			data_repo.close_local(results_local_path)

	finally:
		shutil.rmtree(tmp_path)
Beispiel #7
0
def enrichment(log, conf, rs, data_repo, results_path, data_file, e, ec,
				filtered_columns, filtered_columns_new_names):

	eid = e["id"]

	key = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"])

	# determine the modules file
	mod_repo, mod_path = rs.from_url(ec["modules_file"])
	mod_local_path = mod_repo.get_local(mod_path)

	# oncodrive data file
	matrix_repo, matrix_path = rs.from_url(data_file)
	matrix_local_path = matrix_repo.get_local(matrix_path)

	e["data_file"] = data_file
	e["modules_file"] = ec["modules_file"]

	results_local_path = None

	tmp_path = mkdtemp(prefix = "enrichment_")
	log.debug("Temporary directory: {}".format(tmp_path))

	valid = True

	try:
		log.info("Filtering pvalue columns from {} ...".format(data_file))

		# filter columns for pvalues
		data_local_path = os.path.join(tmp_path, "data.tsv")

		rf = of = None
		try:
			rf = FileReader(matrix_local_path)
			of = FileWriter(data_local_path)
			row_count = tsv.filter_columns(rf, of,
					filtered_columns, filtered_columns_new_names)
		finally:
			if rf is not None:
				rf.close()
			if of is not None:
				of.close()

		if row_count == 0:
			log.warn("Oncodrive results are empty: {}".format(matrix_path))
			raise EmptyResults

		# apply background if necessary
		if "population.file" in ec:
			pop_url = ec["population.file"]
			pop_missing_value = ec.get("population.missing_value", "-")
			log.info("Applying background from {} with missing value {} ...".format(pop_url, pop_missing_value))
			data2_local_path = os.path.join(tmp_path, "data-filtered.tsv")
			pop_repo, pop_path = rs.from_url(pop_url)
			pop_local_path = pop_repo.get_local(pop_path)
			cmd = " ".join([
				conf["bin_paths.python"], conf["bin_paths.matrix_background"],
				"--verbose --missing-value", pop_missing_value,
				"-o", data2_local_path,
				data_local_path, pop_local_path ])

			log.debug(cmd)
			retcode = subprocess.call(args = cmd, shell = True)

			if retcode != 0:
				raise Exception("Applying population background for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

			pop_repo.close_local(pop_local_path)
			data_local_path = data2_local_path

		# enrichment results
		e["results_file"] = data_repo.url(results_path)
		results_local_path = data_repo.create_local(results_path)

		log.info("Running enrichment ...")
		log.debug("\tData file: {}".format(data_local_path))
		log.debug("\tModules file: {}".format(ec["modules_file"]))

		gitools_enrichment_bin = os.path.join(conf["bin_paths.gitools"], "bin", "gitools-enrichment")

		sb = [ gitools_enrichment_bin,
			"-N", eid, "-w", tmp_path, "-p 1",
			"-mf tcm", "-m", mod_local_path,
			"-df cdm", "-d", data_local_path,
			"-t", ec["test"] ]

		if "filter" in ec:
			sb += ["-b", ec["filter"]]

		if ec.get("only_mapped_items", False, dtype=bool):
			sb += ["-only-mapped-items"]

		#if "population" in ec:
		#	pop_repo, pop_path = rs.from_url(ec["population"])
		#	pop_local_path = pop_repo.get_local(pop_path)
		#	sb += ["-P", pop_local_path]

		cmd = " ".join(sb)

		log.debug(cmd)

		retcode = subprocess.call(args = cmd, shell = True)

		sys.stdout.write("\n")
		sys.stdout.flush()

		if retcode != 0:
			raise Exception("Enrichment for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode))

		# flatten results

		log.info("Flattening results into {} ...".format(e["results_file"]))

		try:
			gitools_results = os.path.join(tmp_path, eid + "-results.tdm.gz")
			rf = FileReader(gitools_results)
			of = FileWriter(results_local_path)
			tdm.flatten(rf, of,
				{ "column" : str, "row" : str, "N" : int, "observed" : int,
				"expected-mean" : float, "expected-stdev" : float, "probability" : float,
				"right-p-value" : float, "corrected-right-p-value" : float },

				["N", "observed", "expected-mean", "expected-stdev",
				"probability", "right-p-value", "corrected-right-p-value"])
		finally:
			if rf is not None:
				rf.close()
			if of is not None:
				of.close()

		# close local paths
		data_repo.put_local(results_local_path)

	except EmptyResults:
		valid = False

	except Exception as ex:
		log.exception(ex)

		if results_local_path is not None:
			data_repo.close_local(results_local_path)

		valid = False

	finally:
		shutil.rmtree(tmp_path)
		mod_repo.close_local(mod_local_path)
		data_repo.close_local(matrix_local_path)
		#if "population" in ec:
		#	pop_repo.close_local(pop_local_path)

	return valid