def map_from_file(path): map = {} f = FileReader(path) for line in f: line = line.rstrip() fields = line.split("\t") if len(fields) != 2: raise Exception("Unexpected number of columns: ({0})".format(", ".join(fields))) map[fields[0]] = fields[1] f.close() return map
def read_header(self): if self.header == None: line = FileReader.readline(self).rstrip() hdr = line.split("\t") self.header = MatrixHeader(hdr) return self.header
def __iter__(self): parser = self._parser_from_type(self.dtype) self.read_header() for line in FileReader.__iter__(self): line = line.rstrip() fields = line.split("\t") row_name = fields[0] values = [parser(x) for x in fields[1:]] yield MatrixRow(row_name, values)
def read_data_map(log, gain_results, loss_results): dmap = {} log.debug("Reading gain data from {} ...".format(gain_results)) # read gain data uf = FileReader(gain_results) hdr = read_header(uf) count = 0 for line in uf: k, d = read_data(line, hdr, "row", FIELDS) dmap[k] = d count += 1 uf.close() log.debug("Total gain rows = {0}".format(count)) log.debug("Reading loss data from {0} ...".format(loss_results)) # read loss data and join with gain df = FileReader(loss_results) hdr = read_header(df) count = 0 for line in df: k, d = read_data(line, hdr, "row", FIELDS) if k not in dmap: data = ["-"] * len(FIELDS) else: data = dmap[k] data += d dmap[k] = data count += 1 log.debug("Total loss rows = {0}".format(count)) return dmap
def read_data_map(log, upreg_results, downreg_results): dmap = {} log.debug("Reading upreg data from {0} ...".format(upreg_results)) # read upreg data uf = FileReader(upreg_results) hdr = read_header(uf) count = 0 for line in uf: k, d = read_data(line, hdr, "row", FIELDS) dmap[k] = d count += 1 uf.close() log.debug("Total upreg rows = {0}".format(count)) log.debug("Reading downreg data from {0} ...".format(downreg_results)) # read downreg data and join with upreg df = FileReader(downreg_results) hdr = read_header(df) count = 0 for line in df: k, d = read_data(line, hdr, "row", FIELDS) if k not in dmap: data = ["-"] * len(FIELDS) else: data = dmap[k] data += d dmap[k] = data count += 1 log.debug("Total downreg rows = {0}".format(count)) return dmap
def merge(log, input, output, gitools_output): """ Merge repeated rows by the lowest pvalue, in case the pvalue is the same take the one with greater n """ f = FileReader(input) hdr = f.readline().rstrip().split("\t") upreg = {} downreg = {} upreg_count = 0 downreg_count = 0 mid_index = 8 for line in f: line = line.rstrip() if len(line) == 0: continue fields = line.split("\t") row_name = fields[0] upreg_count += merge_data(row_name, fields[1:mid_index], upreg) downreg_count += merge_data(row_name, fields[mid_index:], downreg) f.close() upreg_keys = upreg.keys() downreg_keys = downreg.keys() log.debug("Total rows: upreg = {}, downreg = {}".format(len(upreg_keys), len(downreg_keys))) log.debug("Merged rows: upreg = {}, downreg = {}".format(upreg_count, downreg_count)) ofile = FileWriter(output) ofile.write("\t".join(hdr)) ofile.write("\n") gfile = FileWriter(gitools_output) gfile.write("column\trow\t") gfile.write("\t".join([x[6:] for x in hdr if x.startswith("upreg_")])) gfile.write("\n") for row_name in upreg_keys: upreg_data = upreg[row_name] upreg_data_join = "\t".join(upreg_data) downreg_data = downreg[row_name] downreg_data_join = "\t".join(downreg_data) ofile.write(row_name) ofile.write("\t") ofile.write(upreg_data_join) ofile.write("\t") ofile.write(downreg_data_join) ofile.write("\n") gfile.write("upreg\t") gfile.write(row_name) gfile.write("\t") gfile.write(upreg_data_join) gfile.write("\n") gfile.write("downreg\t") gfile.write(row_name) gfile.write("\t") gfile.write(downreg_data_join) gfile.write("\n") ofile.close() gfile.close() return (upreg_count, downreg_count)
def __init__(self, obj, dtype=float): FileReader.__init__(self, obj) self.dtype = dtype self.header = None
def main(): # Initialization task.check_conf(["entities", "repositories", "cnv.background.ensg", "cnv.mapping.ensg", "bin_paths.bed_tools"]) conf = task.conf log = task.logger() evt_tunit_port, joined_evt_tunit_port = \ task.ports("evt_tumour_unit_ids", "joined_evt_tumour_unit_ids") es = EntityServer(conf["entities"]) em = es.manager() rs = RepositoryServer(conf["repositories"]) data_repo = rs.repository("data") overwrite = conf.get("overwrite", False, dtype=bool) # Run mapping_file = conf["cnv.mapping.ensg"] log.info("UCSC Ensembl gene regions at {} ...".format(mapping_file)) mapping_repo, mapping_path = rs.from_url(mapping_file) mapping_local_path = mapping_repo.get_local(mapping_path) background_file = conf["cnv.background.ensg"] log.info("Loading background from {} ...".format(background_file)) background = set() repo, path = rs.from_url(background_file) reader = repo.open_reader(path) for line in reader: line = line.rstrip() if len(line) == 0: continue background.add(line) reader.close() repo.close() for uid in evt_tunit_port: u = em.find(uid, types.CNV_EVENTS_TUMOUR_UNIT) if u is None: log.error("{} not found: {}".format(types.CNV_EVENTS_TUMOUR_UNIT, uid)) continue key = (u["study_id"], u["platform_id"], u["icdo_topography"], u.get("icdo_morphology", "")) tunit_base_path = types.CNV_EVENTS_TUMOUR_UNIT.replace(".", "/") tunit_path = rpath.join(tunit_base_path, uid + ".tsv.gz") if skip_file(overwrite, data_repo, tunit_path, u.get("data_file")): log.warn("Skipping ({}) [{}] as data file already exists".format(", ".join(key), uid)) joined_evt_tunit_port.write(uid) continue log.info("Mapping and joining {} ({}) [{}] ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key), uid)) cnv_evt_ids = u["cnv_evt_ids"] log.debug("{} {}".format(len(cnv_evt_ids), types.CNV_EVENTS)) data = {} tmp_path = mkdtemp(prefix = "evt_map_and_join_") log.debug("Temporary directory: {}".format(tmp_path)) try: for eid in cnv_evt_ids: e = em.find(eid, types.CNV_EVENTS) if e is None: log.error("{} not found: {}".format(types.CNV_EVENTS, eid)) continue data_file = e["data_file"] log.debug("{} ...".format(data_file)) repo, path = rs.from_url(data_file) local_path = repo.get_local(path) # Fix wrong bed files generated by gunes (end should be 1 indexed instead of 0 indexed) # tmp_file = os.path.join(tmp_path, "".join([eid, "-fixed-bed.tsv"])) # writer = FileWriter(tmp_file) # reader = repo.open_reader(path) # for line in reader: # if line.lstrip().startswith("#"): # continue # fields = line.rstrip().split("\t") # end = int(fields[2]) + 0 # FIXME fix not necessary already # fields[2] = str(end) # writer.write("\t".join(fields)) # writer.write("\n") # writer.close() # reader.close() # Run BED tools to intersect event regions with gene names tmp_file2 = os.path.join(tmp_path, "".join([eid, "-intersect.tsv"])) cmd = " ".join([ os.path.join(conf["bin_paths.bed_tools"], "bin", "intersectBed"), "-a", mapping_local_path, #"-b", tmp_file, "-b", local_path, "-s -wb", ">{}".format(tmp_file2)]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("BED tools intersect for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) repo.close_local(local_path) # Read BED tools results and load event data into memory reader = FileReader(tmp_file2) name_index = 3 value_index = 12 line_num = 1 for line in reader: try: fields = line.rstrip().split("\t") name = fields[name_index] value = int(fields[value_index]) if value not in [1, 2]: log.error("Unexpected value {} at line {} of data file {}".format(value, line_num, data_file)) continue except: log.error("Error parsing line {} of data file {}".format(line_num, data_file)) continue k = (eid, name) if k in data: prev_value = data[k] else: prev_value = 0 data[k] = prev_value | value line_num += 1 reader.close() repo.close() finally: if os.path.exists(tmp_path): log.debug("Removing temporary directory {} ...".format(tmp_path)) shutil.rmtree(tmp_path) # Write events data to data file and merge with background labels log.info("Writing data to {} ...".format(tunit_path)) u["data_file"] = data_repo.url(tunit_path) #TODO u["data_timestamp"] = ... writer = data_repo.open_writer(tunit_path) # header for name in cnv_evt_ids: writer.write("\t") writer.write(name) writer.write("\n") # data for row_name in sorted(background): writer.write(row_name) for col_name in cnv_evt_ids: k = (col_name, row_name) if k in data: value = data[k] else: value = 0 writer.write("\t") writer.write(str(value)) writer.write("\n") writer.close() log.info("Writting {} ({}) ...".format(types.CNV_EVENTS_TUMOUR_UNIT, ", ".join(key))) em.persist(u, types.CNV_EVENTS_TUMOUR_UNIT) joined_evt_tunit_port.write(uid) em.close() es.close() mapping_repo.close_local(mapping_local_path) mapping_repo.close() data_repo.close() rs.close()
def enrichment(log, conf, rs, data_repo, results_path, data_file, e, ec, filtered_columns, filtered_columns_new_names): eid = e["id"] key = (e["study_id"], e["platform_id"], e["icdo_topography"], e["icdo_morphology"]) # determine the modules file mod_repo, mod_path = rs.from_url(ec["modules_file"]) mod_local_path = mod_repo.get_local(mod_path) # oncodrive data file matrix_repo, matrix_path = rs.from_url(data_file) matrix_local_path = matrix_repo.get_local(matrix_path) e["data_file"] = data_file e["modules_file"] = ec["modules_file"] results_local_path = None tmp_path = mkdtemp(prefix = "enrichment_") log.debug("Temporary directory: {}".format(tmp_path)) valid = True try: log.info("Filtering pvalue columns from {} ...".format(data_file)) # filter columns for pvalues data_local_path = os.path.join(tmp_path, "data.tsv") rf = of = None try: rf = FileReader(matrix_local_path) of = FileWriter(data_local_path) row_count = tsv.filter_columns(rf, of, filtered_columns, filtered_columns_new_names) finally: if rf is not None: rf.close() if of is not None: of.close() if row_count == 0: log.warn("Oncodrive results are empty: {}".format(matrix_path)) raise EmptyResults # apply background if necessary if "population.file" in ec: pop_url = ec["population.file"] pop_missing_value = ec.get("population.missing_value", "-") log.info("Applying background from {} with missing value {} ...".format(pop_url, pop_missing_value)) data2_local_path = os.path.join(tmp_path, "data-filtered.tsv") pop_repo, pop_path = rs.from_url(pop_url) pop_local_path = pop_repo.get_local(pop_path) cmd = " ".join([ conf["bin_paths.python"], conf["bin_paths.matrix_background"], "--verbose --missing-value", pop_missing_value, "-o", data2_local_path, data_local_path, pop_local_path ]) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) if retcode != 0: raise Exception("Applying population background for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) pop_repo.close_local(pop_local_path) data_local_path = data2_local_path # enrichment results e["results_file"] = data_repo.url(results_path) results_local_path = data_repo.create_local(results_path) log.info("Running enrichment ...") log.debug("\tData file: {}".format(data_local_path)) log.debug("\tModules file: {}".format(ec["modules_file"])) gitools_enrichment_bin = os.path.join(conf["bin_paths.gitools"], "bin", "gitools-enrichment") sb = [ gitools_enrichment_bin, "-N", eid, "-w", tmp_path, "-p 1", "-mf tcm", "-m", mod_local_path, "-df cdm", "-d", data_local_path, "-t", ec["test"] ] if "filter" in ec: sb += ["-b", ec["filter"]] if ec.get("only_mapped_items", False, dtype=bool): sb += ["-only-mapped-items"] #if "population" in ec: # pop_repo, pop_path = rs.from_url(ec["population"]) # pop_local_path = pop_repo.get_local(pop_path) # sb += ["-P", pop_local_path] cmd = " ".join(sb) log.debug(cmd) retcode = subprocess.call(args = cmd, shell = True) sys.stdout.write("\n") sys.stdout.flush() if retcode != 0: raise Exception("Enrichment for ({}) [{}] failed with code {}".format(", ".join(key), eid, retcode)) # flatten results log.info("Flattening results into {} ...".format(e["results_file"])) try: gitools_results = os.path.join(tmp_path, eid + "-results.tdm.gz") rf = FileReader(gitools_results) of = FileWriter(results_local_path) tdm.flatten(rf, of, { "column" : str, "row" : str, "N" : int, "observed" : int, "expected-mean" : float, "expected-stdev" : float, "probability" : float, "right-p-value" : float, "corrected-right-p-value" : float }, ["N", "observed", "expected-mean", "expected-stdev", "probability", "right-p-value", "corrected-right-p-value"]) finally: if rf is not None: rf.close() if of is not None: of.close() # close local paths data_repo.put_local(results_local_path) except EmptyResults: valid = False except Exception as ex: log.exception(ex) if results_local_path is not None: data_repo.close_local(results_local_path) valid = False finally: shutil.rmtree(tmp_path) mod_repo.close_local(mod_local_path) data_repo.close_local(matrix_local_path) #if "population" in ec: # pop_repo.close_local(pop_local_path) return valid