def load_data(self, data_paths, method=None): columns = [] col_names = [] row_name_index = {} for col_index, data_file in enumerate(data_paths): self.log.debug(" > {0}".format(data_file)) names = [] values = [] with tsv.open(data_file, "r") as f: col_name, ext = os.path.splitext(os.path.basename(data_file)) params = tsv.params(f) if "slice" in params: col_name = params["slice"] if "method" in params: if method is None: method = params["method"] elif method != params["method"]: self.log.warn("Different method of computation used for file {0}".format(data_file)) for name, value in tsv.lines(f, (str, float), header=True, null_value="-"): if len(name) == 0: self.log.warn("Empty identifier detected") continue if name not in row_name_index: row_name_index[name] = len(row_name_index) names += [name] values += [value] col_names += [col_name] columns += [(names, values)] num_cols = len(columns) num_rows = len(row_name_index) row_names = [None] * num_rows for name, index in row_name_index.items(): row_names[index] = name data = np.empty((num_rows, num_cols)) data[:] = np.nan for col_index, (names, values) in enumerate(columns): for i, name in enumerate(names): data[row_name_index[name], col_index] = values[i] return row_names, col_names, data, method
def drivers(): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) db_path = paths.results_path("drivers.db") db = SigDb(db_path) db.open() log.info("Variants ...") path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz") with tsv.open(path, "r") as f: types = (str, str, int, str) for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True): chr, strand, start, allele = fields[:4] db.add_variant(chr, start) log.info("Genes ...") gene_sites = {} gene_fm = set() gene_clust = set() #SPECIAL_THRESHOLD = ["C18", "C34"] SPECIAL_THRESHOLD = [] log.info(" OncodriveFM ...") filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodrivefm") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) if cancer_site_code in SPECIAL_THRESHOLD: threshold = 1e-6 else: threshold = 0.01 with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < threshold: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_fm.add(gene) log.info(" OncodriveCLUST ...") filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodriveclust") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < 0.05: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_clust.add(gene) log.info(" Updating db ...") sig_genes = gene_fm | gene_clust for gene in sig_genes: db.add_gene(gene, gene in gene_fm, gene in gene_clust) log.info("Saving driver genes cancer sites dataset ...") path = paths.results_path("gene-driver_cancer_sites.tsv") log.debug("> {}".format(path)) with open(path, "w") as f: tsv.write_param(f, "date", datetime.now()) tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES") for gene, sites in gene_sites.items(): tsv.write_line(f, gene, 1 if gene in gene_fm else 0, 1 if gene in gene_clust else 0, len(sites), ", ".join(sorted([code for code, name in sites])), ", ".join(sorted([name for code, name in sites]))) db.commit() db.close()