def __exit__(self, exc_type, exc_val, exc_tb): self.f.close() if exc_val is None: self.arc.add(self.tmp, arcname=self.arcname) remove_temp(self.task, self.tmp) return False
def combination_recurrences(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info( "--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30 ) ) log.info("Creating database ...") db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix)) log.debug(" > {0}".format(db_path)) conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row create_db(conn) log.info("Combining recurrences ...") c = conn.cursor() sample_total = 0 project_ids = [] for project in projects: project_ids += [project["id"]] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) project_sample_total = projdb.get_total_affected_samples() sample_total += project_sample_total log.info(" Total samples = {0}".format(project_sample_total)) log.info(" Variant genes ...") count = 0 for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec if rec.sample_freq is None: log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg))) continue start, end, ref, alt = var_to_tab(var) try: c.execute( "INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)", (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)), ) var_id = c.lastrowid except sqlite3.IntegrityError: c.execute( "SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?", (var.chr, var.strand, start, ref, alt), ) r = c.fetchone() var_id = r[0] try: c.execute( "INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)", (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq), ) except sqlite3.IntegrityError: c.execute( """ UPDATE variant_genes SET sample_freq=sample_freq + ? WHERE var_id=? AND gene_id=?""", (rec.sample_freq, var_id, afg.gene_id), ) count += 1 log.info(" {0} variant genes".format(count)) log.info(" Genes ...") count = 0 for gene in projdb.genes(join_xrefs=True, join_rec=True): rec = gene.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq)) else: c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id)) count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq)) else: c.execute( "UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id) ) count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total)) if sample_total > 0: c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.close() conn.commit() log.info("Saving results ...") c = conn.cursor() base_path = paths.combination_path("recurrences") log.info(" Variant genes ...") with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line( f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS", ) for r in c.execute( "SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id" ): strand, ref, alt = r["strand"], r["ref"], r["alt"] allele = "{0}/{1}".format(ref, alt) tsv.write_line( f, r["chr"], strand, r["start"], allele, r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]), r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-", ) log.info(" Genes ...") with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM genes ORDER BY gene_id"): tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-") log.info(" Pathways ...") with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"): tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-") conn.close() remove_temp(task, db_path)
def combination_oncodrivefm(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Exporting project data ...") base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix)) log.debug("> {0}".format(base_path)) project_ids = [] gene_files = [] pathway_files = [] for project in projects: project_id = project["id"] project_ids += [project_id] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) log.info(" Genes ...") count = 0 file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id)) gene_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "GENE_ID", "PVALUE") for gene in projdb.genes(): if gene.fm_pvalue is not None: tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-") count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id)) pathway_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "PATHWAY_ID", "ZSCORE") for pathway in projdb.pathways(): if pathway.fm_zscore is not None: tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-") count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Combining ...") combination_path = paths.combination_path("oncodrivefm") log.info(" Genes ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-empirical", "-o '{0}'".format(combination_path), "-n 'gene-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in gene_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) log.info(" Pathways ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-zscore", "-o '{0}'".format(combination_path), "-n 'pathway-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in pathway_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) remove_temp(task, base_path)
def scan_files(project): log = task.logger conf = task.conf config = GlobalConfig(conf) paths = PathsConfig(config) projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects") project_id = project["id"] temp_path = project["temp_path"] project_path = project["path"] projdb_path = project["db"] assembly = project["assembly"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if assembly == "hg18": out_port = liftover_projects_port elif assembly == "hg19": out_port = projects_port else: raise Exception("Unexpected assembly: {0}".format(assembly)) #if os.path.exists(projdb_path): # log.warn("Variations database already created, skipping this step.") # out_port.send(project) # return if os.path.exists(projdb_path): os.remove(projdb_path) log.info("Creating variants database ...") projdb_tmp_path = make_temp_file(task, suffix=".db") log.debug(projdb_tmp_path) projdb = ProjectDb(projdb_tmp_path).create() data_path = config.data_path log.info("Loading genes ...") projdb.load_genes(paths.data_ensembl_genes_path()) log.info("Loading pathways ...") projdb.load_pathways( paths.data_kegg_def_path(), paths.data_kegg_ensg_map_path()) log.info("Parsing variants ...") for obj_name in project["storage_objects"]: log.info("Downloading {} ...".format(obj_name)) dst_path = os.path.join(project_path, "sources", os.path.basename(obj_name)) dst_dirname = os.path.dirname(dst_path) if not os.path.exists(dst_dirname): os.makedirs(dst_dirname) # TODO: do not copy the source file (do not specify dst_path) task.storage.get_object(obj_name).get_data(dst_path) for container_name, path, name, ext, f in archived_files(dst_path): fname = os.path.join(path, name + ext) if container_name is not None: source_name = "{0}:{1}".format(os.path.basename(container_name), fname) else: source_name = name + ext log.info("=> {0} ...".format(source_name)) sample_id = os.path.basename(name) if ext.lower() in _SUPPORTED_EXTENSIONS: parser_type = ext[1:] else: parser_type = "tab" parser = create_variants_parser(parser_type, f, source_name, sample_id) source_id = projdb.add_source(source_name) var_ids = set() for var in parser: for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num()) var_ids.add(var_id) for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) num_variants = len(var_ids) log.info(" {0} variants".format(num_variants)) if num_variants == 0: raise Exception("No variants found in source '{}'. " "Please check the documentation for the expected input for '{}' format.".format( source_name, parser.name)) projdb.commit() projdb.close() log.info("Copying variants database ...") log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path)) shutil.copy(projdb_tmp_path, projdb_path) remove_temp(task, projdb_tmp_path) out_port.send(project)
def liftover(project): log = task.logger conf = task.conf config = GlobalConfig(conf) lifted_project_port = task.ports("lifted_projects") log.info("--- [{0}] --------------------------------------------".format(project["id"])) log.info("Preparing liftOver files ...") in_path = make_temp_file(task, suffix=".bed") in_file = open(in_path, "w") out_path = make_temp_file(task, suffix=".bed") unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed") projdb = ProjectDb(project["db"]) for var in projdb.variants(order_by="position"): in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id)) in_file.close() log.info("Running liftOver ...") project["from_assembly"] = project["assembly"] project["assembly"] = "hg19" cmd = " ".join( [ conf["liftover_bin"], in_path, os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"), out_path, unmapped_path, ] ) log.debug(cmd) subprocess.call(cmd, shell=True) log.info("Annotating unmapped variants ...") count = 0 with open(unmapped_path, "r") as f: for line in f: if line.lstrip().startswith("#"): continue fields = line.rstrip().split("\t") var_id = int(fields[3]) projdb.update_variant_start(var_id, start=None) count += 1 log.info(" {0} unmapped variants annotated".format(count)) log.info("Updating variants ...") count = 0 with open(out_path, "r") as f: for line in f: fields = line.rstrip().split("\t") chr, start, end, var_id = fields projdb.update_variant_start(var_id, start=start) count += 1 log.info(" {0} variants".format(count)) remove_temp(task, in_path, out_path) projdb.commit() projdb.close() lifted_project_port.send(project)