def create(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] templates_path = config.website.templates_path if templates_path is None: log.warn("No website templates have been defined in the configuration. Skipping website creation.") return log.info("Creating website ...") website_path = paths.project_website_path(project_path) if os.path.exists(website_path): shutil.rmtree(website_path) log.info("Copying templates ...") shutil.copytree(templates_path, website_path) gitignore_path = os.path.join(website_path, ".gitignore") try: os.remove(gitignore_path) except: pass log.info("Expanding templates ...") vars = dict( PROJECT_NAME=project_id, SHOW_ALL_TABS=not config.variants_only) tmpl_paths = [ os.path.join(website_path, "css", "header.html"), os.path.join(website_path, "onexus-project.onx") ] for path in tmpl_paths: with open(path, "r") as f: t = Template(f.read()) with open(path, "w") as f: f.write(t.safe_substitute(vars)) # Creating a soft link to the results folder project_results_path = paths.project_results_path(project_path) os.symlink(project_results_path, os.path.join(website_path, "results")) # Send project to the next modules projects_port = task.ports("projects_out") project["website"] = website_path projects_port.send(project)
def scan_projects(project_out): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) projects_path = paths.projects_path() log.info("Scanning projects ...") count = 0 sent_projects = [] for path, project in list_projects(log, projects_path): project_path = os.path.dirname(path) if "id" not in project: log.warn("Discarding project that doesn't have 'id': {0}".format(path)) continue project_id = project["id"] if "name" in project: log.info("--- [{0}: {1}] ---------------------------------".format(project_id, project["name"])) else: log.info("--- [{0}] ---------------------------------".format(project_id)) if "db" not in project: project["db"] = os.path.join(project_path, "project.db.gz") elif not os.path.isabs(project["db"]): project["db"] = os.path.join(project_path, project["db"]) if not os.path.exists(project["db"]): log.error("Project database not found at {0}".format(os.path.relpath(project["db"], project_path))) continue if project["db"].endswith(".gz"): log.info("Uncompressing project database ...") retcode = subprocess.call("gunzip -fc {0} >{1}".format(project["db"], project["db"][:-3]), shell=True) if retcode != 0: log.error("Unexpected error while uncompressing the project database at {0}".format(os.path.relpath(project["db"], projects_path))) continue project["db"] = project["db"][:-3] temp_path = paths.project_temp_path(project_path) if not os.path.exists(temp_path): os.makedirs(temp_path) project["path"] = project_path project["temp_path"] = temp_path sent_projects += [project_id] project_out.send(project) count += 1 log.info("Found {0} projects:\n - {1}".format(count, "\n - ".join(sent_projects)))
def datasets(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] group_file_prefix = normalize_id(classifier_id) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Reading number of samples per project ...") project_ids = [] total_samples = 0 for project in projects: project_id = project["id"] project_ids += [project_id] log.info(" Project {0}".format(project["id"])) projdb = ProjectDb(project["db"]) num_samples = projdb.get_total_affected_samples() total_samples += num_samples log.debug(" {0} samples".format(num_samples)) projdb.close() log.debug(" {0} samples in total".format(total_samples)) log.info("Updating ...") combination_path = paths.combination_path() path = os.path.join(combination_path, "{0}.tsv".format(group_file_prefix)) if not os.path.exists(path): with open(path, "w") as f: tsv.write_line(f, "NAME", "SHORT_NAME", "LONG_NAME", "SAMPLES_TOTAL", "PROJECT_IDS") with open(path, "a") as f: tsv.write_line(f, group_name, group_short_name, group_long_name, total_samples, ",".join(project_ids))
def oncoclust(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) gene_transcripts_path = paths.data_ensembl_gene_transcripts_path() oclust = project["oncodriveclust"] data_paths = oclust["data_paths"] samples_threshold = oclust["samples_threshold"] oclust["results"] = os.path.join(project["temp_path"], "oncodriveclust-results.tsv") cmd = " ".join([ "oncodriveclust", "-c", "-m", str(samples_threshold), "-o", oclust["results"], data_paths[0], data_paths[1], gene_transcripts_path ]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code == 1: log.warn("No results were generated") elif ret_code != 0: log.error("Error while executing OncodriveCLUST:\n{0}".format(cmd)) projects_out_port.send(project)
def finalize_all(): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) if config.results.use_storage: log.info("Uploading combination files ...") combination_path = paths.combination_path() exclude = ["sources/*", "project.db", "project.db.gz"] object_prefix = "results/combination" start_callback = lambda path: log.info(" {}".format(path)) task.storage.upload(combination_path, object_prefix=object_prefix, exclude=exclude, overwrite=True, start_callback=start_callback) if config.results.purge_after_upload: log.info("Purging combination files ...") shutil.rmtree(combination_path)
def combination_recurrences(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info( "--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30 ) ) log.info("Creating database ...") db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix)) log.debug(" > {0}".format(db_path)) conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row create_db(conn) log.info("Combining recurrences ...") c = conn.cursor() sample_total = 0 project_ids = [] for project in projects: project_ids += [project["id"]] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) project_sample_total = projdb.get_total_affected_samples() sample_total += project_sample_total log.info(" Total samples = {0}".format(project_sample_total)) log.info(" Variant genes ...") count = 0 for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec if rec.sample_freq is None: log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg))) continue start, end, ref, alt = var_to_tab(var) try: c.execute( "INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)", (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)), ) var_id = c.lastrowid except sqlite3.IntegrityError: c.execute( "SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?", (var.chr, var.strand, start, ref, alt), ) r = c.fetchone() var_id = r[0] try: c.execute( "INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)", (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq), ) except sqlite3.IntegrityError: c.execute( """ UPDATE variant_genes SET sample_freq=sample_freq + ? WHERE var_id=? AND gene_id=?""", (rec.sample_freq, var_id, afg.gene_id), ) count += 1 log.info(" {0} variant genes".format(count)) log.info(" Genes ...") count = 0 for gene in projdb.genes(join_xrefs=True, join_rec=True): rec = gene.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq)) else: c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id)) count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None: continue c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,)) r = c.fetchone() if r[0] == 0: c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq)) else: c.execute( "UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id) ) count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total)) if sample_total > 0: c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total)) c.close() conn.commit() log.info("Saving results ...") c = conn.cursor() base_path = paths.combination_path("recurrences") log.info(" Variant genes ...") with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line( f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS", ) for r in c.execute( "SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id" ): strand, ref, alt = r["strand"], r["ref"], r["alt"] allele = "{0}/{1}".format(ref, alt) tsv.write_line( f, r["chr"], strand, r["start"], allele, r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]), r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-", ) log.info(" Genes ...") with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM genes ORDER BY gene_id"): tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-") log.info(" Pathways ...") with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f: tsv.write_param(f, "classifier", classifier["id"]) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "group_short_name", group_short_name) tsv.write_param(f, "group_long_name", group_long_name) tsv.write_param(f, "projects", ",".join(project_ids)) tsv.write_param(f, "SAMPLE_TOTAL", sample_total) tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP") for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"): tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-") conn.close() remove_temp(task, db_path)
def scan_files(project): log = task.logger conf = task.conf config = GlobalConfig(conf) paths = PathsConfig(config) projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects") project_id = project["id"] temp_path = project["temp_path"] project_path = project["path"] projdb_path = project["db"] assembly = project["assembly"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if assembly == "hg18": out_port = liftover_projects_port elif assembly == "hg19": out_port = projects_port else: raise Exception("Unexpected assembly: {0}".format(assembly)) #if os.path.exists(projdb_path): # log.warn("Variations database already created, skipping this step.") # out_port.send(project) # return if os.path.exists(projdb_path): os.remove(projdb_path) log.info("Creating variants database ...") projdb_tmp_path = make_temp_file(task, suffix=".db") log.debug(projdb_tmp_path) projdb = ProjectDb(projdb_tmp_path).create() data_path = config.data_path log.info("Loading genes ...") projdb.load_genes(paths.data_ensembl_genes_path()) log.info("Loading pathways ...") projdb.load_pathways( paths.data_kegg_def_path(), paths.data_kegg_ensg_map_path()) log.info("Parsing variants ...") for obj_name in project["storage_objects"]: log.info("Downloading {} ...".format(obj_name)) dst_path = os.path.join(project_path, "sources", os.path.basename(obj_name)) dst_dirname = os.path.dirname(dst_path) if not os.path.exists(dst_dirname): os.makedirs(dst_dirname) # TODO: do not copy the source file (do not specify dst_path) task.storage.get_object(obj_name).get_data(dst_path) for container_name, path, name, ext, f in archived_files(dst_path): fname = os.path.join(path, name + ext) if container_name is not None: source_name = "{0}:{1}".format(os.path.basename(container_name), fname) else: source_name = name + ext log.info("=> {0} ...".format(source_name)) sample_id = os.path.basename(name) if ext.lower() in _SUPPORTED_EXTENSIONS: parser_type = ext[1:] else: parser_type = "tab" parser = create_variants_parser(parser_type, f, source_name, sample_id) source_id = projdb.add_source(source_name) var_ids = set() for var in parser: for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num()) var_ids.add(var_id) for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) num_variants = len(var_ids) log.info(" {0} variants".format(num_variants)) if num_variants == 0: raise Exception("No variants found in source '{}'. " "Please check the documentation for the expected input for '{}' format.".format( source_name, parser.name)) projdb.commit() projdb.close() log.info("Copying variants database ...") log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path)) shutil.copy(projdb_tmp_path, projdb_path) remove_temp(task, projdb_tmp_path) out_port.send(project)
def create_datasets(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] datasets_path = paths.project_results_path(project_path) ensure_path_exists(datasets_path) sigdb = SigDb(config.sigdb_path) sigdb.open() projdb = ProjectDb(project["db"]) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Exporting variant genes ...") vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log) tsv.write_param(vf, "SAMPLE_TOTAL", total_samples) tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log) tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE") count = 0 for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 tsv.write_line(vf, var.id, var.chr, var.strand, start, allele, afg.gene_id, afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq, rec.sample_prop, afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N") for sample in var.samples: tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N") count += 1 vf.close() sf.close() log.info(" {0} variant genes".format(count)) log.info("Exporting consequences ...") cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log) tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") count = 0 for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact), null_value="\N") count += 1 cf.close() log.info(" {0} consequences".format(count)) log.info("Exporting genes ...") gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log) tsv.write_param(gf, "SAMPLE_TOTAL", total_samples) tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER") for gene in projdb.genes(join_rec=True): rec = gene.rec if rec.sample_freq is None or rec.sample_freq == 0: continue intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, rec.sample_freq or 0, rec.sample_prop or 0, intogen_driver, null_value="\N") gf.close() log.info("Exporting pathways ...") pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log) tsv.write_param(pf, "SAMPLE_TOTAL", total_samples) tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None or rec.sample_freq == 0: continue tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N") pf.close() if not config.skip_oncodrivefm: log.info("Exporting genes per sample functional impact ...") with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f: tsv.write_line(f, "GENE_ID", "SAMPLE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields tsv.write_line(f, gene, sample, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N") projdb.close() sigdb.close() log.info("Saving project configuration ...") projres = ProjectResults(project) with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f: names = ["ASSEMBLY", "SAMPLES_TOTAL"] values = [project["assembly"], total_samples] names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="\N") projects_port = task.ports("projects_out") projects_port.send(project)
def compute(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) ofm = Data.element(project["oncodrivefm"]) feature = ofm["feature"] slice_name = ofm["slice"] estimator = ofm.get("estimator") num_cores = ofm.get("num_cores", dtype=str) num_samplings = ofm.get("num_samplings", dtype=str) samples_threshold = ofm.get("samples_threshold", dtype=str) filter_enabled = ofm.get("filter_enabled", dtype=bool) filter_path = ofm.get("filter_path", dtype=str) log.info("feature = {0}".format(feature)) log.info("slice = {0}".format(slice_name)) log.info("estimator = {0}".format(estimator)) log.info("num_cores = {0}".format(num_cores)) log.info("num_samplings = {0}".format(num_samplings)) log.info("samples_threshold = {0}".format(samples_threshold)) log.info("filter_enabled = {0}".format(filter_enabled)) log.info("filter_path = {0}".format(os.path.basename(filter_path))) cmd = [ "oncodrivefm-compute", "-o", project["temp_path"], "-n oncodrivefm-{0}".format(feature), "-N", num_samplings, "--threshold", samples_threshold, "-e {0}".format(estimator), "-j", num_cores, "--slices '{0}'".format(slice_name)] if filter_enabled: cmd += ["--filter", filter_path] if feature == "pathways": cmd += ["-m", paths.data_kegg_path("ensg_kegg.tsv")] cmd += [ofm["data"]] project["oncodrivefm"] = dict( feature=feature, slice=slice_name, results=os.path.join(project["temp_path"], "oncodrivefm-{0}-{1}.tsv".format(feature, slice_name))) cmd = " ".join(cmd) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: raise Exception("OncodriveFM error while computing {0}:\n{1}".format(feature, cmd)) projects_out_port.send(project)
def drivers(): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) db_path = paths.results_path("drivers.db") db = SigDb(db_path) db.open() log.info("Variants ...") path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz") with tsv.open(path, "r") as f: types = (str, str, int, str) for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True): chr, strand, start, allele = fields[:4] db.add_variant(chr, start) log.info("Genes ...") gene_sites = {} gene_fm = set() gene_clust = set() #SPECIAL_THRESHOLD = ["C18", "C34"] SPECIAL_THRESHOLD = [] log.info(" OncodriveFM ...") filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodrivefm") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) if cancer_site_code in SPECIAL_THRESHOLD: threshold = 1e-6 else: threshold = 0.01 with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < threshold: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_fm.add(gene) log.info(" OncodriveCLUST ...") filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz") base_path = paths.combination_path("oncodriveclust") for path in os.listdir(base_path): m = filename_re.match(path) if not m: continue cancer_site_code = m.group(1) with tsv.open(os.path.join(base_path, path), "r") as f: params = tsv.params(f) cancer_site_name = params["group_long_name"] for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True): gene, qvalue = fields if qvalue < 0.05: add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name) gene_clust.add(gene) log.info(" Updating db ...") sig_genes = gene_fm | gene_clust for gene in sig_genes: db.add_gene(gene, gene in gene_fm, gene in gene_clust) log.info("Saving driver genes cancer sites dataset ...") path = paths.results_path("gene-driver_cancer_sites.tsv") log.debug("> {}".format(path)) with open(path, "w") as f: tsv.write_param(f, "date", datetime.now()) tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES") for gene, sites in gene_sites.items(): tsv.write_line(f, gene, 1 if gene in gene_fm else 0, 1 if gene in gene_clust else 0, len(sites), ", ".join(sorted([code for code, name in sites])), ", ".join(sorted([name for code, name in sites]))) db.commit() db.close()
def combination_oncodrivefm(projects_set): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) classifier, projects = projects_set classifier_id = classifier["id"] group_values = classifier["group_values"] short_values = classifier["group_short_values"] long_values = classifier["group_long_values"] group_name = classifier["group_name"] group_short_name = classifier["group_short_name"] group_long_name = classifier["group_long_name"] if len(group_values) == 0: group_file_prefix = classifier_id else: group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name) group_file_prefix = normalize_id(group_file_prefix) log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format( classifier["name"], group_long_name, group_short_name, group_name, "-" * 30)) log.info("Exporting project data ...") base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix)) log.debug("> {0}".format(base_path)) project_ids = [] gene_files = [] pathway_files = [] for project in projects: project_id = project["id"] project_ids += [project_id] log.info(" Project {0}:".format(project["id"])) projdb = ProjectDb(project["db"]) log.info(" Genes ...") count = 0 file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id)) gene_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "GENE_ID", "PVALUE") for gene in projdb.genes(): if gene.fm_pvalue is not None: tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-") count += 1 log.info(" {0} genes".format(count)) log.info(" Pathways ...") count = 0 file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id)) pathway_files += [file_path] with open(file_path, "w") as f: tsv.write_param(f, "classifier", classifier_id) tsv.write_param(f, "group_id", group_name) tsv.write_param(f, "slice", project_id) tsv.write_line(f, "PATHWAY_ID", "ZSCORE") for pathway in projdb.pathways(): if pathway.fm_zscore is not None: tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-") count += 1 log.info(" {0} pathways".format(count)) projdb.close() log.info("Combining ...") combination_path = paths.combination_path("oncodrivefm") log.info(" Genes ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-empirical", "-o '{0}'".format(combination_path), "-n 'gene-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in gene_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd)) log.info(" Pathways ...") cmd = " ".join([ "oncodrivefm-combine", "-m median-zscore", "-o '{0}'".format(combination_path), "-n 'pathway-{0}'".format(group_file_prefix), "-D 'classifier={0}'".format(classifier_id), "-D 'group_id={0}'".format(group_name), "-D 'group_short_name={0}'".format(group_short_name), "-D 'group_long_name={0}'".format(group_long_name), "--output-format tsv.gz" ] + ["'{0}'".format(name) for name in pathway_files]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: #log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) #return -1 raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd)) remove_temp(task, base_path)
def fimpact_run(partition): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=paths.data_transfic_path()) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields ct = (ct or "").split(",") # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot.get(var_id) sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = 1 if so.match(ct, so.CODING_REGION) else 0 sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores.get(var_id) (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact elif so.match(ct, so.STOP): # stop sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_JUNCTION): # splice junction sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_REGION): # splice region sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SYNONYMOUS): # synonymous sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS aff_gene = (var_id, gene) # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact, sift_score, sift_tfic, sift_class, sift_impact, pph2_score, pph2_tfic, pph2_class, pph2_impact, ma_score, ma_tfic, ma_class, ma_impact, null_value="-") cf.close() # Send results to the next module partition["tfi_path"] = tfi_path results_port.send(partition)
def begin(): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) log.info("Creating combination folders ...") paths.create_combination_folders() log.info("Checking classifiers ...") """ if CONF_KEY in conf: classifiers = conf[CONF_KEY].to_native() else: classifiers = [] """ classifiers = config.combination.classifiers results = [] if len(classifiers) == 0: log.warn("No classifiers have been defined !!!") log.warn("Specify them in the configuration with '{0}'".format(CONF_KEY)) updated_classifiers = [] for index, classifier in enumerate(classifiers): classifier = classifier.to_native() if not isinstance(classifier, dict): raise Exception("Classifier at index {0} should be a dictionary".format(index)) if "id" not in classifier: classifier["id"] = str(index) classifier_id = classifier["id"] if "name" not in classifier: classifier["name"] = "Classifier {0}".format(index) if "keys" not in classifier: classifier["keys"] = [] elif not isinstance(classifier["keys"], list): raise Exception("'keys' for classifier at index {0} should be a list".format(classifier_id)) keys_len = len(classifier["keys"]) if "default_key_values" not in classifier: classifier["default_key_values"] = [""] * keys_len elif not isinstance(classifier["default_key_values"], list): raise Exception("'default_key_values' for classifier {0} should be a list".format(classifier_id)) elif len(classifier["default_key_values"]) != keys_len: raise Exception("Number of values for 'default_key_values' should be the same of 'keys' in classifier '{0}'".format(classifier_id)) if "short_names" not in classifier: classifier["short_names"] = classifier["keys"] elif not isinstance(classifier["short_names"], list): raise Exception("'short_names' for classifier {0} should be a list".format(classifier_id)) elif len(classifier["short_names"]) != keys_len: raise Exception("Number of keys for 'short_names' should be the same of 'keys' in classifier '{0}'".format(classifier_id)) if "default_short_values" not in classifier: classifier["default_short_values"] = classifier["default_key_values"] elif not isinstance(classifier["default_short_values"], list): raise Exception("'default_short_values' for classifier {0} should be a list".format(classifier_id)) elif len(classifier["default_short_values"]) != keys_len: raise Exception("Number of values for 'default_short_values' should be the same of 'keys' in classifier '{0}'".format(classifier_id)) if "long_names" not in classifier: classifier["long_names"] = classifier["short_names"] elif not isinstance(classifier["long_names"], list): raise Exception("'long_names' for classifier {0} should be a list".format(classifier_id)) elif len(classifier["long_names"]) != keys_len: raise Exception("Number of keys for 'long_names' should be the same of 'keys' in classifier '{0}'".format(classifier["id"])) if "default_long_values" not in classifier: classifier["default_long_values"] = classifier["default_key_values"] elif not isinstance(classifier["default_long_values"], list): raise Exception("'default_long_values' for classifier {0} should be a list".format(classifier_id)) elif len(classifier["default_long_values"]) != keys_len: raise Exception("Number of values for 'default_long_values' should be the same of 'keys' in classifier '{0}'".format(classifier_id)) updated_classifiers += [classifier] results += [{}] task.context["classifiers"] = updated_classifiers task.context["results"] = results