def process(time): #delay = task.conf.get("delay", 1, dtype=int) #min_delay = task.conf.get("min_delay", delay, dtype=int) #max_delay = task.conf.get("max_delay", delay, dtype=int) #max_delay = max(min_delay, max_delay) #delay = min_delay + (max_delay - min_delay) * random() task.logger.info("{}: {}".format(type(time), repr(time))) #if time > 1.5: # return -1 task.logger.info("Waiting for {:.2} seconds ...".format(time)) sleep(time) task.ports("time2").send(time)
def projects(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") log.info("--- [{0}] --------------------------------------------".format(project["id"])) projdb = ProjectDb(project["db"]) total_samples = projdb.get_total_affected_samples() if total_samples == 0: log.warn("There are no samples, recurrences cannot be calculated.") projdb.close() return log.info("Calculating project recurrences for variant genes ...") projdb.compute_affected_genes_recurrences(total_samples) if not conf.get("variants_only", False): log.info("Calculating project recurrences for genes ...") projdb.compute_gene_recurrences(total_samples) log.info("Calculating project recurrences for pathways ...") projdb.compute_pathway_recurrences(total_samples) projdb.commit() projdb.close() projects_out_port.send(project)
def compute(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] ofm = project["oncodrivefm"] feature = ofm["feature"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project_id, feature)) cmd = [ "oncodrivefm-combine", "-o", project["temp_path"], "-n oncodrivefm-{0}".format(feature)] cmd += ofm["data"] ofm["results"] = os.path.join(project["temp_path"], "oncodrivefm-{0}.tsv".format(feature)) cmd = " ".join(cmd) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: raise Exception("OncodriveFM error while combining {0}:\n{1}".format(feature, cmd)) projects_out_port.send(project)
def ma_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) offline = conf["offline"] if offline == "yes": log.info("Running Mutation assessor in local mode.") ma = MaLocal(conf["ma_cache_path"]) else: log.info("Running Mutation assessor using web services.") ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db")) results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"])) if not os.path.exists(results_path) or conf.get("consequences_overwrite", True): log.info("Querying Mutation assessor for 'missense_variant' consequences ...") projdb = ProjectDb(project["db"]) missense_variants = set() with open(partition["vep_path"], "r") as f: for line in f: fields = line.rstrip().split("\t") var_id = int(fields[0]) ctypes = fields[3].split(",") if so.match(ctypes, so.NON_SYNONYMOUS): missense_variants.add(var_id) with open(results_path, "w") as mf: for var_id in missense_variants: var = projdb.get_variant(var_id) start, end, ref, alt = var_to_tab(var) r = ma.get(var.chr, var.strand, start, ref, alt, var_id) if r is not None: tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-") projdb.close() else: log.warn("Skipping MA, results already exist.") log.debug("MA results: {0}".format(results_path)) ma.close() # Send results to the next module partition["ma_path"] = results_path results_port.send(partition)
def end(): log = task.logger conf = task.conf project_sets_port = task.ports("project_sets") classifiers = task.context["classifiers"] results = task.context["results"] log.info("Classification results ...") for index, classifier in enumerate(classifiers): log.info(" {0}:".format(classifier["name"])) cresults = results[index] groups = sorted(cresults.keys()) for group_values in groups: group_short_values, group_long_values, projects = cresults[group_values] group_name = str_join("-", group_values) group_short_name = str_join("-", group_short_values) group_long_name = str_join("; ", group_long_values) projects_set = (dict(classifier, group_name=group_name, group_values=group_values, group_short_name=group_short_name, group_short_values=group_short_values, group_long_name=group_long_name, group_long_values=group_long_values), projects) log.info(" ({0}) -> {1} projects".format(group_name, len(projects))) project_sets_port.send(projects_set)
def create(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] templates_path = config.website.templates_path if templates_path is None: log.warn("No website templates have been defined in the configuration. Skipping website creation.") return log.info("Creating website ...") website_path = paths.project_website_path(project_path) if os.path.exists(website_path): shutil.rmtree(website_path) log.info("Copying templates ...") shutil.copytree(templates_path, website_path) gitignore_path = os.path.join(website_path, ".gitignore") try: os.remove(gitignore_path) except: pass log.info("Expanding templates ...") vars = dict( PROJECT_NAME=project_id, SHOW_ALL_TABS=not config.variants_only) tmpl_paths = [ os.path.join(website_path, "css", "header.html"), os.path.join(website_path, "onexus-project.onx") ] for path in tmpl_paths: with open(path, "r") as f: t = Template(f.read()) with open(path, "w") as f: f.write(t.safe_substitute(vars)) # Creating a soft link to the results folder project_results_path = paths.project_results_path(project_path) os.symlink(project_results_path, os.path.join(website_path, "results")) # Send project to the next modules projects_port = task.ports("projects_out") project["website"] = website_path projects_port.send(project)
def square(x): square = x * x task.logger.info("x = {0}, x^2 = {1}".format(x, square)) x_square_port = task.ports("x_square") x_square_port.send(square)
def update_db(project): log = task.logger projects_out_port = task.ports("projects_out") project_id = project["id"] ofm = project["oncodrivefm"] task.context[project_id] += [project]
def main(): values, result = task.ports("x", "sum") count = 0 nsum = 0 for v in values: count += 1 nsum += v task.logger.info("Sum of {0} numbers = {1}".format(count, nsum)) result.send(nsum)
def route(project): log = task.logger conf = task.conf projects_port = task.ports("projects_out") recurrences_projects_port = task.ports("recurrences_projects") oncodrivefm_projects_port = task.ports("oncodrivefm_projects") oncodriveclust_projects_port = task.ports("oncodriveclust_projects") log.info(" {0}".format(project["id"])) projects_port.send(project) if not task.context["skip_recurrences"]: recurrences_projects_port.send(project) if not task.context["skip_oncodrivefm"]: oncodrivefm_projects_port.send(project) if not task.context["skip_oncodriveclust"]: oncodriveclust_projects_port.send(project)
def end(): log = task.logger projects_out_port = task.ports("projects_out") log.info("Sending projects ...") combinations = task.context["combinations"] for key, project in sorted(combinations.items(), key=lambda v: v[0]): log.debug(key) projects_out_port.send(project)
def route(project): log = task.logger config = GlobalConfig(task.conf) projects_port = task.ports("projects_out") recurrences_projects_port = task.ports("recurrences_projects") oncodrivefm_projects_port = task.ports("oncodrivefm_projects") oncodriveclust_projects_port = task.ports("oncodriveclust_projects") log.info(" {0}".format(project["id"])) projects_port.send(project) if not config.skip_recurrences: recurrences_projects_port.send(project) if not config.skip_oncodrivefm: oncodrivefm_projects_port.send(project) if not config.skip_oncodriveclust: oncodriveclust_projects_port.send(project)
def update_db(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] ofm = project["oncodrivefm"] if project_id in task.context: task.context[project_id] += [project] else: task.context[project_id] = [project]
def vep_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project_id = partition["project"]["id"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project_id, partition["index"])) offline = conf["offline"] if offline == "yes": log.info("Running VEP in local mode.") vep = VepLocal( perl_path=conf["perl_bin"], lib_path=conf["perl_lib"], script_path=os.path.join(conf["ext_bin_path"], "variant_effect_predictor", "variant_effect_predictor.pl"), cache_path=os.path.join(conf["data_path"], "vep_cache")) else: log.info("Running VEP using web services.") vep = VepService(cache_path=os.path.join(conf["cache_path"], "vep.db")) results_path = os.path.join(partition["base_path"], "{0:08d}.vep".format(partition["index"])) if not os.path.exists(results_path) or conf.get("consequences_overwrite", True): # Run VEP vep.run(partition["bed_path"]) log.info("Saving results ...") log.debug("VEP results: {0}".format(vep.results_path)) # Save results with open(results_path, "w") as f: for r in vep.results(): tsv.write_line(f, r.var_id, r.gene, r.transcript, ",".join(r.consequences), r.protein_pos, r.aa_change, r.protein, r.sift, r.polyphen, null_value="-") else: log.warn("Skipping VEP, results already exist.") log.debug("VEP results: {0}".format(results_path)) vep.close() # Send results to the next module partition["vep_path"] = results_path results_port.send(partition)
def end(): log = task.logger projects_port = task.ports("projects") for project_id, partitions in task.context.items(): log.info("Project {0}: {1} partitions".format(project_id, len(partitions))) project = partitions[0]["project"] project["partitions"] = part_list = list() for part in partitions: del part["project"] part_list += [part] projects_port.send(project)
def main(): values, count_port, sum_port = task.ports("x", "count", "sum") count = 0 sum = 0 for v in values: task.logger.info("value = {0}".format(v)) count += 1 sum += v task.logger.info("Sum of {0} numbers = {1}".format(count, sum)) count_port.send(count) sum_port.send(sum)
def split_variants(project): log = task.logger config = GlobalConfig(task.conf) partition_port = task.ports("partitions") log.info("--- [{}] --------------------------------------------".format(project["id"])) projdb = ProjectDb(project["db"]) log.info("Preparing variants for VEP ...") base_path = os.path.join(project["temp_path"], "consequences") ensure_path_exists(base_path) project["csq_path"] = base_path partition_size = config.vep_partition_size partition = -1 f = None count = 0 for var in projdb.variants(order_by="position"): start, end, ref, alt = var_to_tab(var) if count % partition_size == 0: if f is not None: f.close() partition += 1 partition_path = os.path.join(base_path, "{0:08d}.vep_in".format(partition)) f = open(partition_path, "w") partition_port.send( {"project": project, "index": partition, "bed_path": partition_path, "base_path": base_path} ) tsv.write_line(f, var.chr, start, end, ref + "/" + alt, var.strand, var.id) count += 1 if f is not None: f.close() log.info("{} variants split into {} partitions".format(count, partition + 1)) projdb.close()
def create(project): log = task.logger conf = task.conf project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] templates_path = conf.get("website.templates_path") if templates_path is None: log.warn("No website templates have been defined in the configuration. Skipping website creation.") return log.info("Creating website ...") website_path = get_website_path(project_path) if os.path.exists(website_path): shutil.rmtree(website_path) log.info("Copying templates ...") shutil.copytree(templates_path, website_path) log.info("Expanding templates ...") vars = dict( PROJECT_NAME=project_id, SHOW_ALL_TABS=not conf.get("variants_only", False)) paths = [ os.path.join(website_path, "css", "header.html"), os.path.join(website_path, "onexus-project.onx") ] for path in paths: with open(path, "r") as f: t = Template(f.read()) with open(path, "w") as f: f.write(t.safe_substitute(vars)) # Send project to the next modules projects_port = task.ports("projects_out") project["website"] = website_path projects_port.send(project)
def update_db(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) oclust = project["oncodriveclust"] del project["oncodriveclust"] if not os.path.exists(oclust["results"]): log.warn("No results have been found. Skipping it.") return log.info("Updating the project database ...") projdb = ProjectDb(project["db"]) exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info(" Excluded gene causes ...") log.debug(" > {0}".format(exc_path)) count = 0 with tsv.open(exc_path, "r") as exf: for gene, cause in tsv.lines(exf, (str, str), header=True): projdb.update_gene(Gene(id=gene, clust_exc_cause=cause)) count += 1 log.debug(" {0} genes excluded".format(count)) log.info(" OncodriveCLUST results ...") with tsv.open(oclust["results"], "r") as f: types = (str, str, float, float, float) columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE") for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"): projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue, clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC)) projdb.commit() projdb.close() projects_out_port.send(project)
def oncoclust(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) gene_transcripts_path = paths.data_ensembl_gene_transcripts_path() oclust = project["oncodriveclust"] data_paths = oclust["data_paths"] samples_threshold = oclust["samples_threshold"] oclust["results"] = os.path.join(project["temp_path"], "oncodriveclust-results.tsv") cmd = " ".join([ "oncodriveclust", "-c", "-m", str(samples_threshold), "-o", oclust["results"], data_paths[0], data_paths[1], gene_transcripts_path ]) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code == 1: log.warn("No results were generated") elif ret_code != 0: log.error("Error while executing OncodriveCLUST:\n{0}".format(cmd)) projects_out_port.send(project)
def create_datasets(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] datasets_path = paths.project_results_path(project_path) ensure_path_exists(datasets_path) sigdb = SigDb(config.sigdb_path) sigdb.open() projdb = ProjectDb(project["db"]) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Exporting variant genes ...") vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log) tsv.write_param(vf, "SAMPLE_TOTAL", total_samples) tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log) tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE") count = 0 for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 tsv.write_line(vf, var.id, var.chr, var.strand, start, allele, afg.gene_id, afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq, rec.sample_prop, afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N") for sample in var.samples: tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N") count += 1 vf.close() sf.close() log.info(" {0} variant genes".format(count)) log.info("Exporting consequences ...") cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log) tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") count = 0 for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact), null_value="\N") count += 1 cf.close() log.info(" {0} consequences".format(count)) log.info("Exporting genes ...") gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log) tsv.write_param(gf, "SAMPLE_TOTAL", total_samples) tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER") for gene in projdb.genes(join_rec=True): rec = gene.rec if rec.sample_freq is None or rec.sample_freq == 0: continue intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, rec.sample_freq or 0, rec.sample_prop or 0, intogen_driver, null_value="\N") gf.close() log.info("Exporting pathways ...") pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log) tsv.write_param(pf, "SAMPLE_TOTAL", total_samples) tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None or rec.sample_freq == 0: continue tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N") pf.close() if not config.skip_oncodrivefm: log.info("Exporting genes per sample functional impact ...") with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f: tsv.write_line(f, "GENE_ID", "SAMPLE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields tsv.write_line(f, gene, sample, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N") projdb.close() sigdb.close() log.info("Saving project configuration ...") projres = ProjectResults(project) with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f: names = ["ASSEMBLY", "SAMPLES_TOTAL"] values = [project["assembly"], total_samples] names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="\N") projects_port = task.ports("projects_out") projects_port.send(project)
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) config = GlobalConfig(conf) paths = PathsConfig(config) # avoid that project conf override path configurations config = GlobalConfig(conf, project["conf"]) oclust = OncodriveClust(config.oncodriveclust, paths, log) project_results = ProjectResults(project) projdb = ProjectDb(project["db"]) data = oclust.retrieve_data(projdb) projdb.close() data_paths = [ os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"), os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")] log.info("Saving data ...") log.debug("> {0}".format(data_paths[NON_SYN])) log.debug("> {0}".format(data_paths[SYN])) df = [tsv.open(path, "w") for path in data_paths] gene_sample_count = defaultdict(int) for key, value in data.items(): findex, gene, sample = key transcript, transcript_len, protein_pos = value if findex == NON_SYN: gene_sample_count[gene] += 1 if oclust.filter_enabled and not oclust.filter.valid(gene): continue tsv.write_line(df[findex], gene, sample, protein_pos) for f in df: f.close() exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if oclust.filter_enabled and not oclust.filter.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < oclust.samples_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) log.info("Sending project ...") projects_out_port.send(dict(project, oncodriveclust=dict( data_paths=data_paths, samples_threshold=oclust.samples_threshold)))
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_results = ProjectResults(project) projdb = ProjectDb(project["db"]) log.info("Retrieving functional impact scores for genes ...") data = retrieve_data(projdb) projdb.close() # save data matrix dst_path = os.path.join(project["temp_path"], "oncodrivefm-data.tdm") sgfi_path = os.path.join(project["temp_path"], "sample_gene-fimpact.tsv.gz") project["sample_gene_fi_data"] = sgfi_path log.info("Saving functional impact scores ...") log.debug("> {0}".format(dst_path)) with open(dst_path, "w") as f: sgff = tsv.open(sgfi_path, "w") tsv.write_line(f, "SAMPLE", "GENE", "SIFT", "PPH2", "MA") tsv.write_line(sgff, "SAMPLE", "GENE", "SIFT_SCORE", "SIFT_TFIC", "SIFT_TFIC_CLASS", "PPH2_SCORE", "PPH2_TFIC", "PPH2_TFIC_CLASS", "MA_SCORE", "MA_TFIC", "MA_TFIC_CLASS") for key, values in data.iteritems(): sample, gene = key (sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = values tsv.write_line(f, sample, gene, sift_score, pph2_score, ma_score) tsv.write_line(sgff, sample, gene, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, null_value="-") sgff.close() # count samples samples = set() gene_sample_count = {} for sample, gene in data.keys(): samples.add(sample) if gene not in gene_sample_count: gene_sample_count[gene] = 1 else: gene_sample_count[gene] += 1 num_samples = len(samples) if num_samples == 0: log.warn("There are no samples data, skipping OncodriveFM for this project") return (num_cores, estimator, genes_num_samplings, genes_threshold, genes_filter_enabled, genes_filter, filt, pathways_num_samplings, pathways_threshold) = get_oncodrivefm_configuration(log, conf, project, num_samples) # Create a dataset with information on why some genes are not considered for calculation in OncodriveFM # There are basically two possible reasons: # - It does not pass the filter # - There are less samples mutated than the threshold exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if genes_filter_enabled and not filt.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < genes_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) ofm = dict( data=dst_path, num_cores=num_cores, estimator=estimator) for slice_name in ["SIFT", "PPH2", "MA"]: projects_out_port.send(dict(project, oncodrivefm=dict(ofm, feature="genes", slice=slice_name, num_samplings=genes_num_samplings, threshold=genes_threshold, filter_enabled=genes_filter_enabled, filter=genes_filter))) for slice_name in ["SIFT", "PPH2", "MA"]: projects_out_port.send(dict(project, oncodrivefm=dict(ofm, feature="pathways", slice=slice_name, num_samplings=pathways_num_samplings, threshold=pathways_threshold, filter_enabled=genes_filter_enabled, filter=genes_filter)))
def liftover(project): log = task.logger conf = task.conf config = GlobalConfig(conf) lifted_project_port = task.ports("lifted_projects") log.info("--- [{0}] --------------------------------------------".format(project["id"])) log.info("Preparing liftOver files ...") in_path = make_temp_file(task, suffix=".bed") in_file = open(in_path, "w") out_path = make_temp_file(task, suffix=".bed") unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed") projdb = ProjectDb(project["db"]) for var in projdb.variants(order_by="position"): in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id)) in_file.close() log.info("Running liftOver ...") project["from_assembly"] = project["assembly"] project["assembly"] = "hg19" cmd = " ".join( [ conf["liftover_bin"], in_path, os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"), out_path, unmapped_path, ] ) log.debug(cmd) subprocess.call(cmd, shell=True) log.info("Annotating unmapped variants ...") count = 0 with open(unmapped_path, "r") as f: for line in f: if line.lstrip().startswith("#"): continue fields = line.rstrip().split("\t") var_id = int(fields[3]) projdb.update_variant_start(var_id, start=None) count += 1 log.info(" {0} unmapped variants annotated".format(count)) log.info("Updating variants ...") count = 0 with open(out_path, "r") as f: for line in f: fields = line.rstrip().split("\t") chr, start, end, var_id = fields projdb.update_variant_start(var_id, start=start) count += 1 log.info(" {0} variants".format(count)) remove_temp(task, in_path, out_path) projdb.commit() projdb.close() lifted_project_port.send(project)
def prepare_files(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_results = ProjectResults(project) mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project) log.info("Loading transcripts CDS length ...") cds_len = load_cds_len(conf) log.info("Retrieving gene alterations ...") projdb = ProjectDb(project["db"]) data = retrieve_data(projdb, cds_len) projdb.close() data_paths = [ os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"), os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")] log.info("Saving data ...") log.debug("> {0}".format(data_paths[NON_SYN])) log.debug("> {0}".format(data_paths[SYN])) df = [tsv.open(path, "w") for path in data_paths] gene_sample_count = {} for key, value in data.items(): findex, gene, sample = key transcript, transcript_len, protein_pos = value if findex == NON_SYN: if gene not in gene_sample_count: gene_sample_count[gene] = 1 else: gene_sample_count[gene] += 1 if genes_filter_enabled and not filt.valid(gene): continue tsv.write_line(df[findex], gene, sample, protein_pos) for f in df: f.close() exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info("Saving excluded gene causes ...") log.debug("> {0}".format(exc_path)) with tsv.open(exc_path, "w") as exf: tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE") for gene, sample_count in gene_sample_count.items(): causes = [] if genes_filter_enabled and not filt.valid(gene): causes += [ProjectDb.GENE_EXC_FILTER] if sample_count < mutations_threshold: causes += [ProjectDb.GENE_EXC_THRESHOLD] if len(causes) > 0: tsv.write_line(exf, gene, "".join(causes)) log.info("Sending project ...") projects_out_port.send(dict(project, oncodriveclust=dict( data_paths=data_paths, mutations_threshold=mutations_threshold, genes_filter_enabled=genes_filter_enabled, # not used genes_filter=genes_filter))) # not used
def end(): log = task.logger projects_out_port = task.ports("projects_out") log.info("Updating the projects database ...") for project_id, projects in task.context.items(): log.info("[{0}]".format(project_id)) for index, project in enumerate(projects): projdb = ProjectDb(project["db"]) if index == 0: log.info(" Functional impact ...") projdb.delete_sample_gene_fimpact() with tsv.open(project["sample_gene_fi_data"], "r") as f: types = (int, str, float, float, int, float, float, int, float, float, int) for fields in tsv.lines(f, types, header=True, null_value="-"): projdb.add_sample_gene_fimpact(*fields) ofm = project["oncodrivefm"] del project["oncodrivefm"] exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv") log.info(" Excluded gene causes ...") log.debug(" > {0}".format(exc_path)) count = 0 with tsv.open(exc_path, "r") as exf: for gene, cause in tsv.lines(exf, (str, str), header=True): projdb.update_gene(Gene(id=gene, fm_exc_cause=cause)) count += 1 log.debug(" {0} genes excluded".format(count)) for feature, results_path in ofm: log.info(" {0} ...".format(feature)) log.debug(" > {0}".format(results_path)) if feature == "genes": with tsv.open(results_path, "r") as f: count = 0 for gene, pvalue, qvalue in tsv.lines(f, (str, float, float), header=True): projdb.update_gene(Gene(id=gene, fm_pvalue=pvalue, fm_qvalue=qvalue, fm_exc_cause=ProjectDb.NO_GENE_EXC)) count += 1 log.info(" {0} genes".format(count)) elif feature == "pathways": with tsv.open(results_path, "r") as f: count = 0 for pathway, zscore, pvalue, qvalue in tsv.lines(f, (str, float, float, float), header=True): projdb.update_pathway(Pathway(id=pathway, fm_zscore=zscore, fm_pvalue=pvalue, fm_qvalue=qvalue)) count += 1 log.info(" {0} pathways".format(count)) projdb.commit() projdb.close() projects_out_port.send(projects[0])
def scan_files(project): log = task.logger conf = task.conf config = GlobalConfig(conf) paths = PathsConfig(config) projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects") project_id = project["id"] temp_path = project["temp_path"] project_path = project["path"] projdb_path = project["db"] assembly = project["assembly"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if assembly == "hg18": out_port = liftover_projects_port elif assembly == "hg19": out_port = projects_port else: raise Exception("Unexpected assembly: {0}".format(assembly)) #if os.path.exists(projdb_path): # log.warn("Variations database already created, skipping this step.") # out_port.send(project) # return if os.path.exists(projdb_path): os.remove(projdb_path) log.info("Creating variants database ...") projdb_tmp_path = make_temp_file(task, suffix=".db") log.debug(projdb_tmp_path) projdb = ProjectDb(projdb_tmp_path).create() data_path = config.data_path log.info("Loading genes ...") projdb.load_genes(paths.data_ensembl_genes_path()) log.info("Loading pathways ...") projdb.load_pathways( paths.data_kegg_def_path(), paths.data_kegg_ensg_map_path()) log.info("Parsing variants ...") for obj_name in project["storage_objects"]: log.info("Downloading {} ...".format(obj_name)) dst_path = os.path.join(project_path, "sources", os.path.basename(obj_name)) dst_dirname = os.path.dirname(dst_path) if not os.path.exists(dst_dirname): os.makedirs(dst_dirname) # TODO: do not copy the source file (do not specify dst_path) task.storage.get_object(obj_name).get_data(dst_path) for container_name, path, name, ext, f in archived_files(dst_path): fname = os.path.join(path, name + ext) if container_name is not None: source_name = "{0}:{1}".format(os.path.basename(container_name), fname) else: source_name = name + ext log.info("=> {0} ...".format(source_name)) sample_id = os.path.basename(name) if ext.lower() in _SUPPORTED_EXTENSIONS: parser_type = ext[1:] else: parser_type = "tab" parser = create_variants_parser(parser_type, f, source_name, sample_id) source_id = projdb.add_source(source_name) var_ids = set() for var in parser: for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num()) var_ids.add(var_id) for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) num_variants = len(var_ids) log.info(" {0} variants".format(num_variants)) if num_variants == 0: raise Exception("No variants found in source '{}'. " "Please check the documentation for the expected input for '{}' format.".format( source_name, parser.name)) projdb.commit() projdb.close() log.info("Copying variants database ...") log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path)) shutil.copy(projdb_tmp_path, projdb_path) remove_temp(task, projdb_tmp_path) out_port.send(project)
def compute(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) ofm = Data.element(project["oncodrivefm"]) feature = ofm["feature"] slice_name = ofm["slice"] estimator = ofm.get("estimator") num_cores = ofm.get("num_cores", dtype=str) num_samplings = ofm.get("num_samplings", dtype=str) samples_threshold = ofm.get("samples_threshold", dtype=str) filter_enabled = ofm.get("filter_enabled", dtype=bool) filter_path = ofm.get("filter_path", dtype=str) log.info("feature = {0}".format(feature)) log.info("slice = {0}".format(slice_name)) log.info("estimator = {0}".format(estimator)) log.info("num_cores = {0}".format(num_cores)) log.info("num_samplings = {0}".format(num_samplings)) log.info("samples_threshold = {0}".format(samples_threshold)) log.info("filter_enabled = {0}".format(filter_enabled)) log.info("filter_path = {0}".format(os.path.basename(filter_path))) cmd = [ "oncodrivefm-compute", "-o", project["temp_path"], "-n oncodrivefm-{0}".format(feature), "-N", num_samplings, "--threshold", samples_threshold, "-e {0}".format(estimator), "-j", num_cores, "--slices '{0}'".format(slice_name)] if filter_enabled: cmd += ["--filter", filter_path] if feature == "pathways": cmd += ["-m", paths.data_kegg_path("ensg_kegg.tsv")] cmd += [ofm["data"]] project["oncodrivefm"] = dict( feature=feature, slice=slice_name, results=os.path.join(project["temp_path"], "oncodrivefm-{0}-{1}.tsv".format(feature, slice_name))) cmd = " ".join(cmd) log.debug(cmd) ret_code = subprocess.call(cmd, shell=True) if ret_code != 0: raise Exception("OncodriveFM error while computing {0}:\n{1}".format(feature, cmd)) projects_out_port.send(project)
def gene_impact(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) projects_port = task.ports("projects") log.info("--- [{0}] --------------------------------------------".format(project["id"])) partitions = project["partitions"] log.info("Reading {} partitions ...".format(len(partitions))) aff_gene_attrs = {} for partition in partitions: log.info(" Partition {} ...".format(partition["index"])) with open(partition["tfi_path"], "r") as f: bool_type = lambda val: bool(int(val)) if val is not None else False types = (int, str, str, bool_type, int, int, int, int) columns = [0, 2, 4, 5, 6, 10, 14, 18] for fields in tsv.lines(f, types, columns=columns, null_value="-"): (var_id, gene, prot_change, coding_region, tr_impact, sift_impact, pph2_impact, ma_impact) = fields coding_region = coding_region == 1 aff_gene = (var_id, gene) # update aggregated impact for all the predictors update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact) # update whether the affected gene is a coding region or not update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region, update=lambda prev_value, value: prev_value or value) # aggregate protein changes per affected_gene if prot_change is not None: update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change, new=lambda value: set([value]), update=lambda prev_value, value: prev_value | set([value])) num_vars = len(set([var_id for var_id, gene in aff_gene_attrs.keys()])) num_genes = len(set([gene for var_id, gene in aff_gene_attrs.keys()])) log.info("Saving {} variant-gene impacts ({} variants and {} genes) ...".format(len(aff_gene_attrs), num_vars, num_genes)) gfi_path = os.path.join(project["csq_path"], "variant-gene_impact.tsv") with open(gfi_path, "w") as vf: for aff_gene, attrs in aff_gene_attrs.items(): var_id, gene = aff_gene # get the impact by trust priority: ma, pph2, sift impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS coding_region = attrs.get("coding_region", False) coding_region = 1 if coding_region else 0 prot_changes = attrs.get("prot_changes") prot_changes = ",".join(prot_changes) if prot_changes is not None else None tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-") # Send results to the next module project["gfi_path"] = gfi_path projects_port.send(project)
def fimpact_run(partition): log = task.logger conf = task.conf results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC")) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") aff_gene_attrs = {} with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields if ct is not None: ct = ct.split(",") else: ct = [] # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = so.match(ct, so.CODING_REGION) calculate_transfic = True ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores[var_id] if var_id in ma_scores else None elif so.match(ct, so.STOP): # stop ct_type = TransFIC.CT_STOP sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift ct_type = TransFIC.CT_FRAMESHIFT sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE): # splice ct_type = "splice" sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS calculate_transfic = False elif so.match(ct, so.SYNONYMOUS): # synonymous ct_type = TransFIC.CT_SYNONYMOUS sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS calculate_transfic = False if calculate_transfic: (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) # if the impact was not preassigned get it from the transFIC calculated class sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact else: sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) aff_gene = (var_id, gene) # update aggregated impact for all the predictors update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact) update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact) # update whether the affected gene is a coding region or not update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region, update=lambda prev_value, value: prev_value or value) # aggregate protein changes per affected_gene # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) if prot_change is not None: update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change, new=lambda value: set([value]), update=lambda prev_value, value: prev_value | set([value])) impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, uniprot, sift_score, sift_tfic, sift_class, pph2_score, pph2_tfic, pph2_class, ma_score, ma_tfic, ma_class, impact, null_value="-") cf.close() log.info("Saving variant impacts ...") gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"])) vf = open(gfi_path, "w") for aff_gene, attrs in aff_gene_attrs.items(): var_id, gene = aff_gene # get the impact by trust priority: ma, pph2, sift impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS coding_region = attrs.get("coding_region", False) coding_region = 1 if coding_region else 0 prot_changes = attrs.get("prot_changes") prot_changes = ",".join(prot_changes) if prot_changes is not None else None tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-") vf.close() # Send results to the next module partition["tfi_path"] = tfi_path partition["gfi_path"] = gfi_path results_port.send(partition)
def fimpact_run(partition): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) results_port = task.ports("results") project = partition["project"] log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"])) log.info("Reading MA scores ...") ma_uniprot = {} ma_scores = {} with open(partition["ma_path"], "r") as f: for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"): ma_uniprot[var_id] = uniprot ma_scores[var_id] = fi_score log.info("Reading VEP results and calculating functional impact ...") tfic = TransFIC(data_path=paths.data_transfic_path()) tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"])) cf = open(tfi_path, "w") with open(partition["vep_path"], "r") as f: for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"): (var_id, gene, transcript, ct, protein_pos, aa_change, protein, sift_score, pph2_score) = fields ct = (ct or "").split(",") # Invert sift score if sift_score is not None: sift_score = 1.0 - sift_score ma_score = None uniprot = ma_uniprot.get(var_id) sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS coding_region = 1 if so.match(ct, so.CODING_REGION) else 0 sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None) ct_type = None if so.match(ct, so.NON_SYNONYMOUS): # missense ct_type = TransFIC.CT_NON_SYNONYMOUS ma_score = ma_scores.get(var_id) (sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score) sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact elif so.match(ct, so.STOP): # stop sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.FRAMESHIFT): # frameshift sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_JUNCTION): # splice junction sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SPLICE_REGION): # splice region sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS sift_score = pph2_score = 1.0 ma_score = 3.5 elif so.match(ct, so.SYNONYMOUS): # synonymous sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS sift_score = pph2_score = 0.0 ma_score = -2 else: sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS aff_gene = (var_id, gene) # try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html prot_change = None if ct_type == TransFIC.CT_FRAMESHIFT: if protein_pos is None: prot_change = "fs" else: prot_change = "fs {0}".format(protein_pos) #log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif ct_type == "splice": prot_change = "r.spl?" #log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change)) elif protein_pos is not None and aa_change is not None: rc = ReContext() if rc.match(SIMPLE_AA_CHANGE_RE, aa_change): prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=") elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change): prot_change = "{0} {1}".format(aa_change, protein_pos) else: log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format( gene, protein, protein_pos, aa_change, ", ".join(ct))) tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact, sift_score, sift_tfic, sift_class, sift_impact, pph2_score, pph2_tfic, pph2_class, pph2_impact, ma_score, ma_tfic, ma_class, ma_impact, null_value="-") cf.close() # Send results to the next module partition["tfi_path"] = tfi_path results_port.send(partition)