def get_paths(project_id, conf=None): if conf is None: conf = get_project_conf() results_path = get_results_path(conf) project_path = get_project_path(conf, project_id) project_temp_path = get_temp_path(conf, project_id) ensure_path_exists(project_path) ensure_path_exists(project_temp_path) return results_path, project_path, project_temp_path
def split_variants(project): log = task.logger config = GlobalConfig(task.conf) partition_port = task.ports("partitions") log.info("--- [{}] --------------------------------------------".format(project["id"])) projdb = ProjectDb(project["db"]) log.info("Preparing variants for VEP ...") base_path = os.path.join(project["temp_path"], "consequences") ensure_path_exists(base_path) project["csq_path"] = base_path partition_size = config.vep_partition_size partition = -1 f = None count = 0 for var in projdb.variants(order_by="position"): start, end, ref, alt = var_to_tab(var) if count % partition_size == 0: if f is not None: f.close() partition += 1 partition_path = os.path.join(base_path, "{0:08d}.vep_in".format(partition)) f = open(partition_path, "w") partition_port.send( {"project": project, "index": partition, "bed_path": partition_path, "base_path": base_path} ) tsv.write_line(f, var.chr, start, end, ref + "/" + alt, var.strand, var.id) count += 1 if f is not None: f.close() log.info("{} variants split into {} partitions".format(count, partition + 1)) projdb.close()
def init_project(logger, config, paths, storage, project): project_id = project["id"] results_path = paths.results_path() project_path = paths.project_path(project_id) project_temp_path = paths.project_temp_path(project_path) if config.results.purge_on_start: logger.info(" Purging previous results ...") if os.path.isdir(project_path): logger.info(" {} ...".format(os.path.relpath(project_path, results_path))) shutil.rmtree(project_path) #if os.path.isdir(project_temp_path): # logger.info(" {} ...".format(os.path.relpath(project_temp_path, results_path))) # shutil.rmtree(project_temp_path) for obj_name in storage.list_objects(prefix="results/"): logger.info(" {} ...".format(obj_name)) storage.delete_object("results/{}".format(obj_name)) ensure_path_exists(project_path) ensure_path_exists(project_temp_path) projdb_path = os.path.join(project_path, "project.db") if "annotations" in project: annotations = project["annotations"] if not Data.is_element(annotations): logger.warn("Overriding project annotations field with an empty dictionary") project["annotations"] = annotations = Data.element() else: project["annotations"] = annotations = Data.element() # for backward compatibility for key in project.keys(): if key not in ["id", "assembly", "files", "storage_objects", "annotations", "conf", "oncodriveclust", "oncodrivefm"]: value = project[key] del project[key] annotations[key] = value project["conf"] = pconf = project.get("conf") or Data.element() if not Data.is_element(pconf): logger.warn("Overriding project conf field with an empty dictionary") project["conf"] = pconf = Data.element() # for backward compatibility for key in project.keys(): if key in ["oncodriveclust", "oncodrivefm"]: value = project[key] del project[key] pconf[key] = value project["path"] = project_path project["temp_path"] = project_temp_path project["db"] = projdb_path if "assembly" not in project: project["assembly"] = DEFAULT_ASSEMBLY missing_objects = [] for obj_name in project["storage_objects"]: if not storage.exists_object(obj_name): missing_objects += [obj_name] if len(missing_objects) > 0: raise InternalError("Project {0} references some missing objects:\n{1}".format(project_id, "\n".join(missing_objects))) project["files"] = [str(f) for f in project["files"]] #unicode is not json serializable project["storage_objects"] = [str(f) for f in project["storage_objects"]] #unicode is not json serializable project = project.to_native() # save project.conf projres = ProjectResults(project) projres.save_def() return project
def create_datasets(project): log = task.logger config = GlobalConfig(task.conf) paths = PathsConfig(config) project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) project_path = project["path"] temp_path = project["temp_path"] datasets_path = paths.project_results_path(project_path) ensure_path_exists(datasets_path) sigdb = SigDb(config.sigdb_path) sigdb.open() projdb = ProjectDb(project["db"]) gene_sym = projdb.get_gene_symbols() total_samples = projdb.get_total_affected_samples() log.info("Exporting variant genes ...") vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log) tsv.write_param(vf, "SAMPLE_TOTAL", total_samples) tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS") sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log) tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE") count = 0 for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True): var = afg.var rec = afg.rec start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) xrefs = [xref for xref in var.xrefs] if sigdb.exists_variant(var.chr, start): xrefs += ["I:1"] xrefs = ",".join(xrefs) intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0 tsv.write_line(vf, var.id, var.chr, var.strand, start, allele, afg.gene_id, afg.impact, TransFIC.class_name(afg.impact), rec.sample_freq, rec.sample_prop, afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N") for sample in var.samples: tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N") count += 1 vf.close() sf.close() log.info(" {0} variant genes".format(count)) log.info("Exporting consequences ...") cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log) tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS", "IMPACT", "IMPACT_CLASS") count = 0 for csq in projdb.consequences(join_variant=True): var = csq.var start, end, ref, alt = var_to_tab(var) allele = "{0}/{1}".format(ref, alt) uniprot = protein = protein_pos = aa_change = None sift_score = sift_tfic = sift_tfic_class = None pph2_score = pph2_tfic = pph2_tfic_class = None ma_score = ma_tfic = ma_tfic_class = None if so.match(csq.ctypes, so.ONCODRIVEFM): uniprot, protein = csq.uniprot, csq.protein if so.match(csq.ctypes, so.NON_SYNONYMOUS): protein_pos, aa_change = csq.protein_pos, csq.aa_change sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class) pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class) ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class) tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript, ",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene), uniprot, protein, protein_pos, aa_change, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class, csq.impact, TransFIC.class_name(csq.impact), null_value="\N") count += 1 cf.close() log.info(" {0} consequences".format(count)) log.info("Exporting genes ...") gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log) tsv.write_param(gf, "SAMPLE_TOTAL", total_samples) tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE", "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS", "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER") for gene in projdb.genes(join_rec=True): rec = gene.rec if rec.sample_freq is None or rec.sample_freq == 0: continue intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0 tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause, gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords, rec.sample_freq or 0, rec.sample_prop or 0, intogen_driver, null_value="\N") gf.close() log.info("Exporting pathways ...") pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log) tsv.write_param(pf, "SAMPLE_TOTAL", total_samples) tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE", "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP") for pathway in projdb.pathways(join_rec=True): rec = pathway.rec if rec.sample_freq is None or rec.sample_freq == 0: continue tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue, rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N") pf.close() if not config.skip_oncodrivefm: log.info("Exporting genes per sample functional impact ...") with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f: tsv.write_line(f, "GENE_ID", "SAMPLE", "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS", "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS", "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS") for fields in projdb.sample_gene_fimpacts(): (gene, sample, sift_score, sift_tfic, sift_tfic_class, pph2_score, pph2_tfic, pph2_tfic_class, ma_score, ma_tfic, ma_tfic_class) = fields tsv.write_line(f, gene, sample, sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class), pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class), ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N") projdb.close() sigdb.close() log.info("Saving project configuration ...") projres = ProjectResults(project) with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f: names = ["ASSEMBLY", "SAMPLES_TOTAL"] values = [project["assembly"], total_samples] names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values) tsv.write_line(f, *names) tsv.write_line(f, *values, null_value="\N") projects_port = task.ports("projects_out") projects_port.send(project)
def save_quality_control(self, key, data): qc_path = os.path.join(self.project["path"], "quality_control") ensure_path_exists(qc_path) with open(os.path.join(qc_path, "{}.json".format(key)), "w") as f: json.dump(data, f, indent=True, sort_keys=True)