Beispiel #1
0
def get_paths(project_id, conf=None):
	if conf is None:
		conf = get_project_conf()

	results_path = get_results_path(conf)
	project_path = get_project_path(conf, project_id)
	project_temp_path = get_temp_path(conf, project_id)
	ensure_path_exists(project_path)
	ensure_path_exists(project_temp_path)
	return results_path, project_path, project_temp_path
Beispiel #2
0
def split_variants(project):
    log = task.logger

    config = GlobalConfig(task.conf)

    partition_port = task.ports("partitions")

    log.info("--- [{}] --------------------------------------------".format(project["id"]))

    projdb = ProjectDb(project["db"])

    log.info("Preparing variants for VEP ...")

    base_path = os.path.join(project["temp_path"], "consequences")

    ensure_path_exists(base_path)

    project["csq_path"] = base_path

    partition_size = config.vep_partition_size
    partition = -1
    f = None

    count = 0
    for var in projdb.variants(order_by="position"):
        start, end, ref, alt = var_to_tab(var)

        if count % partition_size == 0:
            if f is not None:
                f.close()

            partition += 1
            partition_path = os.path.join(base_path, "{0:08d}.vep_in".format(partition))
            f = open(partition_path, "w")
            partition_port.send(
                {"project": project, "index": partition, "bed_path": partition_path, "base_path": base_path}
            )

        tsv.write_line(f, var.chr, start, end, ref + "/" + alt, var.strand, var.id)

        count += 1

    if f is not None:
        f.close()

    log.info("{} variants split into {} partitions".format(count, partition + 1))

    projdb.close()
Beispiel #3
0
def init_project(logger, config, paths, storage, project):
	project_id = project["id"]

	results_path = paths.results_path()
	project_path = paths.project_path(project_id)
	project_temp_path = paths.project_temp_path(project_path)

	if config.results.purge_on_start:
		logger.info("  Purging previous results ...")
		if os.path.isdir(project_path):
			logger.info("    {} ...".format(os.path.relpath(project_path, results_path)))
			shutil.rmtree(project_path)
		#if os.path.isdir(project_temp_path):
		#	logger.info("    {} ...".format(os.path.relpath(project_temp_path, results_path)))
		#	shutil.rmtree(project_temp_path)

		for obj_name in storage.list_objects(prefix="results/"):
			logger.info("    {} ...".format(obj_name))
			storage.delete_object("results/{}".format(obj_name))

	ensure_path_exists(project_path)
	ensure_path_exists(project_temp_path)

	projdb_path = os.path.join(project_path, "project.db")

	if "annotations" in project:
		annotations = project["annotations"]
		if not Data.is_element(annotations):
			logger.warn("Overriding project annotations field with an empty dictionary")
			project["annotations"] = annotations = Data.element()
	else:
		project["annotations"] = annotations = Data.element()

	# for backward compatibility
	for key in project.keys():
		if key not in ["id", "assembly", "files", "storage_objects", "annotations", "conf", "oncodriveclust", "oncodrivefm"]:
			value = project[key]
			del project[key]
			annotations[key] = value

	project["conf"] = pconf = project.get("conf") or Data.element()
	if not Data.is_element(pconf):
		logger.warn("Overriding project conf field with an empty dictionary")
		project["conf"] = pconf = Data.element()

	# for backward compatibility
	for key in project.keys():
		if key in ["oncodriveclust", "oncodrivefm"]:
			value = project[key]
			del project[key]
			pconf[key] = value

	project["path"] = project_path
	project["temp_path"] = project_temp_path
	project["db"] = projdb_path

	if "assembly" not in project:
		project["assembly"] = DEFAULT_ASSEMBLY

	missing_objects = []

	for obj_name in project["storage_objects"]:
		if not storage.exists_object(obj_name):
			missing_objects += [obj_name]

	if len(missing_objects) > 0:
		raise InternalError("Project {0} references some missing objects:\n{1}".format(project_id, "\n".join(missing_objects)))

	project["files"] = [str(f) for f in project["files"]] #unicode is not json serializable
	project["storage_objects"] = [str(f) for f in project["storage_objects"]] #unicode is not json serializable

	project = project.to_native()

	# save project.conf
	projres = ProjectResults(project)
	projres.save_def()

	return project
Beispiel #4
0
def create_datasets(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_path = project["path"]
	temp_path = project["temp_path"]

	datasets_path = paths.project_results_path(project_path)
	ensure_path_exists(datasets_path)

	sigdb = SigDb(config.sigdb_path)
	sigdb.open()

	projdb = ProjectDb(project["db"])

	gene_sym = projdb.get_gene_symbols()

	total_samples = projdb.get_total_affected_samples()

	log.info("Exporting variant genes ...")

	vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log)
	tsv.write_param(vf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE",
					"GENE_ID", "IMPACT", "IMPACT_CLASS",
					"SAMPLE_FREQ", "SAMPLE_PROP",
					"CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS")

	sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log)
	tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE")

	count = 0
	for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True):
		var = afg.var
		rec = afg.rec

		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		xrefs = [xref for xref in var.xrefs]
		if sigdb.exists_variant(var.chr, start):
			xrefs += ["I:1"]
		xrefs = ",".join(xrefs)

		intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0

		tsv.write_line(vf, var.id, var.chr, var.strand, start, allele,
						afg.gene_id, afg.impact, TransFIC.class_name(afg.impact),
						rec.sample_freq, rec.sample_prop,
						afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N")

		for sample in var.samples:
			tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N")

		count += 1

	vf.close()
	sf.close()

	log.info("  {0} variant genes".format(count))

	log.info("Exporting consequences ...")

	cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log)
	tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID",
				   "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE",
					"SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
					"PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
					"MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS",
					"IMPACT", "IMPACT_CLASS")

	count = 0
	for csq in projdb.consequences(join_variant=True):
		var = csq.var
		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		uniprot = protein = protein_pos = aa_change = None
		sift_score = sift_tfic = sift_tfic_class = None
		pph2_score = pph2_tfic = pph2_tfic_class = None
		ma_score = ma_tfic = ma_tfic_class = None

		if so.match(csq.ctypes, so.ONCODRIVEFM):
			uniprot, protein = csq.uniprot, csq.protein

		if so.match(csq.ctypes, so.NON_SYNONYMOUS):
			protein_pos, aa_change = csq.protein_pos, csq.aa_change
			sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class)
			pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class)
			ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class)

		tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript,
						",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene),
						uniprot, protein, protein_pos, aa_change,
						sift_score, sift_tfic, sift_tfic_class,
						pph2_score, pph2_tfic, pph2_tfic_class,
						ma_score, ma_tfic, ma_tfic_class,
						csq.impact, TransFIC.class_name(csq.impact), null_value="\N")
		count += 1

	cf.close()

	log.info("  {0} consequences".format(count))

	log.info("Exporting genes ...")

	gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log)
	tsv.write_param(gf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE",
				   "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS",
				   "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER")


	for gene in projdb.genes(join_rec=True):
		rec = gene.rec

		if rec.sample_freq is None or rec.sample_freq == 0:
			continue

		intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0

		tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause,
					   gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords,
					   rec.sample_freq or 0, rec.sample_prop or 0,
					   intogen_driver, null_value="\N")

	gf.close()

	log.info("Exporting pathways ...")

	pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log)
	tsv.write_param(pf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE",
				   "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP")

	for pathway in projdb.pathways(join_rec=True):
		rec = pathway.rec

		if rec.sample_freq is None or rec.sample_freq == 0:
			continue

		tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue,
						rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N")

	pf.close()

	if not config.skip_oncodrivefm:

		log.info("Exporting genes per sample functional impact ...")

		with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f:
			tsv.write_line(f, "GENE_ID", "SAMPLE",
					   "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
					   "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
					   "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS")

			for fields in projdb.sample_gene_fimpacts():
				(gene, sample,
					sift_score, sift_tfic, sift_tfic_class,
					pph2_score, pph2_tfic, pph2_tfic_class,
					ma_score, ma_tfic, ma_tfic_class) = fields
				tsv.write_line(f, gene, sample,
						   sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class),
						   pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class),
						   ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N")

	projdb.close()

	sigdb.close()

	log.info("Saving project configuration ...")

	projres = ProjectResults(project)

	with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f:
		names = ["ASSEMBLY", "SAMPLES_TOTAL"]
		values = [project["assembly"], total_samples]
		names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values)
		tsv.write_line(f, *names)
		tsv.write_line(f, *values, null_value="\N")

	projects_port = task.ports("projects_out")
	projects_port.send(project)
Beispiel #5
0
	def save_quality_control(self, key, data):
		qc_path = os.path.join(self.project["path"], "quality_control")
		ensure_path_exists(qc_path)
		with open(os.path.join(qc_path, "{}.json".format(key)), "w") as f:
			json.dump(data, f, indent=True, sort_keys=True)