Example #1
	def save_matrix(self, output_path, analysis_name, output_format,
						 row_names, col_names, data,
						 suffix="", params=None, valid_row=lambda row: True):

		if len(suffix) > 0:
			suffix = "-{0}".format(suffix)

		if params is None:
			params = []

		path = os.path.join(output_path, "{0}{1}.{2}".format(analysis_name, suffix, output_format))
		self.log.debug("  > {0}".format(path))

		with tsv.open(path, 'w') as f:
			tsv.write_line(f, "## version={0}".format(VERSION))
			tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			for key, value in params + self.parameters:
				tsv.write_line(f, "## {0}={1}".format(key, value))
			tsv.write_line(f, "ID", *col_names)
			for row_index, row_name in enumerate(row_names):
				if len(row_name) == 0:
					self.log.warn("Empty identifier detected")

				row = data[row_index, :]
				if valid_row(row):
					values = [v if not np.isnan(v) else None for v in row]
					tsv.write_line(f, row_name, *values, null_value="-")
Example #2
def ma_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	offline = conf["offline"]

	if offline == "yes":
		log.info("Running Mutation assessor in local mode.")

		ma = MaLocal(conf["ma_cache_path"])
		log.info("Running Mutation assessor using web services.")

		ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db"))

	results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"]))

	if not os.path.exists(results_path) or conf.get("consequences_overwrite", True):

		log.info("Querying Mutation assessor for 'missense_variant' consequences ...")

		projdb = ProjectDb(project["db"])

		missense_variants = set()

		with open(partition["vep_path"], "r") as f:
			for line in f:
				fields = line.rstrip().split("\t")
				var_id = int(fields[0])
				ctypes = fields[3].split(",")
				if so.match(ctypes, so.NON_SYNONYMOUS):

		with open(results_path, "w") as mf:
			for var_id in missense_variants:
				var = projdb.get_variant(var_id)

				start, end, ref, alt = var_to_tab(var)

				r = ma.get(var.chr, var.strand, start, ref, alt, var_id)
				if r is not None:
					tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-")


		log.warn("Skipping MA, results already exist.")
		log.debug("MA results: {0}".format(results_path))


	# Send results to the next module
	partition["ma_path"] = results_path
Example #3
def datasets(projects_set):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	group_file_prefix = normalize_id(classifier_id)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(
		classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Reading number of samples per project ...")

	project_ids = []
	total_samples = 0
	for project in projects:
		project_id = project["id"]
		project_ids += [project_id]

		log.info("  Project {0}".format(project["id"]))

		projdb = ProjectDb(project["db"])

		num_samples = projdb.get_total_affected_samples()
		total_samples += num_samples

		log.debug("    {0} samples".format(num_samples))


	log.debug("  {0} samples in total".format(total_samples))

	log.info("Updating ...")

	combination_path = paths.combination_path()

	path = os.path.join(combination_path, "{0}.tsv".format(group_file_prefix))

	if not os.path.exists(path):
		with open(path, "w") as f:
			tsv.write_line(f, "NAME", "SHORT_NAME", "LONG_NAME", "SAMPLES_TOTAL", "PROJECT_IDS")

	with open(path, "a") as f:
		tsv.write_line(f, group_name, group_short_name, group_long_name, total_samples, ",".join(project_ids))
Example #4
def vep_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project_id = partition["project"]["id"]
	log.info("--- [{0} @ {1}] --------------------------------------------".format(project_id, partition["index"]))

	offline = conf["offline"]

	if offline == "yes":
		log.info("Running VEP in local mode.")

		vep = VepLocal(
				script_path=os.path.join(conf["ext_bin_path"], "variant_effect_predictor", "variant_effect_predictor.pl"),
				cache_path=os.path.join(conf["data_path"], "vep_cache"))
		log.info("Running VEP using web services.")

		vep = VepService(cache_path=os.path.join(conf["cache_path"], "vep.db"))

	results_path = os.path.join(partition["base_path"], "{0:08d}.vep".format(partition["index"]))

	if not os.path.exists(results_path) or conf.get("consequences_overwrite", True):
		# Run VEP

		log.info("Saving results ...")
		log.debug("VEP results: {0}".format(vep.results_path))

		# Save results

		with open(results_path, "w") as f:
			for r in vep.results():
				tsv.write_line(f, r.var_id, r.gene, r.transcript,
								r.protein_pos, r.aa_change, r.protein,
								r.sift, r.polyphen, null_value="-")

		log.warn("Skipping VEP, results already exist.")
		log.debug("VEP results: {0}".format(results_path))


	# Send results to the next module
	partition["vep_path"] = results_path
Example #5
def split_variants(project):
    log = task.logger

    config = GlobalConfig(task.conf)

    partition_port = task.ports("partitions")

    log.info("--- [{}] --------------------------------------------".format(project["id"]))

    projdb = ProjectDb(project["db"])

    log.info("Preparing variants for VEP ...")

    base_path = os.path.join(project["temp_path"], "consequences")


    project["csq_path"] = base_path

    partition_size = config.vep_partition_size
    partition = -1
    f = None

    count = 0
    for var in projdb.variants(order_by="position"):
        start, end, ref, alt = var_to_tab(var)

        if count % partition_size == 0:
            if f is not None:

            partition += 1
            partition_path = os.path.join(base_path, "{0:08d}.vep_in".format(partition))
            f = open(partition_path, "w")
                {"project": project, "index": partition, "bed_path": partition_path, "base_path": base_path}

        tsv.write_line(f, var.chr, start, end, ref + "/" + alt, var.strand, var.id)

        count += 1

    if f is not None:

    log.info("{} variants split into {} partitions".format(count, partition + 1))

Example #6
def fetch(db, muts_path, out_path, params=None, columns=None, maps=None, predictors=None,
		  labels=None, calc_labels=None, muts_header=False, logger=None):
	params = params or {}
	columns = columns or [c.lower() for c in COORD_COLUMNS]
	maps = maps or []
	predictors = predictors or []
	labels = labels or []
	state = {}
	with tsv.open(out_path, "w") as wf:
		metadata = db.metadata
		if "version" in metadata:
			tsv.write_param(wf, "db-version", db.metadata["version"])
		tsv.write_param(wf, "fetched", dt.now().strftime("%Y-%m-%d %H:%M:%S"))
		for k, v in params.items():
			tsv.write_param(wf, k, v)
		tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [m.upper() for m in maps] + predictors + labels)
		for row in fetch_iter(db, muts_path, maps=maps, predictors=predictors,
							  muts_header=muts_header, state=state, logger=logger):
			if calc_labels is not None:
				labels = calc_labels(row) or {}
				labels = {}
			xrefs = row["xrefs"]
			scores = row["scores"]

			tsv.write_line(wf, state[STATE_MUTATION].identifier,
				   *[row[c] for c in columns]
				   + [xrefs[m] for m in maps]
				   + [scores[p] for p in predictors]
				   + [labels.get(l, "") for l in labels])
	return {k : state[k] for k in [STATE_HITS, STATE_FAILS]}
Example #7
def main():
    parser = argparse.ArgumentParser(description="Extract mutations in VCF and save as simple tabulated file")

    parser.add_argument("vcf_paths", metavar="PATH", nargs="+", help="The VCF files")

    parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.")


    args = parser.parse_args()


    log = bglogging.get_logger("vcf-to-snvs")

    if args.out_path is None:
        names = []
        for path in args.vcf_paths:
            if path != "-":
                base_path, name, ext = tsv.split_path(path)
                names += [name]

        prefix = os.path.commonprefix(*names) if len(names) > 0 else ""
        prefix = prefix.rstrip(".")
        if len(prefix) == 0:
            prefix = "genome"
        args.out_path = "{}.tsv.gz".format(prefix)

    with tsv.open(args.out_path, "w") as outf:
        tsv.write_line(outf, "CHR", "POS", "REF", "ALT")

        for path in args.vcf_paths:
            log.info("Reading {} ...".format(path))

            with tsv.open(path) as inf:
                types = (str, str, str, str)
                columns = [0, 1, 3, 4]
                for fields in tsv.lines(inf, types, columns=columns):
                    chrom, pos, ref, alt = fields

                    # ref = ref.upper().strip("N")
                    # alt = alt.upper().strip("N")

                    ref_len = len(ref)
                    alt_len = len(alt)

                    if ref_len != alt_len or ref_len == 0 or alt_len == 0:

                        pos = int(pos)

                    if ref_len == 1:
                        tsv.write_line(outf, chrom, pos, ref, alt)
                        for i in range(ref_len):
                            tsv.write_line(outf, chrom, pos + i, ref[i], alt[i])
Example #8
def main():
	parser = argparse.ArgumentParser(
		description="Export SNV's")

	cmd = DefaultCommandHelper(parser)


	parser.add_argument("dest_path", metavar="DEST",
						help="The destination file. Use - for standard output.")

	args, log = cmd.parse_args("export-snvs")

	db = cmd.open_db()

	logger.info("Exporting SNV's ...")

	total_count = 0
	total_start_time = time.time()

		progress = RatedProgress(logger, name="SNVs")
		rows_count = 0
		with tsv.open(args.dest_path, "w") as f:
			for snv in db.snvs():
				rows_count += 1

				tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S")


		log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time))
		return cmd.handle_error()

	return 0
Example #9
def create_datasets(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_path = project["path"]
	temp_path = project["temp_path"]

	datasets_path = paths.project_results_path(project_path)

	sigdb = SigDb(config.sigdb_path)

	projdb = ProjectDb(project["db"])

	gene_sym = projdb.get_gene_symbols()

	total_samples = projdb.get_total_affected_samples()

	log.info("Exporting variant genes ...")

	vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log)
	tsv.write_param(vf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE",

	sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log)
	tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE")

	count = 0
	for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True):
		var = afg.var
		rec = afg.rec

		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		xrefs = [xref for xref in var.xrefs]
		if sigdb.exists_variant(var.chr, start):
			xrefs += ["I:1"]
		xrefs = ",".join(xrefs)

		intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0

		tsv.write_line(vf, var.id, var.chr, var.strand, start, allele,
						afg.gene_id, afg.impact, TransFIC.class_name(afg.impact),
						rec.sample_freq, rec.sample_prop,
						afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N")

		for sample in var.samples:
			tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N")

		count += 1


	log.info("  {0} variant genes".format(count))

	log.info("Exporting consequences ...")

	cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log)
	tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID",

	count = 0
	for csq in projdb.consequences(join_variant=True):
		var = csq.var
		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		uniprot = protein = protein_pos = aa_change = None
		sift_score = sift_tfic = sift_tfic_class = None
		pph2_score = pph2_tfic = pph2_tfic_class = None
		ma_score = ma_tfic = ma_tfic_class = None

		if so.match(csq.ctypes, so.ONCODRIVEFM):
			uniprot, protein = csq.uniprot, csq.protein

		if so.match(csq.ctypes, so.NON_SYNONYMOUS):
			protein_pos, aa_change = csq.protein_pos, csq.aa_change
			sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class)
			pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class)
			ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class)

		tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript,
						",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene),
						uniprot, protein, protein_pos, aa_change,
						sift_score, sift_tfic, sift_tfic_class,
						pph2_score, pph2_tfic, pph2_tfic_class,
						ma_score, ma_tfic, ma_tfic_class,
						csq.impact, TransFIC.class_name(csq.impact), null_value="\N")
		count += 1


	log.info("  {0} consequences".format(count))

	log.info("Exporting genes ...")

	gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log)
	tsv.write_param(gf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE",

	for gene in projdb.genes(join_rec=True):
		rec = gene.rec

		if rec.sample_freq is None or rec.sample_freq == 0:

		intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0

		tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause,
					   gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords,
					   rec.sample_freq or 0, rec.sample_prop or 0,
					   intogen_driver, null_value="\N")


	log.info("Exporting pathways ...")

	pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log)
	tsv.write_param(pf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE",

	for pathway in projdb.pathways(join_rec=True):
		rec = pathway.rec

		if rec.sample_freq is None or rec.sample_freq == 0:

		tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue,
						rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N")


	if not config.skip_oncodrivefm:

		log.info("Exporting genes per sample functional impact ...")

		with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f:
			tsv.write_line(f, "GENE_ID", "SAMPLE",

			for fields in projdb.sample_gene_fimpacts():
				(gene, sample,
					sift_score, sift_tfic, sift_tfic_class,
					pph2_score, pph2_tfic, pph2_tfic_class,
					ma_score, ma_tfic, ma_tfic_class) = fields
				tsv.write_line(f, gene, sample,
						   sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class),
						   pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class),
						   ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N")



	log.info("Saving project configuration ...")

	projres = ProjectResults(project)

	with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f:
		values = [project["assembly"], total_samples]
		names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values)
		tsv.write_line(f, *names)
		tsv.write_line(f, *values, null_value="\N")

	projects_port = task.ports("projects_out")
Example #10
	def save_splited_results(self, output_path, analysis_name, output_format,
								matrix, mapping, method, results, slices, suffix=""):

		if len(suffix) > 0:
			suffix = "-{0}".format(suffix)

		for slice_results_index, slice in enumerate(slices):
			slice_name = matrix.slice_names[slice]
			path = os.path.join(output_path, "{0}{1}-{2}.{3}".format(
				analysis_name, suffix, slice_name, output_format))
			self.log.debug("  > {0}".format(path))

			with tsv.open(path, 'w') as f:
				tsv.write_line(f, "## version={0}".format(VERSION))
				tsv.write_line(f, "## slice={0}".format(slice_name))
				tsv.write_line(f, "## method={0}".format(method.name))
				tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
				for key, value in self.parameters:
					tsv.write_line(f, "## {0}={1}".format(key, value))
				tsv.write_line(f, "ID", *method.results_columns)
				for row_index, row_name in enumerate(mapping.group_names):
					value = results[slice_results_index, row_index]
					if not np.isnan(value):
						tsv.write_line(f, row_name, value, null_value="-")
Example #11
def drivers():
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	db_path = paths.results_path("drivers.db")
	db = SigDb(db_path)

	log.info("Variants ...")

	path = paths.combination_path("recurrences", "variant_gene-global-all.tsv.gz")
	with tsv.open(path, "r") as f:
		types = (str, str, int, str)
		for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True):
			chr, strand, start, allele = fields[:4]
			db.add_variant(chr, start)

	log.info("Genes ...")

	gene_sites = {}

	gene_fm = set()
	gene_clust = set()

	#SPECIAL_THRESHOLD = ["C18", "C34"]

	log.info("  OncodriveFM ...")

	filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodrivefm")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:

		cancer_site_code = m.group(1)

		if cancer_site_code in SPECIAL_THRESHOLD:
			threshold = 1e-6
			threshold = 0.01

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < threshold:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)


	log.info("  OncodriveCLUST ...")

	filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz")
	base_path = paths.combination_path("oncodriveclust")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:

		cancer_site_code = m.group(1)

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < 0.05:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)


	log.info("  Updating db ...")
	sig_genes = gene_fm | gene_clust
	for gene in sig_genes:
		db.add_gene(gene, gene in gene_fm, gene in gene_clust)

	log.info("Saving driver genes cancer sites dataset ...")
	path = paths.results_path("gene-driver_cancer_sites.tsv")
	log.debug("> {}".format(path))
	with open(path, "w") as f:
		tsv.write_param(f, "date", datetime.now())
		for gene, sites in gene_sites.items():
			tsv.write_line(f, gene,
						   1 if gene in gene_fm else 0,
						   1 if gene in gene_clust else 0,
						   ", ".join(sorted([code for code, name in sites])),
						   ", ".join(sorted([name for code, name in sites])))

Example #12
def gene_impact(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	projects_port = task.ports("projects")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	partitions = project["partitions"]

	log.info("Reading {} partitions ...".format(len(partitions)))

	aff_gene_attrs = {}

	for partition in partitions:
		log.info(" Partition {} ...".format(partition["index"]))
		with open(partition["tfi_path"], "r") as f:
			bool_type = lambda val: bool(int(val)) if val is not None else False
			types = (int, str, str, bool_type, int, int, int, int)
			columns = [0, 2, 4, 5, 6, 10, 14, 18]
			for fields in tsv.lines(f, types, columns=columns, null_value="-"):
				(var_id, gene, prot_change, coding_region, tr_impact,
				 	sift_impact, pph2_impact, ma_impact) = fields

				coding_region = coding_region == 1

				aff_gene = (var_id, gene)

				# update aggregated impact for all the predictors
				update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact)
				update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact)
				update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact)

				# update whether the affected gene is a coding region or not
				update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region,
							update=lambda prev_value, value: prev_value or value)

				# aggregate protein changes per affected_gene
				if prot_change is not None:
					update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change,
									 new=lambda value: set([value]),
									 update=lambda prev_value, value: prev_value | set([value]))

	num_vars = len(set([var_id for var_id, gene in aff_gene_attrs.keys()]))
	num_genes = len(set([gene for var_id, gene in aff_gene_attrs.keys()]))
	log.info("Saving {} variant-gene impacts ({} variants and {} genes) ...".format(len(aff_gene_attrs), num_vars, num_genes))

	gfi_path = os.path.join(project["csq_path"], "variant-gene_impact.tsv")
	with open(gfi_path, "w") as vf:
		for aff_gene, attrs in aff_gene_attrs.items():
			var_id, gene = aff_gene

			# get the impact by trust priority: ma, pph2, sift
			impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = attrs.get("coding_region", False)
			coding_region = 1 if coding_region else 0

			prot_changes = attrs.get("prot_changes")
			prot_changes = ",".join(prot_changes) if prot_changes is not None else None

			tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-")

	# Send results to the next module
	project["gfi_path"] = gfi_path
Example #13
def write_line(f, *v):
	tsv.write_line(f, *v, null_value="-")
Example #14
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	projdb = ProjectDb(project["db"])

	log.info("Retrieving functional impact scores for genes ...")

	data = retrieve_data(projdb)


	# save data matrix

	dst_path = os.path.join(project["temp_path"], "oncodrivefm-data.tdm")
	sgfi_path = os.path.join(project["temp_path"], "sample_gene-fimpact.tsv.gz")
	project["sample_gene_fi_data"] = sgfi_path

	log.info("Saving functional impact scores ...")
	log.debug("> {0}".format(dst_path))

	with open(dst_path, "w") as f:
		sgff = tsv.open(sgfi_path, "w")

		tsv.write_line(f, "SAMPLE", "GENE", "SIFT", "PPH2", "MA")
		tsv.write_line(sgff, "SAMPLE", "GENE",

		for key, values in data.iteritems():
			sample, gene = key

			(sift_score, sift_tfic, sift_tfic_class,
				pph2_score, pph2_tfic, pph2_tfic_class,
				ma_score, ma_tfic, ma_tfic_class) = values

			tsv.write_line(f, sample, gene, sift_score, pph2_score, ma_score)
			tsv.write_line(sgff, sample, gene,
						   sift_score, sift_tfic, sift_tfic_class,
						   pph2_score, pph2_tfic, pph2_tfic_class,
						   ma_score, ma_tfic, ma_tfic_class, null_value="-")


	# count samples

	samples = set()
	gene_sample_count = {}
	for sample, gene in data.keys():
		if gene not in gene_sample_count:
			gene_sample_count[gene] = 1
			gene_sample_count[gene] += 1

	num_samples = len(samples)

	if num_samples == 0:
		log.warn("There are no samples data, skipping OncodriveFM for this project")

	(num_cores, estimator,
		genes_num_samplings, genes_threshold, genes_filter_enabled, genes_filter, filt,
		pathways_num_samplings, pathways_threshold) = get_oncodrivefm_configuration(log, conf, project, num_samples)

	# Create a dataset with information on why some genes are not considered for calculation in OncodriveFM
	# There are basically two possible reasons:
	# - It does not pass the filter
	# - There are less samples mutated than the threshold

	exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < genes_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	ofm = dict(

	for slice_name in ["SIFT", "PPH2", "MA"]:

	for slice_name in ["SIFT", "PPH2", "MA"]:
Example #15
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	config = GlobalConfig(conf)
	paths = PathsConfig(config) # avoid that project conf override path configurations
	config = GlobalConfig(conf, project["conf"])

	oclust = OncodriveClust(config.oncodriveclust, paths, log)

	project_results = ProjectResults(project)

	projdb = ProjectDb(project["db"])

	data = oclust.retrieve_data(projdb)


	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = defaultdict(int)

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			gene_sample_count[gene] += 1

			if oclust.filter_enabled and not oclust.filter.valid(gene):

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if oclust.filter_enabled and not oclust.filter.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < oclust.samples_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

Example #16
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Baseline Tolerance partial statistics per feature")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("scores_path", metavar="SCORES_PATH",
						help="The scores file")

	parser.add_argument("predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="Output file.")


	args, logger = cmd.parse_args("blt-partial")

	predictors = [p.strip() for p in args.predictors.split(",") if len(p.strip()) > 0]
	num_predictors = len(predictors)

	if len(predictors) == 0:
		logger.error("At least one predictor is needed")

	logger.info("Selected predictors: {}".format(", ".join(predictors)))

	transforms = cmd.get_transforms()

	stats = {}

	lost_snvs = 0
	scores_path = args.scores_path

	logger.info("Reading scores from {} ...".format(
		os.path.basename(scores_path) if scores_path != "-" else "standard input"))

	with tsv.open(scores_path) as sf:
		for line_num, line in enumerate(sf):
			fields = line.rstrip("\n").split("\t")
			chrom, pos, ref, alt, feature = fields[:5]

			if len(feature) == 0:
				lost_snvs += 1

			scores = fields[5:]

			if len(scores) != num_predictors:
				line_error(logger, scores_path, line_num, "Number of score columns does not match the number of predictors")

				scores = [float(v) if len(v) > 0 else None for v in scores]
				line_error(logger, scores_path, line_num, "Scores should be real numbers: {}".format(scores))

			if feature not in stats:
				stats[feature] = tuple([[0, 0.0, 0.0] for p in predictors])

			feature_stats = stats[feature]

			for i, score in enumerate(scores):
				if score is not None:
					predictor = predictors[i]
					if predictor in transforms:
						for name, func in transforms[predictor]:
								score = func(score)
								logger.error("Error transforming the {} score {} with {}".format(predictor, score, name))

					feature_stats[i][0] += 1
					feature_stats[i][1] += score
					feature_stats[i][2] += score * score

	logger.info("Saving results into {} ...".format(
		os.path.basename(args.out_path) if args.out_path != "-" else "standard output"))

	with tsv.open(args.out_path, "w") as of:
		tsv.write_line(of, "FEATURE", *predictors)
		for feature in sorted(stats.keys()):
			sb = [feature]
			feature_stats = stats[feature]
			for i in range(num_predictors):
				sb += ["/".join([repr(v) for v in feature_stats[i]])]
			tsv.write_line(of, *sb)

	logger.info("Number of SNV's = {}, lost SNV's = {}, number of features = {}".format(line_num, lost_snvs, len(stats)))

	return 0
Example #17
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project)

	log.info("Loading transcripts CDS length ...")

	cds_len = load_cds_len(conf)

	log.info("Retrieving gene alterations ...")

	projdb = ProjectDb(project["db"])

	data = retrieve_data(projdb, cds_len)


	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = {}

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			if gene not in gene_sample_count:
				gene_sample_count[gene] = 1
				gene_sample_count[gene] += 1

			if genes_filter_enabled and not filt.valid(gene):

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < mutations_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

									genes_filter_enabled=genes_filter_enabled, # not used
									genes_filter=genes_filter))) # not used
Example #18
def fimpact_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC"))

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	aff_gene_attrs = {}

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			if ct is not None:
				ct = ct.split(",")
				ct = []

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = so.match(ct, so.CODING_REGION)

			calculate_transfic = True

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores[var_id] if var_id in ma_scores else None
			elif so.match(ct, so.STOP):               # stop
				ct_type = TransFIC.CT_STOP
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				ct_type = TransFIC.CT_FRAMESHIFT
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE):             # splice
				ct_type = "splice"
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS
				calculate_transfic = False
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				ct_type = TransFIC.CT_SYNONYMOUS
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				calculate_transfic = False

			if calculate_transfic:
				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				# if the impact was not preassigned get it from the transFIC calculated class
				sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact
				sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			aff_gene = (var_id, gene)

			# update aggregated impact for all the predictors
			update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact)

			# update whether the affected gene is a coding region or not
			update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region,
						update=lambda prev_value, value: prev_value or value)

			# aggregate protein changes per affected_gene
			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			if prot_change is not None:
				update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change,
								 new=lambda value: set([value]),
								 update=lambda prev_value, value: prev_value | set([value]))

			impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, uniprot,
						   sift_score, sift_tfic, sift_class,
						   pph2_score, pph2_tfic, pph2_class,
						   ma_score, ma_tfic, ma_class,
						   impact, null_value="-")


	log.info("Saving variant impacts ...")

	gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"]))
	vf = open(gfi_path, "w")
	for aff_gene, attrs in aff_gene_attrs.items():
		var_id, gene = aff_gene
		# get the impact by trust priority: ma, pph2, sift
		impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS
		coding_region = attrs.get("coding_region", False)
		coding_region = 1 if coding_region else 0
		prot_changes = attrs.get("prot_changes")
		prot_changes = ",".join(prot_changes) if prot_changes is not None else None

		tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-")

	# Send results to the next module
	partition["tfi_path"] = tfi_path
	partition["gfi_path"] = gfi_path
Example #19
def fimpact_run(partition):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=paths.data_transfic_path())

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			ct = (ct or "").split(",")

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot.get(var_id)

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = 1 if so.match(ct, so.CODING_REGION) else 0

			sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores.get(var_id)

				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact
			elif so.match(ct, so.STOP):               # stop
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_JUNCTION):    # splice junction
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_REGION):      # splice region
				sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS

			aff_gene = (var_id, gene)

			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact,
						   sift_score, sift_tfic, sift_class, sift_impact,
						   pph2_score, pph2_tfic, pph2_class, pph2_impact,
						   ma_score, ma_tfic, ma_class, ma_impact,


	# Send results to the next module
	partition["tfi_path"] = tfi_path
Example #20
	def save_combined_results(self, output_path, analysis_name, output_format,
								method, row_names, col_names, data, suffix="combination"):

		self.log.info("Saving combination results ...")

		path = os.path.join(self.args.output_path,
							"{0}-{1}.{2}".format(self.args.analysis_name, suffix, self.args.output_format))

		self.log.debug("  > {0}".format(path))
		with tsv.open(path, 'w') as f:
			tsv.write_line(f, "## slices={0}".format(",".join(col_names)))
			tsv.write_line(f, "## method={0}".format(method.name))
			tsv.write_line(f, "## date={0}".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			for key, value in self.parameters:
				tsv.write_line(f, "## {0}={1}".format(key, value))
			tsv.write_line(f, "ID", *method.combination_columns)
			for row_index, row_name in enumerate(row_names):
				if not np.isnan(data[row_index, 0]):
					values = [v if not np.isnan(v) else None for v in data[row_index, :]]
					tsv.write_line(f, row_name, *values, null_value="-")
Example #21
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Baseline Tolerance statistics")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("tree_path", metavar="TREE_PATH",
						help="The groups descendant tree")

	parser.add_argument("root_group", metavar="ROOT_GROUP",
						help="Tree root group")

	parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH",
						help="Map between groups and features")

	parser.add_argument("stats_path", metavar="STATS_PATH",
						help="Partial feature statistics")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="Output feature statistics")

	parser.add_argument("--tsv", dest="tsv_path", metavar="PATH",
						help="Store baseline tolerance in tsv format too.")

	parser.add_argument("-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD,
						help="Minimum number of features per group")

	parser.add_argument("--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD,
						help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)")

	args, logger = cmd.parse_args("blt-groups")

	logger.info("Loading groups tree ...")

	tree = Tree()
	with tsv.open(args.tree_path) as f:
		for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))):
			tree.add_node(group, children)

	logger.info("  Nodes: {}".format(tree.node_count))

	logger.info("Loading mappings between groups and features ...")

	all_groups = set()
	all_features = set()
	with tsv.open(args.group_genes_path) as f:
		for group, features in tsv.lines(f, (str, lambda v: set(v.split(",")))):
			tree.add_node(group, features)

	logger.info("  Nodes: {}".format(tree.node_count))
	logger.info("  Groups: {}".format(len(all_groups)))
	logger.info("  Features: {}".format(len(all_features)))

	logger.info("Loading partial statistics ...")

	with tsv.open(args.stats_path) as f:
		predictors = f.readline().rstrip("\n").split("\t")[1:]
		num_predictors = len(predictors)
		num_features = 0
		for line in f:
				fields = line.rstrip("\n").split("\t")
				feature = fields[0]
				node = tree.get_or_create_node(feature)
				for p, ss in zip(predictors, fields[1:]):
						s0, s1, s2 = [float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))]
						node.set_pblt(p, PartialBLT(s0, s1, s2, sources=set([feature])))
						import traceback
						logger.warn("Failed to parse partial baseline tolerance"
									" for {}/{} from {}".format(feature, p, ss))
				num_features += 1
				logger.warn("Failed to parse partial baseline tolerance"
									" for {} from {}".format(feature, line))

	logger.info("  Nodes: {}".format(tree.node_count))
	logger.info("  Features: {}".format(num_features))
	logger.info("  Predictors: {}".format(", ".join(predictors)))

	logger.info("Calculating baseline tolerance ...")

	for predictor in predictors:
		logger.info("For {} ...".format(predictor))

			parent=None, node=tree.get_or_create_node(args.root_group), predictor=predictor,
			count_threshold=args.count_threshold, stdev_threshold=args.stdev_threshold, logger=logger)

	# TODO log summary info

	logger.info("Writing results into {} ...".format(os.path.basename(args.out_path)))

	if args.tsv_path is not None:
		with tsv.open(args.tsv_path, "w") as of:
			tsv.write_line(of, "FEATURE", *predictors)
			for feature in all_features:
				sb = [feature]
				node = tree.get_node(feature)
				predictors_with_blt = 0
				for predictor in predictors:
					blt = node.get_blt(predictor)
					if blt is None or blt.n < args.count_threshold:
						sb += ["/".join(["-"] * 5)]

					predictors_with_blt += 1
					sb += ["/".join(map(str, [blt.from_node, blt.scope, blt.n, blt.mean, blt.stdev]))]

				if predictors_with_blt > 0:
					tsv.write_line(of, *sb)

	with tsv.open(args.out_path, "w") as of:
		tree_blt = {}
		for node_name, node in tree.nodes.items():
			predictors_blt = {}
			for predictor in predictors:
				pred_blt = node.get_blt(predictor)
				if pred_blt is None or pred_blt.n < args.count_threshold:

				predictors_blt[predictor] = dict(
					from_node=pred_blt.from_node, scope=pred_blt.scope,
					N=pred_blt.n, mean=pred_blt.mean, stdev=pred_blt.stdev)

			if len(predictors_blt) > 0:
				tree_blt[node.name] = predictors_blt

		doc = dict(
			tree=None, # tree relations
			pblt=None, # TODO
		json.dump(doc, of, indent=True)

	return 0
Example #22
def combination_recurrences(projects_set):
    log = task.logger

    config = GlobalConfig(task.conf)
    paths = PathsConfig(config)

    classifier, projects = projects_set

    classifier_id = classifier["id"]

    group_values = classifier["group_values"]
    short_values = classifier["group_short_values"]
    long_values = classifier["group_long_values"]

    group_name = classifier["group_name"]
    group_short_name = classifier["group_short_name"]
    group_long_name = classifier["group_long_name"]

    if len(group_values) == 0:
        group_file_prefix = classifier_id
        group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

    group_file_prefix = normalize_id(group_file_prefix)

        "--- [{0} ({1}) ({2}) ({3})] {4}".format(
            classifier["name"], group_long_name, group_short_name, group_name, "-" * 30

    log.info("Creating database ...")

    db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix))
    log.debug("  > {0}".format(db_path))

    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row


    log.info("Combining recurrences ...")

    c = conn.cursor()

    sample_total = 0

    project_ids = []
    for project in projects:
        project_ids += [project["id"]]

        log.info("  Project {0}:".format(project["id"]))

        projdb = ProjectDb(project["db"])

        project_sample_total = projdb.get_total_affected_samples()

        sample_total += project_sample_total

        log.info("    Total samples = {0}".format(project_sample_total))

        log.info("    Variant genes ...")

        count = 0
        for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True):
            var = afg.var
            rec = afg.rec

            if rec.sample_freq is None:
                log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg)))

            start, end, ref, alt = var_to_tab(var)

                    "INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)",
                    (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)),
                var_id = c.lastrowid
            except sqlite3.IntegrityError:
                    "SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?",
                    (var.chr, var.strand, start, ref, alt),
                r = c.fetchone()
                var_id = r[0]

                    "INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)",
                    (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq),
            except sqlite3.IntegrityError:
					UPDATE variant_genes
					SET sample_freq=sample_freq + ?
					WHERE var_id=? AND gene_id=?""",
                    (rec.sample_freq, var_id, afg.gene_id),

            count += 1

        log.info("      {0} variant genes".format(count))

        log.info("    Genes ...")

        count = 0
        for gene in projdb.genes(join_xrefs=True, join_rec=True):
            rec = gene.rec

            if rec.sample_freq is None:

            c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,))
            r = c.fetchone()
            if r[0] == 0:
                c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)", (gene.id, rec.sample_freq))
                c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?", (rec.sample_freq, gene.id))
            count += 1

        log.info("      {0} genes".format(count))

        log.info("    Pathways ...")

        count = 0
        for pathway in projdb.pathways(join_rec=True):
            rec = pathway.rec

            if rec.sample_freq is None:

            c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,))
            r = c.fetchone()
            if r[0] == 0:
                c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)", (pathway.id, rec.sample_freq))
                    "UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?", (rec.sample_freq, pathway.id)
            count += 1

        log.info("      {0} pathways".format(count))


    log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total))

    if sample_total > 0:
        c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
        c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
        c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))


    log.info("Saving results ...")

    c = conn.cursor()

    base_path = paths.combination_path("recurrences")

    log.info("  Variant genes ...")

    with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        for r in c.execute(
            "SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id"
            strand, ref, alt = r["strand"], r["ref"], r["alt"]
            allele = "{0}/{1}".format(ref, alt)

    log.info("  Genes ...")

    with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
        for r in c.execute("SELECT * FROM genes ORDER BY gene_id"):
            tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-")

    log.info("  Pathways ...")

    with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
        tsv.write_param(f, "classifier", classifier["id"])
        tsv.write_param(f, "group_id", group_name)
        tsv.write_param(f, "group_short_name", group_short_name)
        tsv.write_param(f, "group_long_name", group_long_name)
        tsv.write_param(f, "projects", ",".join(project_ids))
        tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
        tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
        for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"):
            tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-")


    remove_temp(task, db_path)
Example #23
def main():
	parser = argparse.ArgumentParser(
		description="Export dbNSFP scores")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("source_path", metavar="SOURCE",
						help="The original zip file")

	parser.add_argument("ensp_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Ensembl transcript id's and Uniprot id's")

	parser.add_argument("uniprot_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Uniprot id's")

	parser.add_argument("-o", "--output", dest="out_path", metavar="OUT_PATH",
						help="The output file")

	parser.add_argument("--temp", dest="temp_path", metavar="TEMP_PATH",
						help="A temporary path for zip extraction")

	parser.add_argument("--chr", dest="chr", metavar="CHROMOSOMES",
						help="Chromosomes to include: list separated by commas.")

	parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False,
						help="Skip SNV's where all the scores are empty")

	args, logger = cmd.parse_args("dbnsfp-export")

	if args.out_path is None:
		basename = os.path.basename(args.source_path)
		prefix = os.path.splitext(basename)[0]
		args.out_path = "{}.tsv.gz".format(prefix)

	logger.info("Loading maps ...")

	uniprot_map = {}
	trs_map = {}
	with tsv.open(args.ensp_map_path) as f:
		for ensp, enst in tsv.lines(f, (str, str)):
			if len(enst) > 0:
				trs_map[enst] = ensp

	with tsv.open(args.uniprot_map_path) as f:
		for ensp, uniprot_id in tsv.lines(f, (str, str)):
			if len(uniprot_id) > 0:
				uniprot_map[uniprot_id] = ensp

	logger.info("Opening {} ...".format(args.source_path))

	chromosomes = None
	if args.chr is not None:
		chromosomes = [c.strip().upper() for c in args.chr.split(",") if len(c.strip()) > 0]
		logger.info("Selected chromosomes: {}".format(", ".join(chromosomes)))
		chromosomes = set(chromosomes)

	name_pattern = re.compile(r"dbNSFP.+_variant.chr(.+)")

		"#chr", "pos(1-coor)", "ref", "alt", "cds_strand",
		"genename", "Uniprot_id", "Uniprot_aapos", "aaref", "aaalt",
		"Ensembl_geneid", "Ensembl_transcriptid", "aapos",
#		"GERP_RS",
#		"PhyloP_score"

	tmp_prefix = args.temp_path or tempfile.gettempdir()
	if not os.path.exists(tmp_prefix):
	if tmp_prefix[-1] != "/":
		tmp_prefix += "/"

	extract_path = tempfile.mkdtemp(prefix=tmp_prefix)

		logger.info("Output: {}".format(args.out_path if args.out_path != "-" else "standard output"))

		total_start_time = time.time()
		total_lines = 0
		with ZipFile(args.source_path, "r") as zf,\
			tsv.open(args.out_path, "w") as of: #,\
			#tsv.open(args.noprot_path, "w") as npf:

			tsv.write_line(of, "CHR", "STRAND", "START", "REF", "ALT", "TRANSCRIPT",
						   "PROTEIN", "AA_POS", "AA_REF", "AA_ALT",
						   "SIFT", "PPH2", "MA", "FATHMM", "MT", "GERPRS", "PHYLOP")

			#tsv.write_line(npf, "#CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")

			entries = []
			for entry in zf.infolist():
				m = name_pattern.match(entry.filename)
				if not m:

				chr = m.group(1)
				index = CHR_INDEX[chr] if chr in CHR_INDEX else 99

				if chromosomes is not None and chr not in chromosomes:
					logger.debug("Skipping chromosome {} ...".format(chr))

				entries += [(index, chr, entry)]

			for index, chr, entry in sorted(entries, key=lambda x: x[0]):
				logger.info("Reading chromosome {} ...".format(chr))

				zf.extract(entry, extract_path)
				fpath = os.path.join(extract_path, entry.filename)
				with open(fpath) as f:
					# Parse header
					hdr_line = f.readline()
					hdr = {}
					for index, name in enumerate(hdr_line.rstrip("\n").split("\t")):
						hdr[name] = index
					columns = [hdr[name] if name in hdr else None for name in COLUMNS]

					read = set()

					start_time = time.time()
					partial_start_time = start_time
					for line_num, line in enumerate(f, start=2):
						fields = line.rstrip("\n").split("\t")

							fields = [fields[i] if i is not None and i < len(fields) else None for i in columns]

							(chr, start, ref, alt, strand,
							 symbol, uniprot, uniprot_aapos, aa_ref, aa_alt,
							 gene, transcript, aapos,
							 sift, pph2, ma, fathmm,
							 mt, gerprs, phylop) = fields
							start = safe_int(start)
							ref = ref.upper() if ref is not None else None
							alt = alt.upper() if alt is not None else None
							aa_ref = aa_ref.upper() if aa_ref is not None else None
							aa_alt = aa_alt.upper() if aa_alt is not None else None
							sift = safe_float(sift)
							ma = safe_float(ma)
							fathmm = safe_float(fathmm)
							mt = safe_float(mt)
							gerprs = safe_float(gerprs)
							phylop = safe_float(phylop)

							if start is None or ref is None or alt is None:
								logger.warn("None value for pos or ref or alt at line {}: {}".format(line_num, fields))
							elif ref not in BASE_INDEX or alt not in BASE_INDEX:
								logger.warn("Unknown ref or alt at line {}: {}".format(line_num, fields))
							elif len(ref) != 1 or len(alt) != 1:
								logger.warn("Length != 1 for ref or alt len at line {}: {}".format(line_num, fields))
							#elif aa_ref not in AA_INDEX or aa_alt not in AA_INDEX:
							#	logger.warn("Unknown aa_ref or aa_alt at line {}: {}".format(line_num, fields))
							#	continue
							elif transcript is None or aapos is None or uniprot is None or uniprot_aapos is None:
								logger.warn("None value for transcript or aapos or uniprot or uniprot_aapos at line {}: {}".format(line_num, fields))

							if aa_ref not in AA_INDEX:
								aa_ref = None
							if aa_alt not in AA_INDEX:
								aa_alt = None

							trs_values = transcript.split(";")

							aapos_values = [safe_int(v) for v in aapos.split(";")]
							l = len(trs_values) - len(aapos_values)
							if l > 0:
								aapos_values += [aapos_values[-1]] * l

							uniprot_values = uniprot.split(";")
							uniprot_aapos_values = [safe_int(v) for v in uniprot_aapos.split(";")]
							l = len(uniprot_values) - len(uniprot_aapos_values)
							if l > 0:
								uniprot_aapos_values += [uniprot_aapos_values[-1]] * l

							pph2_values = [safe_float(v) for v in pph2.split(";")] if pph2 is not None else [None]
							l = len(uniprot_values) - len(pph2_values)
							if l > 0:
								pph2_values += [pph2_values[-1]] * l

							uniprot_index = {}
							for i, id in enumerate(uniprot_values):
								if uniprot_aapos_values[i] is not None:
									uniprot_index[uniprot_aapos_values[i]] = i

							for i, trs in enumerate(trs_values):
								pos = aapos_values[i]
								if pos < 0:
									pos = None

								if pos is not None and pos in uniprot_index:
									j = uniprot_index[pos]
									uniprot_value = uniprot_values[j]
									pph2_value = pph2_values[j]
									uniprot_value = pph2_value = None

								if trs in trs_map:
									prot_id = trs_map[trs]
								elif uniprot_value in uniprot_map:
									prot_id = uniprot_map[uniprot_value]
									logger.warn("Couldn't map neither protein {} or transcript {} at line {}: {}".format(uniprot_value, trs, line_num, "|".join([str(v) for v in fields])))

								#if pos < 0:
								#	logger.warn("Negative protein position at line {}: {}".format(line_num, pos))
								#	continue
								#elif ...
								if pph2_value is not None and (pph2_value < 0.0 or pph2_value > 1.0):
									logger.warn("PPH2 score {} out of range at line {}: {}".format(pph2_value, line_num, fields))

								if aa_alt == "X": # fix stop codons having a sift score
									sift = None

								if args.skip_empty_scores and sift is None and pph2_value is None and ma is None \
										and mt is None and gerprs is None and phylop is None:

								#log.info((chr, strand, start, ref, alt, aapos_values[i], aa_ref, aa_alt, trs, sift, pph2_value, ma))

								if pos is None or aa_ref is None or aa_alt is None:
									pass #tsv.write_line(npf, chr, start, ".", ref, alt, ".", "PASS",
										#		   "dbNSFP={}|{}|{}|{}|{}|{}".format(trs, prot_id,
										#					sift or "", pph2_value or "", ma or "", fathmm or ""))
									tsv.write_line(of, chr, strand, start, ref, alt, trs,
												   prot_id, pos, aa_ref, aa_alt,
												   sift, pph2_value, ma, fathmm,
												   mt, gerprs, phylop)

						except KeyboardInterrupt:
							logger.warn("Malformed line {}: {}".format(line_num, "|".join([str(v) for v in fields])))
							raise #continue

						partial_time = time.time() - partial_start_time
						if partial_time >= 5.0:
							partial_start_time = time.time()
							elapsed_time = time.time() - start_time
							logger.debug("  {} lines, {:.1f} lines/second".format(hsize(line_num-1), (line_num-1) / float(elapsed_time)))

					total_lines += line_num

					logger.info("  >  {} lines, {:.1f} lines/second".format(hsize(line_num), line_num / float(time.time() - start_time)))
					logger.info("  >> {} lines, {:.1f} lines/second".format(hsize(total_lines), total_lines / float(time.time() - total_start_time)))


		total_elapsed_time = timedelta(seconds=time.time() - total_start_time)
		logger.info("Finished successfully. Elapsed time: {}".format(total_elapsed_time))

		return cmd.handle_error()

	return 0
Example #24
def main():
	parser = argparse.ArgumentParser(
		description="Fetch Condel scores")

	cmd = DefaultCommandHelper(parser)


	parser.add_argument("muts_path", metavar="SNVS_PATH",
						help="SNV's to check. Use - for standard input.")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="The results path. Use - for standard output.")




	args, logger = cmd.parse_args("fetch")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input"))

		progress = RatedProgress(logger, name="SNVs")

		with tsv.open(args.muts_path) as f:
			with tsv.open(args.out_path, "w") as wf:
				tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

				hit = fail = 0

				mut = DnaAndProtMutationParser()
				for line_num, line in enumerate(f, start=1):
					line = line.rstrip(" \n\r")
					if len(line) == 0 or line.startswith("#"):

					except PrematureEnd:
						logger.error("Missing fields at line {}".format(line_num))
						fail += 1
					except UnexpectedToken as ex:
						logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num))
						fail += 1

					exists = False
					for row in query_mutation(logger, db, mut, annotations, predictors):

						exists = True

						ann = row["annotations"]
						scores = row["scores"]

						tsv.write_line(wf, mut.identifier,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

						if logger.isEnabledFor(logging.DEBUG):
							logger.debug("    --> {} {} {} {} {} {} {} {} {} {}".format(
										row["chr"], row["start"], row["ref"], row["alt"], row["transcript"],
										row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"],
										mut.identifier or "*"))


					if exists:
						hit += 1
						fail += 1


		logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time))

		return cmd.handle_error()
Example #25
def main():
	parser = argparse.ArgumentParser(
		description="Export Scores")

	cmd = DefaultCommandHelper(parser)


	parser.add_argument("dest_path", metavar="OUTPUT_PATH",
						help="The output file. Use - for standard output.")




	parser.add_argument("--json", dest="to_json", action="store_true", default=False,
						help="Export the results in json format")

	parser.add_argument("--sample", dest="sample", type=int, metavar="PCT",
						help="Export a random sample of PCT %%")

	parser.add_argument("--start", dest="start", type=int, metavar="N",
						help="Start to export from the SNV number N")

	parser.add_argument("--limit", dest="limit", type=int, metavar="N",
						help="Limit the number of SNVs to export to N")

	args, logger = cmd.parse_args("export")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Exporting ...")


	total_count = 0
	total_start_time = time.time()

		progress = RatedProgress(logger, name="SNVs")

		to_json = args.to_json
		sample = args.sample
		start = args.start or 0
		limit = args.limit

		doc = None
		last_pos = None
		rows_count = 0
		snvs_count = 0
		with tsv.open(args.dest_path, "w") as f:

			if not to_json:
				tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

			for row in db.query_scores(predictors=predictors, maps=annotations):

				if not to_json:
					if start > 0:
						start -= 1

					if sample is not None and random.randint(1, 100) > sample:

				pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"])
				if last_pos != pos:
					if to_json:
						if start > 0:
							start -= 1

						if limit is not None and snvs_count >= limit:
							if doc is not None:
								json.dump(doc, f)

					snvs_count += 1

				rows_count += 1

				ann = row["annotations"]
				scores = row["scores"]

				if to_json:
					tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] +
										[(k,scores[k]) for k in predictors])

					if pos != last_pos:
						if doc is not None:
							if sample is None or random.randint(1, 100) <= sample:
								json.dump(doc, f)
								snvs_count -= 1

						doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] +
										[("transcripts", [tdoc])])
						doc["transcripts"] += [tdoc]

							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])


				last_pos = pos

				if not to_json and limit is not None and rows_count >= limit:


		logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time))
		return cmd.handle_error()

	return 0
Example #26
def main():
    parser = argparse.ArgumentParser(description="Calculate Baseline Tolerance statistics per gene")

    cmd = DefaultCommandHelper(parser)

    parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree")

    parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group")

    parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features")

    parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial gene statistics")

    parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output gene statistics")

        help="Minimum number of features per group",

        help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)",

    args, logger = cmd.parse_args("blt-groups")

    logger.info("Loading groups tree ...")

    group_children = defaultdict(set)
    with tsv.open(args.tree_path) as f:
        for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_children[group] |= children

    logger.info("Loading mappings between groups and features ...")

    group_genes = defaultdict(set)
    with tsv.open(args.group_genes_path) as f:
        for group, genes in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_genes[group] |= genes

    logger.info("Loading partial statistics ...")

    partial_stats = {}
    with tsv.open(args.stats_path) as f:
        predictors = f.readline().rstrip("\n").split("\t")[1:]
        num_predictors = len(predictors)
        for line in f:
            fields = line.rstrip("\n").split("\t")
            gene = fields[0]
            gene_stats = [[float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] for ss in fields[1:]]
            partial_stats[gene] = gene_stats

    logger.info("  Predictors: {}".format(", ".join(predictors)))
    logger.info("  Features: {}".format(len(partial_stats.keys())))

    logger.info("Calculating features ...")

    stats = {}

    feat_count = 0
    feat_partial_count = [0] * num_predictors
    for feature, feat_partial_stats in partial_stats.items():
        feat_with_stats = False
        feat_stats = [None] * (num_predictors + 1)
        for i in range(num_predictors):
            s0, s1, s2 = feat_partial_stats[i]

            if s0 == 0.0:

            if s0 < args.count_threshold:

            x = (s0 * s2 - s1 * s1) / (s0 * (s0 - 1))
            if x < -1e-12:

            mean = s1 / s0
            std = math.sqrt(abs(x))
            if std < args.stdev_threshold:

            feat_stats[i] = (int(s0), mean, std)
            feat_partial_count[i] += 1
            feat_with_stats = True

        if feat_with_stats:
            feat_count += 1
            stats[feature] = feat_stats
            # print feature, "\t".join(["/".join([str(v) for v in feat_stats[i] or []]) for i in range(num_predictors)])

        "  {} ({}) features out of {} calculated directly from partial statistics".format(
            feat_count, "/".join(map(str, feat_partial_count)), len(partial_stats)

    logger.info("Calculating groups ...")

        logger, args.root_group, args.count_threshold, group_children, group_genes, partial_stats, num_predictors, stats

    logger.info("  {} features calculated in total".format(len(stats)))

    with tsv.open(args.out_path, "w") as of:
        tsv.write_line(of, "GENE", "GROUP", *predictors)
        for gene in sorted(stats.keys()):
            gene_stats = stats[gene]
            sb = [gene]
            stats_group = gene_stats[num_predictors]
            if stats_group is not None:
                sb += [stats_group]
                sb += ["|" + ("-" * num_predictors)]

            for i in range(num_predictors):
                if gene_stats[i] is not None:
                    sb += ["/".join([str(v) for v in gene_stats[i]])]
                    sb += ["-/-/-"]
            tsv.write_line(of, *sb)

    return 0
Example #27
def combination_oncodrivefm(projects_set):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	if len(group_values) == 0:
		group_file_prefix = classifier_id
		group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

	group_file_prefix = normalize_id(group_file_prefix)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(
		classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Exporting project data ...")

	base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix))

	log.debug("> {0}".format(base_path))

	project_ids = []
	gene_files = []
	pathway_files = []
	for project in projects:
		project_id = project["id"]
		project_ids += [project_id]

		log.info("  Project {0}:".format(project["id"]))

		projdb = ProjectDb(project["db"])

		log.info("    Genes ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id))
		gene_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "GENE_ID", "PVALUE")
			for gene in projdb.genes():
				if gene.fm_pvalue is not None:
					tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-")
					count += 1

		log.info("      {0} genes".format(count))

		log.info("    Pathways ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id))
		pathway_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "PATHWAY_ID", "ZSCORE")
			for pathway in projdb.pathways():
				if pathway.fm_zscore is not None:
					tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-")
					count += 1

		log.info("      {0} pathways".format(count))


	log.info("Combining ...")

	combination_path = paths.combination_path("oncodrivefm")

	log.info("  Genes ...")

	cmd = " ".join([
			"-m median-empirical",
			"-o '{0}'".format(combination_path),
			"-n 'gene-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in gene_files])


	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))

	log.info("  Pathways ...")

	cmd = " ".join([
			"-m median-zscore",
			"-o '{0}'".format(combination_path),
			"-n 'pathway-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in pathway_files])


	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))

	remove_temp(task, base_path)
Example #28
def pack_datasets(project):
	log = task.logger

	config = GlobalConfig(task.conf)

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	if not config.results.create_zip:
		log.info("Creation of the results compressed file is deactivated. Skipped.")

	project_path = project["path"]
	temp_path = project["temp_path"]

	dest_path = os.path.join(project_path, "results.zip")

	sigdb = SigDb(config.sigdb_path)

	projdb = ProjectDb(project["db"])

	projres = ProjectResults(project)

	gene_sym = projdb.get_gene_symbols()

	total_samples = projdb.get_total_affected_samples()

	log.info("Compressing files ...")

	arc = None
		arc = Archive(dest_path, mode="w", fmt="zip")

		log.info("  Variant genes ...")

		with ArcFile(task, arc, project_id, "variant_genes", "w") as vf:
			write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE",

			for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True):
				var = afg.var
				rec = afg.rec

				start, end, ref, alt = var_to_tab(var)

				xrefs = [xref for xref in var.xrefs]
				if sigdb.exists_variant(var.chr, start):
					xrefs += ["I:1"]
				xrefs = ",".join(xrefs)

				intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0

				write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt),
								afg.gene_id, gene_sym.get(afg.gene_id),
								afg.impact, TransFIC.class_name(afg.impact),
								rec.sample_freq or 0, total_samples, rec.sample_prop or 0,
								afg.coding_region, afg.prot_changes, intogen_driver, xrefs)

		log.info("  Variant samples ...")

		with ArcFile(task, arc, project_id, "variant_samples", "w") as vf:
			write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLES")

			for var in projdb.variants(join_samples=True):
				start, end, ref, alt = var_to_tab(var)
				write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt),
						   ",".join([s.name for s in var.samples]))

		log.info("  Consequences ...")

		with ArcFile(task, arc, project_id, "consequences", "w") as cf:
			write_line(cf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT",

			for csq in projdb.consequences(join_variant=True):
				var = csq.var
				start, end, ref, alt = var_to_tab(var)
				allele = "{0}/{1}".format(ref, alt)

				uniprot = protein = protein_pos = aa_change = None
				sift_score = sift_tfic = sift_tfic_class = None
				pph2_score = pph2_tfic = pph2_tfic_class = None
				ma_score = ma_tfic = ma_tfic_class = None
				if so.match(csq.ctypes, so.ONCODRIVEFM):
					uniprot, protein = csq.uniprot, csq.protein
				if so.match(csq.ctypes, so.NON_SYNONYMOUS):
					protein_pos, aa_change = csq.protein_pos, csq.aa_change
					sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class)
					pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class)
					ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class)

				write_line(cf, project_id, var.chr, var.strand, start, allele, csq.transcript,
							",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene),
							uniprot, protein, protein_pos, aa_change,
							sift_score, sift_tfic, sift_tfic_class,
							pph2_score, pph2_tfic, pph2_tfic_class,
							ma_score, ma_tfic, ma_tfic_class,
							csq.impact, TransFIC.class_name(csq.impact))

		log.info("  Genes ...")

		with ArcFile(task, arc, project_id, "genes", "w") as gf:
			write_line(gf, "PROJECT_ID", "GENE_ID", "SYMBOL", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE",

			for gene in projdb.genes(join_xrefs=True, join_rec=True):
				if gene.rec.sample_freq is not None and gene.rec.sample_freq > 0:
					intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0
					write_line(gf, project_id, gene.id, gene.symbol, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause,
									gene.rec.sample_freq, total_samples, gene.rec.sample_prop or 0,
									gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause,
									gene.clust_coords, intogen_driver, ",".join(gene.xrefs))

		log.info("  Pathways ...")

		with ArcFile(task, arc, project_id, "pathways", "w") as pf:

			for pathway in projdb.pathways(join_rec=True):
				if pathway.rec.sample_freq is not None and pathway.rec.sample_freq > 0:
					write_line(pf, project_id, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue,
									pathway.rec.sample_freq or 0, total_samples, pathway.rec.sample_prop or 0,
									pathway.rec.gene_freq or 0, pathway.gene_count, pathway.rec.gene_prop or 0)

		if not config.skip_oncodrivefm:

			log.info("  Genes per sample functional impact ...")

			with ArcFile(task, arc, project_id, "fimpact.gitools.tdm", "w") as f:
				write_line(f, "SAMPLE", "GENE_ID",
				for fields in projdb.sample_gene_fimpacts():
					(gene, sample,
						 sift_score, sift_tfic, sift_tfic_class,
						 pph2_score, pph2_tfic, pph2_tfic_class,
						 ma_score, ma_tfic, ma_tfic_class) = fields
					write_line(f, sample, gene,
							   sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class),
							   pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class),
							   ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class))

		log.info("Saving project configuration ...")

		with ArcFile(task, arc, project_id, "project", "w") as f:
			values = [project_id, project["assembly"], total_samples]
			names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values)
			tsv.write_line(f, *names)
			tsv.write_line(f, *values, null_value="-")
		if arc is not None:
Example #29
def main():
	parser = argparse.ArgumentParser(
		description="Prepare SNV's dataset from individual training sets")

	parser.add_argument("pos_path", metavar="POS_SET",
						help="The positive training set file")

	parser.add_argument("neg_path", metavar="NEG_SET",
						help="The negative training set file")

	parser.add_argument("-m", "--map", dest="map_path", metavar="MAP",
						help="Optional mapping file for feature id's. Format: DST SRC")

	parser.add_argument("-o", dest="out_path", metavar="PATH",
						help="Output file. Use - for standard output.")


	args = parser.parse_args()


	logger = bglogging.get_logger("training-sets")

	if args.out_path is None:
		prefix = os.path.commonprefix([

		prefix = prefix.rstrip(".")

		args.out_path = os.path.join(os.getcwd(), "{}-training.tsv".format(prefix))

	if args.map_path is not None:
		logger.info("Loading map ...")

		prot_map = {}
		with tsv.open(args.map_path) as f:
			for dst_feature, src_feature in tsv.lines(f, (str, str)):
					if len(src_feature) > 0:
						if src_feature not in prot_map:
							prot_map[src_feature] = set([dst_feature])
		prot_map = None
	logger.info("Processing ...")

	hits = dict(POS=0, NEG=0)
	fails = dict(POS=0, NEG=0)

	start_time = datetime.now()

	with tsv.open(args.out_path, "w") as wf:

		for event_type, path in (("POS", args.pos_path), ("NEG", args.neg_path)):

			logger.info("  [{}] Reading {} ...".format(event_type, path))

			with tsv.open(path) as f:
				types = (str, int, str, str)
				for protein, pos, aa1, aa2 in tsv.lines(f, types):
					protein = protein.strip()

					if prot_map is not None:
						if protein not in prot_map:
							logger.debug("[{}] Unmapped protein: {}".format(event_type, protein))
							fails[event_type] += 1
						proteins = prot_map[protein]
						proteins = [protein]

					hits[event_type] += 1

					for p in proteins:
						tsv.write_line(wf, p, pos, aa1.strip(), aa2.strip(), event_type)

	logger.info("               POS       NEG")
	logger.info("SNVs      {POS:>8}  {NEG:>8}".format(**hits))
	if args.map_path is not None:
		logger.info("unmapped  {POS:>8}  {NEG:>8}".format(**fails))

	logger.info("Finished. Elapsed time: {}".format(datetime.now() - start_time))