コード例 #1
0
ファイル: dbnsfp_export.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Export dbNSFP scores")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("source_path", metavar="SOURCE",
						help="The original zip file")

	parser.add_argument("ensp_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Ensembl transcript id's and Uniprot id's")

	parser.add_argument("uniprot_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Uniprot id's")

	parser.add_argument("-o", "--output", dest="out_path", metavar="OUT_PATH",
						help="The output file")

	parser.add_argument("--temp", dest="temp_path", metavar="TEMP_PATH",
						help="A temporary path for zip extraction")

	parser.add_argument("--chr", dest="chr", metavar="CHROMOSOMES",
						help="Chromosomes to include: list separated by commas.")

	parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False,
						help="Skip SNV's where all the scores are empty")

	args, logger = cmd.parse_args("dbnsfp-export")

	if args.out_path is None:
		basename = os.path.basename(args.source_path)
		prefix = os.path.splitext(basename)[0]
		args.out_path = "{}.tsv.gz".format(prefix)

	logger.info("Loading maps ...")

	uniprot_map = {}
	trs_map = {}
	with tsv.open(args.ensp_map_path) as f:
		for ensp, enst in tsv.lines(f, (str, str)):
			if len(enst) > 0:
				trs_map[enst] = ensp

	with tsv.open(args.uniprot_map_path) as f:
		for ensp, uniprot_id in tsv.lines(f, (str, str)):
			if len(uniprot_id) > 0:
				uniprot_map[uniprot_id] = ensp

	logger.info("Opening {} ...".format(args.source_path))

	chromosomes = None
	if args.chr is not None:
		chromosomes = [c.strip().upper() for c in args.chr.split(",") if len(c.strip()) > 0]
		logger.info("Selected chromosomes: {}".format(", ".join(chromosomes)))
		chromosomes = set(chromosomes)

	name_pattern = re.compile(r"dbNSFP.+_variant.chr(.+)")

	COLUMNS = [
		"#chr", "pos(1-coor)", "ref", "alt", "cds_strand",
		"genename", "Uniprot_id", "Uniprot_aapos", "aaref", "aaalt",
		"Ensembl_geneid", "Ensembl_transcriptid", "aapos",
		"SIFT_score",
		"Polyphen2_HVAR_score",
		"MutationAssessor_score",
		"FATHMM_score",
		"MutationTaster_score",
#		"GERP_RS",
		"GERP++_RS",
#		"PhyloP_score"
		"phyloP"
	]

	tmp_prefix = args.temp_path or tempfile.gettempdir()
	if not os.path.exists(tmp_prefix):
		os.makedirs(tmp_prefix)
	if tmp_prefix[-1] != "/":
		tmp_prefix += "/"

	extract_path = tempfile.mkdtemp(prefix=tmp_prefix)

	try:
		logger.info("Output: {}".format(args.out_path if args.out_path != "-" else "standard output"))

		total_start_time = time.time()
		total_lines = 0
		with ZipFile(args.source_path, "r") as zf,\
			tsv.open(args.out_path, "w") as of: #,\
			#tsv.open(args.noprot_path, "w") as npf:

			tsv.write_line(of, "CHR", "STRAND", "START", "REF", "ALT", "TRANSCRIPT",
						   "PROTEIN", "AA_POS", "AA_REF", "AA_ALT",
						   "SIFT", "PPH2", "MA", "FATHMM", "MT", "GERPRS", "PHYLOP")

			#tsv.write_line(npf, "#CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")

			entries = []
			for entry in zf.infolist():
				m = name_pattern.match(entry.filename)
				if not m:
					continue

				chr = m.group(1)
				index = CHR_INDEX[chr] if chr in CHR_INDEX else 99

				if chromosomes is not None and chr not in chromosomes:
					logger.debug("Skipping chromosome {} ...".format(chr))
					continue

				entries += [(index, chr, entry)]

			for index, chr, entry in sorted(entries, key=lambda x: x[0]):
				logger.info("Reading chromosome {} ...".format(chr))

				zf.extract(entry, extract_path)
				fpath = os.path.join(extract_path, entry.filename)
				with open(fpath) as f:
					# Parse header
					hdr_line = f.readline()
					hdr = {}
					for index, name in enumerate(hdr_line.rstrip("\n").split("\t")):
						hdr[name] = index
					columns = [hdr[name] if name in hdr else None for name in COLUMNS]

					read = set()

					start_time = time.time()
					partial_start_time = start_time
					for line_num, line in enumerate(f, start=2):
						fields = line.rstrip("\n").split("\t")

						try:
							fields = [fields[i] if i is not None and i < len(fields) else None for i in columns]

							(chr, start, ref, alt, strand,
							 symbol, uniprot, uniprot_aapos, aa_ref, aa_alt,
							 gene, transcript, aapos,
							 sift, pph2, ma, fathmm,
							 mt, gerprs, phylop) = fields
							
							start = safe_int(start)
							ref = ref.upper() if ref is not None else None
							alt = alt.upper() if alt is not None else None
							aa_ref = aa_ref.upper() if aa_ref is not None else None
							aa_alt = aa_alt.upper() if aa_alt is not None else None
							sift = safe_float(sift)
							ma = safe_float(ma)
							fathmm = safe_float(fathmm)
							mt = safe_float(mt)
							gerprs = safe_float(gerprs)
							phylop = safe_float(phylop)

							if start is None or ref is None or alt is None:
								logger.warn("None value for pos or ref or alt at line {}: {}".format(line_num, fields))
								continue
							elif ref not in BASE_INDEX or alt not in BASE_INDEX:
								logger.warn("Unknown ref or alt at line {}: {}".format(line_num, fields))
								continue
							elif len(ref) != 1 or len(alt) != 1:
								logger.warn("Length != 1 for ref or alt len at line {}: {}".format(line_num, fields))
								continue
							#elif aa_ref not in AA_INDEX or aa_alt not in AA_INDEX:
							#	logger.warn("Unknown aa_ref or aa_alt at line {}: {}".format(line_num, fields))
							#	continue
							elif transcript is None or aapos is None or uniprot is None or uniprot_aapos is None:
								logger.warn("None value for transcript or aapos or uniprot or uniprot_aapos at line {}: {}".format(line_num, fields))
								continue

							if aa_ref not in AA_INDEX:
								aa_ref = None
							if aa_alt not in AA_INDEX:
								aa_alt = None

							trs_values = transcript.split(";")

							aapos_values = [safe_int(v) for v in aapos.split(";")]
							l = len(trs_values) - len(aapos_values)
							if l > 0:
								aapos_values += [aapos_values[-1]] * l

							uniprot_values = uniprot.split(";")
							uniprot_aapos_values = [safe_int(v) for v in uniprot_aapos.split(";")]
							l = len(uniprot_values) - len(uniprot_aapos_values)
							if l > 0:
								uniprot_aapos_values += [uniprot_aapos_values[-1]] * l

							pph2_values = [safe_float(v) for v in pph2.split(";")] if pph2 is not None else [None]
							l = len(uniprot_values) - len(pph2_values)
							if l > 0:
								pph2_values += [pph2_values[-1]] * l

							uniprot_index = {}
							for i, id in enumerate(uniprot_values):
								if uniprot_aapos_values[i] is not None:
									uniprot_index[uniprot_aapos_values[i]] = i

							for i, trs in enumerate(trs_values):
								pos = aapos_values[i]
								if pos < 0:
									pos = None

								if pos is not None and pos in uniprot_index:
									j = uniprot_index[pos]
									uniprot_value = uniprot_values[j]
									pph2_value = pph2_values[j]
								else:
									uniprot_value = pph2_value = None

								if trs in trs_map:
									prot_id = trs_map[trs]
								elif uniprot_value in uniprot_map:
									prot_id = uniprot_map[uniprot_value]
								else:
									logger.warn("Couldn't map neither protein {} or transcript {} at line {}: {}".format(uniprot_value, trs, line_num, "|".join([str(v) for v in fields])))
									continue

								#if pos < 0:
								#	logger.warn("Negative protein position at line {}: {}".format(line_num, pos))
								#	continue
								#elif ...
								if pph2_value is not None and (pph2_value < 0.0 or pph2_value > 1.0):
									logger.warn("PPH2 score {} out of range at line {}: {}".format(pph2_value, line_num, fields))
									continue

								if aa_alt == "X": # fix stop codons having a sift score
									sift = None

								if args.skip_empty_scores and sift is None and pph2_value is None and ma is None \
										and mt is None and gerprs is None and phylop is None:
									continue

								#log.info((chr, strand, start, ref, alt, aapos_values[i], aa_ref, aa_alt, trs, sift, pph2_value, ma))

								if pos is None or aa_ref is None or aa_alt is None:
									pass #tsv.write_line(npf, chr, start, ".", ref, alt, ".", "PASS",
										#		   "dbNSFP={}|{}|{}|{}|{}|{}".format(trs, prot_id,
										#					sift or "", pph2_value or "", ma or "", fathmm or ""))
								else:
									tsv.write_line(of, chr, strand, start, ref, alt, trs,
												   prot_id, pos, aa_ref, aa_alt,
												   sift, pph2_value, ma, fathmm,
												   mt, gerprs, phylop)

						except KeyboardInterrupt:
							raise
						except:
							logger.warn("Malformed line {}: {}".format(line_num, "|".join([str(v) for v in fields])))
							raise #continue

						partial_time = time.time() - partial_start_time
						if partial_time >= 5.0:
							partial_start_time = time.time()
							elapsed_time = time.time() - start_time
							logger.debug("  {} lines, {:.1f} lines/second".format(hsize(line_num-1), (line_num-1) / float(elapsed_time)))

					total_lines += line_num

					logger.info("  >  {} lines, {:.1f} lines/second".format(hsize(line_num), line_num / float(time.time() - start_time)))
					logger.info("  >> {} lines, {:.1f} lines/second".format(hsize(total_lines), total_lines / float(time.time() - total_start_time)))

				os.remove(fpath)

		total_elapsed_time = timedelta(seconds=time.time() - total_start_time)
		logger.info("Finished successfully. Elapsed time: {}".format(total_elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		shutil.rmtree(extract_path)

	return 0
コード例 #2
0
ファイル: calc_label.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Condel label")

	parser.add_argument("db_path", metavar="DB_PATH",
						help="Functional scores database")

	parser.add_argument("weights_path", metavar="WEIGHTS",
						help="File containing the scores weights and cutoffs")

	parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors")

	parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES",
						help="Updated predictor names")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	log = bglogging.get_logger("calculate-label")

	log.info("Opening functional scores database ...")

	db = FannsSQLiteDb(args.db_path)
	db.open()

	log.info("Loading state ...")

	state = load_weights(args.weights_path)

	avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]]
	if args.predictors is not None:
		predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors]
		if len(predictors) == 0:
			log.error("Unknown predictors: {}".format(args.predictors))
			log.error("Available predictor names are: {}".format(", ".join(avail_predictors)))
			exit(-1)
	else:
		predictors = avail_predictors

	if args.updated_predictors is not None:
		updated_predictors = [p.strip() for p in args.updated_predictors.split(",")]
		if len(predictors) != len(updated_predictors):
			log.error("Number of updated predictors does not match with the list of number of predictors")
			exit(-1)
	else:
		updated_predictors = ["{}_CLASS".format(p.upper()) for p in predictors]

	log.info("Available predictors: {}".format(", ".join(avail_predictors)))
	log.info("Selected predictors: {}".format(", ".join(predictors)))

	for predictor, updated_predictor in zip(predictors, updated_predictors):
		log.info("Creating predictor {} ...".format(updated_predictor))
		db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor])

	cutoffs = []
	for predictor in predictors:
		cutoff, mcc, acc = [stats[predictor][v] for v in ["cutoff", "cutoff_mcc", "cutoff_acc"]]
		log.info("{}: cutoff={}, MCC={}, accuracy={}".format(predictor, cutoff, mcc, acc))
		cutoffs += [cutoff]

	log.info("Calculating ...")

	start_time = partial_start_time = time.time()
	try:
		for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1):
			scores = row["scores"]
			d = {}
			for i, predictor in enumerate(predictors):
				score = scores[predictor]
				if score is None:
					continue

				cutoff = cutoffs[i]
				updated_predictor = updated_predictors[i]

				d[updated_predictor] = 0.0 if score < cutoff else 1.0

			db.update_scores(row["id"], d)

			partial_time = time.time() - partial_start_time
			if partial_time > 5.0:
				partial_start_time = time.time()
				elapsed_time = time.time() - start_time
				log.debug("  {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time))

		db.commit()
	except KeyboardInterrupt:
		log.warn("Interrupted by Ctrl-C")
		db.rollback()
	except:
		db.rollback()
		raise
	finally:
		db.close()
コード例 #3
0
ファイル: calc.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Condel score")

	parser.add_argument("db_path", metavar="DB_PATH",
						help="Functional scores database")

	parser.add_argument("weights_path", metavar="WEIGHTS",
						help="File containing the scores weights and cutoffs")

	parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors")

	parser.add_argument("-u", "--updated-predictor", dest="updated_predictor", metavar="NAME",
						help="Updated predictor name")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	log = bglogging.get_logger("calculate")

	log.info("Opening functional scores database ...")

	db = FannsSQLiteDb(args.db_path)
	db.open()

	updated_predictor = args.updated_predictor or "CONDEL"

	predictors = set([p["id"] for p in db.predictors()])
	if updated_predictor not in predictors:
		log.info("  Creating predictor {} ...".format(updated_predictor))
		db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=predictors)

	log.info("Loading state ...")

	state = load_weights(args.weights_path)

	avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]]
	if args.predictors is not None:
		predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors]
		if len(predictors) == 0:
			log.error("Unknown predictors: {}".format(args.predictors))
			log.error("Available predictor names are: {}".format(", ".join(avail_predictors)))
			exit(-1)
	else:
		predictors = avail_predictors

	log.info("Available predictors: {}".format(", ".join(avail_predictors)))
	log.info("Selected predictors: {}".format(", ".join(predictors)))

	log.info("Calculating ...")

	start_time = partial_start_time = time.time()
	try:
		for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1):
			scores = row["scores"]
			condel = wsum = 0
			for predictor, score in scores.items():
				if score is None:
					continue

				predictor_stats = stats[predictor]
				rmin, rmax, dim, size, cdp, cdn, cutoff = [predictor_stats[k] for k in [
																"rmin", "rmax", "dim", "size", "cdp", "cdn", "cutoff"]]

				if predictor in PREDICTOR_TRANSFORM:
					score = PREDICTOR_TRANSFORM[predictor](score)

				r = (score - rmin) / dim
				index = int(r * size) if score < rmax else size - 1

				if score < cutoff:
					w = 1 - cdn[index]
				else:
					w = 1 - cdp[index]

				wsum += w
				condel += w * score

				#log.info("{}={}, w={} -> {}".format(predictor_name, score, w, score * w))

			if wsum != 0:
				condel /= wsum

				d = {updated_predictor : condel}
				db.update_scores(row["id"], d)

				#log.info(">>> CONDEL={}".format(condel))
			else:
				log.warn("wsum = 0, condel={}, scores={}".format(condel, repr(scores)))

			partial_time = time.time() - partial_start_time
			if partial_time > 5.0:
				partial_start_time = time.time()
				elapsed_time = time.time() - start_time
				log.debug("  {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time))

		log.info("Commit ...")
		db.commit()
	except KeyboardInterrupt:
		log.warn("Interrupted by Ctrl-C")
		db.rollback()
	except:
		db.rollback()
		raise
	finally:
		db.close()