Beispiel #1
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Baseline Tolerance partial statistics per feature")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("scores_path", metavar="SCORES_PATH",
						help="The scores file")

	parser.add_argument("predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="Output file.")

	cmd.add_transform_args()

	args, logger = cmd.parse_args("blt-partial")

	predictors = [p.strip() for p in args.predictors.split(",") if len(p.strip()) > 0]
	num_predictors = len(predictors)

	if len(predictors) == 0:
		logger.error("At least one predictor is needed")
		exit(-1)

	logger.info("Selected predictors: {}".format(", ".join(predictors)))

	transforms = cmd.get_transforms()

	stats = {}

	lost_snvs = 0
	scores_path = args.scores_path

	logger.info("Reading scores from {} ...".format(
		os.path.basename(scores_path) if scores_path != "-" else "standard input"))

	with tsv.open(scores_path) as sf:
		for line_num, line in enumerate(sf):
			fields = line.rstrip("\n").split("\t")
			chrom, pos, ref, alt, feature = fields[:5]

			if len(feature) == 0:
				lost_snvs += 1
				continue

			scores = fields[5:]

			if len(scores) != num_predictors:
				line_error(logger, scores_path, line_num, "Number of score columns does not match the number of predictors")

			try:
				scores = [float(v) if len(v) > 0 else None for v in scores]
			except:
				line_error(logger, scores_path, line_num, "Scores should be real numbers: {}".format(scores))

			if feature not in stats:
				stats[feature] = tuple([[0, 0.0, 0.0] for p in predictors])

			feature_stats = stats[feature]

			for i, score in enumerate(scores):
				if score is not None:
					predictor = predictors[i]
					if predictor in transforms:
						for name, func in transforms[predictor]:
							try:
								score = func(score)
							except:
								logger.error("Error transforming the {} score {} with {}".format(predictor, score, name))
								exit(-1)

					feature_stats[i][0] += 1
					feature_stats[i][1] += score
					feature_stats[i][2] += score * score

	logger.info("Saving results into {} ...".format(
		os.path.basename(args.out_path) if args.out_path != "-" else "standard output"))

	with tsv.open(args.out_path, "w") as of:
		tsv.write_line(of, "FEATURE", *predictors)
		for feature in sorted(stats.keys()):
			sb = [feature]
			feature_stats = stats[feature]
			for i in range(num_predictors):
				sb += ["/".join([repr(v) for v in feature_stats[i]])]
			tsv.write_line(of, *sb)

	logger.info("Number of SNV's = {}, lost SNV's = {}, number of features = {}".format(line_num, lost_snvs, len(stats)))

	return 0
Beispiel #2
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate TransFIC cutoffs")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("ranges_path", metavar="RANGES_PATH",
						help="JSON file generated with pred-list containing predictors stats. Only min and max are used.")

	parser.add_argument("scores_path", metavar="SCORES_PATH",
						help="The dataset with scores for non recurrent and highly recurrent. ID column should be NON/HIGH for non-rec/highly-rec datasets.")

	parser.add_argument("-o", dest="out_path", metavar="OUT_PATH",
						help="The file where cutoffs will be saved. Use - for standard output.")

	cmd.add_selected_predictors_args()

	parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISION", type=int, default=3,
						help="Distribution precision")

	cmd.add_transform_args()

	args, logger = cmd.parse_args("cutoffs")

	if args.out_path is None:
		prefix = os.path.splitext(os.path.basename(args.scores_path))[0]
		if prefix.endswith("-scores"):
			prefix = prefix[:-7]
		args.out_path = os.path.join(os.getcwd(), "{}-cutoffs.json".format(prefix))

	try:
		logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path)))

		with open(args.ranges_path) as f:
			pred_stats = json.load(f)

		predictor_range = {}
		for pid, pstats in pred_stats.items():
			predictor_range[pid] = (pstats["min"], pstats["max"])

		transforms = cmd.get_transforms()

		logger.info("Reading datasets {} ...".format(args.scores_path if args.scores_path != "-" else "from standard input"))

		with tsv.open(args.scores_path) as f:

			# Select predictors from the available predictors in the dataset or user selection

			column_names, column_indices = tsv.header(f)
			excluded_columns = set(COORD_COLUMNS) | set(["ID"])
			available_predictors = [c for c in column_names if c not in excluded_columns]
			predictors = cmd.get_selected_predictors(available_predictors)

			# Initialize statistics

			step = 1.0 / 10**args.precision

			stats = dict()

			state = dict(
				predictors = predictors,
				stats = stats,
				transforms=dict([(p, [e for e, _ in t]) for p, t in transforms.items()]),
				precision = args.precision,
				step = step)

			for predictor in predictors:
				rmin, rmax = predictor_range[predictor] if predictor in predictor_range else (0.0, 1.0)
				dim = rmax - rmin
				size = int(dim / step) + 1
				values = [(x * step) + rmin for x in xrange(size)]

				stats[predictor] = dict(
					rmin = rmin,
					rmax = rmax,
					dim = dim,
					values = values,
					size = size,
					vmin = None,
					vmax = None,
					dp = [0] * size,
					dn = [0] * size,
					cdp = [0] * size,
					cdn = [0] * size,
					cump = 0,
					cumn = 0,
					cutoff = None,
					cutoff_index = None)

			counts = load_events(f, column_indices, predictors, transforms, stats, logger)

			logger.info("  {}".format(", ".join(["{}={}".format(n, c) for n, c in counts.items()])))

		logger.info("Calculating cumulative distribution ...")

		for predictor in predictors:
			predictor_stats = stats[predictor]
			dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]]
			cump = 0
			cumn = 0
			i = len(dp) - 1
			while i >= 0:
				#cdp[i] = dp[i] + cump
				cump += dp[i]
				cdp[i] = cump

				cdn[i] = dn[i] + cumn
				cumn += dn[i]

				i -= 1

			predictor_stats["cump"] = cump
			predictor_stats["cumn"] = cumn

			logger.info("  {}: cump={}, cumn={}".format(predictor, cump, cumn))

		logger.info("Calculating cutoffs ...")

		for predictor in predictors:
			predictor_stats = stats[predictor]
			values, size, vmin, vmax, cump, cumn, cdp, cdn = [predictor_stats[k] for k in [
				"values", "size", "vmin", "vmax", "cump", "cumn", "cdp", "cdn"]]

			cutoff_low_mid_index = -1
			i = 0
			while (i < size) and (cdp[i] / float(cump) >= 0.95):
				cutoff_low_mid_index = i
				i += 1

			cutoff_low_mid = values[cutoff_low_mid_index]
			predictor_stats["cutoff_low_mid"] = cutoff_low_mid
			predictor_stats["cutoff_low_mid_index"] = cutoff_low_mid_index

			cutoff_mid_high_index = -1
			i = 0
			while (i < size) and (cdn[i] / float(cumn) >= 0.20):
				cutoff_mid_high_index = i
				i += 1

			cutoff_mid_high = values[cutoff_mid_high_index]
			predictor_stats["cutoff_mid_high"] = cutoff_mid_high
			predictor_stats["cutoff_mid_high_index"] = cutoff_mid_high_index

			logger.info("  {}: cutoffs: vmin={}, low_mid={}, mid_high={}, vmax={}".format(predictor, vmin, cutoff_low_mid, cutoff_mid_high, vmax))

		logger.info("Saving state ...")

		out_path = args.out_path
		save_weights(out_path, state)

	except:
		cmd.handle_error()

	return 0