Example #1
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate weights")

	parser.add_argument("ranges_path", metavar="RANGES_PATH",
						help="JSON file generated with pred-list containing predictors stats. Only min and max are used.")

	parser.add_argument("training_path", metavar="TRAINING_PATH",
						help="The training set scores. ID column should be POS/NEG for positive/negative sets.")

	parser.add_argument("-o", dest="out_path", metavar="OUT_PATH",
						help="The file where weights will be saved. Use - for standard output.")

	parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors to fetch")

	parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISSION", type=int, default=3,
						help="Distribution precision")

	parser.add_argument("-f", "--full-state", dest="full_state", action="store_true", default=False,
						help="Save intermediate calculations to allow further exploration and plotting")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	logger = bglogging.get_logger("weights")
	
	if args.out_path is None:
		prefix = os.path.splitext(os.path.basename(args.training_path))[0]
		if prefix.endswith("-scores"):
			prefix = prefix[:-7]
		args.out_path = os.path.join(os.getcwd(), "{}-weights.json".format(prefix))

	if args.predictors is not None:
		args.predictors = [p.strip() for p in args.predictors.split(",")]

	logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path)))

	with open(args.ranges_path) as f:
		pred_stats = json.load(f)

	predictor_range = {}
	for pid, pstats in pred_stats.items():
		predictor_range[pid] = (pstats["min"], pstats["max"])

	logger.info("Reading training set {} ...".format(args.training_path if args.training_path != "-" else "from standard input"))

	with tsv.open(args.training_path) as f:

		# Select predictors from the available predictors in the dataset or user selection

		column_names, column_indices = tsv.header(f)
		available_predictors = [c for c in column_names if c not in set(COORD_COLUMNS)]
		if args.predictors is None:
			predictors = available_predictors
		else:
			missing_predictors = [p for p in args.predictors if p not in set(available_predictors)]
			if len(missing_predictors) > 0:
				logger.error("Missing predictors: {}".format(", ".join(missing_predictors)))
				exit(-1)
			predictors = args.predictors

	data = pd.read_csv(args.training_path, sep="\t", index_col=False,
					   usecols=["ID"] + predictors,
					   true_values=["POS"], false_values=["NEG"])

	data.rename(columns={"ID" : "EVT"}, inplace=True)

	# Initialize statistics

	logger.info("Initializing metrics ...")

	step = 1.0 / 10**args.precision

	stats = dict()

	state = dict(
		predictor_names = predictors,
		precision = args.precision,
		step = step,
		stats = stats)

	for predictor in predictors:
		d = data[["EVT", predictor]]
		d = d[np.isfinite(d.iloc[:, 1])]

		nump = d.iloc[:, 0].sum()
		numn = d.shape[0] - nump

		rmin, rmax = d.iloc[:, 1].min(), d.iloc[:, 1].max()

		dim = rmax - rmin
		size = int(dim / step) + 1
		values = [(x * step) + rmin for x in xrange(size)]

		logger.info("  {:10}: p={}, n={}, min={}, max={}, bins={}".format(predictor, nump, numn, rmin, rmax, size))

		stats[predictor] = dict(
			rmin = rmin,
			rmax = rmax,
			dim = dim,
			values = values,
			size = size,
			vmin = rmin,
			vmax = rmax,
			dp = [0] * size,
			dn = [0] * size,
			cdp = [0] * size,
			cdn = [0] * size,
			cump = 0,
			cumn = 0,
			tp = [0] * size,
			tn = [0] * size,
			fp = [0] * size,
			fn = [0] * size,
			mcc = [0] * size,
			acc = [0] * size,
			auc = [0] * size,
			cutoff = None,
			cutoff_index = None,
			cutoff_mcc = None,
			cutoff_acc = None,
			cutoff_auc = None)

	positive_count = data.iloc[:, 0].sum()
	negative_count = data.shape[0] - positive_count

	logger.info("  TOTAL     : positive={}, negative={}".format(positive_count, negative_count))

	logger.info("Calculating scores distribution and confusion matrices ...")



	logger.info("Calculating cumulative distribution ...")

	for predictor in predictors:
		predictor_stats = stats[predictor]
		dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]]
		cump = 0
		cumn = 0
		i = len(dp) - 1
		while i >= 0:
			cdp[i] = dp[i] + cump
			cump += dp[i]

			cdn[i] = dn[i] + cumn
			cumn += dn[i]

			i -= 1

		predictor_stats["cump"] = cump
		predictor_stats["cumn"] = cumn

		logger.info("  {}: cump={}, cumn={}".format(predictor, cump, cumn))

	logger.info("Calculating accuracy and cutoff ...")

	for predictor in predictors:
		predictor_stats = stats[predictor]
		values, size, tp, tn, fp, fn, mcc, acc = [predictor_stats[k] for k in [
													"values", "size", "tp", "tn", "fp", "fn", "mcc", "acc"]]

		cutoff = -1
		cutoff_index = -1
		best_mcc = -1e6
		for i in xrange(size):
			try:
				#http://en.wikipedia.org/wiki/Matthews_correlation_coefficient
				mcc[i] = (tp[i] * tn[i] - fp[i] * fn[i]) / sqrt((tp[i] + fp[i]) * (tp[i] + fn[i]) * (tn[i] + fp[i]) * (tn[i] + fn[i]))

				#http://en.wikipedia.org/wiki/Accuracy
				acc[i] = (tp[i] + tn[i]) / float(tp[i] + fp[i] + fn[i] + tn[i])
			except ZeroDivisionError:
				mcc[i] = 0
				acc[i] = 0

			if mcc[i] > best_mcc:
				cutoff = values[i]
				cutoff_index = i
				best_mcc = mcc[i]

		best_acc = max(acc)

		predictor_stats["cutoff"] = cutoff
		predictor_stats["cutoff_index"] = cutoff_index
		predictor_stats["cutoff_mcc"] = best_mcc
		predictor_stats["cutoff_acc"] = best_acc

		logger.info("  {}: cutoff={:.3f}, mcc={:.2f}, accuracy={:.2f}".format(
			predictor, cutoff, best_mcc * 100.0, best_acc * 100.0))

	if args.full_state:
		logger.info("Saving weights with full state ...")

		out_path = args.out_path
		save_weights(out_path, state)

	else:
		logger.info("Saving weights ...")

		stats = {}

		reduced_state = dict(
			predictor_names=state["predictor_names"],
			precision=state["precision"],
			step=state["step"],
			stats=stats)

		for predictor in state["predictor_names"]:
			predictor_stats = state["stats"][predictor]
			stats[predictor] = dict(
				rmin=predictor_stats["rmin"],
				rmax=predictor_stats["rmax"],
				dim=predictor_stats["dim"],
				values=predictor_stats["values"],
				size=predictor_stats["size"],
				cdp=predictor_stats["cdp"],
				cdn=predictor_stats["cdn"],
				cutoff=predictor_stats["cutoff"],
				cutoff_index=predictor_stats["cutoff_index"])

		save_weights(args.out_path, reduced_state)

	return 0
Example #2
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate TransFIC cutoffs")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("ranges_path", metavar="RANGES_PATH",
						help="JSON file generated with pred-list containing predictors stats. Only min and max are used.")

	parser.add_argument("scores_path", metavar="SCORES_PATH",
						help="The dataset with scores for non recurrent and highly recurrent. ID column should be NON/HIGH for non-rec/highly-rec datasets.")

	parser.add_argument("-o", dest="out_path", metavar="OUT_PATH",
						help="The file where cutoffs will be saved. Use - for standard output.")

	cmd.add_selected_predictors_args()

	parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISION", type=int, default=3,
						help="Distribution precision")

	cmd.add_transform_args()

	args, logger = cmd.parse_args("cutoffs")

	if args.out_path is None:
		prefix = os.path.splitext(os.path.basename(args.scores_path))[0]
		if prefix.endswith("-scores"):
			prefix = prefix[:-7]
		args.out_path = os.path.join(os.getcwd(), "{}-cutoffs.json".format(prefix))

	try:
		logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path)))

		with open(args.ranges_path) as f:
			pred_stats = json.load(f)

		predictor_range = {}
		for pid, pstats in pred_stats.items():
			predictor_range[pid] = (pstats["min"], pstats["max"])

		transforms = cmd.get_transforms()

		logger.info("Reading datasets {} ...".format(args.scores_path if args.scores_path != "-" else "from standard input"))

		with tsv.open(args.scores_path) as f:

			# Select predictors from the available predictors in the dataset or user selection

			column_names, column_indices = tsv.header(f)
			excluded_columns = set(COORD_COLUMNS) | set(["ID"])
			available_predictors = [c for c in column_names if c not in excluded_columns]
			predictors = cmd.get_selected_predictors(available_predictors)

			# Initialize statistics

			step = 1.0 / 10**args.precision

			stats = dict()

			state = dict(
				predictors = predictors,
				stats = stats,
				transforms=dict([(p, [e for e, _ in t]) for p, t in transforms.items()]),
				precision = args.precision,
				step = step)

			for predictor in predictors:
				rmin, rmax = predictor_range[predictor] if predictor in predictor_range else (0.0, 1.0)
				dim = rmax - rmin
				size = int(dim / step) + 1
				values = [(x * step) + rmin for x in xrange(size)]

				stats[predictor] = dict(
					rmin = rmin,
					rmax = rmax,
					dim = dim,
					values = values,
					size = size,
					vmin = None,
					vmax = None,
					dp = [0] * size,
					dn = [0] * size,
					cdp = [0] * size,
					cdn = [0] * size,
					cump = 0,
					cumn = 0,
					cutoff = None,
					cutoff_index = None)

			counts = load_events(f, column_indices, predictors, transforms, stats, logger)

			logger.info("  {}".format(", ".join(["{}={}".format(n, c) for n, c in counts.items()])))

		logger.info("Calculating cumulative distribution ...")

		for predictor in predictors:
			predictor_stats = stats[predictor]
			dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]]
			cump = 0
			cumn = 0
			i = len(dp) - 1
			while i >= 0:
				#cdp[i] = dp[i] + cump
				cump += dp[i]
				cdp[i] = cump

				cdn[i] = dn[i] + cumn
				cumn += dn[i]

				i -= 1

			predictor_stats["cump"] = cump
			predictor_stats["cumn"] = cumn

			logger.info("  {}: cump={}, cumn={}".format(predictor, cump, cumn))

		logger.info("Calculating cutoffs ...")

		for predictor in predictors:
			predictor_stats = stats[predictor]
			values, size, vmin, vmax, cump, cumn, cdp, cdn = [predictor_stats[k] for k in [
				"values", "size", "vmin", "vmax", "cump", "cumn", "cdp", "cdn"]]

			cutoff_low_mid_index = -1
			i = 0
			while (i < size) and (cdp[i] / float(cump) >= 0.95):
				cutoff_low_mid_index = i
				i += 1

			cutoff_low_mid = values[cutoff_low_mid_index]
			predictor_stats["cutoff_low_mid"] = cutoff_low_mid
			predictor_stats["cutoff_low_mid_index"] = cutoff_low_mid_index

			cutoff_mid_high_index = -1
			i = 0
			while (i < size) and (cdn[i] / float(cumn) >= 0.20):
				cutoff_mid_high_index = i
				i += 1

			cutoff_mid_high = values[cutoff_mid_high_index]
			predictor_stats["cutoff_mid_high"] = cutoff_mid_high
			predictor_stats["cutoff_mid_high_index"] = cutoff_mid_high_index

			logger.info("  {}: cutoffs: vmin={}, low_mid={}, mid_high={}, vmax={}".format(predictor, vmin, cutoff_low_mid, cutoff_mid_high, vmax))

		logger.info("Saving state ...")

		out_path = args.out_path
		save_weights(out_path, state)

	except:
		cmd.handle_error()

	return 0