Exemple #1
0
def load_events(f, column_indices, predictors, transforms, stats, logger):
	count = [0, 0]
	last_pos = [[None]*4]*2

	id_index = column_indices["ID"]
	pos_indices = [column_indices[name] for name in POS_HEADERS]

	for fields in tsv.rows(f):
		try:
			event_type = EVENT_TYPES[fields[id_index]]
		except KeyError:
			raise Exception("Unknown event type: {}".format(fields[id_index]))

		current_pos = [fields[i] for i in pos_indices]
		if last_pos[event_type] == current_pos:
			continue

		last_pos[event_type] = current_pos

		count[event_type] += 1

		protein, pos, aa_ref, aa_alt = current_pos
		scores = [score_value(fields[column_indices[p]]) for p in predictors]

		for predictor, score in zip(predictors, scores):
			if score is None or not predictor in stats:
				continue

			if predictor in transforms:
				for expr, func in transforms[predictor]:
					try:
						score = func(score)
					except:
						logger.error("Error applying transformation {} to score {}".format(expr, score))

			predictor_stats = stats[predictor]
			(rmin, rmax, dim, vmin, vmax, size, dp, dn) = [predictor_stats[k] for k in [
											"rmin", "rmax", "dim", "vmin", "vmax", "size", "dp", "dn"]]

			r = (score - rmin) / dim
			index = int(r * size) if score < rmax else size - 1

			if vmin is None or score < vmin:
				predictor_stats["vmin"] = score
			if vmax is None or score > vmax:
				predictor_stats["vmax"] = score

			if event_type == HIGH_REC_EVENT:
				dp[index] += 1
			elif event_type == NON_REC_EVENT:
				dn[index] += 1

	return { "high_recurrent" : count[HIGH_REC_EVENT], "non_recurrent" : count[NON_REC_EVENT] }
Exemple #2
0
def main():
	parser = argparse.ArgumentParser(
		description="Update scores in the database")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	cmd.add_selected_predictors_args()

	parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False,
						help="Update of the predictors.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("update")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors(check_missing=False)

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))])

			db_predictors = set([p["id"] for p in db.predictors()])

			if len(predictors) == 0:
				predictors = [name for name in hdr if name in db_predictors]
				if len(predictors) == 0:
					raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.")

			logger.info("Predictors: {}".format(", ".join(predictors)))

			for predictor in filter(lambda p: p not in db_predictors, predictors):
				logger.info("Creating predictor {} ...".format(predictor))
				db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0
			use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0

			if not use_genome_coords and not use_protein_coords:
				raise Exception("No coordinate columns found. "
								"Use {} for genomic coordinates or {} for protein coordinates.".format(
									GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS))
			elif use_genome_coords and use_protein_coords:
				logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default")

			if use_genome_coords:
				coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)]
				coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_dna
			elif use_protein_coords:
				coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)]
				coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_protein

			coord_column_indices = [hdr[n] for n in coord_column_names]
			score_indices = [hdr[n] for n in predictors]
			max_column_index = max(coord_column_indices + score_indices)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column_index:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip(
													coord_column_names, coord_column_types, coord_column_indices)])

					scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)])
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					for row in db.query_scores(fields=[], **coords):
						db.update_scores(row["id"], scores)
				except Exception as ex:
					logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex)))
					logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()])))
					if not args.ignore_errors:
						raise

				progress.update()

		progress.log_totals()

		logger.info("Finalizing database ...")

		if args.update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #3
0
def load_events(f, column_indices, predictors, stats, logger):
	count = [0, 0]
	last_pos = [[None]*4]*2

	id_index = column_indices["ID"]
	pos_indices = [column_indices[name] for name in POS_COLUMNS]
	pred_indices = [column_indices[p] for p in predictors]

	for fields in tsv.rows(f):
		try:
			event_type = EVENT_TYPES[fields[id_index]]
		except KeyError:
			raise Exception("Unknown event type: {}".format(fields[id_index]))

		current_pos = [fields[i] for i in pos_indices]
		if last_pos[event_type] == current_pos:
			continue

		last_pos[event_type] = current_pos

		count[event_type] += 1

		protein, pos, aa_ref, aa_alt = current_pos
		scores = [score_value(fields[pi]) for pi in pred_indices]

		for predictor, score in zip(predictors, scores):
			if score is None or not predictor in stats:
				continue

			if predictor in PREDICTOR_TRANSFORM:
				score = PREDICTOR_TRANSFORM[predictor](score)

			predictor_stats = stats[predictor]
			(rmin, rmax, dim, vmin, vmax, size,
			 dp, dn, tp, tn, fp, fn) = [predictor_stats[k] for k in [
											"rmin", "rmax", "dim", "vmin", "vmax", "size",
											"dp", "dn", "tp", "tn", "fp", "fn"]]

			r = (score - rmin) / dim
			index = int(r * size) if score < rmax else size - 1

			if vmin is None or score < vmin:
				predictor_stats["vmin"] = score
			if vmax is None or score > vmax:
				predictor_stats["vmax"] = score

			if event_type == POS_EVENT:
				dp[index] += 1
				for i in xrange(0, index):
					tp[i] += 1
				for i in xrange(index, size):
					fn[i] += 1

			elif event_type == NEG_EVENT:
				dn[index] += 1
				for i in xrange(0, index):
					fp[i] += 1
				for i in xrange(index, size):
					tn[i] += 1

	return count[POS_EVENT], count[NEG_EVENT]