コード例 #1
0
ファイル: pred_update.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Update predictors min, max and count")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	cmd.add_selected_predictors_args()

	args, logger = cmd.parse_args("pred-update")

	db = cmd.open_db()

	try:
		predictors = cmd.get_selected_predictors(default_all=True)

		logger.info("Updating predictors ...")

		start_time = datetime.now()

		db.update_predictors(predictors)

		db.commit()

		logger.info("Finished. elapsed={}".format(datetime.now() - start_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
コード例 #2
0
ファイル: fetch.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Fetch Condel scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("muts_path", metavar="SNVS_PATH",
						help="SNV's to check. Use - for standard input.")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="The results path. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	args, logger = cmd.parse_args("fetch")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input"))

	try:
		progress = RatedProgress(logger, name="SNVs")

		with tsv.open(args.muts_path) as f:
			with tsv.open(args.out_path, "w") as wf:
				tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

				hit = fail = 0

				mut = DnaAndProtMutationParser()
				for line_num, line in enumerate(f, start=1):
					line = line.rstrip(" \n\r")
					if len(line) == 0 or line.startswith("#"):
						continue

					try:
						mut.parse(line)
					except PrematureEnd:
						logger.error("Missing fields at line {}".format(line_num))
						fail += 1
						continue
					except UnexpectedToken as ex:
						logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num))
						fail += 1
						continue

					exists = False
					for row in query_mutation(logger, db, mut, annotations, predictors):

						exists = True

						ann = row["annotations"]
						scores = row["scores"]

						tsv.write_line(wf, mut.identifier,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

						"""
						if logger.isEnabledFor(logging.DEBUG):
							logger.debug("    --> {} {} {} {} {} {} {} {} {} {}".format(
										row["chr"], row["start"], row["ref"], row["alt"], row["transcript"],
										row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"],
										mut.identifier or "*"))
						"""

					progress.update()

					if exists:
						hit += 1
					else:
						fail += 1

		progress.log_totals()

		logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()
コード例 #3
0
ファイル: update.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Update scores in the database")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	cmd.add_selected_predictors_args()

	parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False,
						help="Update of the predictors.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("update")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors(check_missing=False)

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))])

			db_predictors = set([p["id"] for p in db.predictors()])

			if len(predictors) == 0:
				predictors = [name for name in hdr if name in db_predictors]
				if len(predictors) == 0:
					raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.")

			logger.info("Predictors: {}".format(", ".join(predictors)))

			for predictor in filter(lambda p: p not in db_predictors, predictors):
				logger.info("Creating predictor {} ...".format(predictor))
				db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0
			use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0

			if not use_genome_coords and not use_protein_coords:
				raise Exception("No coordinate columns found. "
								"Use {} for genomic coordinates or {} for protein coordinates.".format(
									GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS))
			elif use_genome_coords and use_protein_coords:
				logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default")

			if use_genome_coords:
				coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)]
				coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_dna
			elif use_protein_coords:
				coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)]
				coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_protein

			coord_column_indices = [hdr[n] for n in coord_column_names]
			score_indices = [hdr[n] for n in predictors]
			max_column_index = max(coord_column_indices + score_indices)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column_index:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip(
													coord_column_names, coord_column_types, coord_column_indices)])

					scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)])
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					for row in db.query_scores(fields=[], **coords):
						db.update_scores(row["id"], scores)
				except Exception as ex:
					logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex)))
					logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()])))
					if not args.ignore_errors:
						raise

				progress.update()

		progress.log_totals()

		logger.info("Finalizing database ...")

		if args.update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
コード例 #4
0
ファイル: export.py プロジェクト: chris-zen/phd-thesis
def main():
	parser = argparse.ArgumentParser(
		description="Export Scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("dest_path", metavar="OUTPUT_PATH",
						help="The output file. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	parser.add_argument("--json", dest="to_json", action="store_true", default=False,
						help="Export the results in json format")

	parser.add_argument("--sample", dest="sample", type=int, metavar="PCT",
						help="Export a random sample of PCT %%")

	parser.add_argument("--start", dest="start", type=int, metavar="N",
						help="Start to export from the SNV number N")

	parser.add_argument("--limit", dest="limit", type=int, metavar="N",
						help="Limit the number of SNVs to export to N")

	args, logger = cmd.parse_args("export")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Exporting ...")

	random.seed(time.time())

	total_count = 0
	total_start_time = time.time()

	try:
		progress = RatedProgress(logger, name="SNVs")

		to_json = args.to_json
		sample = args.sample
		start = args.start or 0
		limit = args.limit

		doc = None
		last_pos = None
		rows_count = 0
		snvs_count = 0
		with tsv.open(args.dest_path, "w") as f:

			if not to_json:
				tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

			for row in db.query_scores(predictors=predictors, maps=annotations):

				if not to_json:
					if start > 0:
						start -= 1
						continue

					if sample is not None and random.randint(1, 100) > sample:
						continue

				pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"])
				if last_pos != pos:
					if to_json:
						if start > 0:
							start -= 1
							continue

						if limit is not None and snvs_count >= limit:
							if doc is not None:
								json.dump(doc, f)
								f.write("\n")
							break

					snvs_count += 1

				rows_count += 1

				ann = row["annotations"]
				scores = row["scores"]

				if to_json:
					tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] +
										[(k,scores[k]) for k in predictors])

					if pos != last_pos:
						if doc is not None:
							if sample is None or random.randint(1, 100) <= sample:
								json.dump(doc, f)
								f.write("\n")
							else:
								snvs_count -= 1

						doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] +
										[("transcripts", [tdoc])])
					else:
						doc["transcripts"] += [tdoc]

				else:
					tsv.write_line(f,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

				progress.update()

				last_pos = pos

				if not to_json and limit is not None and rows_count >= limit:
					break

		progress.log_totals()

		logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0