Ejemplo n.º 1
0
def extract_snvs(fanns_db, data_path, logger=None):

	logger = logger or logging.getLogger("perf-cosmic")

	snvs = dict()

	logger.info("Reading mutations ...")
	
	progress = RatedProgress(logger, name="mutations")
	
	with tsv.open(data_path, "r") as df:
		columns = [
			"Genome-wide screen",
			"Mutation Description",
			"Mutation CDS",
			"Mutation AA",
			"Mutation GRCh37 genome position",
			"Mutation GRCh37 strand",
			"Accession Number",
			"ID_sample"]

		total_rows = queried_rows = dbfound_rows = 0
		for fields in tsv.rows(df, columns=columns, header=True):
			total_rows += 1
			wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields

			# wide_screen != "y"
			if mut_desc != "Substitution - Missense":
				continue

			queried_rows += 1
			for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
				dbfound_rows += 1
				k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]])
				if k not in snvs:
					snvs[k] = snv = dict(
						transcript=row["transcript"],
						symbol=row["xrefs"]["symbol"],
						msamples=set(), wsamples=set())
				else:
					snv = snvs[k]
				
				if wide_screen == "y":
					snv["wsamples"].add(sample_id)
				else:
					snv["msamples"].add(sample_id)
			
			progress.update()

		progress.log_totals()

	logger.info("Counting the number of samples per mutation ...")
	
	for data in snvs.itervalues():
		data["msamples"] = len(data["msamples"])
		data["wsamples"] = len(data["wsamples"])
    
	logger.info("Total: total_rows={}, queried_rows={}, found_rows={}, protein_changes={}".format(total_rows, queried_rows, dbfound_rows, len(snvs)))

	return snvs
Ejemplo n.º 2
0
def main():
	parser = argparse.ArgumentParser(
		description="Export SNV's")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("dest_path", metavar="DEST",
						help="The destination file. Use - for standard output.")

	args, log = cmd.parse_args("export-snvs")

	db = cmd.open_db()

	logger.info("Exporting SNV's ...")

	total_count = 0
	total_start_time = time.time()

	try:
		progress = RatedProgress(logger, name="SNVs")
		rows_count = 0
		with tsv.open(args.dest_path, "w") as f:
			for snv in db.snvs():
				rows_count += 1

				tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S")

				progress.update()

		log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="Map score values")

    cmd = DefaultCommandHelper(parser)

    cmd.add_db_args()

    cmd.add_transform_args()

    parser.add_argument(
        "--skip-empty-scores",
        dest="skip_empty_scores",
        action="store_true",
        default=False,
        help="Skip transformation for empty scores",
    )

    args, logger = cmd.parse_args("scores-transform")

    db = cmd.open_db()

    try:
        transforms = cmd.get_transforms()

        predictors = transforms.keys()

        logger.info("Transforming scores ...")

        progress = RatedProgress(logger, name="SNVs")

        rows_count = updated_count = 0
        for row in db.query_scores(predictors=predictors):
            rows_count += 1

            scores = row["scores"]
            upd_scores = {}

            for predictor in transforms:
                score = scores[predictor]
                if args.skip_empty_scores and score is None:
                    continue

                prev_score = score
                for name, func in transforms[predictor]:
                    try:
                        score = func(score)
                    except:
                        raise Exception("Error transforming the {} score {} with {}".format(predictor, score, name))

                if prev_score != score:
                    upd_scores[predictor] = score

            if len(upd_scores) > 0:
                db.update_scores(row["id"], upd_scores)
                updated_count += 1

            progress.update()

        progress.log_totals()

        logger.info("Commit ...")

        db.commit()

        logger.info(
            "Finished. Total rows = {}, updated rows = {}, elapsed time = {}".format(
                rows_count, updated_count, progress.elapsed_time
            )
        )
    except:
        return cmd.handle_error()
    finally:
        db.close()

    return 0
Ejemplo n.º 4
0
def create_datasets(snvs, cgc_path, tdrivers_path, pdrivers_path, output_prefix, logger=None):

	logger = logger or logging.getLogger("perf-cosmic")

	prefix = output_prefix or "cosmic-"

	logger.info("Loading CGC genes ...")
	cgc_genes = set()
	with open(cgc_path, "r") as f:
		for line in f:
			cgc_genes.add(line.rstrip("\n"))

	logger.info("Loading TD drivers ...")
	tdrivers = set()
	with open(tdrivers_path, "r") as f:
		for line in f:
			tdrivers.add(line.rstrip("\n").split("\t")[0])

	logger.info("Loading PD drivers ...")
	pdrivers = set()
	with open(pdrivers_path, "r") as f:
		for line in f:
			pdrivers.add(line.rstrip("\n").split("\t")[0])

	logger.info("Creating datasets ...")

	progress = RatedProgress(logger, name="mutations")

	with Dataset(prefix + "1") as rec1,\
		Dataset(prefix + "2") as rec2,\
		Dataset(prefix + "4") as rec4,\
		Dataset(prefix + "CGC") as cgc,\
		Dataset(prefix + "noCGC") as nocgc,\
		Dataset(prefix + "TD") as td,\
		Dataset(prefix + "noTD") as notd,\
		Dataset(prefix + "PD") as pd,\
		Dataset(prefix + "noPD") as nopd:

		for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items():
			num_samples = len(snv["samples"])
			line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]])
			symbol = snv["symbol"] or ""
			if isinstance(symbol, basestring):
				symbol = set([symbol])
			elif isinstance(symbol, list):
				symbol = set(symbol)

			if num_samples == 1:
				rec1.write(line)

			if num_samples >= 2:
				rec2.write(line)

			if num_samples >= 4:
				rec4.write(line)

			if len(symbol & cgc_genes) > 0:
				cgc.write(line)
			elif num_samples == 1:
				nocgc.write(line)

			if len(symbol & tdrivers) > 0:
				td.write(line)
			elif num_samples == 1:
				notd.write(line)

			if len(symbol & pdrivers) > 0:
				pd.write(line)
			elif num_samples == 1:
				nopd.write(line)

			progress.update()

		progress.log_totals()

		logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [
			rec1, rec2, rec4, cgc, nocgc, td, notd, pd, nopd]])))
Ejemplo n.º 5
0
def main():
	parser = argparse.ArgumentParser(
		description="Fetch Condel scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("muts_path", metavar="SNVS_PATH",
						help="SNV's to check. Use - for standard input.")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="The results path. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	args, logger = cmd.parse_args("fetch")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input"))

	try:
		progress = RatedProgress(logger, name="SNVs")

		with tsv.open(args.muts_path) as f:
			with tsv.open(args.out_path, "w") as wf:
				tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

				hit = fail = 0

				mut = DnaAndProtMutationParser()
				for line_num, line in enumerate(f, start=1):
					line = line.rstrip(" \n\r")
					if len(line) == 0 or line.startswith("#"):
						continue

					try:
						mut.parse(line)
					except PrematureEnd:
						logger.error("Missing fields at line {}".format(line_num))
						fail += 1
						continue
					except UnexpectedToken as ex:
						logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num))
						fail += 1
						continue

					exists = False
					for row in query_mutation(logger, db, mut, annotations, predictors):

						exists = True

						ann = row["annotations"]
						scores = row["scores"]

						tsv.write_line(wf, mut.identifier,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

						"""
						if logger.isEnabledFor(logging.DEBUG):
							logger.debug("    --> {} {} {} {} {} {} {} {} {} {}".format(
										row["chr"], row["start"], row["ref"], row["alt"], row["transcript"],
										row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"],
										mut.identifier or "*"))
						"""

					progress.update()

					if exists:
						hit += 1
					else:
						fail += 1

		progress.log_totals()

		logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()
Ejemplo n.º 6
0
def main():
	parser = argparse.ArgumentParser(
		description="Update scores in the database")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	cmd.add_selected_predictors_args()

	parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False,
						help="Update of the predictors.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("update")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors(check_missing=False)

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))])

			db_predictors = set([p["id"] for p in db.predictors()])

			if len(predictors) == 0:
				predictors = [name for name in hdr if name in db_predictors]
				if len(predictors) == 0:
					raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.")

			logger.info("Predictors: {}".format(", ".join(predictors)))

			for predictor in filter(lambda p: p not in db_predictors, predictors):
				logger.info("Creating predictor {} ...".format(predictor))
				db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0
			use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0

			if not use_genome_coords and not use_protein_coords:
				raise Exception("No coordinate columns found. "
								"Use {} for genomic coordinates or {} for protein coordinates.".format(
									GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS))
			elif use_genome_coords and use_protein_coords:
				logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default")

			if use_genome_coords:
				coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)]
				coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_dna
			elif use_protein_coords:
				coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)]
				coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_protein

			coord_column_indices = [hdr[n] for n in coord_column_names]
			score_indices = [hdr[n] for n in predictors]
			max_column_index = max(coord_column_indices + score_indices)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column_index:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip(
													coord_column_names, coord_column_types, coord_column_indices)])

					scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)])
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					for row in db.query_scores(fields=[], **coords):
						db.update_scores(row["id"], scores)
				except Exception as ex:
					logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex)))
					logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()])))
					if not args.ignore_errors:
						raise

				progress.update()

		progress.log_totals()

		logger.info("Finalizing database ...")

		if args.update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Ejemplo n.º 7
0
def main():
	parser = argparse.ArgumentParser(
		description="Filter for the longest transcript")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("len_path", metavar="PATH",
						help="The tsv containing the transcripts length")

	parser.add_argument("data_path", metavar="PATH",
						help="The data file")

	parser.add_argument("out_path", metavar="PATH",
						help="Output file. Use - for standard output.")

	parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT",
						help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT")

	args, logger = cmd.parse_args("filter-transcript")

	try:
		logger.info("Loading transcripts length ...")
		trslen = defaultdict(int)
		with tsv.open(args.len_path) as f:
			for name, length in tsv.rows(f):
				trslen[name] = length

		logger.info("Filtering {} ...".format(os.path.basename(args.data_path)))

		total_count = filter_count = 0

		progress = RatedProgress(logger, name="mutations")

		key_columns = args.key.split(",")
		with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of:
			hdr_line = df.readline()
			of.write(hdr_line)
			_, hdr = tsv.header_from_line(hdr_line)
			key_indices = [hdr[name] for name in key_columns]
			trs_index = hdr["TRANSCRIPT"]

			last_key = None
			longest = (0, "")

			for line in df:
				total_count += 1

				fields = line.rstrip("\n").split("\t")
				key = tuple([fields[index] for index in key_indices])
				trs = fields[trs_index]

				tl = trslen[trs]

				if last_key != key:
					if last_key is not None:
						of.write(longest[1])
						filter_count += 1
					longest = (tl, line)
					last_key = key

				elif tl > longest[0]:
					longest = (tl, line)

				progress.update()

			filter_count += 1
			of.write(longest[1])

		progress.log_totals()

		logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format(
			total_count, filter_count, total_count - filter_count, progress.elapsed_time))
	except:
		cmd.handle_error()

	return 0
Ejemplo n.º 8
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate TransFIC labels")

	cmd = Command.withtraits(DbTrait, PredictorsInDbTrait)(parser)
	
	cmd.add_db_args()
	
	parser.add_argument("cutoffs_path", metavar="CUTOFFS",
						help="File containing the cutoffs")

	cmd.add_selected_predictors_args()

	parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES",
						help="Updated predictor names")

	args, logger = cmd.parse_args("calc-label")

	db = cmd.open_db()

	try:
		logger.info("Loading state ...")

		state = load_weights(args.cutoffs_path)

		avail_predictors, stats = [state[k] for k in ["predictors", "stats"]]

		predictors = cmd.get_selected_predictors(default_all=True)
		missing_predictors = [p for p in predictors if p not in set(avail_predictors)]
		if len(missing_predictors) > 0:
			raise Exception("Missing cutoff stats for predictors: {}".format(", ".join(missing_predictors)))

		if args.updated_predictors is not None:
			if len(predictors) != len(args.updated_predictors):
				raise Exception("The number of selected predictors does not match the number of predictor names to update")
			updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)])
		else:
			updated_predictors = dict([(p, "{}_LABEL".format(p)) for p in predictors])

		# create predictors in the database if required

		db_predictors = set([p["id"] for p in db.predictors()])
		for predictor, updated_predictor in updated_predictors.items():
			if updated_predictor not in db_predictors:
				logger.info("Creating predictor {} ...".format(updated_predictor))
				db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor])

		cutoffs = {}
		for predictor in predictors:
			cutoff_low_mid, cutoff_mid_high = [stats[predictor][v] for v in ["cutoff_low_mid", "cutoff_mid_high"]]
			logger.info("{}: cutoffs: low_mid={}, mid_high={}".format(predictor, cutoff_low_mid, cutoff_mid_high))
			cutoffs[predictor] = (cutoff_low_mid, cutoff_mid_high)

		logger.info("Calculating ...")

		progress = RatedProgress(logger, name="SNVs")

		for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1):
			scores = row["scores"]
			uscores = {}
			for predictor in predictors:
				score = scores[predictor]
				if score is None:
					continue

				cutoff_low_mid, cutoff_mid_high = cutoffs[predictor]
				updated_predictor = updated_predictors[predictor]

				uscores[updated_predictor] = 0.0 if score < cutoff_low_mid else 1.0 if score < cutoff_mid_high else 2.0

			if len(uscores) > 0:
				db.update_scores(row["id"], uscores)

			progress.update()

		db.commit()
	except:
		cmd.handle_error()

	return 0
Ejemplo n.º 9
0
def main():
	parser = argparse.ArgumentParser(
		description="Import scores into the database")

	cmd = Command.withtraits(DbTrait, PredictorsTrait)(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	#TODO: which are the coordinates column

	cmd.add_selected_predictors_args()

	parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False,
						help="Skip SNV's where all the scores are empty")

	parser.add_argument("--skip-update-predictors", dest="skip_update_predictors", action="store_true", default=False,
						help="Skip the update of the predictors.")

	parser.add_argument("--skip-create-index", dest="skip_create_index", action="store_true", default=False,
						help="Skip the creation of the database indices.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("import")

	db = cmd.open_db()

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = {}
			for index, name in enumerate(hdr_line.rstrip("\n").split("\t")):
				hdr[name] = index

			# Predictors to update from the user selection and source availability
			db_predictors = set([p["id"] for p in db.predictors()])
			src_predictors = [name for name in hdr if name not in COORD_COLUMNS]
			predictors = cmd.get_selected_predictors(available_predictors=src_predictors)
			for predictor in predictors:
				if predictor not in db_predictors:
					logger.info("Creating non existing predictor: {}".format(predictor))
					db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			logger.info("Predictors: {}".format(", ".join(predictors)))

			all_columns = COORD_COLUMNS + predictors
			types = COORD_TYPES + ([score_value] * len(predictors))

			missing_columns = [name for name in all_columns if name not in hdr]
			if len(missing_columns) > 0:
				raise Exception("The following columns are missing: {}".format(", ".join(missing_columns)))

			columns = [hdr[name] for name in all_columns]
			max_column = max(columns)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					fields = [type_cast(fields[index]) for type_cast, index in zip(types, columns)]
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				(chr, strand, start, ref, alt, transcript,
				 aa_pos, aa_ref, aa_alt, protein) = fields[:10]

				scores = fields[10:]

				if args.skip_empty_scores and sum([0 if s is None else 1 for s in scores]) == 0:
					continue

				try:
					db.add_snv(
								chr=chr, strand=strand, start=start, ref=ref, alt=alt, transcript=transcript,
								protein=protein, aa_pos=aa_pos, aa_ref=aa_ref, aa_alt=aa_alt,
								scores=dict(zip(predictors, scores)))
				except Exception as ex:
					logger.error("Error importing SNV at line {}: {}".format(line_num, str(ex)))
					if not args.ignore_errors:
						raise

				progress.update()

			total_lines += line_num

		progress.log_totals()

		logger.info("Finalizing database ...")

		if not args.skip_update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		if not args.skip_create_index:
			logger.info("Creating indices ...")
			db.create_indices()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Ejemplo n.º 10
0
def main():
	parser = argparse.ArgumentParser(
		description="Export Scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("dest_path", metavar="OUTPUT_PATH",
						help="The output file. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	parser.add_argument("--json", dest="to_json", action="store_true", default=False,
						help="Export the results in json format")

	parser.add_argument("--sample", dest="sample", type=int, metavar="PCT",
						help="Export a random sample of PCT %%")

	parser.add_argument("--start", dest="start", type=int, metavar="N",
						help="Start to export from the SNV number N")

	parser.add_argument("--limit", dest="limit", type=int, metavar="N",
						help="Limit the number of SNVs to export to N")

	args, logger = cmd.parse_args("export")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Exporting ...")

	random.seed(time.time())

	total_count = 0
	total_start_time = time.time()

	try:
		progress = RatedProgress(logger, name="SNVs")

		to_json = args.to_json
		sample = args.sample
		start = args.start or 0
		limit = args.limit

		doc = None
		last_pos = None
		rows_count = 0
		snvs_count = 0
		with tsv.open(args.dest_path, "w") as f:

			if not to_json:
				tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

			for row in db.query_scores(predictors=predictors, maps=annotations):

				if not to_json:
					if start > 0:
						start -= 1
						continue

					if sample is not None and random.randint(1, 100) > sample:
						continue

				pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"])
				if last_pos != pos:
					if to_json:
						if start > 0:
							start -= 1
							continue

						if limit is not None and snvs_count >= limit:
							if doc is not None:
								json.dump(doc, f)
								f.write("\n")
							break

					snvs_count += 1

				rows_count += 1

				ann = row["annotations"]
				scores = row["scores"]

				if to_json:
					tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] +
										[(k,scores[k]) for k in predictors])

					if pos != last_pos:
						if doc is not None:
							if sample is None or random.randint(1, 100) <= sample:
								json.dump(doc, f)
								f.write("\n")
							else:
								snvs_count -= 1

						doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] +
										[("transcripts", [tdoc])])
					else:
						doc["transcripts"] += [tdoc]

				else:
					tsv.write_line(f,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

				progress.update()

				last_pos = pos

				if not to_json and limit is not None and rows_count >= limit:
					break

		progress.log_totals()

		logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Ejemplo n.º 11
0
def main():
	parser = argparse.ArgumentParser(
		description="Generate datasets needed to evaluate performance from Cosmic mutations")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("data_path", metavar="PATH",
						help="The CosmicMutantExport tsv file")

	parser.add_argument("cgc_path", metavar="PATH",
						help="The list of CGC genes")

	parser.add_argument("drivers_path", metavar="PATH",
						help="The list of CHASM drivers (drivers.tmps)")

	parser.add_argument("-o", dest="prefix", metavar="PREFIX",
						help="Output prefix.")

	args, logger = cmd.parse_args("perf-cosmic")

	prefix = args.prefix or "cosmic-"

	fanns_db = cmd.open_db()

	try:
		snvs = dict()

		logger.info("Counting the number of samples per mutation ...")
		with tsv.open(args.data_path, "r") as df:
			columns = [
				#"Genome-wide screen",
				"Mutation Description",
				"Mutation CDS",
				"Mutation AA",
				"Mutation GRCh37 genome position",
				"Mutation GRCh37 strand",
				"Accession Number",
				"ID_sample"]

			total_rows = queried_rows = 0
			for fields in tsv.rows(df, columns=columns, header=True):
				total_rows += 1
				#wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				wide_screen = "y"		
				if wide_screen != "y" or mut_desc != "Substitution - Missense":
					continue

				queried_rows += 1
				for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
					k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]])
					if k not in snvs:
						symbol = row["xrefs"]["symbol"]
						snvs[k] = dict(
							transcript=row["transcript"],
							symbol=symbol,
							samples=set([sample_id]))
					else:
						snvs[k]["samples"].add(sample_id)

		logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs)))

		logger.info("Loading CGC genes ...")
		cgc_genes = set()
		with open(args.cgc_path, "r") as f:
			for line in f:
				cgc_genes.add(line.rstrip("\n"))

		logger.info("Loading CHASM drivers ...")
		drivers = set()
		with open(args.drivers_path, "r") as f:
			for line in f:
				drivers.add(line.rstrip("\n").split("\t")[0])

		logger.info("Creating datasets ...")

		progress = RatedProgress(logger, name="mutations")

		with Dataset(prefix + "1") as rec1,\
			Dataset(prefix + "2") as rec2,\
			Dataset(prefix + "4") as rec4,\
			Dataset(prefix + "CGC") as cgc,\
			Dataset(prefix + "noCGC") as nocgc,\
			Dataset(prefix + "D") as drv,\
			Dataset(prefix + "O") as nodrv:

			for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items():
				num_samples = len(snv["samples"])
				line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]])
				if num_samples == 1:
					rec1.write(line)
				if num_samples >= 2:
					rec2.write(line)
				if num_samples >= 4:
					rec4.write(line)
				
				symbol = snv["symbol"]
				if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0):
					cgc.write(line)
				elif num_samples == 1:
					nocgc.write(line)
			
				if snv["transcript"] in drivers:
					drv.write(line)
				elif num_samples == 1:
					nodrv.write(line)
                    
				progress.update()

			progress.log_totals()

			logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [
				rec1, rec2, rec4, cgc, nocgc, drv, nodrv]])))

	except:
		cmd.handle_error()

	return 0
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser(description="Calculate TransFIC for the selected scores")

    cmd = Command.withtraits(DbTrait, PredictorsInDbTrait, TransformsTrait)(parser)

    cmd.add_db_args()

    parser.add_argument(
        "feature_name",
        metavar="FEATURE_COLUMN",
        help="The column name with the features. It can be transcript, protein or any of the available annotations.",
    )

    parser.add_argument("blt_path", metavar="BLT_PATH", help="The baseline tolerance statistics.")

    cmd.add_selected_predictors_args()

    parser.add_argument(
        "-u", "--updated-predictors", dest="updated_predictors", metavar="NAME", help="Updated predictor names"
    )

    cmd.add_transform_args()

    args, logger = cmd.parse_args("calc")

    db = cmd.open_db()

    # initialize feature selection

    db_annotations = [a["id"] for a in db.maps()]
    if args.feature_name not in set(["transcript", "protein"] + db_annotations):
        logger.error("Feature name not available in the database: {}".format(args.feature_name))
        logger.error("Available annotations: {}".format(", ".join(db_annotations)))
        exit(-1)

    if args.feature_name.lower() in ["transcript", "protein"]:
        annotations = None
        feature_getter = lambda row: row[args.feature_name]
    else:
        annotations = [args.feature_name]
        feature_getter = lambda row: row["annotations"][args.feature_name]

        # predictors, transforms, and updated_predictors

    predictors = cmd.get_selected_predictors(default_all=True)

    transforms = cmd.get_transforms()

    if args.updated_predictors is not None:
        if len(predictors) != len(args.updated_predictors):
            logger.error("The number of selected predictors does not match the number of predictor names to update")
            exit(-1)
        updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)])
    else:
        updated_predictors = dict([(p, "TFIC_{}".format(p)) for p in predictors])

        # create predictors in the database if required

    db_predictors = set([p["id"] for p in db.predictors()])
    for predictor, updated_predictor in updated_predictors.items():
        if updated_predictor not in db_predictors:
            logger.info("Creating predictor {} ...".format(updated_predictor))
            db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor])

    try:
        logger.info("Loading baseline tolerance statistics ...")

        with tsv.open(args.blt_path) as f:
            doc = json.load(f)
            blt_predictors = doc["predictors"]
            features = doc["features"]
            blt_stats = doc["blt"]
            num_predictors = len(blt_predictors)

        logger.info("  Predictors: {}".format(", ".join(blt_predictors)))
        logger.info("  Features: {}".format(len(features)))

        logger.info("Calculating ...")

        progress = RatedProgress(logger, name="SNVs")

        rows_count = updated_count = 0
        for row in db.query_scores(predictors=predictors, maps=annotations):
            rows_count += 1

            scores = row["scores"]

            feature = feature_getter(row)
            if feature not in blt_stats:
                continue

            feature_stats = blt_stats[feature]

            tfic_scores = calculate_tfic(predictors, updated_predictors, feature_stats, scores, transforms)

            if len(tfic_scores) > 0:
                db.update_scores(row["id"], tfic_scores)
                updated_count += 1

            progress.update()

        progress.log_totals()

        logger.info("Commit ...")

        db.commit()

        logger.info(
            "Finished. Total rows = {}, updated rows = {}, elapsed_time = {}".format(
                rows_count, updated_count, progress.elapsed_time
            )
        )

    except:
        cmd.handle_error()

    return 0
Ejemplo n.º 13
0
def fetch_iter(db, muts_path, maps=None, predictors=None, muts_header=False, state=None, logger=None):
	"""
	Iterator that fetches scores from the database from the mutations in a file.
	
	:param db: FannsDb interface.
	:param muts_path: The input path for mutations.
	:param maps: Map transcript/protein ensembl identifiers with external identifiers (swissprot_id, ...)
	:param predictors: Predictors for which to obtain the scores.
	:param muts_header: Whether the muts_path has a header or not.
	:param state: The state of the iteration: hits, fails.
	:param logger: Logger to use. If not specified a new one is created.
	"""

	def query_mutation(logger, db, mut, maps, predictors):

		if mut.coord == Mutation.GENOMIC:
			if logger.isEnabledFor(logging.DEBUG):
				logger.debug("  Querying {} {} {} {} {} {} {} ...".format(
					mut.chr, mut.start, mut.end or "*", mut.ref or "*", mut.alt, mut.strand or "*", mut.identifier or "*"))

			for row in db.query_scores(chr=mut.chr, start=mut.start,
											ref=mut.ref, alt=mut.alt, strand=mut.strand,
											predictors=predictors, maps=maps):
				yield row

		elif mut.coord == Mutation.PROTEIN:
			if logger.isEnabledFor(logging.DEBUG):
				logger.debug("  Querying {} {} {} {} {} ...".format(
					mut.protein, mut.start, mut.ref or "*", mut.alt, mut.identifier or "*"))

			for row in db.query_scores(protein=mut.protein, aa_pos=mut.start, aa_ref=mut.ref, aa_alt=mut.alt,
											predictors=predictors, maps=maps):
				yield row

		else:
			logger.warn("Unknown coordinates system: {}".format(mut.line))

	if logger is None:
		logger = logging.getLogger("fannsdb.fetch")

	state = state if state is not None else {}
	state[STATE_HITS] = state[STATE_FAILS] = 0
	maps = maps if maps is not None else []
	predictors = predictors if predictors is not None else []
	
	logger.info("Reading {} ...".format(os.path.basename(muts_path) if muts_path != "-" else "from standard input"))

	progress = RatedProgress(logger, name="SNVs")

	with tsv.open(muts_path) as f:
		if muts_header:
			tsv.skip_comments_and_empty(f) # this returns the first non empty nor comment line (the header)

		mutparser = DnaAndProtMutationParser()
		for line_num, line in enumerate(f, start=1):
			line = line.rstrip(" \n\r")
			if len(line) == 0 or line.startswith("#"):
				continue

			try:
				mut = mutparser.parse(line)
			except PrematureEnd:
				logger.error("Missing fields at line {}".format(line_num))
				state[STATE_FAILS] += 1
				continue
			except UnexpectedToken as ex:
				logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num))
				state[STATE_FAILS] += 1
				continue

			state.update({
				STATE_LINE_NUM : line_num,
				STATE_LINE : line,
				STATE_MUTATION : mut})

			exists = False
			for row in query_mutation(logger, db, mut, maps, predictors):
				exists = True

				yield row

			progress.update()

			if exists:
				state[STATE_HITS] += 1
			else:
				state[STATE_FAILS] += 1

	progress.log_totals()

	hits, fails = [state[k] for k in [STATE_HITS, STATE_FAILS]]
	logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hits + fails, hits, fails, progress.elapsed_time))