Example #1
0
def extract_snvs(fanns_db, data_path, logger=None):

	logger = logger or logging.getLogger("perf-cosmic")

	snvs = dict()

	logger.info("Reading mutations ...")
	
	progress = RatedProgress(logger, name="mutations")
	
	with tsv.open(data_path, "r") as df:
		columns = [
			"Genome-wide screen",
			"Mutation Description",
			"Mutation CDS",
			"Mutation AA",
			"Mutation GRCh37 genome position",
			"Mutation GRCh37 strand",
			"Accession Number",
			"ID_sample"]

		total_rows = queried_rows = dbfound_rows = 0
		for fields in tsv.rows(df, columns=columns, header=True):
			total_rows += 1
			wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields

			# wide_screen != "y"
			if mut_desc != "Substitution - Missense":
				continue

			queried_rows += 1
			for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
				dbfound_rows += 1
				k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]])
				if k not in snvs:
					snvs[k] = snv = dict(
						transcript=row["transcript"],
						symbol=row["xrefs"]["symbol"],
						msamples=set(), wsamples=set())
				else:
					snv = snvs[k]
				
				if wide_screen == "y":
					snv["wsamples"].add(sample_id)
				else:
					snv["msamples"].add(sample_id)
			
			progress.update()

		progress.log_totals()

	logger.info("Counting the number of samples per mutation ...")
	
	for data in snvs.itervalues():
		data["msamples"] = len(data["msamples"])
		data["wsamples"] = len(data["wsamples"])
    
	logger.info("Total: total_rows={}, queried_rows={}, found_rows={}, protein_changes={}".format(total_rows, queried_rows, dbfound_rows, len(snvs)))

	return snvs
Example #2
0
def load_events(f, column_indices, predictors, transforms, stats, logger):
	count = [0, 0]
	last_pos = [[None]*4]*2

	id_index = column_indices["ID"]
	pos_indices = [column_indices[name] for name in POS_HEADERS]

	for fields in tsv.rows(f):
		try:
			event_type = EVENT_TYPES[fields[id_index]]
		except KeyError:
			raise Exception("Unknown event type: {}".format(fields[id_index]))

		current_pos = [fields[i] for i in pos_indices]
		if last_pos[event_type] == current_pos:
			continue

		last_pos[event_type] = current_pos

		count[event_type] += 1

		protein, pos, aa_ref, aa_alt = current_pos
		scores = [score_value(fields[column_indices[p]]) for p in predictors]

		for predictor, score in zip(predictors, scores):
			if score is None or not predictor in stats:
				continue

			if predictor in transforms:
				for expr, func in transforms[predictor]:
					try:
						score = func(score)
					except:
						logger.error("Error applying transformation {} to score {}".format(expr, score))

			predictor_stats = stats[predictor]
			(rmin, rmax, dim, vmin, vmax, size, dp, dn) = [predictor_stats[k] for k in [
											"rmin", "rmax", "dim", "vmin", "vmax", "size", "dp", "dn"]]

			r = (score - rmin) / dim
			index = int(r * size) if score < rmax else size - 1

			if vmin is None or score < vmin:
				predictor_stats["vmin"] = score
			if vmax is None or score > vmax:
				predictor_stats["vmax"] = score

			if event_type == HIGH_REC_EVENT:
				dp[index] += 1
			elif event_type == NON_REC_EVENT:
				dn[index] += 1

	return { "high_recurrent" : count[HIGH_REC_EVENT], "non_recurrent" : count[NON_REC_EVENT] }
Example #3
0
def load_events(f, column_indices, predictors, stats, logger):
	count = [0, 0]
	last_pos = [[None]*4]*2

	id_index = column_indices["ID"]
	pos_indices = [column_indices[name] for name in POS_COLUMNS]
	pred_indices = [column_indices[p] for p in predictors]

	for fields in tsv.rows(f):
		try:
			event_type = EVENT_TYPES[fields[id_index]]
		except KeyError:
			raise Exception("Unknown event type: {}".format(fields[id_index]))

		current_pos = [fields[i] for i in pos_indices]
		if last_pos[event_type] == current_pos:
			continue

		last_pos[event_type] = current_pos

		count[event_type] += 1

		protein, pos, aa_ref, aa_alt = current_pos
		scores = [score_value(fields[pi]) for pi in pred_indices]

		for predictor, score in zip(predictors, scores):
			if score is None or not predictor in stats:
				continue

			if predictor in PREDICTOR_TRANSFORM:
				score = PREDICTOR_TRANSFORM[predictor](score)

			predictor_stats = stats[predictor]
			(rmin, rmax, dim, vmin, vmax, size,
			 dp, dn, tp, tn, fp, fn) = [predictor_stats[k] for k in [
											"rmin", "rmax", "dim", "vmin", "vmax", "size",
											"dp", "dn", "tp", "tn", "fp", "fn"]]

			r = (score - rmin) / dim
			index = int(r * size) if score < rmax else size - 1

			if vmin is None or score < vmin:
				predictor_stats["vmin"] = score
			if vmax is None or score > vmax:
				predictor_stats["vmax"] = score

			if event_type == POS_EVENT:
				dp[index] += 1
				for i in xrange(0, index):
					tp[i] += 1
				for i in xrange(index, size):
					fn[i] += 1

			elif event_type == NEG_EVENT:
				dn[index] += 1
				for i in xrange(0, index):
					fp[i] += 1
				for i in xrange(index, size):
					tn[i] += 1

	return count[POS_EVENT], count[NEG_EVENT]
def main():
	parser = argparse.ArgumentParser(
		description="Filter for the longest transcript")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("len_path", metavar="PATH",
						help="The tsv containing the transcripts length")

	parser.add_argument("data_path", metavar="PATH",
						help="The data file")

	parser.add_argument("out_path", metavar="PATH",
						help="Output file. Use - for standard output.")

	parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT",
						help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT")

	args, logger = cmd.parse_args("filter-transcript")

	try:
		logger.info("Loading transcripts length ...")
		trslen = defaultdict(int)
		with tsv.open(args.len_path) as f:
			for name, length in tsv.rows(f):
				trslen[name] = length

		logger.info("Filtering {} ...".format(os.path.basename(args.data_path)))

		total_count = filter_count = 0

		progress = RatedProgress(logger, name="mutations")

		key_columns = args.key.split(",")
		with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of:
			hdr_line = df.readline()
			of.write(hdr_line)
			_, hdr = tsv.header_from_line(hdr_line)
			key_indices = [hdr[name] for name in key_columns]
			trs_index = hdr["TRANSCRIPT"]

			last_key = None
			longest = (0, "")

			for line in df:
				total_count += 1

				fields = line.rstrip("\n").split("\t")
				key = tuple([fields[index] for index in key_indices])
				trs = fields[trs_index]

				tl = trslen[trs]

				if last_key != key:
					if last_key is not None:
						of.write(longest[1])
						filter_count += 1
					longest = (tl, line)
					last_key = key

				elif tl > longest[0]:
					longest = (tl, line)

				progress.update()

			filter_count += 1
			of.write(longest[1])

		progress.log_totals()

		logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format(
			total_count, filter_count, total_count - filter_count, progress.elapsed_time))
	except:
		cmd.handle_error()

	return 0
Example #5
0
def main():
	parser = argparse.ArgumentParser(
		description="Generate datasets needed to evaluate performance from Cosmic mutations")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("data_path", metavar="PATH",
						help="The CosmicMutantExport tsv file")

	parser.add_argument("cgc_path", metavar="PATH",
						help="The list of CGC genes")

	parser.add_argument("drivers_path", metavar="PATH",
						help="The list of CHASM drivers (drivers.tmps)")

	parser.add_argument("-o", dest="prefix", metavar="PREFIX",
						help="Output prefix.")

	args, logger = cmd.parse_args("perf-cosmic")

	prefix = args.prefix or "cosmic-"

	fanns_db = cmd.open_db()

	try:
		snvs = dict()

		logger.info("Counting the number of samples per mutation ...")
		with tsv.open(args.data_path, "r") as df:
			columns = [
				#"Genome-wide screen",
				"Mutation Description",
				"Mutation CDS",
				"Mutation AA",
				"Mutation GRCh37 genome position",
				"Mutation GRCh37 strand",
				"Accession Number",
				"ID_sample"]

			total_rows = queried_rows = 0
			for fields in tsv.rows(df, columns=columns, header=True):
				total_rows += 1
				#wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				wide_screen = "y"		
				if wide_screen != "y" or mut_desc != "Substitution - Missense":
					continue

				queried_rows += 1
				for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
					k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]])
					if k not in snvs:
						symbol = row["xrefs"]["symbol"]
						snvs[k] = dict(
							transcript=row["transcript"],
							symbol=symbol,
							samples=set([sample_id]))
					else:
						snvs[k]["samples"].add(sample_id)

		logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs)))

		logger.info("Loading CGC genes ...")
		cgc_genes = set()
		with open(args.cgc_path, "r") as f:
			for line in f:
				cgc_genes.add(line.rstrip("\n"))

		logger.info("Loading CHASM drivers ...")
		drivers = set()
		with open(args.drivers_path, "r") as f:
			for line in f:
				drivers.add(line.rstrip("\n").split("\t")[0])

		logger.info("Creating datasets ...")

		progress = RatedProgress(logger, name="mutations")

		with Dataset(prefix + "1") as rec1,\
			Dataset(prefix + "2") as rec2,\
			Dataset(prefix + "4") as rec4,\
			Dataset(prefix + "CGC") as cgc,\
			Dataset(prefix + "noCGC") as nocgc,\
			Dataset(prefix + "D") as drv,\
			Dataset(prefix + "O") as nodrv:

			for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items():
				num_samples = len(snv["samples"])
				line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]])
				if num_samples == 1:
					rec1.write(line)
				if num_samples >= 2:
					rec2.write(line)
				if num_samples >= 4:
					rec4.write(line)
				
				symbol = snv["symbol"]
				if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0):
					cgc.write(line)
				elif num_samples == 1:
					nocgc.write(line)
			
				if snv["transcript"] in drivers:
					drv.write(line)
				elif num_samples == 1:
					nodrv.write(line)
                    
				progress.update()

			progress.log_totals()

			logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [
				rec1, rec2, rec4, cgc, nocgc, drv, nodrv]])))

	except:
		cmd.handle_error()

	return 0