Exemple #1
0
def main():
	parser = argparse.ArgumentParser(
		description="Update predictors min, max and count")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	cmd.add_selected_predictors_args()

	args, logger = cmd.parse_args("pred-update")

	db = cmd.open_db()

	try:
		predictors = cmd.get_selected_predictors(default_all=True)

		logger.info("Updating predictors ...")

		start_time = datetime.now()

		db.update_predictors(predictors)

		db.commit()

		logger.info("Finished. elapsed={}".format(datetime.now() - start_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #2
0
def main():
	parser = argparse.ArgumentParser(
		description="Generate datasets needed to evaluate performance from Cosmic mutations")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("data_path", metavar="PATH",
						help="The CosmicMutantExport tsv file")

	parser.add_argument("cgc_path", metavar="PATH",
						help="The list of CGC genes")

	parser.add_argument("tdrivers_path", metavar="PATH",
						help="The list of TD drivers")
	
	parser.add_argument("pdrivers_path", metavar="PATH",
						help="The list of PD drivers")

	parser.add_argument("-o", dest="prefix", metavar="PREFIX",
						help="Output prefix.")

	args, logger = cmd.parse_args("perf-cosmic")

	fanns_db = cmd.open_db()

	try:
		snvs = extract_snvs(fanns_db, args.data_path, logger=logger)

		create_datasets(snvs, args.cgc_path, args.tdrivers_path, args.pdrivers_path, args.prefix)
	except:
		cmd.handle_error()

	return 0
Exemple #3
0
def main():
	parser = argparse.ArgumentParser(
		description="Remove annotations")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("id", metavar="ID", nargs="+",
						help="Annotation identifier.")

	args, logger = cmd.parse_args("ann-rm")

	db = cmd.open_db()

	try:
		if "*" in args.id:
			logger.info("Removing all the annotations ...")
			for ann in db.maps():
				logger.info("  {} {} ...".format(ann["id"], ann["name"]))
				db.remove_map(ann["id"])
		else:
			for ann_id in args.id:
				logger.info("Removing annotation {} ...".format(ann_id))
				db.remove_map(ann_id)

		db.commit()
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #4
0
def main():
	parser = argparse.ArgumentParser(
		description="Create a functional scores database")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()
	
	parser.add_argument("predictors", metavar="PREDICTORS", nargs="*",
						help="Predictor identifiers")

	args, logger = cmd.parse_args("create")

	db = cmd.create_db()

	try:
		for predictor_id in args.predictors:
			logger.info("Adding predictor {} ...".format(predictor_id))
			db.add_predictor(predictor_id, FannsDb.SOURCE_PREDICTOR_TYPE)

		db.set_initialized()

		db.commit()
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #5
0
def main():
	parser = argparse.ArgumentParser(
		description="Manipulate database indices")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("ops", metavar="OPERATIONS", nargs="+", choices=["drop", "create"],
						help="The operations to perform on the indices.")

	args, logger = cmd.parse_args("index")

	db = cmd.open_db()

	try:
		start_time = datetime.now()

		for op in args.ops:
			if op == "drop":
				logger.info("Dropping indices ...")
				db.drop_indices()
			elif op == "create":
				logger.info("Creating indices ...")
				db.create_indices()

		elapsed_time = datetime.now() - start_time
		logger.info("Done. Elapsed time: {}".format(elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #6
0
def main():
	parser = argparse.ArgumentParser(
		description="List predictors")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("--json", dest="to_json", action="store_true", default=False,
						help="Print the results in json format")

	args, log = cmd.parse_args("pred-list")

	db = cmd.open_db()

	try:
		if args.to_json:
			d = {}
			for pred in db.predictors():
				d[pred["id"]] = dict([(k,pred[k]) for k in ["type", "source", "min", "max", "count"]])
			import json
			print json.dumps(d, indent=True)
		else:
			print "\t".join(["ID", "TYPE", "SOURCE", "MIN", "MAX", "COUNT"])
			for pred in db.predictors():
				print "\t".join([str(pred[k]) for k in ["id", "type", "source", "min", "max", "count"]])

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(description="Add annotations")

    cmd = DefaultCommandHelper(parser)

    cmd.add_db_args()

    parser.add_argument("id", metavar="ID", help="Annotation identifier.")

    parser.add_argument("name", metavar="NAME", help="Annotation name.")

    parser.add_argument(
        "type", metavar="TYPE", choices=["transcript", "protein"], help="Annotation type: transcript, protein"
    )

    parser.add_argument("path", metavar="PATH", help="Annotation items")

    parser.add_argument(
        "--priority",
        dest="priority",
        default=0,
        help="Priority for translating input annotations. 0 means not considered for translation. Default 0.",
    )

    parser.add_argument(
        "--header",
        dest="header",
        action="store_true",
        default=False,
        help="Specify that the annotation items file have a header.",
    )

    args, logger = cmd.parse_args("ann-add")

    db = cmd.open_db()

    try:
        logger.info("Creating annotation {} ...".format(args.name))

        db.add_map(args.id, args.name, args.type, args.priority)

        logger.info("Loading items ...")

        with tsv.open(args.path) as f:
            for source, value in tsv.lines(f, (str, str), header=args.header):
                if len(source) > 0 and len(value) > 0:
                    db.add_map_item(args.id, source, value)

        db.commit()
    except:
        return cmd.handle_error()
    finally:
        db.close()

    return 0
Exemple #8
0
def main():
	parser = argparse.ArgumentParser(
		description="Export SNV's")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("dest_path", metavar="DEST",
						help="The destination file. Use - for standard output.")

	args, log = cmd.parse_args("export-snvs")

	db = cmd.open_db()

	logger.info("Exporting SNV's ...")

	total_count = 0
	total_start_time = time.time()

	try:
		progress = RatedProgress(logger, name="SNVs")
		rows_count = 0
		with tsv.open(args.dest_path, "w") as f:
			for snv in db.snvs():
				rows_count += 1

				tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S")

				progress.update()

		log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #9
0
def main():
	parser = argparse.ArgumentParser(
		description="List annotations")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	args, logger = cmd.parse_args("ann-list")

	db = cmd.open_db()

	try:
		print "\t".join(["ID", "NAME", "TYPE", "PRIORITY"])
		for ann in db.maps():
			print "\t".join([str(ann[k]) for k in ["id", "name", "type", "priority"]])

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(description="Map score values")

    cmd = DefaultCommandHelper(parser)

    cmd.add_db_args()

    cmd.add_transform_args()

    parser.add_argument(
        "--skip-empty-scores",
        dest="skip_empty_scores",
        action="store_true",
        default=False,
        help="Skip transformation for empty scores",
    )

    args, logger = cmd.parse_args("scores-transform")

    db = cmd.open_db()

    try:
        transforms = cmd.get_transforms()

        predictors = transforms.keys()

        logger.info("Transforming scores ...")

        progress = RatedProgress(logger, name="SNVs")

        rows_count = updated_count = 0
        for row in db.query_scores(predictors=predictors):
            rows_count += 1

            scores = row["scores"]
            upd_scores = {}

            for predictor in transforms:
                score = scores[predictor]
                if args.skip_empty_scores and score is None:
                    continue

                prev_score = score
                for name, func in transforms[predictor]:
                    try:
                        score = func(score)
                    except:
                        raise Exception("Error transforming the {} score {} with {}".format(predictor, score, name))

                if prev_score != score:
                    upd_scores[predictor] = score

            if len(upd_scores) > 0:
                db.update_scores(row["id"], upd_scores)
                updated_count += 1

            progress.update()

        progress.log_totals()

        logger.info("Commit ...")

        db.commit()

        logger.info(
            "Finished. Total rows = {}, updated rows = {}, elapsed time = {}".format(
                rows_count, updated_count, progress.elapsed_time
            )
        )
    except:
        return cmd.handle_error()
    finally:
        db.close()

    return 0
def main():
    parser = argparse.ArgumentParser(description="Calculate Baseline Tolerance statistics per gene")

    cmd = DefaultCommandHelper(parser)

    parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree")

    parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group")

    parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features")

    parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial gene statistics")

    parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output gene statistics")

    parser.add_argument(
        "-c",
        "--count-threshold",
        dest="count_threshold",
        metavar="N",
        default=DEFAULT_COUNT_THRESHOLD,
        help="Minimum number of features per group",
    )

    parser.add_argument(
        "--stdev-threshold",
        dest="stdev_threshold",
        metavar="V",
        default=DEFAULT_STDEV_THRESHOLD,
        help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)",
    )

    args, logger = cmd.parse_args("blt-groups")

    logger.info("Loading groups tree ...")

    group_children = defaultdict(set)
    with tsv.open(args.tree_path) as f:
        for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_children[group] |= children

    logger.info("Loading mappings between groups and features ...")

    group_genes = defaultdict(set)
    with tsv.open(args.group_genes_path) as f:
        for group, genes in tsv.lines(f, (str, lambda v: set(v.split(",")))):
            group_genes[group] |= genes

    logger.info("Loading partial statistics ...")

    partial_stats = {}
    with tsv.open(args.stats_path) as f:
        predictors = f.readline().rstrip("\n").split("\t")[1:]
        num_predictors = len(predictors)
        for line in f:
            fields = line.rstrip("\n").split("\t")
            gene = fields[0]
            gene_stats = [[float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] for ss in fields[1:]]
            partial_stats[gene] = gene_stats

    logger.info("  Predictors: {}".format(", ".join(predictors)))
    logger.info("  Features: {}".format(len(partial_stats.keys())))

    logger.info("Calculating features ...")

    stats = {}

    feat_count = 0
    feat_partial_count = [0] * num_predictors
    for feature, feat_partial_stats in partial_stats.items():
        feat_with_stats = False
        feat_stats = [None] * (num_predictors + 1)
        for i in range(num_predictors):
            s0, s1, s2 = feat_partial_stats[i]

            if s0 == 0.0:
                continue

            if s0 < args.count_threshold:
                continue

            x = (s0 * s2 - s1 * s1) / (s0 * (s0 - 1))
            if x < -1e-12:
                continue

            mean = s1 / s0
            std = math.sqrt(abs(x))
            if std < args.stdev_threshold:
                continue

            feat_stats[i] = (int(s0), mean, std)
            feat_partial_count[i] += 1
            feat_with_stats = True

        if feat_with_stats:
            feat_count += 1
            stats[feature] = feat_stats
            # print feature, "\t".join(["/".join([str(v) for v in feat_stats[i] or []]) for i in range(num_predictors)])

    logger.info(
        "  {} ({}) features out of {} calculated directly from partial statistics".format(
            feat_count, "/".join(map(str, feat_partial_count)), len(partial_stats)
        )
    )

    logger.info("Calculating groups ...")

    calculate_group(
        logger, args.root_group, args.count_threshold, group_children, group_genes, partial_stats, num_predictors, stats
    )

    logger.info("  {} features calculated in total".format(len(stats)))

    with tsv.open(args.out_path, "w") as of:
        tsv.write_line(of, "GENE", "GROUP", *predictors)
        for gene in sorted(stats.keys()):
            gene_stats = stats[gene]
            sb = [gene]
            stats_group = gene_stats[num_predictors]
            if stats_group is not None:
                sb += [stats_group]
            else:
                sb += ["|" + ("-" * num_predictors)]

            for i in range(num_predictors):
                if gene_stats[i] is not None:
                    sb += ["/".join([str(v) for v in gene_stats[i]])]
                else:
                    sb += ["-/-/-"]
            tsv.write_line(of, *sb)

    return 0
Exemple #12
0
def main():
	parser = argparse.ArgumentParser(
		description="Fetch Condel scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("muts_path", metavar="SNVS_PATH",
						help="SNV's to check. Use - for standard input.")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="The results path. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	args, logger = cmd.parse_args("fetch")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input"))

	try:
		progress = RatedProgress(logger, name="SNVs")

		with tsv.open(args.muts_path) as f:
			with tsv.open(args.out_path, "w") as wf:
				tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

				hit = fail = 0

				mut = DnaAndProtMutationParser()
				for line_num, line in enumerate(f, start=1):
					line = line.rstrip(" \n\r")
					if len(line) == 0 or line.startswith("#"):
						continue

					try:
						mut.parse(line)
					except PrematureEnd:
						logger.error("Missing fields at line {}".format(line_num))
						fail += 1
						continue
					except UnexpectedToken as ex:
						logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num))
						fail += 1
						continue

					exists = False
					for row in query_mutation(logger, db, mut, annotations, predictors):

						exists = True

						ann = row["annotations"]
						scores = row["scores"]

						tsv.write_line(wf, mut.identifier,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

						"""
						if logger.isEnabledFor(logging.DEBUG):
							logger.debug("    --> {} {} {} {} {} {} {} {} {} {}".format(
										row["chr"], row["start"], row["ref"], row["alt"], row["transcript"],
										row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"],
										mut.identifier or "*"))
						"""

					progress.update()

					if exists:
						hit += 1
					else:
						fail += 1

		progress.log_totals()

		logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()
Exemple #13
0
def main():
	parser = argparse.ArgumentParser(
		description="Update scores in the database")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	cmd.add_selected_predictors_args()

	parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False,
						help="Update of the predictors.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("update")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors(check_missing=False)

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))])

			db_predictors = set([p["id"] for p in db.predictors()])

			if len(predictors) == 0:
				predictors = [name for name in hdr if name in db_predictors]
				if len(predictors) == 0:
					raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.")

			logger.info("Predictors: {}".format(", ".join(predictors)))

			for predictor in filter(lambda p: p not in db_predictors, predictors):
				logger.info("Creating predictor {} ...".format(predictor))
				db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0
			use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0

			if not use_genome_coords and not use_protein_coords:
				raise Exception("No coordinate columns found. "
								"Use {} for genomic coordinates or {} for protein coordinates.".format(
									GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS))
			elif use_genome_coords and use_protein_coords:
				logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default")

			if use_genome_coords:
				coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)]
				coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_dna
			elif use_protein_coords:
				coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)]
				coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names]
				#get_rows = db.get_transcripts_by_protein

			coord_column_indices = [hdr[n] for n in coord_column_names]
			score_indices = [hdr[n] for n in predictors]
			max_column_index = max(coord_column_indices + score_indices)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column_index:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip(
													coord_column_names, coord_column_types, coord_column_indices)])

					scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)])
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					for row in db.query_scores(fields=[], **coords):
						db.update_scores(row["id"], scores)
				except Exception as ex:
					logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex)))
					logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()])))
					if not args.ignore_errors:
						raise

				progress.update()

		progress.log_totals()

		logger.info("Finalizing database ...")

		if args.update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
def main():
	parser = argparse.ArgumentParser(
		description="Filter for the longest transcript")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("len_path", metavar="PATH",
						help="The tsv containing the transcripts length")

	parser.add_argument("data_path", metavar="PATH",
						help="The data file")

	parser.add_argument("out_path", metavar="PATH",
						help="Output file. Use - for standard output.")

	parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT",
						help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT")

	args, logger = cmd.parse_args("filter-transcript")

	try:
		logger.info("Loading transcripts length ...")
		trslen = defaultdict(int)
		with tsv.open(args.len_path) as f:
			for name, length in tsv.rows(f):
				trslen[name] = length

		logger.info("Filtering {} ...".format(os.path.basename(args.data_path)))

		total_count = filter_count = 0

		progress = RatedProgress(logger, name="mutations")

		key_columns = args.key.split(",")
		with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of:
			hdr_line = df.readline()
			of.write(hdr_line)
			_, hdr = tsv.header_from_line(hdr_line)
			key_indices = [hdr[name] for name in key_columns]
			trs_index = hdr["TRANSCRIPT"]

			last_key = None
			longest = (0, "")

			for line in df:
				total_count += 1

				fields = line.rstrip("\n").split("\t")
				key = tuple([fields[index] for index in key_indices])
				trs = fields[trs_index]

				tl = trslen[trs]

				if last_key != key:
					if last_key is not None:
						of.write(longest[1])
						filter_count += 1
					longest = (tl, line)
					last_key = key

				elif tl > longest[0]:
					longest = (tl, line)

				progress.update()

			filter_count += 1
			of.write(longest[1])

		progress.log_totals()

		logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format(
			total_count, filter_count, total_count - filter_count, progress.elapsed_time))
	except:
		cmd.handle_error()

	return 0
Exemple #15
0
def main():
	parser = argparse.ArgumentParser(
		description="Export Scores")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("dest_path", metavar="OUTPUT_PATH",
						help="The output file. Use - for standard output.")

	cmd.add_selected_predictors_args()

	cmd.add_selected_annotations_args()

	cmd.add_selected_columns_args()

	parser.add_argument("--json", dest="to_json", action="store_true", default=False,
						help="Export the results in json format")

	parser.add_argument("--sample", dest="sample", type=int, metavar="PCT",
						help="Export a random sample of PCT %%")

	parser.add_argument("--start", dest="start", type=int, metavar="N",
						help="Start to export from the SNV number N")

	parser.add_argument("--limit", dest="limit", type=int, metavar="N",
						help="Limit the number of SNVs to export to N")

	args, logger = cmd.parse_args("export")

	db = cmd.open_db()

	predictors = cmd.get_selected_predictors()

	annotations = cmd.get_selected_annotations()

	columns = cmd.get_selected_columns()

	logger.info("Exporting ...")

	random.seed(time.time())

	total_count = 0
	total_start_time = time.time()

	try:
		progress = RatedProgress(logger, name="SNVs")

		to_json = args.to_json
		sample = args.sample
		start = args.start or 0
		limit = args.limit

		doc = None
		last_pos = None
		rows_count = 0
		snvs_count = 0
		with tsv.open(args.dest_path, "w") as f:

			if not to_json:
				tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors)

			for row in db.query_scores(predictors=predictors, maps=annotations):

				if not to_json:
					if start > 0:
						start -= 1
						continue

					if sample is not None and random.randint(1, 100) > sample:
						continue

				pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"])
				if last_pos != pos:
					if to_json:
						if start > 0:
							start -= 1
							continue

						if limit is not None and snvs_count >= limit:
							if doc is not None:
								json.dump(doc, f)
								f.write("\n")
							break

					snvs_count += 1

				rows_count += 1

				ann = row["annotations"]
				scores = row["scores"]

				if to_json:
					tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] +
										[(k,scores[k]) for k in predictors])

					if pos != last_pos:
						if doc is not None:
							if sample is None or random.randint(1, 100) <= sample:
								json.dump(doc, f)
								f.write("\n")
							else:
								snvs_count -= 1

						doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] +
										[("transcripts", [tdoc])])
					else:
						doc["transcripts"] += [tdoc]

				else:
					tsv.write_line(f,
							   *[row[c] for c in columns]
							   + [ann[a] for a in annotations]
							   + [scores[p] for p in predictors])

				progress.update()

				last_pos = pos

				if not to_json and limit is not None and rows_count >= limit:
					break

		progress.log_totals()

		logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time))
	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
Exemple #16
0
def main():
	parser = argparse.ArgumentParser(
		description="Generate datasets needed to evaluate performance from Cosmic mutations")

	cmd = DefaultCommandHelper(parser)

	cmd.add_db_args()

	parser.add_argument("data_path", metavar="PATH",
						help="The CosmicMutantExport tsv file")

	parser.add_argument("cgc_path", metavar="PATH",
						help="The list of CGC genes")

	parser.add_argument("drivers_path", metavar="PATH",
						help="The list of CHASM drivers (drivers.tmps)")

	parser.add_argument("-o", dest="prefix", metavar="PREFIX",
						help="Output prefix.")

	args, logger = cmd.parse_args("perf-cosmic")

	prefix = args.prefix or "cosmic-"

	fanns_db = cmd.open_db()

	try:
		snvs = dict()

		logger.info("Counting the number of samples per mutation ...")
		with tsv.open(args.data_path, "r") as df:
			columns = [
				#"Genome-wide screen",
				"Mutation Description",
				"Mutation CDS",
				"Mutation AA",
				"Mutation GRCh37 genome position",
				"Mutation GRCh37 strand",
				"Accession Number",
				"ID_sample"]

			total_rows = queried_rows = 0
			for fields in tsv.rows(df, columns=columns, header=True):
				total_rows += 1
				#wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields
				wide_screen = "y"		
				if wide_screen != "y" or mut_desc != "Substitution - Missense":
					continue

				queried_rows += 1
				for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger):
					k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]])
					if k not in snvs:
						symbol = row["xrefs"]["symbol"]
						snvs[k] = dict(
							transcript=row["transcript"],
							symbol=symbol,
							samples=set([sample_id]))
					else:
						snvs[k]["samples"].add(sample_id)

		logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs)))

		logger.info("Loading CGC genes ...")
		cgc_genes = set()
		with open(args.cgc_path, "r") as f:
			for line in f:
				cgc_genes.add(line.rstrip("\n"))

		logger.info("Loading CHASM drivers ...")
		drivers = set()
		with open(args.drivers_path, "r") as f:
			for line in f:
				drivers.add(line.rstrip("\n").split("\t")[0])

		logger.info("Creating datasets ...")

		progress = RatedProgress(logger, name="mutations")

		with Dataset(prefix + "1") as rec1,\
			Dataset(prefix + "2") as rec2,\
			Dataset(prefix + "4") as rec4,\
			Dataset(prefix + "CGC") as cgc,\
			Dataset(prefix + "noCGC") as nocgc,\
			Dataset(prefix + "D") as drv,\
			Dataset(prefix + "O") as nodrv:

			for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items():
				num_samples = len(snv["samples"])
				line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]])
				if num_samples == 1:
					rec1.write(line)
				if num_samples >= 2:
					rec2.write(line)
				if num_samples >= 4:
					rec4.write(line)
				
				symbol = snv["symbol"]
				if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0):
					cgc.write(line)
				elif num_samples == 1:
					nocgc.write(line)
			
				if snv["transcript"] in drivers:
					drv.write(line)
				elif num_samples == 1:
					nodrv.write(line)
                    
				progress.update()

			progress.log_totals()

			logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [
				rec1, rec2, rec4, cgc, nocgc, drv, nodrv]])))

	except:
		cmd.handle_error()

	return 0
Exemple #17
0
def main():
	parser = argparse.ArgumentParser(
		description="Export dbNSFP scores")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("source_path", metavar="SOURCE",
						help="The original zip file")

	parser.add_argument("ensp_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Ensembl transcript id's and Uniprot id's")

	parser.add_argument("uniprot_map_path", metavar="MAP",
						help="The mapping between Ensembl protein id's and Uniprot id's")

	parser.add_argument("-o", "--output", dest="out_path", metavar="OUT_PATH",
						help="The output file")

	parser.add_argument("--temp", dest="temp_path", metavar="TEMP_PATH",
						help="A temporary path for zip extraction")

	parser.add_argument("--chr", dest="chr", metavar="CHROMOSOMES",
						help="Chromosomes to include: list separated by commas.")

	parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False,
						help="Skip SNV's where all the scores are empty")

	args, logger = cmd.parse_args("dbnsfp-export")

	if args.out_path is None:
		basename = os.path.basename(args.source_path)
		prefix = os.path.splitext(basename)[0]
		args.out_path = "{}.tsv.gz".format(prefix)

	logger.info("Loading maps ...")

	uniprot_map = {}
	trs_map = {}
	with tsv.open(args.ensp_map_path) as f:
		for ensp, enst in tsv.lines(f, (str, str)):
			if len(enst) > 0:
				trs_map[enst] = ensp

	with tsv.open(args.uniprot_map_path) as f:
		for ensp, uniprot_id in tsv.lines(f, (str, str)):
			if len(uniprot_id) > 0:
				uniprot_map[uniprot_id] = ensp

	logger.info("Opening {} ...".format(args.source_path))

	chromosomes = None
	if args.chr is not None:
		chromosomes = [c.strip().upper() for c in args.chr.split(",") if len(c.strip()) > 0]
		logger.info("Selected chromosomes: {}".format(", ".join(chromosomes)))
		chromosomes = set(chromosomes)

	name_pattern = re.compile(r"dbNSFP.+_variant.chr(.+)")

	COLUMNS = [
		"#chr", "pos(1-coor)", "ref", "alt", "cds_strand",
		"genename", "Uniprot_id", "Uniprot_aapos", "aaref", "aaalt",
		"Ensembl_geneid", "Ensembl_transcriptid", "aapos",
		"SIFT_score",
		"Polyphen2_HVAR_score",
		"MutationAssessor_score",
		"FATHMM_score",
		"MutationTaster_score",
#		"GERP_RS",
		"GERP++_RS",
#		"PhyloP_score"
		"phyloP"
	]

	tmp_prefix = args.temp_path or tempfile.gettempdir()
	if not os.path.exists(tmp_prefix):
		os.makedirs(tmp_prefix)
	if tmp_prefix[-1] != "/":
		tmp_prefix += "/"

	extract_path = tempfile.mkdtemp(prefix=tmp_prefix)

	try:
		logger.info("Output: {}".format(args.out_path if args.out_path != "-" else "standard output"))

		total_start_time = time.time()
		total_lines = 0
		with ZipFile(args.source_path, "r") as zf,\
			tsv.open(args.out_path, "w") as of: #,\
			#tsv.open(args.noprot_path, "w") as npf:

			tsv.write_line(of, "CHR", "STRAND", "START", "REF", "ALT", "TRANSCRIPT",
						   "PROTEIN", "AA_POS", "AA_REF", "AA_ALT",
						   "SIFT", "PPH2", "MA", "FATHMM", "MT", "GERPRS", "PHYLOP")

			#tsv.write_line(npf, "#CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")

			entries = []
			for entry in zf.infolist():
				m = name_pattern.match(entry.filename)
				if not m:
					continue

				chr = m.group(1)
				index = CHR_INDEX[chr] if chr in CHR_INDEX else 99

				if chromosomes is not None and chr not in chromosomes:
					logger.debug("Skipping chromosome {} ...".format(chr))
					continue

				entries += [(index, chr, entry)]

			for index, chr, entry in sorted(entries, key=lambda x: x[0]):
				logger.info("Reading chromosome {} ...".format(chr))

				zf.extract(entry, extract_path)
				fpath = os.path.join(extract_path, entry.filename)
				with open(fpath) as f:
					# Parse header
					hdr_line = f.readline()
					hdr = {}
					for index, name in enumerate(hdr_line.rstrip("\n").split("\t")):
						hdr[name] = index
					columns = [hdr[name] if name in hdr else None for name in COLUMNS]

					read = set()

					start_time = time.time()
					partial_start_time = start_time
					for line_num, line in enumerate(f, start=2):
						fields = line.rstrip("\n").split("\t")

						try:
							fields = [fields[i] if i is not None and i < len(fields) else None for i in columns]

							(chr, start, ref, alt, strand,
							 symbol, uniprot, uniprot_aapos, aa_ref, aa_alt,
							 gene, transcript, aapos,
							 sift, pph2, ma, fathmm,
							 mt, gerprs, phylop) = fields
							
							start = safe_int(start)
							ref = ref.upper() if ref is not None else None
							alt = alt.upper() if alt is not None else None
							aa_ref = aa_ref.upper() if aa_ref is not None else None
							aa_alt = aa_alt.upper() if aa_alt is not None else None
							sift = safe_float(sift)
							ma = safe_float(ma)
							fathmm = safe_float(fathmm)
							mt = safe_float(mt)
							gerprs = safe_float(gerprs)
							phylop = safe_float(phylop)

							if start is None or ref is None or alt is None:
								logger.warn("None value for pos or ref or alt at line {}: {}".format(line_num, fields))
								continue
							elif ref not in BASE_INDEX or alt not in BASE_INDEX:
								logger.warn("Unknown ref or alt at line {}: {}".format(line_num, fields))
								continue
							elif len(ref) != 1 or len(alt) != 1:
								logger.warn("Length != 1 for ref or alt len at line {}: {}".format(line_num, fields))
								continue
							#elif aa_ref not in AA_INDEX or aa_alt not in AA_INDEX:
							#	logger.warn("Unknown aa_ref or aa_alt at line {}: {}".format(line_num, fields))
							#	continue
							elif transcript is None or aapos is None or uniprot is None or uniprot_aapos is None:
								logger.warn("None value for transcript or aapos or uniprot or uniprot_aapos at line {}: {}".format(line_num, fields))
								continue

							if aa_ref not in AA_INDEX:
								aa_ref = None
							if aa_alt not in AA_INDEX:
								aa_alt = None

							trs_values = transcript.split(";")

							aapos_values = [safe_int(v) for v in aapos.split(";")]
							l = len(trs_values) - len(aapos_values)
							if l > 0:
								aapos_values += [aapos_values[-1]] * l

							uniprot_values = uniprot.split(";")
							uniprot_aapos_values = [safe_int(v) for v in uniprot_aapos.split(";")]
							l = len(uniprot_values) - len(uniprot_aapos_values)
							if l > 0:
								uniprot_aapos_values += [uniprot_aapos_values[-1]] * l

							pph2_values = [safe_float(v) for v in pph2.split(";")] if pph2 is not None else [None]
							l = len(uniprot_values) - len(pph2_values)
							if l > 0:
								pph2_values += [pph2_values[-1]] * l

							uniprot_index = {}
							for i, id in enumerate(uniprot_values):
								if uniprot_aapos_values[i] is not None:
									uniprot_index[uniprot_aapos_values[i]] = i

							for i, trs in enumerate(trs_values):
								pos = aapos_values[i]
								if pos < 0:
									pos = None

								if pos is not None and pos in uniprot_index:
									j = uniprot_index[pos]
									uniprot_value = uniprot_values[j]
									pph2_value = pph2_values[j]
								else:
									uniprot_value = pph2_value = None

								if trs in trs_map:
									prot_id = trs_map[trs]
								elif uniprot_value in uniprot_map:
									prot_id = uniprot_map[uniprot_value]
								else:
									logger.warn("Couldn't map neither protein {} or transcript {} at line {}: {}".format(uniprot_value, trs, line_num, "|".join([str(v) for v in fields])))
									continue

								#if pos < 0:
								#	logger.warn("Negative protein position at line {}: {}".format(line_num, pos))
								#	continue
								#elif ...
								if pph2_value is not None and (pph2_value < 0.0 or pph2_value > 1.0):
									logger.warn("PPH2 score {} out of range at line {}: {}".format(pph2_value, line_num, fields))
									continue

								if aa_alt == "X": # fix stop codons having a sift score
									sift = None

								if args.skip_empty_scores and sift is None and pph2_value is None and ma is None \
										and mt is None and gerprs is None and phylop is None:
									continue

								#log.info((chr, strand, start, ref, alt, aapos_values[i], aa_ref, aa_alt, trs, sift, pph2_value, ma))

								if pos is None or aa_ref is None or aa_alt is None:
									pass #tsv.write_line(npf, chr, start, ".", ref, alt, ".", "PASS",
										#		   "dbNSFP={}|{}|{}|{}|{}|{}".format(trs, prot_id,
										#					sift or "", pph2_value or "", ma or "", fathmm or ""))
								else:
									tsv.write_line(of, chr, strand, start, ref, alt, trs,
												   prot_id, pos, aa_ref, aa_alt,
												   sift, pph2_value, ma, fathmm,
												   mt, gerprs, phylop)

						except KeyboardInterrupt:
							raise
						except:
							logger.warn("Malformed line {}: {}".format(line_num, "|".join([str(v) for v in fields])))
							raise #continue

						partial_time = time.time() - partial_start_time
						if partial_time >= 5.0:
							partial_start_time = time.time()
							elapsed_time = time.time() - start_time
							logger.debug("  {} lines, {:.1f} lines/second".format(hsize(line_num-1), (line_num-1) / float(elapsed_time)))

					total_lines += line_num

					logger.info("  >  {} lines, {:.1f} lines/second".format(hsize(line_num), line_num / float(time.time() - start_time)))
					logger.info("  >> {} lines, {:.1f} lines/second".format(hsize(total_lines), total_lines / float(time.time() - total_start_time)))

				os.remove(fpath)

		total_elapsed_time = timedelta(seconds=time.time() - total_start_time)
		logger.info("Finished successfully. Elapsed time: {}".format(total_elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		shutil.rmtree(extract_path)

	return 0
Exemple #18
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Baseline Tolerance statistics")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("tree_path", metavar="TREE_PATH",
						help="The groups descendant tree")

	parser.add_argument("root_group", metavar="ROOT_GROUP",
						help="Tree root group")

	parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH",
						help="Map between groups and features")

	parser.add_argument("stats_path", metavar="STATS_PATH",
						help="Partial feature statistics")

	parser.add_argument("out_path", metavar="OUTPUT_PATH",
						help="Output feature statistics")

	parser.add_argument("--tsv", dest="tsv_path", metavar="PATH",
						help="Store baseline tolerance in tsv format too.")

	parser.add_argument("-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD,
						help="Minimum number of features per group")

	parser.add_argument("--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD,
						help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)")

	args, logger = cmd.parse_args("blt-groups")

	logger.info("Loading groups tree ...")

	tree = Tree()
	with tsv.open(args.tree_path) as f:
		for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))):
			tree.add_node(group, children)

	logger.info("  Nodes: {}".format(tree.node_count))

	logger.info("Loading mappings between groups and features ...")

	all_groups = set()
	all_features = set()
	with tsv.open(args.group_genes_path) as f:
		for group, features in tsv.lines(f, (str, lambda v: set(v.split(",")))):
			tree.add_node(group, features)
			all_groups.add(group)
			all_features.update(features)

	logger.info("  Nodes: {}".format(tree.node_count))
	logger.info("  Groups: {}".format(len(all_groups)))
	logger.info("  Features: {}".format(len(all_features)))

	logger.info("Loading partial statistics ...")

	with tsv.open(args.stats_path) as f:
		predictors = f.readline().rstrip("\n").split("\t")[1:]
		num_predictors = len(predictors)
		num_features = 0
		for line in f:
			try:
				fields = line.rstrip("\n").split("\t")
				feature = fields[0]
				node = tree.get_or_create_node(feature)
				for p, ss in zip(predictors, fields[1:]):
					try:
						s0, s1, s2 = [float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))]
						node.set_pblt(p, PartialBLT(s0, s1, s2, sources=set([feature])))
					except:
						import traceback
						traceback.print_exc()
						logger.warn("Failed to parse partial baseline tolerance"
									" for {}/{} from {}".format(feature, p, ss))
						exit(-1)
						continue
				num_features += 1
			except:
				logger.warn("Failed to parse partial baseline tolerance"
									" for {} from {}".format(feature, line))
				continue

	logger.info("  Nodes: {}".format(tree.node_count))
	logger.info("  Features: {}".format(num_features))
	logger.info("  Predictors: {}".format(", ".join(predictors)))

	logger.info("Calculating baseline tolerance ...")

	for predictor in predictors:
		logger.info("For {} ...".format(predictor))

		calculate_blt(
			parent=None, node=tree.get_or_create_node(args.root_group), predictor=predictor,
			count_threshold=args.count_threshold, stdev_threshold=args.stdev_threshold, logger=logger)

	# TODO log summary info

	logger.info("Writing results into {} ...".format(os.path.basename(args.out_path)))

	if args.tsv_path is not None:
		with tsv.open(args.tsv_path, "w") as of:
			tsv.write_line(of, "FEATURE", *predictors)
			for feature in all_features:
				sb = [feature]
				node = tree.get_node(feature)
				predictors_with_blt = 0
				for predictor in predictors:
					blt = node.get_blt(predictor)
					if blt is None or blt.n < args.count_threshold:
						sb += ["/".join(["-"] * 5)]
						continue

					predictors_with_blt += 1
					sb += ["/".join(map(str, [blt.from_node, blt.scope, blt.n, blt.mean, blt.stdev]))]

				if predictors_with_blt > 0:
					tsv.write_line(of, *sb)

	with tsv.open(args.out_path, "w") as of:
		tree_blt = {}
		for node_name, node in tree.nodes.items():
			predictors_blt = {}
			for predictor in predictors:
				pred_blt = node.get_blt(predictor)
				if pred_blt is None or pred_blt.n < args.count_threshold:
					continue

				predictors_blt[predictor] = dict(
					from_node=pred_blt.from_node, scope=pred_blt.scope,
					N=pred_blt.n, mean=pred_blt.mean, stdev=pred_blt.stdev)

			if len(predictors_blt) > 0:
				tree_blt[node.name] = predictors_blt

		doc = dict(
			created=str(datetime.now()),
			predictors=predictors,
			count_threshold=args.count_threshold,
			stdev_threshold=args.stdev_threshold,
			tree=None, # tree relations
			features=list(all_features),
			pblt=None, # TODO
			blt=tree_blt
		)
		json.dump(doc, of, indent=True)


	return 0