def main(): parser = argparse.ArgumentParser( description="Update predictors min, max and count") cmd = DefaultCommandHelper(parser) cmd.add_db_args() cmd.add_selected_predictors_args() args, logger = cmd.parse_args("pred-update") db = cmd.open_db() try: predictors = cmd.get_selected_predictors(default_all=True) logger.info("Updating predictors ...") start_time = datetime.now() db.update_predictors(predictors) db.commit() logger.info("Finished. elapsed={}".format(datetime.now() - start_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Generate datasets needed to evaluate performance from Cosmic mutations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("data_path", metavar="PATH", help="The CosmicMutantExport tsv file") parser.add_argument("cgc_path", metavar="PATH", help="The list of CGC genes") parser.add_argument("tdrivers_path", metavar="PATH", help="The list of TD drivers") parser.add_argument("pdrivers_path", metavar="PATH", help="The list of PD drivers") parser.add_argument("-o", dest="prefix", metavar="PREFIX", help="Output prefix.") args, logger = cmd.parse_args("perf-cosmic") fanns_db = cmd.open_db() try: snvs = extract_snvs(fanns_db, args.data_path, logger=logger) create_datasets(snvs, args.cgc_path, args.tdrivers_path, args.pdrivers_path, args.prefix) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Remove annotations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("id", metavar="ID", nargs="+", help="Annotation identifier.") args, logger = cmd.parse_args("ann-rm") db = cmd.open_db() try: if "*" in args.id: logger.info("Removing all the annotations ...") for ann in db.maps(): logger.info(" {} {} ...".format(ann["id"], ann["name"])) db.remove_map(ann["id"]) else: for ann_id in args.id: logger.info("Removing annotation {} ...".format(ann_id)) db.remove_map(ann_id) db.commit() except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Create a functional scores database") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("predictors", metavar="PREDICTORS", nargs="*", help="Predictor identifiers") args, logger = cmd.parse_args("create") db = cmd.create_db() try: for predictor_id in args.predictors: logger.info("Adding predictor {} ...".format(predictor_id)) db.add_predictor(predictor_id, FannsDb.SOURCE_PREDICTOR_TYPE) db.set_initialized() db.commit() except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Manipulate database indices") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("ops", metavar="OPERATIONS", nargs="+", choices=["drop", "create"], help="The operations to perform on the indices.") args, logger = cmd.parse_args("index") db = cmd.open_db() try: start_time = datetime.now() for op in args.ops: if op == "drop": logger.info("Dropping indices ...") db.drop_indices() elif op == "create": logger.info("Creating indices ...") db.create_indices() elapsed_time = datetime.now() - start_time logger.info("Done. Elapsed time: {}".format(elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="List predictors") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("--json", dest="to_json", action="store_true", default=False, help="Print the results in json format") args, log = cmd.parse_args("pred-list") db = cmd.open_db() try: if args.to_json: d = {} for pred in db.predictors(): d[pred["id"]] = dict([(k,pred[k]) for k in ["type", "source", "min", "max", "count"]]) import json print json.dumps(d, indent=True) else: print "\t".join(["ID", "TYPE", "SOURCE", "MIN", "MAX", "COUNT"]) for pred in db.predictors(): print "\t".join([str(pred[k]) for k in ["id", "type", "source", "min", "max", "count"]]) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser(description="Add annotations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("id", metavar="ID", help="Annotation identifier.") parser.add_argument("name", metavar="NAME", help="Annotation name.") parser.add_argument( "type", metavar="TYPE", choices=["transcript", "protein"], help="Annotation type: transcript, protein" ) parser.add_argument("path", metavar="PATH", help="Annotation items") parser.add_argument( "--priority", dest="priority", default=0, help="Priority for translating input annotations. 0 means not considered for translation. Default 0.", ) parser.add_argument( "--header", dest="header", action="store_true", default=False, help="Specify that the annotation items file have a header.", ) args, logger = cmd.parse_args("ann-add") db = cmd.open_db() try: logger.info("Creating annotation {} ...".format(args.name)) db.add_map(args.id, args.name, args.type, args.priority) logger.info("Loading items ...") with tsv.open(args.path) as f: for source, value in tsv.lines(f, (str, str), header=args.header): if len(source) > 0 and len(value) > 0: db.add_map_item(args.id, source, value) db.commit() except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Export SNV's") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="DEST", help="The destination file. Use - for standard output.") args, log = cmd.parse_args("export-snvs") db = cmd.open_db() logger.info("Exporting SNV's ...") total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") rows_count = 0 with tsv.open(args.dest_path, "w") as f: for snv in db.snvs(): rows_count += 1 tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S") progress.update() log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="List annotations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() args, logger = cmd.parse_args("ann-list") db = cmd.open_db() try: print "\t".join(["ID", "NAME", "TYPE", "PRIORITY"]) for ann in db.maps(): print "\t".join([str(ann[k]) for k in ["id", "name", "type", "priority"]]) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser(description="Map score values") cmd = DefaultCommandHelper(parser) cmd.add_db_args() cmd.add_transform_args() parser.add_argument( "--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False, help="Skip transformation for empty scores", ) args, logger = cmd.parse_args("scores-transform") db = cmd.open_db() try: transforms = cmd.get_transforms() predictors = transforms.keys() logger.info("Transforming scores ...") progress = RatedProgress(logger, name="SNVs") rows_count = updated_count = 0 for row in db.query_scores(predictors=predictors): rows_count += 1 scores = row["scores"] upd_scores = {} for predictor in transforms: score = scores[predictor] if args.skip_empty_scores and score is None: continue prev_score = score for name, func in transforms[predictor]: try: score = func(score) except: raise Exception("Error transforming the {} score {} with {}".format(predictor, score, name)) if prev_score != score: upd_scores[predictor] = score if len(upd_scores) > 0: db.update_scores(row["id"], upd_scores) updated_count += 1 progress.update() progress.log_totals() logger.info("Commit ...") db.commit() logger.info( "Finished. Total rows = {}, updated rows = {}, elapsed time = {}".format( rows_count, updated_count, progress.elapsed_time ) ) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser(description="Calculate Baseline Tolerance statistics per gene") cmd = DefaultCommandHelper(parser) parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree") parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group") parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features") parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial gene statistics") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output gene statistics") parser.add_argument( "-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD, help="Minimum number of features per group", ) parser.add_argument( "--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD, help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)", ) args, logger = cmd.parse_args("blt-groups") logger.info("Loading groups tree ...") group_children = defaultdict(set) with tsv.open(args.tree_path) as f: for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))): group_children[group] |= children logger.info("Loading mappings between groups and features ...") group_genes = defaultdict(set) with tsv.open(args.group_genes_path) as f: for group, genes in tsv.lines(f, (str, lambda v: set(v.split(",")))): group_genes[group] |= genes logger.info("Loading partial statistics ...") partial_stats = {} with tsv.open(args.stats_path) as f: predictors = f.readline().rstrip("\n").split("\t")[1:] num_predictors = len(predictors) for line in f: fields = line.rstrip("\n").split("\t") gene = fields[0] gene_stats = [[float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] for ss in fields[1:]] partial_stats[gene] = gene_stats logger.info(" Predictors: {}".format(", ".join(predictors))) logger.info(" Features: {}".format(len(partial_stats.keys()))) logger.info("Calculating features ...") stats = {} feat_count = 0 feat_partial_count = [0] * num_predictors for feature, feat_partial_stats in partial_stats.items(): feat_with_stats = False feat_stats = [None] * (num_predictors + 1) for i in range(num_predictors): s0, s1, s2 = feat_partial_stats[i] if s0 == 0.0: continue if s0 < args.count_threshold: continue x = (s0 * s2 - s1 * s1) / (s0 * (s0 - 1)) if x < -1e-12: continue mean = s1 / s0 std = math.sqrt(abs(x)) if std < args.stdev_threshold: continue feat_stats[i] = (int(s0), mean, std) feat_partial_count[i] += 1 feat_with_stats = True if feat_with_stats: feat_count += 1 stats[feature] = feat_stats # print feature, "\t".join(["/".join([str(v) for v in feat_stats[i] or []]) for i in range(num_predictors)]) logger.info( " {} ({}) features out of {} calculated directly from partial statistics".format( feat_count, "/".join(map(str, feat_partial_count)), len(partial_stats) ) ) logger.info("Calculating groups ...") calculate_group( logger, args.root_group, args.count_threshold, group_children, group_genes, partial_stats, num_predictors, stats ) logger.info(" {} features calculated in total".format(len(stats))) with tsv.open(args.out_path, "w") as of: tsv.write_line(of, "GENE", "GROUP", *predictors) for gene in sorted(stats.keys()): gene_stats = stats[gene] sb = [gene] stats_group = gene_stats[num_predictors] if stats_group is not None: sb += [stats_group] else: sb += ["|" + ("-" * num_predictors)] for i in range(num_predictors): if gene_stats[i] is not None: sb += ["/".join([str(v) for v in gene_stats[i]])] else: sb += ["-/-/-"] tsv.write_line(of, *sb) return 0
def main(): parser = argparse.ArgumentParser( description="Fetch Condel scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("muts_path", metavar="SNVS_PATH", help="SNV's to check. Use - for standard input.") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="The results path. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() args, logger = cmd.parse_args("fetch") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input")) try: progress = RatedProgress(logger, name="SNVs") with tsv.open(args.muts_path) as f: with tsv.open(args.out_path, "w") as wf: tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) hit = fail = 0 mut = DnaAndProtMutationParser() for line_num, line in enumerate(f, start=1): line = line.rstrip(" \n\r") if len(line) == 0 or line.startswith("#"): continue try: mut.parse(line) except PrematureEnd: logger.error("Missing fields at line {}".format(line_num)) fail += 1 continue except UnexpectedToken as ex: logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num)) fail += 1 continue exists = False for row in query_mutation(logger, db, mut, annotations, predictors): exists = True ann = row["annotations"] scores = row["scores"] tsv.write_line(wf, mut.identifier, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) """ if logger.isEnabledFor(logging.DEBUG): logger.debug(" --> {} {} {} {} {} {} {} {} {} {}".format( row["chr"], row["start"], row["ref"], row["alt"], row["transcript"], row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"], mut.identifier or "*")) """ progress.update() if exists: hit += 1 else: fail += 1 progress.log_totals() logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Update scores in the database") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") cmd.add_selected_predictors_args() parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False, help="Update of the predictors.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("update") db = cmd.open_db() predictors = cmd.get_selected_predictors(check_missing=False) try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))]) db_predictors = set([p["id"] for p in db.predictors()]) if len(predictors) == 0: predictors = [name for name in hdr if name in db_predictors] if len(predictors) == 0: raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.") logger.info("Predictors: {}".format(", ".join(predictors))) for predictor in filter(lambda p: p not in db_predictors, predictors): logger.info("Creating predictor {} ...".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0 use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0 if not use_genome_coords and not use_protein_coords: raise Exception("No coordinate columns found. " "Use {} for genomic coordinates or {} for protein coordinates.".format( GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS)) elif use_genome_coords and use_protein_coords: logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default") if use_genome_coords: coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)] coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_dna elif use_protein_coords: coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)] coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_protein coord_column_indices = [hdr[n] for n in coord_column_names] score_indices = [hdr[n] for n in predictors] max_column_index = max(coord_column_indices + score_indices) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column_index: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip( coord_column_names, coord_column_types, coord_column_indices)]) scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)]) except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise try: for row in db.query_scores(fields=[], **coords): db.update_scores(row["id"], scores) except Exception as ex: logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex))) logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()]))) if not args.ignore_errors: raise progress.update() progress.log_totals() logger.info("Finalizing database ...") if args.update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Filter for the longest transcript") cmd = DefaultCommandHelper(parser) parser.add_argument("len_path", metavar="PATH", help="The tsv containing the transcripts length") parser.add_argument("data_path", metavar="PATH", help="The data file") parser.add_argument("out_path", metavar="PATH", help="Output file. Use - for standard output.") parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT", help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT") args, logger = cmd.parse_args("filter-transcript") try: logger.info("Loading transcripts length ...") trslen = defaultdict(int) with tsv.open(args.len_path) as f: for name, length in tsv.rows(f): trslen[name] = length logger.info("Filtering {} ...".format(os.path.basename(args.data_path))) total_count = filter_count = 0 progress = RatedProgress(logger, name="mutations") key_columns = args.key.split(",") with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of: hdr_line = df.readline() of.write(hdr_line) _, hdr = tsv.header_from_line(hdr_line) key_indices = [hdr[name] for name in key_columns] trs_index = hdr["TRANSCRIPT"] last_key = None longest = (0, "") for line in df: total_count += 1 fields = line.rstrip("\n").split("\t") key = tuple([fields[index] for index in key_indices]) trs = fields[trs_index] tl = trslen[trs] if last_key != key: if last_key is not None: of.write(longest[1]) filter_count += 1 longest = (tl, line) last_key = key elif tl > longest[0]: longest = (tl, line) progress.update() filter_count += 1 of.write(longest[1]) progress.log_totals() logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format( total_count, filter_count, total_count - filter_count, progress.elapsed_time)) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Export Scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="OUTPUT_PATH", help="The output file. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() parser.add_argument("--json", dest="to_json", action="store_true", default=False, help="Export the results in json format") parser.add_argument("--sample", dest="sample", type=int, metavar="PCT", help="Export a random sample of PCT %%") parser.add_argument("--start", dest="start", type=int, metavar="N", help="Start to export from the SNV number N") parser.add_argument("--limit", dest="limit", type=int, metavar="N", help="Limit the number of SNVs to export to N") args, logger = cmd.parse_args("export") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Exporting ...") random.seed(time.time()) total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") to_json = args.to_json sample = args.sample start = args.start or 0 limit = args.limit doc = None last_pos = None rows_count = 0 snvs_count = 0 with tsv.open(args.dest_path, "w") as f: if not to_json: tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) for row in db.query_scores(predictors=predictors, maps=annotations): if not to_json: if start > 0: start -= 1 continue if sample is not None and random.randint(1, 100) > sample: continue pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"]) if last_pos != pos: if to_json: if start > 0: start -= 1 continue if limit is not None and snvs_count >= limit: if doc is not None: json.dump(doc, f) f.write("\n") break snvs_count += 1 rows_count += 1 ann = row["annotations"] scores = row["scores"] if to_json: tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] + [(k,scores[k]) for k in predictors]) if pos != last_pos: if doc is not None: if sample is None or random.randint(1, 100) <= sample: json.dump(doc, f) f.write("\n") else: snvs_count -= 1 doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] + [("transcripts", [tdoc])]) else: doc["transcripts"] += [tdoc] else: tsv.write_line(f, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) progress.update() last_pos = pos if not to_json and limit is not None and rows_count >= limit: break progress.log_totals() logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Generate datasets needed to evaluate performance from Cosmic mutations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("data_path", metavar="PATH", help="The CosmicMutantExport tsv file") parser.add_argument("cgc_path", metavar="PATH", help="The list of CGC genes") parser.add_argument("drivers_path", metavar="PATH", help="The list of CHASM drivers (drivers.tmps)") parser.add_argument("-o", dest="prefix", metavar="PREFIX", help="Output prefix.") args, logger = cmd.parse_args("perf-cosmic") prefix = args.prefix or "cosmic-" fanns_db = cmd.open_db() try: snvs = dict() logger.info("Counting the number of samples per mutation ...") with tsv.open(args.data_path, "r") as df: columns = [ #"Genome-wide screen", "Mutation Description", "Mutation CDS", "Mutation AA", "Mutation GRCh37 genome position", "Mutation GRCh37 strand", "Accession Number", "ID_sample"] total_rows = queried_rows = 0 for fields in tsv.rows(df, columns=columns, header=True): total_rows += 1 #wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields wide_screen = "y" if wide_screen != "y" or mut_desc != "Substitution - Missense": continue queried_rows += 1 for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]]) if k not in snvs: symbol = row["xrefs"]["symbol"] snvs[k] = dict( transcript=row["transcript"], symbol=symbol, samples=set([sample_id])) else: snvs[k]["samples"].add(sample_id) logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs))) logger.info("Loading CGC genes ...") cgc_genes = set() with open(args.cgc_path, "r") as f: for line in f: cgc_genes.add(line.rstrip("\n")) logger.info("Loading CHASM drivers ...") drivers = set() with open(args.drivers_path, "r") as f: for line in f: drivers.add(line.rstrip("\n").split("\t")[0]) logger.info("Creating datasets ...") progress = RatedProgress(logger, name="mutations") with Dataset(prefix + "1") as rec1,\ Dataset(prefix + "2") as rec2,\ Dataset(prefix + "4") as rec4,\ Dataset(prefix + "CGC") as cgc,\ Dataset(prefix + "noCGC") as nocgc,\ Dataset(prefix + "D") as drv,\ Dataset(prefix + "O") as nodrv: for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items(): num_samples = len(snv["samples"]) line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]]) if num_samples == 1: rec1.write(line) if num_samples >= 2: rec2.write(line) if num_samples >= 4: rec4.write(line) symbol = snv["symbol"] if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0): cgc.write(line) elif num_samples == 1: nocgc.write(line) if snv["transcript"] in drivers: drv.write(line) elif num_samples == 1: nodrv.write(line) progress.update() progress.log_totals() logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [ rec1, rec2, rec4, cgc, nocgc, drv, nodrv]]))) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Export dbNSFP scores") cmd = DefaultCommandHelper(parser) parser.add_argument("source_path", metavar="SOURCE", help="The original zip file") parser.add_argument("ensp_map_path", metavar="MAP", help="The mapping between Ensembl protein id's and Ensembl transcript id's and Uniprot id's") parser.add_argument("uniprot_map_path", metavar="MAP", help="The mapping between Ensembl protein id's and Uniprot id's") parser.add_argument("-o", "--output", dest="out_path", metavar="OUT_PATH", help="The output file") parser.add_argument("--temp", dest="temp_path", metavar="TEMP_PATH", help="A temporary path for zip extraction") parser.add_argument("--chr", dest="chr", metavar="CHROMOSOMES", help="Chromosomes to include: list separated by commas.") parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False, help="Skip SNV's where all the scores are empty") args, logger = cmd.parse_args("dbnsfp-export") if args.out_path is None: basename = os.path.basename(args.source_path) prefix = os.path.splitext(basename)[0] args.out_path = "{}.tsv.gz".format(prefix) logger.info("Loading maps ...") uniprot_map = {} trs_map = {} with tsv.open(args.ensp_map_path) as f: for ensp, enst in tsv.lines(f, (str, str)): if len(enst) > 0: trs_map[enst] = ensp with tsv.open(args.uniprot_map_path) as f: for ensp, uniprot_id in tsv.lines(f, (str, str)): if len(uniprot_id) > 0: uniprot_map[uniprot_id] = ensp logger.info("Opening {} ...".format(args.source_path)) chromosomes = None if args.chr is not None: chromosomes = [c.strip().upper() for c in args.chr.split(",") if len(c.strip()) > 0] logger.info("Selected chromosomes: {}".format(", ".join(chromosomes))) chromosomes = set(chromosomes) name_pattern = re.compile(r"dbNSFP.+_variant.chr(.+)") COLUMNS = [ "#chr", "pos(1-coor)", "ref", "alt", "cds_strand", "genename", "Uniprot_id", "Uniprot_aapos", "aaref", "aaalt", "Ensembl_geneid", "Ensembl_transcriptid", "aapos", "SIFT_score", "Polyphen2_HVAR_score", "MutationAssessor_score", "FATHMM_score", "MutationTaster_score", # "GERP_RS", "GERP++_RS", # "PhyloP_score" "phyloP" ] tmp_prefix = args.temp_path or tempfile.gettempdir() if not os.path.exists(tmp_prefix): os.makedirs(tmp_prefix) if tmp_prefix[-1] != "/": tmp_prefix += "/" extract_path = tempfile.mkdtemp(prefix=tmp_prefix) try: logger.info("Output: {}".format(args.out_path if args.out_path != "-" else "standard output")) total_start_time = time.time() total_lines = 0 with ZipFile(args.source_path, "r") as zf,\ tsv.open(args.out_path, "w") as of: #,\ #tsv.open(args.noprot_path, "w") as npf: tsv.write_line(of, "CHR", "STRAND", "START", "REF", "ALT", "TRANSCRIPT", "PROTEIN", "AA_POS", "AA_REF", "AA_ALT", "SIFT", "PPH2", "MA", "FATHMM", "MT", "GERPRS", "PHYLOP") #tsv.write_line(npf, "#CHR", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") entries = [] for entry in zf.infolist(): m = name_pattern.match(entry.filename) if not m: continue chr = m.group(1) index = CHR_INDEX[chr] if chr in CHR_INDEX else 99 if chromosomes is not None and chr not in chromosomes: logger.debug("Skipping chromosome {} ...".format(chr)) continue entries += [(index, chr, entry)] for index, chr, entry in sorted(entries, key=lambda x: x[0]): logger.info("Reading chromosome {} ...".format(chr)) zf.extract(entry, extract_path) fpath = os.path.join(extract_path, entry.filename) with open(fpath) as f: # Parse header hdr_line = f.readline() hdr = {} for index, name in enumerate(hdr_line.rstrip("\n").split("\t")): hdr[name] = index columns = [hdr[name] if name in hdr else None for name in COLUMNS] read = set() start_time = time.time() partial_start_time = start_time for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") try: fields = [fields[i] if i is not None and i < len(fields) else None for i in columns] (chr, start, ref, alt, strand, symbol, uniprot, uniprot_aapos, aa_ref, aa_alt, gene, transcript, aapos, sift, pph2, ma, fathmm, mt, gerprs, phylop) = fields start = safe_int(start) ref = ref.upper() if ref is not None else None alt = alt.upper() if alt is not None else None aa_ref = aa_ref.upper() if aa_ref is not None else None aa_alt = aa_alt.upper() if aa_alt is not None else None sift = safe_float(sift) ma = safe_float(ma) fathmm = safe_float(fathmm) mt = safe_float(mt) gerprs = safe_float(gerprs) phylop = safe_float(phylop) if start is None or ref is None or alt is None: logger.warn("None value for pos or ref or alt at line {}: {}".format(line_num, fields)) continue elif ref not in BASE_INDEX or alt not in BASE_INDEX: logger.warn("Unknown ref or alt at line {}: {}".format(line_num, fields)) continue elif len(ref) != 1 or len(alt) != 1: logger.warn("Length != 1 for ref or alt len at line {}: {}".format(line_num, fields)) continue #elif aa_ref not in AA_INDEX or aa_alt not in AA_INDEX: # logger.warn("Unknown aa_ref or aa_alt at line {}: {}".format(line_num, fields)) # continue elif transcript is None or aapos is None or uniprot is None or uniprot_aapos is None: logger.warn("None value for transcript or aapos or uniprot or uniprot_aapos at line {}: {}".format(line_num, fields)) continue if aa_ref not in AA_INDEX: aa_ref = None if aa_alt not in AA_INDEX: aa_alt = None trs_values = transcript.split(";") aapos_values = [safe_int(v) for v in aapos.split(";")] l = len(trs_values) - len(aapos_values) if l > 0: aapos_values += [aapos_values[-1]] * l uniprot_values = uniprot.split(";") uniprot_aapos_values = [safe_int(v) for v in uniprot_aapos.split(";")] l = len(uniprot_values) - len(uniprot_aapos_values) if l > 0: uniprot_aapos_values += [uniprot_aapos_values[-1]] * l pph2_values = [safe_float(v) for v in pph2.split(";")] if pph2 is not None else [None] l = len(uniprot_values) - len(pph2_values) if l > 0: pph2_values += [pph2_values[-1]] * l uniprot_index = {} for i, id in enumerate(uniprot_values): if uniprot_aapos_values[i] is not None: uniprot_index[uniprot_aapos_values[i]] = i for i, trs in enumerate(trs_values): pos = aapos_values[i] if pos < 0: pos = None if pos is not None and pos in uniprot_index: j = uniprot_index[pos] uniprot_value = uniprot_values[j] pph2_value = pph2_values[j] else: uniprot_value = pph2_value = None if trs in trs_map: prot_id = trs_map[trs] elif uniprot_value in uniprot_map: prot_id = uniprot_map[uniprot_value] else: logger.warn("Couldn't map neither protein {} or transcript {} at line {}: {}".format(uniprot_value, trs, line_num, "|".join([str(v) for v in fields]))) continue #if pos < 0: # logger.warn("Negative protein position at line {}: {}".format(line_num, pos)) # continue #elif ... if pph2_value is not None and (pph2_value < 0.0 or pph2_value > 1.0): logger.warn("PPH2 score {} out of range at line {}: {}".format(pph2_value, line_num, fields)) continue if aa_alt == "X": # fix stop codons having a sift score sift = None if args.skip_empty_scores and sift is None and pph2_value is None and ma is None \ and mt is None and gerprs is None and phylop is None: continue #log.info((chr, strand, start, ref, alt, aapos_values[i], aa_ref, aa_alt, trs, sift, pph2_value, ma)) if pos is None or aa_ref is None or aa_alt is None: pass #tsv.write_line(npf, chr, start, ".", ref, alt, ".", "PASS", # "dbNSFP={}|{}|{}|{}|{}|{}".format(trs, prot_id, # sift or "", pph2_value or "", ma or "", fathmm or "")) else: tsv.write_line(of, chr, strand, start, ref, alt, trs, prot_id, pos, aa_ref, aa_alt, sift, pph2_value, ma, fathmm, mt, gerprs, phylop) except KeyboardInterrupt: raise except: logger.warn("Malformed line {}: {}".format(line_num, "|".join([str(v) for v in fields]))) raise #continue partial_time = time.time() - partial_start_time if partial_time >= 5.0: partial_start_time = time.time() elapsed_time = time.time() - start_time logger.debug(" {} lines, {:.1f} lines/second".format(hsize(line_num-1), (line_num-1) / float(elapsed_time))) total_lines += line_num logger.info(" > {} lines, {:.1f} lines/second".format(hsize(line_num), line_num / float(time.time() - start_time))) logger.info(" >> {} lines, {:.1f} lines/second".format(hsize(total_lines), total_lines / float(time.time() - total_start_time))) os.remove(fpath) total_elapsed_time = timedelta(seconds=time.time() - total_start_time) logger.info("Finished successfully. Elapsed time: {}".format(total_elapsed_time)) except: return cmd.handle_error() finally: shutil.rmtree(extract_path) return 0
def main(): parser = argparse.ArgumentParser( description="Calculate Baseline Tolerance statistics") cmd = DefaultCommandHelper(parser) parser.add_argument("tree_path", metavar="TREE_PATH", help="The groups descendant tree") parser.add_argument("root_group", metavar="ROOT_GROUP", help="Tree root group") parser.add_argument("group_genes_path", metavar="GROUP_FEATS_PATH", help="Map between groups and features") parser.add_argument("stats_path", metavar="STATS_PATH", help="Partial feature statistics") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output feature statistics") parser.add_argument("--tsv", dest="tsv_path", metavar="PATH", help="Store baseline tolerance in tsv format too.") parser.add_argument("-c", "--count-threshold", dest="count_threshold", metavar="N", default=DEFAULT_COUNT_THRESHOLD, help="Minimum number of features per group") parser.add_argument("--stdev-threshold", dest="stdev_threshold", metavar="V", default=DEFAULT_STDEV_THRESHOLD, help="Skip feature statistics with a standard deviation less than V (it will be calculated at the level of groups)") args, logger = cmd.parse_args("blt-groups") logger.info("Loading groups tree ...") tree = Tree() with tsv.open(args.tree_path) as f: for group, children in tsv.lines(f, (str, lambda v: set(v.split(",")))): tree.add_node(group, children) logger.info(" Nodes: {}".format(tree.node_count)) logger.info("Loading mappings between groups and features ...") all_groups = set() all_features = set() with tsv.open(args.group_genes_path) as f: for group, features in tsv.lines(f, (str, lambda v: set(v.split(",")))): tree.add_node(group, features) all_groups.add(group) all_features.update(features) logger.info(" Nodes: {}".format(tree.node_count)) logger.info(" Groups: {}".format(len(all_groups))) logger.info(" Features: {}".format(len(all_features))) logger.info("Loading partial statistics ...") with tsv.open(args.stats_path) as f: predictors = f.readline().rstrip("\n").split("\t")[1:] num_predictors = len(predictors) num_features = 0 for line in f: try: fields = line.rstrip("\n").split("\t") feature = fields[0] node = tree.get_or_create_node(feature) for p, ss in zip(predictors, fields[1:]): try: s0, s1, s2 = [float(v) if i > 0 else int(v) for i, v in enumerate(ss.split("/"))] node.set_pblt(p, PartialBLT(s0, s1, s2, sources=set([feature]))) except: import traceback traceback.print_exc() logger.warn("Failed to parse partial baseline tolerance" " for {}/{} from {}".format(feature, p, ss)) exit(-1) continue num_features += 1 except: logger.warn("Failed to parse partial baseline tolerance" " for {} from {}".format(feature, line)) continue logger.info(" Nodes: {}".format(tree.node_count)) logger.info(" Features: {}".format(num_features)) logger.info(" Predictors: {}".format(", ".join(predictors))) logger.info("Calculating baseline tolerance ...") for predictor in predictors: logger.info("For {} ...".format(predictor)) calculate_blt( parent=None, node=tree.get_or_create_node(args.root_group), predictor=predictor, count_threshold=args.count_threshold, stdev_threshold=args.stdev_threshold, logger=logger) # TODO log summary info logger.info("Writing results into {} ...".format(os.path.basename(args.out_path))) if args.tsv_path is not None: with tsv.open(args.tsv_path, "w") as of: tsv.write_line(of, "FEATURE", *predictors) for feature in all_features: sb = [feature] node = tree.get_node(feature) predictors_with_blt = 0 for predictor in predictors: blt = node.get_blt(predictor) if blt is None or blt.n < args.count_threshold: sb += ["/".join(["-"] * 5)] continue predictors_with_blt += 1 sb += ["/".join(map(str, [blt.from_node, blt.scope, blt.n, blt.mean, blt.stdev]))] if predictors_with_blt > 0: tsv.write_line(of, *sb) with tsv.open(args.out_path, "w") as of: tree_blt = {} for node_name, node in tree.nodes.items(): predictors_blt = {} for predictor in predictors: pred_blt = node.get_blt(predictor) if pred_blt is None or pred_blt.n < args.count_threshold: continue predictors_blt[predictor] = dict( from_node=pred_blt.from_node, scope=pred_blt.scope, N=pred_blt.n, mean=pred_blt.mean, stdev=pred_blt.stdev) if len(predictors_blt) > 0: tree_blt[node.name] = predictors_blt doc = dict( created=str(datetime.now()), predictors=predictors, count_threshold=args.count_threshold, stdev_threshold=args.stdev_threshold, tree=None, # tree relations features=list(all_features), pblt=None, # TODO blt=tree_blt ) json.dump(doc, of, indent=True) return 0