def extract_snvs(fanns_db, data_path, logger=None): logger = logger or logging.getLogger("perf-cosmic") snvs = dict() logger.info("Reading mutations ...") progress = RatedProgress(logger, name="mutations") with tsv.open(data_path, "r") as df: columns = [ "Genome-wide screen", "Mutation Description", "Mutation CDS", "Mutation AA", "Mutation GRCh37 genome position", "Mutation GRCh37 strand", "Accession Number", "ID_sample"] total_rows = queried_rows = dbfound_rows = 0 for fields in tsv.rows(df, columns=columns, header=True): total_rows += 1 wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields # wide_screen != "y" if mut_desc != "Substitution - Missense": continue queried_rows += 1 for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): dbfound_rows += 1 k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]]) if k not in snvs: snvs[k] = snv = dict( transcript=row["transcript"], symbol=row["xrefs"]["symbol"], msamples=set(), wsamples=set()) else: snv = snvs[k] if wide_screen == "y": snv["wsamples"].add(sample_id) else: snv["msamples"].add(sample_id) progress.update() progress.log_totals() logger.info("Counting the number of samples per mutation ...") for data in snvs.itervalues(): data["msamples"] = len(data["msamples"]) data["wsamples"] = len(data["wsamples"]) logger.info("Total: total_rows={}, queried_rows={}, found_rows={}, protein_changes={}".format(total_rows, queried_rows, dbfound_rows, len(snvs))) return snvs
def load_events(f, column_indices, predictors, transforms, stats, logger): count = [0, 0] last_pos = [[None]*4]*2 id_index = column_indices["ID"] pos_indices = [column_indices[name] for name in POS_HEADERS] for fields in tsv.rows(f): try: event_type = EVENT_TYPES[fields[id_index]] except KeyError: raise Exception("Unknown event type: {}".format(fields[id_index])) current_pos = [fields[i] for i in pos_indices] if last_pos[event_type] == current_pos: continue last_pos[event_type] = current_pos count[event_type] += 1 protein, pos, aa_ref, aa_alt = current_pos scores = [score_value(fields[column_indices[p]]) for p in predictors] for predictor, score in zip(predictors, scores): if score is None or not predictor in stats: continue if predictor in transforms: for expr, func in transforms[predictor]: try: score = func(score) except: logger.error("Error applying transformation {} to score {}".format(expr, score)) predictor_stats = stats[predictor] (rmin, rmax, dim, vmin, vmax, size, dp, dn) = [predictor_stats[k] for k in [ "rmin", "rmax", "dim", "vmin", "vmax", "size", "dp", "dn"]] r = (score - rmin) / dim index = int(r * size) if score < rmax else size - 1 if vmin is None or score < vmin: predictor_stats["vmin"] = score if vmax is None or score > vmax: predictor_stats["vmax"] = score if event_type == HIGH_REC_EVENT: dp[index] += 1 elif event_type == NON_REC_EVENT: dn[index] += 1 return { "high_recurrent" : count[HIGH_REC_EVENT], "non_recurrent" : count[NON_REC_EVENT] }
def load_events(f, column_indices, predictors, stats, logger): count = [0, 0] last_pos = [[None]*4]*2 id_index = column_indices["ID"] pos_indices = [column_indices[name] for name in POS_COLUMNS] pred_indices = [column_indices[p] for p in predictors] for fields in tsv.rows(f): try: event_type = EVENT_TYPES[fields[id_index]] except KeyError: raise Exception("Unknown event type: {}".format(fields[id_index])) current_pos = [fields[i] for i in pos_indices] if last_pos[event_type] == current_pos: continue last_pos[event_type] = current_pos count[event_type] += 1 protein, pos, aa_ref, aa_alt = current_pos scores = [score_value(fields[pi]) for pi in pred_indices] for predictor, score in zip(predictors, scores): if score is None or not predictor in stats: continue if predictor in PREDICTOR_TRANSFORM: score = PREDICTOR_TRANSFORM[predictor](score) predictor_stats = stats[predictor] (rmin, rmax, dim, vmin, vmax, size, dp, dn, tp, tn, fp, fn) = [predictor_stats[k] for k in [ "rmin", "rmax", "dim", "vmin", "vmax", "size", "dp", "dn", "tp", "tn", "fp", "fn"]] r = (score - rmin) / dim index = int(r * size) if score < rmax else size - 1 if vmin is None or score < vmin: predictor_stats["vmin"] = score if vmax is None or score > vmax: predictor_stats["vmax"] = score if event_type == POS_EVENT: dp[index] += 1 for i in xrange(0, index): tp[i] += 1 for i in xrange(index, size): fn[i] += 1 elif event_type == NEG_EVENT: dn[index] += 1 for i in xrange(0, index): fp[i] += 1 for i in xrange(index, size): tn[i] += 1 return count[POS_EVENT], count[NEG_EVENT]
def main(): parser = argparse.ArgumentParser( description="Filter for the longest transcript") cmd = DefaultCommandHelper(parser) parser.add_argument("len_path", metavar="PATH", help="The tsv containing the transcripts length") parser.add_argument("data_path", metavar="PATH", help="The data file") parser.add_argument("out_path", metavar="PATH", help="Output file. Use - for standard output.") parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT", help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT") args, logger = cmd.parse_args("filter-transcript") try: logger.info("Loading transcripts length ...") trslen = defaultdict(int) with tsv.open(args.len_path) as f: for name, length in tsv.rows(f): trslen[name] = length logger.info("Filtering {} ...".format(os.path.basename(args.data_path))) total_count = filter_count = 0 progress = RatedProgress(logger, name="mutations") key_columns = args.key.split(",") with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of: hdr_line = df.readline() of.write(hdr_line) _, hdr = tsv.header_from_line(hdr_line) key_indices = [hdr[name] for name in key_columns] trs_index = hdr["TRANSCRIPT"] last_key = None longest = (0, "") for line in df: total_count += 1 fields = line.rstrip("\n").split("\t") key = tuple([fields[index] for index in key_indices]) trs = fields[trs_index] tl = trslen[trs] if last_key != key: if last_key is not None: of.write(longest[1]) filter_count += 1 longest = (tl, line) last_key = key elif tl > longest[0]: longest = (tl, line) progress.update() filter_count += 1 of.write(longest[1]) progress.log_totals() logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format( total_count, filter_count, total_count - filter_count, progress.elapsed_time)) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Generate datasets needed to evaluate performance from Cosmic mutations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("data_path", metavar="PATH", help="The CosmicMutantExport tsv file") parser.add_argument("cgc_path", metavar="PATH", help="The list of CGC genes") parser.add_argument("drivers_path", metavar="PATH", help="The list of CHASM drivers (drivers.tmps)") parser.add_argument("-o", dest="prefix", metavar="PREFIX", help="Output prefix.") args, logger = cmd.parse_args("perf-cosmic") prefix = args.prefix or "cosmic-" fanns_db = cmd.open_db() try: snvs = dict() logger.info("Counting the number of samples per mutation ...") with tsv.open(args.data_path, "r") as df: columns = [ #"Genome-wide screen", "Mutation Description", "Mutation CDS", "Mutation AA", "Mutation GRCh37 genome position", "Mutation GRCh37 strand", "Accession Number", "ID_sample"] total_rows = queried_rows = 0 for fields in tsv.rows(df, columns=columns, header=True): total_rows += 1 #wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields wide_screen = "y" if wide_screen != "y" or mut_desc != "Substitution - Missense": continue queried_rows += 1 for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]]) if k not in snvs: symbol = row["xrefs"]["symbol"] snvs[k] = dict( transcript=row["transcript"], symbol=symbol, samples=set([sample_id])) else: snvs[k]["samples"].add(sample_id) logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs))) logger.info("Loading CGC genes ...") cgc_genes = set() with open(args.cgc_path, "r") as f: for line in f: cgc_genes.add(line.rstrip("\n")) logger.info("Loading CHASM drivers ...") drivers = set() with open(args.drivers_path, "r") as f: for line in f: drivers.add(line.rstrip("\n").split("\t")[0]) logger.info("Creating datasets ...") progress = RatedProgress(logger, name="mutations") with Dataset(prefix + "1") as rec1,\ Dataset(prefix + "2") as rec2,\ Dataset(prefix + "4") as rec4,\ Dataset(prefix + "CGC") as cgc,\ Dataset(prefix + "noCGC") as nocgc,\ Dataset(prefix + "D") as drv,\ Dataset(prefix + "O") as nodrv: for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items(): num_samples = len(snv["samples"]) line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]]) if num_samples == 1: rec1.write(line) if num_samples >= 2: rec2.write(line) if num_samples >= 4: rec4.write(line) symbol = snv["symbol"] if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0): cgc.write(line) elif num_samples == 1: nocgc.write(line) if snv["transcript"] in drivers: drv.write(line) elif num_samples == 1: nodrv.write(line) progress.update() progress.log_totals() logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [ rec1, rec2, rec4, cgc, nocgc, drv, nodrv]]))) except: cmd.handle_error() return 0