def load_events(f, column_indices, predictors, transforms, stats, logger): count = [0, 0] last_pos = [[None]*4]*2 id_index = column_indices["ID"] pos_indices = [column_indices[name] for name in POS_HEADERS] for fields in tsv.rows(f): try: event_type = EVENT_TYPES[fields[id_index]] except KeyError: raise Exception("Unknown event type: {}".format(fields[id_index])) current_pos = [fields[i] for i in pos_indices] if last_pos[event_type] == current_pos: continue last_pos[event_type] = current_pos count[event_type] += 1 protein, pos, aa_ref, aa_alt = current_pos scores = [score_value(fields[column_indices[p]]) for p in predictors] for predictor, score in zip(predictors, scores): if score is None or not predictor in stats: continue if predictor in transforms: for expr, func in transforms[predictor]: try: score = func(score) except: logger.error("Error applying transformation {} to score {}".format(expr, score)) predictor_stats = stats[predictor] (rmin, rmax, dim, vmin, vmax, size, dp, dn) = [predictor_stats[k] for k in [ "rmin", "rmax", "dim", "vmin", "vmax", "size", "dp", "dn"]] r = (score - rmin) / dim index = int(r * size) if score < rmax else size - 1 if vmin is None or score < vmin: predictor_stats["vmin"] = score if vmax is None or score > vmax: predictor_stats["vmax"] = score if event_type == HIGH_REC_EVENT: dp[index] += 1 elif event_type == NON_REC_EVENT: dn[index] += 1 return { "high_recurrent" : count[HIGH_REC_EVENT], "non_recurrent" : count[NON_REC_EVENT] }
def main(): parser = argparse.ArgumentParser( description="Update scores in the database") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") cmd.add_selected_predictors_args() parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False, help="Update of the predictors.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("update") db = cmd.open_db() predictors = cmd.get_selected_predictors(check_missing=False) try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))]) db_predictors = set([p["id"] for p in db.predictors()]) if len(predictors) == 0: predictors = [name for name in hdr if name in db_predictors] if len(predictors) == 0: raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.") logger.info("Predictors: {}".format(", ".join(predictors))) for predictor in filter(lambda p: p not in db_predictors, predictors): logger.info("Creating predictor {} ...".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0 use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0 if not use_genome_coords and not use_protein_coords: raise Exception("No coordinate columns found. " "Use {} for genomic coordinates or {} for protein coordinates.".format( GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS)) elif use_genome_coords and use_protein_coords: logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default") if use_genome_coords: coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)] coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_dna elif use_protein_coords: coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)] coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_protein coord_column_indices = [hdr[n] for n in coord_column_names] score_indices = [hdr[n] for n in predictors] max_column_index = max(coord_column_indices + score_indices) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column_index: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip( coord_column_names, coord_column_types, coord_column_indices)]) scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)]) except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise try: for row in db.query_scores(fields=[], **coords): db.update_scores(row["id"], scores) except Exception as ex: logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex))) logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()]))) if not args.ignore_errors: raise progress.update() progress.log_totals() logger.info("Finalizing database ...") if args.update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def load_events(f, column_indices, predictors, stats, logger): count = [0, 0] last_pos = [[None]*4]*2 id_index = column_indices["ID"] pos_indices = [column_indices[name] for name in POS_COLUMNS] pred_indices = [column_indices[p] for p in predictors] for fields in tsv.rows(f): try: event_type = EVENT_TYPES[fields[id_index]] except KeyError: raise Exception("Unknown event type: {}".format(fields[id_index])) current_pos = [fields[i] for i in pos_indices] if last_pos[event_type] == current_pos: continue last_pos[event_type] = current_pos count[event_type] += 1 protein, pos, aa_ref, aa_alt = current_pos scores = [score_value(fields[pi]) for pi in pred_indices] for predictor, score in zip(predictors, scores): if score is None or not predictor in stats: continue if predictor in PREDICTOR_TRANSFORM: score = PREDICTOR_TRANSFORM[predictor](score) predictor_stats = stats[predictor] (rmin, rmax, dim, vmin, vmax, size, dp, dn, tp, tn, fp, fn) = [predictor_stats[k] for k in [ "rmin", "rmax", "dim", "vmin", "vmax", "size", "dp", "dn", "tp", "tn", "fp", "fn"]] r = (score - rmin) / dim index = int(r * size) if score < rmax else size - 1 if vmin is None or score < vmin: predictor_stats["vmin"] = score if vmax is None or score > vmax: predictor_stats["vmax"] = score if event_type == POS_EVENT: dp[index] += 1 for i in xrange(0, index): tp[i] += 1 for i in xrange(index, size): fn[i] += 1 elif event_type == NEG_EVENT: dn[index] += 1 for i in xrange(0, index): fp[i] += 1 for i in xrange(index, size): tn[i] += 1 return count[POS_EVENT], count[NEG_EVENT]