def main(args): print("main") ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht' ) if args.train_rf: run_hash = str(uuid.uuid4())[:8] rf_runs = get_rf_runs(f'{tmp_dir}/ddd-elgh-ukbb/') while run_hash in rf_runs: run_hash = str(uuid.uuid4())[:8] ht_result, rf_model = train_rf(ht, args) print("Writing out ht_training data") ht_result = ht_result.checkpoint(get_rf(data="training", run_hash=run_hash).path, overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/Sanger_RF_training_data.ht', overwrite=True) rf_runs[run_hash] = get_run_data( vqsr_training=False, transmitted_singletons=True, test_intervals=args.test_intervals, adj=False, features_importance=hl.eval(ht_result.features_importance), test_results=hl.eval(ht_result.test_results), ) with hl.hadoop_open( f'{plot_dir}/ddd-elgh-ukbb/variant_qc/rf_runs.json', "w") as f: json.dump(rf_runs, f) logger.info("Saving RF model") save_model(rf_model, get_rf(data="model", run_hash=run_hash), overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/rf_model.model') else: run_hash = args.run_hash if args.apply_rf: logger.info(f"Applying RF model {run_hash}...") rf_model = load_model(get_rf(data="model", run_hash=run_hash)) ht = get_rf(data="training", run_hash=run_hash).ht() features = hl.eval(ht.features) ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL) logger.info("Finished applying RF model") ht = ht.annotate_globals(rf_hash=run_hash) ht = ht.checkpoint( get_rf("rf_result", run_hash=run_hash).path, overwrite=True, ) ht_summary = ht.group_by("tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL).aggregate(n=hl.agg.count()) ht_summary.show(n=20)
def main(args): hl.init(log="/variant_qc_random_forest.log") if args.list_rf_runs: logger.info(f"RF runs:") pretty_print_runs(get_rf_runs(rf_run_path())) if args.annotate_for_rf: ht = create_rf_ht( impute_features=args.impute_features, adj=args.adj, n_partitions=args.n_partitions, checkpoint_path=get_checkpoint_path("rf_annotation"), ) ht.write( get_rf_annotations(args.adj).path, overwrite=args.overwrite, ) logger.info(f"Completed annotation wrangling for random forests model training") if args.train_rf: model_id = f"rf_{str(uuid.uuid4())[:8]}" rf_runs = get_rf_runs(rf_run_path()) while model_id in rf_runs: model_id = f"rf_{str(uuid.uuid4())[:8]}" ht, rf_model = train_rf( get_rf_annotations(args.adj).ht(), fp_to_tp=args.fp_to_tp, num_trees=args.num_trees, max_depth=args.max_depth, no_transmitted_singletons=args.no_transmitted_singletons, no_inbreeding_coeff=args.no_inbreeding_coeff, vqsr_training=args.vqsr_training, vqsr_model_id=args.vqsr_model_id, filter_centromere_telomere=args.filter_centromere_telomere, test_intervals=args.test_intervals, ) ht = ht.checkpoint( get_rf_training(model_id=model_id).path, overwrite=args.overwrite, ) logger.info("Adding run to RF run list") rf_runs[model_id] = get_run_data( input_args={ "transmitted_singletons": None if args.vqsr_training else not args.no_transmitted_singletons, "adj": args.adj, "vqsr_training": args.vqsr_training, "filter_centromere_telomere": args.filter_centromere_telomere, }, test_intervals=args.test_intervals, features_importance=hl.eval(ht.features_importance), test_results=hl.eval(ht.test_results), ) with hl.hadoop_open(rf_run_path(), "w") as f: json.dump(rf_runs, f) logger.info("Saving RF model") save_model( rf_model, get_rf_model_path(model_id=model_id), overwrite=args.overwrite, ) else: model_id = args.model_id if args.apply_rf: logger.info(f"Applying RF model {model_id}...") rf_model = load_model(get_rf_model_path(model_id=model_id)) ht = get_rf_training(model_id=model_id).ht() features = hl.eval(ht.features) ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL) logger.info("Finished applying RF model") ht = ht.annotate_globals(rf_model_id=model_id) ht = ht.checkpoint( get_rf_result(model_id=model_id).path, overwrite=args.overwrite, ) ht_summary = ht.group_by( "tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL ).aggregate(n=hl.agg.count()) ht_summary.show(n=20)
def main(args): print("importing main table") ht = hl.read_table( f'{nfs_dir}/hail_data/variant_qc/chd_ukbb.table_for_RF_by_variant_type_all_cols.ht' ) if args.train_rf: # ht = hl.read_table( # f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht') run_hash = str(uuid.uuid4())[:8] rf_runs = get_rf_runs(f'{tmp_dir}/rf_runs.json') while run_hash in rf_runs: run_hash = str(uuid.uuid4())[:8] ht_result, rf_model = train_rf(ht, args) print("Writing out ht_training data") ht_result = ht_result.checkpoint(get_rf(data="training", run_hash=run_hash).path, overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/Sanger_RF_training_data.ht', overwrite=True) rf_runs[run_hash] = get_run_data( vqsr_training=False, transmitted_singletons=True, test_intervals=args.test_intervals, adj=True, features_importance=hl.eval(ht_result.features_importance), test_results=hl.eval(ht_result.test_results), ) with hl.hadoop_open(f'{tmp_dir}/rf_runs.json', "w") as f: json.dump(rf_runs, f) pretty_print_runs(rf_runs) logger.info("Saving RF model") save_model(rf_model, get_rf(data="model", run_hash=run_hash), overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/rf_model.model') else: run_hash = args.run_hash if args.apply_rf: logger.info(f"Applying RF model {run_hash}...") rf_model = load_model(get_rf(data="model", run_hash=run_hash)) ht = get_rf(data="training", run_hash=run_hash).ht() features = hl.eval(ht.features) ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL) logger.info("Finished applying RF model") ht = ht.annotate_globals(rf_hash=run_hash) ht = ht.checkpoint( get_rf("rf_result_chd_ukbb", run_hash=run_hash).path, overwrite=True, ) ht_summary = ht.group_by("tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL).aggregate(n=hl.agg.count()) ht_summary.show(n=20) if args.finalize: # TODO: Adjust this step to run on the CHD-UKBB cohort run_hash = args.run_hash ht = hl.read_table( f'{tmp_dir}/variant_qc/models/{run_hash}/rf_result_ac_added.ht') # ht = create_grouped_bin_ht( # model_id=run_hash, overwrite=True) freq_ht = hl.read_table( f'{tmp_dir}/variant_qc/mt_sampleQC_FILTERED_FREQ_adj.ht') freq = freq_ht[ht.key] print("created bin ht") ht = generate_final_rf_ht( ht, ac0_filter_expr=freq.freq[0].AC == 0, ts_ac_filter_expr=freq.freq[1].AC == 1, mono_allelic_fiter_expr=(freq.freq[1].AF == 1) | (freq.freq[1].AF == 0), snp_cutoff=args.snp_cutoff, indel_cutoff=args.indel_cutoff, determine_cutoff_from_bin=False, aggregated_bin_ht=bin_ht, bin_id=bin_ht.bin, inbreeding_coeff_cutoff=INBREEDING_COEFF_HARD_CUTOFF, ) # This column is added by the RF module based on a 0.5 threshold which doesn't correspond to what we use # ht = ht.drop(ht[PREDICTION_COL]) ht.write(f'{tmp_dir}/rf_final.ht', overwrite=True)
def main(args): print("main") ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_table_for_RF_by_variant_type.ht' ) if args.train_rf: run_hash = str(uuid.uuid4())[:8] rf_runs = get_rf_runs(f'{tmp_dir}/ddd-elgh-ukbb/') while run_hash in rf_runs: run_hash = str(uuid.uuid4())[:8] ht_result, rf_model = train_rf(ht, args) print("Writing out ht_training data") ht_result = ht_result.checkpoint(get_rf(data="training", run_hash=run_hash).path, overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/Sanger_RF_training_data.ht', overwrite=True) rf_runs[run_hash] = get_run_data( vqsr_training=False, transmitted_singletons=True, test_intervals=args.test_intervals, adj=False, features_importance=hl.eval(ht_result.features_importance), test_results=hl.eval(ht_result.test_results), ) with hl.hadoop_open( f'{plot_dir}/ddd-elgh-ukbb/variant_qc/rf_runs.json', "w") as f: json.dump(rf_runs, f) pretty_print_runs(rf_runs) logger.info("Saving RF model") save_model(rf_model, get_rf(data="model", run_hash=run_hash), overwrite=True) # f'{tmp_dir}/ddd-elgh-ukbb/rf_model.model') else: run_hash = args.run_hash if args.apply_rf: logger.info(f"Applying RF model {run_hash}...") #rf_model = load_model(get_rf(data="model", run_hash=run_hash)) run_hash = args.run_hash rf_model = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/model.model' ) # ht = hl.read_table( # f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED_FREQ_adj_inb.ht') ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_for_RF_unfiltered.ht' ) ht = ht.annotate(rf_label=rf_model[ht.key].rf_label) #ht = get_rf(data="training", run_hash=run_hash).ht() features = hl.eval(rf_model.features) ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL) logger.info("Finished applying RF model") ht = ht.annotate_globals(rf_hash=run_hash) ht = ht.checkpoint( get_rf("rf_result_sanger_cohorts", run_hash=run_hash).path, overwrite=True, ) ht_summary = ht.group_by("tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL).aggregate(n=hl.agg.count()) ht_summary.show(n=20) if args.finalize: run_hash = args.run_hash ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/rf_result_ac_added.ht' ) # ht = create_grouped_bin_ht( # model_id=run_hash, overwrite=True) freq_ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-20-XY_sampleQC_FILTERED_FREQ_adj.ht' ) freq = freq_ht[ht.key] bin_ht = create_quantile_bin_ht(run_hash, n_bins=100, vqsr=False, overwrite=True) # if not file_exists( # get_score_quantile_bins(args.run_hash, aggregated=True).path # ): # sys.exit( # f"Could not find binned HT for RF run {args.run_hash} (). Please run create_ranked_scores.py for that hash." # ) #aggregated_bin_ht = get_score_quantile_bins(ht, aggregated=True) print("created bin ht") ht = generate_final_rf_ht( ht, ac0_filter_expr=freq.freq[0].AC == 0, ts_ac_filter_expr=freq.freq[1].AC == 1, mono_allelic_fiter_expr=(freq.freq[1].AF == 1) | (freq.freq[1].AF == 0), snp_cutoff=args.snp_cutoff, indel_cutoff=args.indel_cutoff, determine_cutoff_from_bin=False, aggregated_bin_ht=bin_ht, bin_id=bin_ht.bin, inbreeding_coeff_cutoff=INBREEDING_COEFF_HARD_CUTOFF, ) # This column is added by the RF module based on a 0.5 threshold which doesn't correspond to what we use #ht = ht.drop(ht[PREDICTION_COL]) ht.write(f'{tmp_dir}/rf_final.ht', overwrite=True)
def main(args): hl.init(log='/variantqc.log') data_type = 'exomes' if args.exomes else 'genomes' if args.debug: logger.setLevel(logging.DEBUG) if args.list_rf_runs: logger.info(f"RF runs for {data_type}:") random_forest.pretty_print_runs(get_rf_runs(data_type)) if args.annotate_for_rf: ht = create_rf_ht(data_type, n_variants_median=args.n_variants_median, impute_features_by_variant_type=not args. impute_features_no_variant_type, group='adj' if args.adj else 'raw') ht.write(rf_annotated_path(data_type, args.adj), overwrite=args.overwrite) run_hash = train_rf(data_type, args) if args.train_rf else args.run_hash if args.apply_rf: logger.info(f"Applying RF model {run_hash} to {data_type}.") rf_model = random_forest.load_model( rf_path(data_type, data='model', run_hash=run_hash)) ht = hl.read_table( rf_path(data_type, data='training', run_hash=run_hash)) ht = random_forest.apply_rf_model(ht, rf_model, get_features_list( True, not args.vqsr_features, args.vqsr_features), label=LABEL_COL) if 'singleton' in ht.row and 'was_split' in ht.row: # Needed for backwards compatibility for RF runs that happened prior to updating annotations ht = add_rank(ht, score_expr=ht.rf_probability['FP'], subrank_expr={ 'singleton_rank': ht.singleton, 'biallelic_rank': ~ht.was_split, 'biallelic_singleton_rank': ~ht.was_split & ht.singleton }) else: logger.warn( "Ranking was not added because of missing annotations -- please run 'create_ranked_scores.py' to add rank." ) ht.write(rf_path(data_type, 'rf_result', run_hash=run_hash), overwrite=args.overwrite) if args.finalize: ht = prepare_final_ht(data_type, args.run_hash, args.snp_bin_cutoff, args.indel_bin_cutoff) ht.write(annotations_ht_path(data_type, 'rf'), args.overwrite)
# Training the model rf_trained_ht, rf_model = train_rf_model( rf_ht, rf_features=features, tp_expr=rf_ht.tp, fp_expr=rf_ht.fp, test_expr=hl.literal(test_intervals).any( lambda interval: interval.contains(rf_ht.locus)), ) # Joining results ht = rf_ht.join(rf_trained_ht, how="left") rf_results = apply_rf_model( ht=ht, rf_model=rf_model, features=hl.eval(rf_trained_ht.features), label="rf_label", prediction_col_name="rf_prediction", ) rf_summary_ht = rf_results.group_by( "tp", "fp", "rf_train", "rf_label", "rf_prediction").aggregate(n=hl.agg.count()) # Join with the sample_qc passing matrix table variantqc_pass = mt.annotate_rows(**rf_results[mt.locus, mt.alleles]) variantqc_pass = hl.filter_rows(variantqc_pass.rf_prediction == "TP") variantqc_pass.write("variantqc_pass.mt", overwrite=True)