def get_passing_fracs(array_counts, array_labels_df, keep_n_columns): """ Compute fraction of passing samples per cohort for all intervals """ cohorts = array_counts.keys() cohort_totals = {c: sum(v.values()) for c, v in array_counts.items()} fracs_df = array_labels_df.iloc[:, :keep_n_columns] for cohort in cohorts: npass = pd.Series(np.zeros(len(fracs_df)), index=fracs_df.index) for array in array_counts[cohort].keys(): n_samps = array_counts[cohort][array] npass[array_labels_df[array]] += n_samps fracs_df[cohort] = npass / cohort_totals[cohort] return float_cleanup(fracs_df, 6, keep_n_columns)
def get_cohort_means(array_counts, intervals, tnames, header, keep_n_columns): """ Compute weighted mean of average number of probes per cohort for all intervals """ cohorts = array_counts.keys() cohort_totals = {c: sum(v.values()) for c, v in array_counts.items()} cohort_df = pd.DataFrame.from_dict(array_counts).fillna(value=0) probes_df = pd.read_csv(intervals.fn, sep='\t', header=None).iloc[:, keep_n_columns:] probes_df.columns = tnames probes_df = probes_df.loc[:, probes_df.columns.isin(cohort_df.index)] probe_sums = probes_df @ cohort_df probe_means = probe_sums / np.array(list(cohort_totals.values())) coords_df = pd.read_csv(intervals.fn, sep='\t', header=None).iloc[:, :keep_n_columns] coords_df.columns = header df_out = pd.concat([coords_df, probe_means], axis=1) return float_cleanup(df_out, 1, keep_n_columns)
def mu_predict(pairs, model_pkl, outfile, raw_mu, keep_features, maxfloat, bgzip): """ Apply a trained mutation rate model to new bin-pairs """ # Load pairs and split coordinates from features coords = pd.read_csv(pairs, sep='\t', usecols=range(3)) feats, labels = load_bed(pairs) if keep_features: feats_df = dfutils.load_feature_df(pairs) # Load model from .pkl and switch to evaluation mode model = torch.load(model_pkl) model.eval() # Predict mutation rates for all bins with torch.no_grad(): preds = model(feats).numpy() if not raw_mu: preds = log10(preds) preds_df = pd.DataFrame(preds, columns=['mu']) # Format output dataframe out_df = pd.concat([coords, preds_df], axis=1) if keep_features: out_df = pd.concat([out_df, feats_df], axis=1) out_df = dfutils.float_cleanup(out_df, maxfloat=maxfloat, start_idx=3) # Save pairs with predicted mutation rates if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] out_df.to_csv(outfile, sep='\t', index=False) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def mu_train(training_data, model_class, model_out, stats_out, cal_out, hypers, maxfloat, quiet): """ Master function to train a single mutation rate model """ # Unpack certain hypers seed = hypers['seed'] # Load and process all training BEDs if not quiet: status_msg = '[{0}] athena mu-predict: Loading training datasets.' print(status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'))) data_dict = mututils.load_all_beds(training_data) # Perform cross-validation, if optioned if hypers['cv_eval']: # Assign chromosomes to be held out for cross-validation cv_k = min([len(data_dict), hypers['max_cv_k']]) random.seed(seed) cv_test_contigs = sorted(random.sample(data_dict.keys(), cv_k)) if not quiet: status_msg = '[{0}] athena mu-predict: Holding out data for the following ' + \ '{1} contigs as cross-validation test sets: {2}' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), cv_k, ', '.join(cv_test_contigs))) # Evaluate training & testing performance with cross-validation cv_res = {} for test_contig in cv_test_contigs: fit_model, training_info, train_stats, test_stats = \ contig_cv(data_dict, test_contig, model_class, hypers) stop_reason = training_info['stop_reason'] cv_res[test_contig] = { 'model': fit_model, 'train_stats': train_stats, 'test_stats': test_stats, 'epochs': training_info['epochs_trained'], 'stop_reason': stop_reason } if not quiet: stop_explain = interpret_stop_reason(stop_reason) status_msg = '[{0}] athena mu-predict: Cross-validation results for {1} ' + \ 'after {2} epochs (stopped due to {3}):' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), test_contig, training_info['epochs_trained'], stop_explain)) print(' TRAINING:') for key, value in train_stats.items(): print(' * {} = {}'.format(key, round(value, 6))) print(' TESTING:') for key, value in test_stats.items(): print(' * {} = {}'.format(key, round(value, 6))) # After cross-validation, train model on all data for best predictive power # Stop after median number of epochs from CV, above all_features, all_labels = \ mututils.pool_tensors(data_dict, xchroms=[], seed=seed) model, optimizer, criterion = \ models.initialize_torch_model(model_class, all_features, hypers) if hypers['cv_eval']: avg_epochs = int(median([vals['epochs'] for vals in cv_res.values()])) else: avg_epochs = hypers.get('max_epochs', 10e6) final_model, training_info = \ models.train_torch_model(all_features, all_labels, model, optimizer, criterion, stop_early=False, epochs=avg_epochs) # Evaluate training performance of final model final_model.eval() final_stats = eval_model(final_model(all_features), all_labels) if not quiet: status_msg = '[{0}] athena mu-predict: Training performance for full model ' + \ 'after {1} epochs:' print( status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), avg_epochs)) for key, value in final_stats.items(): print(' * {} = {}'.format(key, round(value, 6))) # Save trained model to model_out, if optioned # (note: model is intentionally exported in .eval() mode) if model_out is not None: torch.save(final_model, model_out) # Compile & save training stats, if optioned if stats_out is not None: stats_df = make_stats_df(cv_res, final_stats, avg_epochs) stats_df = dfutils.float_cleanup(stats_df, maxfloat=maxfloat, start_idx=6) stats_df.to_csv(stats_out, sep='\t', index=False) # Compile & save data for calibration analysis, if optioned if cal_out is not None: cal_df = make_calibration_df(model, all_features, all_labels) cal_df = dfutils.float_cleanup(cal_df, maxfloat=maxfloat, start_idx=0) cal_df.to_csv(cal_out, sep='\t', index=False)
def decompose_bins(bins, bins_outfile=None, parameters_outfile=None, precomp_model=None, components=10, minvar=None, trans_dict=None, whiten=False, fill_missing=0, first_column=3, maxfloat=5, max_pcs=100, pca_stats=None, eigen_prefix='eigenfeature', bgzip=False): """ Master function for Eigendecomposition of bin annotations """ # Set certain defaults prior to loading precomputed model whitener = None # Load precomputed model, if optioned if precomp_model is not None: df_fills, trans_dict, scaler, pca, components, whitener = \ _load_precomp_model(precomp_model) fill_missing = df_fills # Expand feature transformation dictionary log_transform = trans_dict.get('log', []) sqrt_transform = trans_dict.get('sqrt', []) exp_transform = trans_dict.get('exp', []) square_transform = trans_dict.get('square', []) boxcox_transform = trans_dict.get('boxcox', []) # Read bins, then sanitize and transform annotations df_bins = pd.read_csv(bins, sep='\t', usecols=range(first_column)) df_annos, df_fills = \ dfutils.load_feature_df(bins, first_column, log_transform, sqrt_transform, exp_transform, square_transform, boxcox_transform, fill_missing, return_fills=True) feature_names = df_annos.columns.tolist() # Scale all columns if precomp_model is None: scaler = StandardScaler().fit(df_annos) df_annos = scaler.transform(df_annos) # Learn covariance matrix & determine number of components to keep if precomp_model is None: pcs_to_calc = min([df_annos.shape[1], max_pcs]) pca = PCA(n_components=pcs_to_calc).fit(df_annos) if minvar is None: components = pcs_to_calc else: components = len([i for i in np.cumsum(pca.explained_variance_ratio_) \ if i < minvar]) # Decompose annotations pcs = pca.transform(df_annos) eigen_names = ['_'.join([eigen_prefix, str(i+1)]) for i in range(components)] df_pcs = pd.DataFrame(pcs[:, :components], columns=eigen_names) # "Whiten" eigenfeatures, if optioned if whiten: if precomp_model is None: whitener = StandardScaler().fit(df_pcs) if whitener is not None: df_pcs = pd.DataFrame(whitener.transform(df_pcs), columns=eigen_names) # Write output bins with PCs if bins_outfile is not None: if 'compressed' in determine_filetype(bins_outfile): bins_outfile = path.splitext(bins_outfile)[0] out_df = dfutils.float_cleanup(pd.concat([df_bins, df_pcs], axis=1), maxfloat, first_column) out_df.to_csv(bins_outfile, sep='\t', index=False) if bgzip: bgz(bins_outfile) # Save model for future use, if optioned if parameters_outfile is not None: _save_model_params(df_fills, trans_dict, scaler, pca, components, whitener, parameters_outfile) # Perform extra assessments of PCA & feature fits, if optioned if pca_stats is not None: get_feature_stats(df_annos, feature_names, pca, pcs, pca_stats, eigen_prefix, components)
def annotate_bins(bins, chroms, ranges, tracks, ucsc_tracks, ucsc_ref, actions, fasta, snv_mus, maxfloat, ucsc_chromsplit, quiet): """ Master bin annotation function """ # Parse & sanity check all track inputs n_all_tracks = len(tracks) + len(ucsc_tracks) if len(actions) != n_all_tracks: from sys import exit err = 'INPUT ERROR: Number of actions ({0}) does not match number ' + \ 'of tracks ({1}).' exit(err.format(len(actions), n_all_tracks)) if len(ucsc_tracks) > 0: if ucsc_ref is None: from sys import exit exit('INPUT ERROR: --ucsc-ref must be specified if any UCSC ' + 'tracks are requested.') # Load bins. Note: must read contents from file due to odd utf-8 decoding # behavior for bgzipped BED files ftype = determine_filetype(bins) if ftype is None: ftype = 'unknown' if 'compressed' in ftype: bins = ''.join(s.decode('utf-8') for s in GzipFile(bins).readlines()) else: bins = ''.join(open(bins, 'r').readlines()) firstline = bins.split('\n')[0].split('\t') if firstline[0].startswith('#'): colnames = firstline else: colnames = None n_cols_old = len(firstline) bins = pbt.BedTool(bins, from_string=True) # Subset bins to specific chromosomes/ranges, if optioned if chroms is not None: chrlist = chroms.split(',') bins = bins.filter(lambda x: x.chrom in chrlist).saveas() if ranges is not None: bins = bins.intersect(range, wa=True).saveas() # Note: more efficient (and stable) when adding many annotations to hold # pd.DataFrame of bins with annotations in memory and convert entire # pd.DataFrame back to pbt.BedTool after adding all annotations as columns # This appears to be due to peculiarities in pyBedTools handling of wide BED files bins_bt = bins.cut(range(3)).saveas() bins_df = bins.to_dataframe(names=colnames, comment='#') # Annotate bins with all local tracks track_counter = 0 if len(tracks) > 0: for track in tracks: action = actions[track_counter] bins_df['newtrack_{}'.format(track_counter)] = \ add_local_track(bins_bt, track, action, quiet) track_counter += 1 # Annotate bins with all UCSC tracks if len(ucsc_tracks) > 0: if quiet is False: status_msg = '[{0}] athena annotate-bins: Connecting to UCSC ' + \ 'Genome Browser database' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) db = ucsc.ucsc_connect(ucsc_ref) query_regions = ucsc.collapse_query_regions(bins).saveas() # Iterate over tracks for track in ucsc_tracks: # Ping db connection is still active (UCSC may timeout over sequential long queries) # If UCSC connection has timed out, reopen new connection try: db.ping(True) except: try: db.close() except: pass db = ucsc.ucsc_connect(ucsc_ref) # Submit UCSC query action = actions[track_counter] bins_df['newtrack_{}'.format(track_counter)] = \ add_ucsc_track(bins_bt, db, track, action, query_regions, ucsc_ref, ucsc_chromsplit, quiet) track_counter += 1 # Close UCSC connection db.close() # Annotate bins with nucleotide content, if optioned if fasta is not None: if quiet is False: status_msg = '[{0}] athena annotate-bins: Adding nucleotide ' + \ 'content from reference fasta "{1}".' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) bins_df['pct_gc'] = add_nuc_content(bins, fasta, maxfloat) # Annotate bins with SNV mutation rates, if optioned if snv_mus is not None: if quiet is False: status_msg = '[{0}] athena annotate-bins: Adding SNV mutation ' + \ 'rates from reference fasta "{1}".' print( status_msg.format( datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) bins_df['snv_mu'] = add_snv_mu(bins, fasta, snv_mus, maxfloat) # Clean up long floats bins_df = float_cleanup(bins_df, maxfloat, start_idx=n_cols_old) # Return bins as pbt.BedTool return pbt.BedTool.from_dataframe(bins_df)
def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs, sv_ci, maxfloat, bgzip): """ Master function to annotate bins_in with count (or probability) of SVs """ # Load bins, split bin coordinates from annotations, and retain header if 'compressed' in determine_filetype(bins_in): bins_header = gzip.open( bins_in, 'r').readline().decode('utf-8').rstrip().split('\t') else: bins_header = open(bins_in, 'r').readline().rstrip().split('\t') bins_bt = pbt.BedTool(bins_in).cut(range(3)).saveas() bins_df = bins_bt.to_dataframe() feats_df = dfutils.load_feature_df(bins_in) if binsize is None: binsize = calc_binsize(bins_in) # Parse input SV file depending on format # If breakpoints == False, will return simple four-column BED with variant ID in fourth column # If breakpoints == True, will return two rows per record where each record # is one breakpoint with columns 4 = variant ID, 5 = POS or END, 6 = original # POS or END coordinate, 7 = std dev of left side of breakpoint, 8 = std dev of # right side of breakpoint, and 9 = number of std deviations extended left & right (i.e., z_extend) sv_format = determine_filetype(sv_in) if 'vcf' in sv_format: vcf = pysam.VariantFile(sv_in) sv = vcf2bed(vcf, breakpoints=breakpoints, add_ci_to_bkpts=probs, ci=sv_ci) elif 'bed' in sv_format: sv = _load_sv_from_bed(sv_in, breakpoints=breakpoints) # Perform intersection with bins depending on input parameters if breakpoints: bins_bt = add_names_to_bed(bins_bt) bin_ids = [b.name for b in bins_bt] # Split pairs if necessary if paired: bins_bt = _split_pairs(bins_bt, binsize=binsize, add_name=True, add_side=True) # Intersect breakpoints with bins hits = bins_bt.intersect(sv, wa=True, wb=True) bkpt_res = parse_breakpoint_hits(hits, paired, probs) sv_column = pd.Series([bkpt_res.get(b_id, 0) for b_id in bin_ids]) # --comparison "overlap" (i.e., breakpoints == False) is the same for both 1D and 2D bins else: if probs: sv_column = pd.Series( [min([1, int(x[-1])]) for x in bins_bt.intersect(sv, c=True)]) else: sv_column = pd.Series( [int(x[-1]) for x in bins_bt.intersect(sv, c=True)]) # Paste bin coordinates, SV counts, and original features into single dataframe out_df = dfutils.float_cleanup( pd.concat([bins_df, sv_column, feats_df], axis=1), maxfloat, 3) out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:] # Save bins with SV counts if 'compressed' in determine_filetype(outfile): outfile = path.splitext(outfile)[0] out_df.to_csv(outfile, sep='\t', header=True, index=False) # Bgzip bins, if optioned if bgzip: bgz(outfile)
def annotate_pairs(pairs, chroms, ranges, tracks, ucsc_tracks, actions, track_names, ucsc_ref, fasta, binsize, homology_cutoffs, ucsc_chromsplit, maxfloat, quiet): """ Master pair annotation function """ # Infer binsize and filetype if binsize is None: binsize = calc_binsize(pairs) ftype = determine_filetype(pairs) # Load pairs. Note: must read contents from file due to odd utf-8 decoding # behavior for bgzipped BED files with pybedtools if 'compressed' in ftype: pairs = ''.join(s.decode('utf-8') for s in GzipFile(pairs).readlines()) else: pairs = open(pairs, 'r').readlines() firstline = pairs.split('\n')[0].split('\t') if firstline[0].startswith('#'): colnames = firstline else: colnames = None n_cols_old = len(firstline) pairs = pbt.BedTool(pairs, from_string=True) # Subset pairs to specific chromosomes/ranges, if optioned if chroms is not None: chrlist = chroms.split(',') pairs = pairs.filter(lambda x: x.chrom in chrlist).saveas() if ranges is not None: pairs = pairs.intersect(range, wa=True).saveas() # Note: more efficient (and stable) when adding many annotations to hold # pd.DataFrame of pairs with annotations in memory and convert entire # pd.DataFrame back to pbt.BedTool after adding all annotations as columns # This appears to be due to peculiarities in pyBedTools handling of wide BED files pairs_bt = pairs.cut(range(3)).saveas() pairs_df = pairs.to_dataframe(names=colnames, comment='#') pairs_bedpe_bt = _pairs_bed_to_bedpe(pairs_bt, binsize) # Make master pbt.BedTool of all bins from all pairs split_pair_bts = [_split_pairs(p, binsize) for p in pairs_bt] allbins_bt = split_pair_bts[0].cat(*split_pair_bts[1:], postmerge=False).sort().merge(d=-1) query_regions = ucsc.collapse_query_regions(allbins_bt).saveas() # Annotate bins with all local tracks track_counter = 0 if len(tracks) > 0: for track in tracks: action = actions[track_counter] pairs_df['newtrack_{}'.format(track_counter)] = \ add_pairwise_local_track(pairs_bedpe_bt, track, action, query_regions, binsize, quiet) track_counter += 1 # Annotate bins with all UCSC tracks if len(ucsc_tracks) > 0: if quiet is False: status_msg = '[{0}] athena annotate-pairs: Connecting to UCSC ' + \ 'Genome Browser database' print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) db = ucsc.ucsc_connect(ucsc_ref) # Iterate over tracks for track in ucsc_tracks: action = actions[track_counter] pairs_df['newtrack_{}'.format(track_counter)] = \ add_pairwise_ucsc_track(pairs_bedpe_bt, db, track, action, query_regions, binsize, ucsc_ref, ucsc_chromsplit, quiet) track_counter += 1 # Close UCSC connection db.close() # Annotate pairs based on nucleotide content, if optioned if fasta is not None: if quiet is False: status_msg = '[{0}] athena annotate-pairs: Adding sequence homology ' + \ 'features from reference fasta "{1}".' print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta)) for identity in homology_cutoffs: for rev in True, False: pairs_df['newtrack_{}'.format(track_counter)] = \ add_homology(pairs_bt, fasta, binsize, identity, rev) track_counter += 1 # Clean up long floats pairs_df = float_cleanup(pairs_df, maxfloat, start_idx=3) # Return bins as pbt.BedTool return pbt.BedTool.from_dataframe(pairs_df)