예제 #1
0
def get_passing_fracs(array_counts, array_labels_df, keep_n_columns):
    """
    Compute fraction of passing samples per cohort for all intervals
    """

    cohorts = array_counts.keys()
    cohort_totals = {c: sum(v.values()) for c, v in array_counts.items()}

    fracs_df = array_labels_df.iloc[:, :keep_n_columns]

    for cohort in cohorts:
        npass = pd.Series(np.zeros(len(fracs_df)), index=fracs_df.index)
        for array in array_counts[cohort].keys():
            n_samps = array_counts[cohort][array]
            npass[array_labels_df[array]] += n_samps
        fracs_df[cohort] = npass / cohort_totals[cohort]

    return float_cleanup(fracs_df, 6, keep_n_columns)
예제 #2
0
def get_cohort_means(array_counts, intervals, tnames, header, keep_n_columns):
    """
    Compute weighted mean of average number of probes  per cohort for all intervals
    """

    cohorts = array_counts.keys()
    cohort_totals = {c: sum(v.values()) for c, v in array_counts.items()}
    cohort_df = pd.DataFrame.from_dict(array_counts).fillna(value=0)

    probes_df = pd.read_csv(intervals.fn, sep='\t',
                            header=None).iloc[:, keep_n_columns:]
    probes_df.columns = tnames
    probes_df = probes_df.loc[:, probes_df.columns.isin(cohort_df.index)]

    probe_sums = probes_df @ cohort_df
    probe_means = probe_sums / np.array(list(cohort_totals.values()))

    coords_df = pd.read_csv(intervals.fn, sep='\t',
                            header=None).iloc[:, :keep_n_columns]
    coords_df.columns = header
    df_out = pd.concat([coords_df, probe_means], axis=1)

    return float_cleanup(df_out, 1, keep_n_columns)
예제 #3
0
def mu_predict(pairs, model_pkl, outfile, raw_mu, keep_features, maxfloat,
               bgzip):
    """
    Apply a trained mutation rate model to new bin-pairs
    """

    # Load pairs and split coordinates from features
    coords = pd.read_csv(pairs, sep='\t', usecols=range(3))
    feats, labels = load_bed(pairs)
    if keep_features:
        feats_df = dfutils.load_feature_df(pairs)

    # Load model from .pkl and switch to evaluation mode
    model = torch.load(model_pkl)
    model.eval()

    # Predict mutation rates for all bins
    with torch.no_grad():
        preds = model(feats).numpy()
        if not raw_mu:
            preds = log10(preds)
        preds_df = pd.DataFrame(preds, columns=['mu'])

    # Format output dataframe
    out_df = pd.concat([coords, preds_df], axis=1)
    if keep_features:
        out_df = pd.concat([out_df, feats_df], axis=1)
    out_df = dfutils.float_cleanup(out_df, maxfloat=maxfloat, start_idx=3)

    # Save pairs with predicted mutation rates
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
예제 #4
0
def mu_train(training_data, model_class, model_out, stats_out, cal_out, hypers,
             maxfloat, quiet):
    """
    Master function to train a single mutation rate model
    """

    # Unpack certain hypers
    seed = hypers['seed']

    # Load and process all training BEDs
    if not quiet:
        status_msg = '[{0}] athena mu-predict: Loading training datasets.'
        print(status_msg.format(
            datetime.now().strftime('%b %d %Y @ %H:%M:%S')))
    data_dict = mututils.load_all_beds(training_data)

    # Perform cross-validation, if optioned
    if hypers['cv_eval']:

        # Assign chromosomes to be held out for cross-validation
        cv_k = min([len(data_dict), hypers['max_cv_k']])
        random.seed(seed)
        cv_test_contigs = sorted(random.sample(data_dict.keys(), cv_k))
        if not quiet:
            status_msg = '[{0}] athena mu-predict: Holding out data for the following ' + \
                         '{1} contigs as cross-validation test sets: {2}'
            print(
                status_msg.format(
                    datetime.now().strftime('%b %d %Y @ %H:%M:%S'), cv_k,
                    ', '.join(cv_test_contigs)))

        # Evaluate training & testing performance with cross-validation
        cv_res = {}
        for test_contig in cv_test_contigs:
            fit_model, training_info, train_stats, test_stats = \
                contig_cv(data_dict, test_contig, model_class, hypers)
            stop_reason = training_info['stop_reason']
            cv_res[test_contig] = {
                'model': fit_model,
                'train_stats': train_stats,
                'test_stats': test_stats,
                'epochs': training_info['epochs_trained'],
                'stop_reason': stop_reason
            }
            if not quiet:
                stop_explain = interpret_stop_reason(stop_reason)
                status_msg = '[{0}] athena mu-predict: Cross-validation results for {1} ' + \
                             'after {2} epochs (stopped due to {3}):'
                print(
                    status_msg.format(
                        datetime.now().strftime('%b %d %Y @ %H:%M:%S'),
                        test_contig, training_info['epochs_trained'],
                        stop_explain))
                print('  TRAINING:')
                for key, value in train_stats.items():
                    print('    * {} = {}'.format(key, round(value, 6)))
                print('  TESTING:')
                for key, value in test_stats.items():
                    print('    * {} = {}'.format(key, round(value, 6)))

    # After cross-validation, train model on all data for best predictive power
    # Stop after median number of epochs from CV, above
    all_features, all_labels = \
        mututils.pool_tensors(data_dict, xchroms=[], seed=seed)
    model, optimizer, criterion = \
        models.initialize_torch_model(model_class, all_features, hypers)
    if hypers['cv_eval']:
        avg_epochs = int(median([vals['epochs'] for vals in cv_res.values()]))
    else:
        avg_epochs = hypers.get('max_epochs', 10e6)
    final_model, training_info = \
        models.train_torch_model(all_features, all_labels, model, optimizer,
                                 criterion, stop_early=False, epochs=avg_epochs)

    # Evaluate training performance of final model
    final_model.eval()
    final_stats = eval_model(final_model(all_features), all_labels)
    if not quiet:
        status_msg = '[{0}] athena mu-predict: Training performance for full model ' + \
                     'after {1} epochs:'
        print(
            status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'),
                              avg_epochs))
        for key, value in final_stats.items():
            print('    * {} = {}'.format(key, round(value, 6)))

    # Save trained model to model_out, if optioned
    # (note: model is intentionally exported in .eval() mode)
    if model_out is not None:
        torch.save(final_model, model_out)

    # Compile & save training stats, if optioned
    if stats_out is not None:
        stats_df = make_stats_df(cv_res, final_stats, avg_epochs)
        stats_df = dfutils.float_cleanup(stats_df,
                                         maxfloat=maxfloat,
                                         start_idx=6)
        stats_df.to_csv(stats_out, sep='\t', index=False)

    # Compile & save data for calibration analysis, if optioned
    if cal_out is not None:
        cal_df = make_calibration_df(model, all_features, all_labels)
        cal_df = dfutils.float_cleanup(cal_df, maxfloat=maxfloat, start_idx=0)
        cal_df.to_csv(cal_out, sep='\t', index=False)
예제 #5
0
def decompose_bins(bins, bins_outfile=None, parameters_outfile=None, precomp_model=None, 
                   components=10, minvar=None, trans_dict=None, whiten=False, 
                   fill_missing=0, first_column=3, maxfloat=5, max_pcs=100, 
                   pca_stats=None, eigen_prefix='eigenfeature', bgzip=False):
    """
    Master function for Eigendecomposition of bin annotations
    """

    # Set certain defaults prior to loading precomputed model
    whitener = None

    # Load precomputed model, if optioned
    if precomp_model is not None:
        df_fills, trans_dict, scaler, pca, components, whitener = \
            _load_precomp_model(precomp_model)
        fill_missing = df_fills

    # Expand feature transformation dictionary
    log_transform = trans_dict.get('log', [])
    sqrt_transform = trans_dict.get('sqrt', [])
    exp_transform = trans_dict.get('exp', [])
    square_transform = trans_dict.get('square', [])
    boxcox_transform = trans_dict.get('boxcox', [])

    # Read bins, then sanitize and transform annotations
    df_bins = pd.read_csv(bins, sep='\t', usecols=range(first_column))
    df_annos, df_fills = \
        dfutils.load_feature_df(bins, first_column, log_transform, sqrt_transform, 
                                exp_transform, square_transform,  boxcox_transform, 
                                fill_missing, return_fills=True)
    feature_names = df_annos.columns.tolist()

    # Scale all columns
    if precomp_model is None:
        scaler = StandardScaler().fit(df_annos)
    df_annos = scaler.transform(df_annos)

    # Learn covariance matrix & determine number of components to keep
    if precomp_model is None:
        pcs_to_calc = min([df_annos.shape[1], max_pcs])
        pca = PCA(n_components=pcs_to_calc).fit(df_annos)
        if minvar is None:
            components = pcs_to_calc
        else:
            components = len([i for i in np.cumsum(pca.explained_variance_ratio_) \
                              if i < minvar])

    # Decompose annotations
    pcs = pca.transform(df_annos)
    eigen_names = ['_'.join([eigen_prefix, str(i+1)]) for i in range(components)]
    df_pcs = pd.DataFrame(pcs[:, :components], columns=eigen_names)

    # "Whiten" eigenfeatures, if optioned
    if whiten:
        if precomp_model is None:
            whitener = StandardScaler().fit(df_pcs)
    if whitener is not None:
        df_pcs = pd.DataFrame(whitener.transform(df_pcs), columns=eigen_names)

    # Write output bins with PCs
    if bins_outfile is not None:
        if 'compressed' in determine_filetype(bins_outfile):
            bins_outfile = path.splitext(bins_outfile)[0]
        out_df = dfutils.float_cleanup(pd.concat([df_bins, df_pcs], axis=1), 
                                       maxfloat, first_column)
        out_df.to_csv(bins_outfile, sep='\t', index=False)
        if bgzip:
            bgz(bins_outfile)

    # Save model for future use, if optioned
    if parameters_outfile is not None:
        _save_model_params(df_fills, trans_dict, scaler, pca, components, 
                           whitener, parameters_outfile)

    # Perform extra assessments of PCA & feature fits, if optioned
    if pca_stats is not None:
        get_feature_stats(df_annos, feature_names, pca, pcs, pca_stats, 
                          eigen_prefix, components)
예제 #6
0
def annotate_bins(bins, chroms, ranges, tracks, ucsc_tracks, ucsc_ref, actions,
                  fasta, snv_mus, maxfloat, ucsc_chromsplit, quiet):
    """
    Master bin annotation function
    """

    # Parse & sanity check all track inputs
    n_all_tracks = len(tracks) + len(ucsc_tracks)
    if len(actions) != n_all_tracks:
        from sys import exit
        err = 'INPUT ERROR: Number of actions ({0}) does not match number ' + \
              'of tracks ({1}).'
        exit(err.format(len(actions), n_all_tracks))

    if len(ucsc_tracks) > 0:
        if ucsc_ref is None:
            from sys import exit
            exit('INPUT ERROR: --ucsc-ref must be specified if any UCSC ' +
                 'tracks are requested.')

    # Load bins. Note: must read contents from file due to odd utf-8 decoding
    # behavior for bgzipped BED files
    ftype = determine_filetype(bins)
    if ftype is None:
        ftype = 'unknown'
    if 'compressed' in ftype:
        bins = ''.join(s.decode('utf-8') for s in GzipFile(bins).readlines())
    else:
        bins = ''.join(open(bins, 'r').readlines())
    firstline = bins.split('\n')[0].split('\t')
    if firstline[0].startswith('#'):
        colnames = firstline
    else:
        colnames = None
    n_cols_old = len(firstline)
    bins = pbt.BedTool(bins, from_string=True)

    # Subset bins to specific chromosomes/ranges, if optioned
    if chroms is not None:
        chrlist = chroms.split(',')
        bins = bins.filter(lambda x: x.chrom in chrlist).saveas()
    if ranges is not None:
        bins = bins.intersect(range, wa=True).saveas()

    # Note: more efficient (and stable) when adding many annotations to hold
    # pd.DataFrame of bins with annotations in memory and convert entire
    # pd.DataFrame back to pbt.BedTool after adding all annotations as columns
    # This appears to be due to peculiarities in pyBedTools handling of wide BED files
    bins_bt = bins.cut(range(3)).saveas()
    bins_df = bins.to_dataframe(names=colnames, comment='#')

    # Annotate bins with all local tracks
    track_counter = 0
    if len(tracks) > 0:
        for track in tracks:
            action = actions[track_counter]
            bins_df['newtrack_{}'.format(track_counter)] = \
                add_local_track(bins_bt, track, action, quiet)
            track_counter += 1

    # Annotate bins with all UCSC tracks
    if len(ucsc_tracks) > 0:
        if quiet is False:
            status_msg = '[{0}] athena annotate-bins: Connecting to UCSC ' + \
                         'Genome Browser database'
            print(
                status_msg.format(
                    datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta))
        db = ucsc.ucsc_connect(ucsc_ref)
        query_regions = ucsc.collapse_query_regions(bins).saveas()

        # Iterate over tracks
        for track in ucsc_tracks:
            # Ping db connection is still active (UCSC may timeout over sequential long queries)
            # If UCSC connection has timed out, reopen new connection
            try:
                db.ping(True)
            except:
                try:
                    db.close()
                except:
                    pass
                db = ucsc.ucsc_connect(ucsc_ref)

            # Submit UCSC query
            action = actions[track_counter]
            bins_df['newtrack_{}'.format(track_counter)] = \
                add_ucsc_track(bins_bt, db, track, action, query_regions,
                               ucsc_ref, ucsc_chromsplit, quiet)
            track_counter += 1

        # Close UCSC connection
        db.close()

    # Annotate bins with nucleotide content, if optioned
    if fasta is not None:

        if quiet is False:
            status_msg = '[{0}] athena annotate-bins: Adding nucleotide ' + \
                         'content from reference fasta "{1}".'
            print(
                status_msg.format(
                    datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta))

        bins_df['pct_gc'] = add_nuc_content(bins, fasta, maxfloat)

        # Annotate bins with SNV mutation rates, if optioned
        if snv_mus is not None:
            if quiet is False:
                status_msg = '[{0}] athena annotate-bins: Adding SNV mutation ' + \
                             'rates from reference fasta "{1}".'
                print(
                    status_msg.format(
                        datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta))

            bins_df['snv_mu'] = add_snv_mu(bins, fasta, snv_mus, maxfloat)

    # Clean up long floats
    bins_df = float_cleanup(bins_df, maxfloat, start_idx=n_cols_old)

    # Return bins as pbt.BedTool
    return pbt.BedTool.from_dataframe(bins_df)
예제 #7
0
def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs,
             sv_ci, maxfloat, bgzip):
    """
    Master function to annotate bins_in with count (or probability) of SVs
    """

    # Load bins, split bin coordinates from annotations, and retain header
    if 'compressed' in determine_filetype(bins_in):
        bins_header = gzip.open(
            bins_in, 'r').readline().decode('utf-8').rstrip().split('\t')
    else:
        bins_header = open(bins_in, 'r').readline().rstrip().split('\t')
    bins_bt = pbt.BedTool(bins_in).cut(range(3)).saveas()
    bins_df = bins_bt.to_dataframe()
    feats_df = dfutils.load_feature_df(bins_in)
    if binsize is None:
        binsize = calc_binsize(bins_in)

    # Parse input SV file depending on format
    # If breakpoints == False, will return simple four-column BED with variant ID in fourth column
    # If breakpoints == True, will return two rows per record where each record
    # is one breakpoint with columns 4 = variant ID, 5 = POS or END, 6 = original
    # POS or END coordinate, 7 = std dev of left side of breakpoint, 8 = std dev of
    # right side of breakpoint, and 9 = number of std deviations extended left & right (i.e., z_extend)
    sv_format = determine_filetype(sv_in)
    if 'vcf' in sv_format:
        vcf = pysam.VariantFile(sv_in)
        sv = vcf2bed(vcf,
                     breakpoints=breakpoints,
                     add_ci_to_bkpts=probs,
                     ci=sv_ci)
    elif 'bed' in sv_format:
        sv = _load_sv_from_bed(sv_in, breakpoints=breakpoints)

    # Perform intersection with bins depending on input parameters
    if breakpoints:
        bins_bt = add_names_to_bed(bins_bt)
        bin_ids = [b.name for b in bins_bt]

        # Split pairs if necessary
        if paired:
            bins_bt = _split_pairs(bins_bt,
                                   binsize=binsize,
                                   add_name=True,
                                   add_side=True)

        # Intersect breakpoints with bins
        hits = bins_bt.intersect(sv, wa=True, wb=True)
        bkpt_res = parse_breakpoint_hits(hits, paired, probs)
        sv_column = pd.Series([bkpt_res.get(b_id, 0) for b_id in bin_ids])

    # --comparison "overlap" (i.e., breakpoints == False) is the same for both 1D and 2D bins
    else:
        if probs:
            sv_column = pd.Series(
                [min([1, int(x[-1])]) for x in bins_bt.intersect(sv, c=True)])
        else:
            sv_column = pd.Series(
                [int(x[-1]) for x in bins_bt.intersect(sv, c=True)])

    # Paste bin coordinates, SV counts, and original features into single dataframe
    out_df = dfutils.float_cleanup(
        pd.concat([bins_df, sv_column, feats_df], axis=1), maxfloat, 3)
    out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:]

    # Save bins with SV counts
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', header=True, index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
예제 #8
0
def annotate_pairs(pairs, chroms, ranges, tracks, ucsc_tracks, actions, track_names, 
                   ucsc_ref, fasta, binsize, homology_cutoffs, ucsc_chromsplit, 
                   maxfloat, quiet):
    """
    Master pair annotation function
    """

    # Infer binsize and filetype
    if binsize is None:
        binsize = calc_binsize(pairs)
    ftype = determine_filetype(pairs)


    # Load pairs. Note: must read contents from file due to odd utf-8 decoding 
    # behavior for bgzipped BED files with pybedtools
    if 'compressed' in ftype:
        pairs = ''.join(s.decode('utf-8') for s in GzipFile(pairs).readlines())
    else:
        pairs = open(pairs, 'r').readlines()
    firstline = pairs.split('\n')[0].split('\t')
    if firstline[0].startswith('#'):
        colnames = firstline
    else:
        colnames = None
    n_cols_old = len(firstline)
    pairs = pbt.BedTool(pairs, from_string=True)


    # Subset pairs to specific chromosomes/ranges, if optioned
    if chroms is not None:
        chrlist = chroms.split(',')
        pairs = pairs.filter(lambda x: x.chrom in chrlist).saveas()
    if ranges is not None:
        pairs = pairs.intersect(range, wa=True).saveas()


    # Note: more efficient (and stable) when adding many annotations to hold 
    # pd.DataFrame of pairs with annotations in memory and convert entire 
    # pd.DataFrame back to pbt.BedTool after adding all annotations as columns
    # This appears to be due to peculiarities in pyBedTools handling of wide BED files
    pairs_bt = pairs.cut(range(3)).saveas()
    pairs_df = pairs.to_dataframe(names=colnames, comment='#')
    pairs_bedpe_bt = _pairs_bed_to_bedpe(pairs_bt, binsize)


    # Make master pbt.BedTool of all bins from all pairs
    split_pair_bts = [_split_pairs(p, binsize) for p in pairs_bt]
    allbins_bt = split_pair_bts[0].cat(*split_pair_bts[1:], postmerge=False).sort().merge(d=-1)
    query_regions = ucsc.collapse_query_regions(allbins_bt).saveas()


    # Annotate bins with all local tracks
    track_counter = 0
    if len(tracks) > 0:
        for track in tracks:
            action = actions[track_counter]
            pairs_df['newtrack_{}'.format(track_counter)] = \
                add_pairwise_local_track(pairs_bedpe_bt, track, action, query_regions, 
                                         binsize, quiet)
            track_counter += 1


    # Annotate bins with all UCSC tracks
    if len(ucsc_tracks) > 0:
        if quiet is False:
            status_msg = '[{0}] athena annotate-pairs: Connecting to UCSC ' + \
                         'Genome Browser database'
            print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), 
                                    fasta))
        db = ucsc.ucsc_connect(ucsc_ref)

        # Iterate over tracks
        for track in ucsc_tracks:
            action = actions[track_counter]
            pairs_df['newtrack_{}'.format(track_counter)] = \
                add_pairwise_ucsc_track(pairs_bedpe_bt, db, track, action, query_regions, 
                                        binsize, ucsc_ref, ucsc_chromsplit, quiet)
            track_counter += 1

        # Close UCSC connection
        db.close()


    # Annotate pairs based on nucleotide content, if optioned
    if fasta is not None:

        if quiet is False:
            status_msg = '[{0}] athena annotate-pairs: Adding sequence homology ' + \
                         'features from reference fasta "{1}".'
            print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), 
                                    fasta))

        for identity in homology_cutoffs:
            for rev in True, False:
                pairs_df['newtrack_{}'.format(track_counter)] = \
                    add_homology(pairs_bt, fasta, binsize, identity, rev)
                track_counter += 1
    

    # Clean up long floats
    pairs_df = float_cleanup(pairs_df, maxfloat, start_idx=3)


    # Return bins as pbt.BedTool
    return pbt.BedTool.from_dataframe(pairs_df)