Beispiel #1
0
def add_snv_mu(bins, fasta, snv_mus, maxfloat):
    """
    Extract SNV mutation rate (as list) for all bins from a reference fasta
    """

    # Extend all bins by 1bp at start and end (need trinucleotide context for mu)
    bins.saveas()

    def _increment_bin(feat, dist=1, start=True, end=True):
        if start:
            feat.start = max([0, feat.start - dist])
        if end:
            feat.end = feat.end + dist
        return feat

    buffbins = pbt.BedTool(bins).each(_increment_bin).saveas()

    snv_mu_dict = load_snv_mus(snv_mus)

    if 'compressed' in determine_filetype(fasta):
        fasta = GzipFile(fasta)

    values = []
    for seq in get_seqs_from_bt(buffbins, fasta):
        mu = snv_mu_from_seq(seq.rstrip(), snv_mu_dict)
        values.append(mu)

    return values
Beispiel #2
0
def _parse_remote_inputs_tsv(urls_tsv, local_suffix='local_slice'):
    """
    Parse a tsv of remote files to localize
    """

    urls_dict = {}

    with open(urls_tsv) as f_in:
        for k, line in enumerate(f_in):

            url, mdata = line.rstrip().split('\t', 1)
            basename_orig = os.path.basename(url)
            ftype, fext = determine_filetype(basename_orig,
                                             return_extension=True)
            sliced_ext = '{}.{}'.format(local_suffix, fext)
            basename_sliced = basename_orig[:-len(fext)] + sliced_ext
            index_path = _find_remote_index(url, ftype)

            # Confirm
            if ftype not in 'compressed-bed compressed-vcf bam cram'.split():
                err = 'INPUT ERROR: format not recognized as tabix or samtools ' + \
                      'compliant for input file {}'
                exit(err.format(url))

            urls_dict[k] = {
                'url': url,
                'ftype': ftype,
                'metadata': mdata,
                'basename_orig': basename_orig,
                'basename_sliced': basename_sliced,
                'index_path': index_path
            }

    return urls_dict
Beispiel #3
0
def add_bedtool_track(bins, track, action):
    """
    Extract feature values (as list) for all bins vs. a single BedTool (or BAM/CRAM)
    """

    if isinstance(track, str):
        ftype = determine_filetype(track)
    else:
        ftype = None

    if action == 'count':
        if ftype in 'bam cram'.split():
            values = [int(f[-4]) for f in bins.coverage(track, sorted=True)]
        else:
            values = [
                int(f[-1]) for f in bins.intersect(track, c=True, wa=True)
            ]

    elif action == 'count-unique':
        gfile = bedtool_to_genome_file(bins)
        bedtool = pbt.BedTool(track).sort(g=gfile).merge()
        values = [int(f[-1]) for f in bins.intersect(bedtool, c=True, wa=True)]

    elif action == 'coverage':
        values = [float(f[-1]) for f in bins.coverage(track)]

    elif action == 'any-overlap':
        values = [min([1, int(f[-1])]) for f \
                  in bins.intersect(track, c=True, wa=True)]

    else:
        from sys import exit
        exit('INPUT ERROR: --action {0} not recognized.'.format(action))

    return values
Beispiel #4
0
def add_bedtool_track(bins, track, action, header_compliance='loose'):
    """
    Extract feature values (as list) for all bins vs. a single BedTool (or BAM/CRAM)
    """

    if isinstance(track, str):
        ftype = determine_filetype(track)
    else:
        ftype = None

    # Check for header inconsistencies for indexed tracks
    if ftype in 'bam cram compressed-vcf'.split():
        query_bins = copy.deepcopy(bins)
        query_bins = check_header_compliance(track, query_bins,
                                             header_compliance)
    else:
        query_bins = bins

    if action == 'count':
        if ftype in 'bam cram'.split():
            values = [
                int(f[-4]) for f in query_bins.coverage(track, sorted=True)
            ]
        else:
            values = [
                int(f[-1])
                for f in query_bins.intersect(track, c=True, wa=True)
            ]

    elif action == 'count-unique':
        gfile = bedtool_to_genome_file(query_bins)
        chroms = set([f.split('\t')[0] for f in open(gfile).readlines()])
        bedtool = pbt.BedTool(track).filter(lambda f: f.chrom in chroms).sort(
            g=gfile).merge()
        values = [
            int(f[-1]) for f in query_bins.intersect(bedtool, c=True, wa=True)
        ]

    elif action == 'coverage':
        values = [float(f[-1]) for f in query_bins.coverage(track)]

    elif action == 'any-overlap':
        values = [min([1, int(f[-1])]) for f \
                  in query_bins.intersect(track, c=True, wa=True)]

    else:
        from sys import exit
        exit('INPUT ERROR: --action {0} not recognized.'.format(action))

    if ftype in 'bam cram compressed-vcf'.split():
        header_compliance_cleanup(track)

    return values
Beispiel #5
0
def add_nuc_content(bins, fasta, maxfloat):
    """
    Extract GC content (as list) for all bins from a reference fasta
    """

    if 'compressed' in determine_filetype(fasta):
        fasta = GzipFile(fasta)

    pct_gc = [
        float(f[4]) for f in bins.cut(range(3)).nucleotide_content(fi=fasta)
    ]

    return pct_gc
Beispiel #6
0
def load_intervals(bed_in, min_size, cols_to_keep):
    """
    Load and expand intervals
    """

    if 'compressed' in determine_filetype(bed_in):
        fin = gzip.open(bed_in, 'rt')
    else:
        fin = open(bed_in)
    header = fin.readline().rstrip().split('\t')[:cols_to_keep]
    fin.close()

    bt = pbt.BedTool(bed_in).each(expand_interval, min_size=min_size).\
                             cut(range(cols_to_keep)).\
                             saveas('resized_intervals.bed',
                                    trackline='\t'.join(header))

    return bt, header
Beispiel #7
0
def get_seqs_from_bt(bt, fasta, return_headers=False):
    """
    Extract a list of nucleotide sequences corresponding to all records in a pbt.BedTool
    """

    if 'compressed' in determine_filetype(fasta):
        fasta = GzipFile(fasta)
    fseqs = bt.sequence(fasta).seqfn

    seqs = []
    headers = []
    with open(fseqs) as fin:
        for seqheader, seq in itertools.zip_longest(*[fin] * 2):
            headers.append(seqheader.rstrip().replace('>', ''))
            seqs.append(seq.rstrip().upper())

    if return_headers:
        return seqs, headers
    else:
        return seqs
Beispiel #8
0
def add_pairwise_local_track(pairs_bedpe_bt, track, action, query_regions, binsize, quiet):
    """
    Wrapper function to extract values for a single local track
    """

    ftype = determine_filetype(track)

    if quiet is False:
        status_msg = '[{0}] athena annotate-pairs: Adding track "{1}" ' + \
                     'with action "{2}"'
        print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), 
                                track, action))

    # Convert BAM/CRAM records to BEDPE, if necessary
    if ftype in 'bam cram'.split():
        track = _bam_to_bedpe(track, query_regions)

    if action in 'count-pairs pairwise-coverage any-pairwise-overlap'.split():
        values = add_pairwise_bedtool_track(pairs_bedpe_bt, track, action, binsize)

    return values
Beispiel #9
0
def mu_predict(pairs, model_pkl, outfile, raw_mu, keep_features, maxfloat,
               bgzip):
    """
    Apply a trained mutation rate model to new bin-pairs
    """

    # Load pairs and split coordinates from features
    coords = pd.read_csv(pairs, sep='\t', usecols=range(3))
    feats, labels = load_bed(pairs)
    if keep_features:
        feats_df = dfutils.load_feature_df(pairs)

    # Load model from .pkl and switch to evaluation mode
    model = torch.load(model_pkl)
    model.eval()

    # Predict mutation rates for all bins
    with torch.no_grad():
        preds = model(feats).numpy()
        if not raw_mu:
            preds = log10(preds)
        preds_df = pd.DataFrame(preds, columns=['mu'])

    # Format output dataframe
    out_df = pd.concat([coords, preds_df], axis=1)
    if keep_features:
        out_df = pd.concat([out_df, feats_df], axis=1)
    out_df = dfutils.float_cleanup(out_df, maxfloat=maxfloat, start_idx=3)

    # Save pairs with predicted mutation rates
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
Beispiel #10
0
def add_local_track(bins, track, action, quiet):
    """
    Wrapper function to add a single local track
    """

    ftype = determine_filetype(track)

    if quiet is False:
        status_msg = '[{0}] athena annotate-bins: Adding track "{1}" ' + \
                     'with action "{2}"'
        print(
            status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'),
                              track, action))

    if action in 'count count-unique coverage any-overlap'.split():
        values = add_bedtool_track(bins, track, action)

    elif 'map-' in action:
        if ftype == 'bigwig':
            values = add_bigwig_track(bins, track, action)
        else:
            values = add_bedgraph_track(bins, track, action)

    return values
Beispiel #11
0
def decompose_bins(bins, bins_outfile=None, parameters_outfile=None, precomp_model=None, 
                   components=10, minvar=None, trans_dict=None, whiten=False, 
                   fill_missing=0, first_column=3, maxfloat=5, max_pcs=100, 
                   pca_stats=None, eigen_prefix='eigenfeature', bgzip=False):
    """
    Master function for Eigendecomposition of bin annotations
    """

    # Set certain defaults prior to loading precomputed model
    whitener = None

    # Load precomputed model, if optioned
    if precomp_model is not None:
        df_fills, trans_dict, scaler, pca, components, whitener = \
            _load_precomp_model(precomp_model)
        fill_missing = df_fills

    # Expand feature transformation dictionary
    log_transform = trans_dict.get('log', [])
    sqrt_transform = trans_dict.get('sqrt', [])
    exp_transform = trans_dict.get('exp', [])
    square_transform = trans_dict.get('square', [])
    boxcox_transform = trans_dict.get('boxcox', [])

    # Read bins, then sanitize and transform annotations
    df_bins = pd.read_csv(bins, sep='\t', usecols=range(first_column))
    df_annos, df_fills = \
        dfutils.load_feature_df(bins, first_column, log_transform, sqrt_transform, 
                                exp_transform, square_transform,  boxcox_transform, 
                                fill_missing, return_fills=True)
    feature_names = df_annos.columns.tolist()

    # Scale all columns
    if precomp_model is None:
        scaler = StandardScaler().fit(df_annos)
    df_annos = scaler.transform(df_annos)

    # Learn covariance matrix & determine number of components to keep
    if precomp_model is None:
        pcs_to_calc = min([df_annos.shape[1], max_pcs])
        pca = PCA(n_components=pcs_to_calc).fit(df_annos)
        if minvar is None:
            components = pcs_to_calc
        else:
            components = len([i for i in np.cumsum(pca.explained_variance_ratio_) \
                              if i < minvar])

    # Decompose annotations
    pcs = pca.transform(df_annos)
    eigen_names = ['_'.join([eigen_prefix, str(i+1)]) for i in range(components)]
    df_pcs = pd.DataFrame(pcs[:, :components], columns=eigen_names)

    # "Whiten" eigenfeatures, if optioned
    if whiten:
        if precomp_model is None:
            whitener = StandardScaler().fit(df_pcs)
    if whitener is not None:
        df_pcs = pd.DataFrame(whitener.transform(df_pcs), columns=eigen_names)

    # Write output bins with PCs
    if bins_outfile is not None:
        if 'compressed' in determine_filetype(bins_outfile):
            bins_outfile = path.splitext(bins_outfile)[0]
        out_df = dfutils.float_cleanup(pd.concat([df_bins, df_pcs], axis=1), 
                                       maxfloat, first_column)
        out_df.to_csv(bins_outfile, sep='\t', index=False)
        if bgzip:
            bgz(bins_outfile)

    # Save model for future use, if optioned
    if parameters_outfile is not None:
        _save_model_params(df_fills, trans_dict, scaler, pca, components, 
                           whitener, parameters_outfile)

    # Perform extra assessments of PCA & feature fits, if optioned
    if pca_stats is not None:
        get_feature_stats(df_annos, feature_names, pca, pcs, pca_stats, 
                          eigen_prefix, components)
Beispiel #12
0
def main():
    """
    Main block
    """

    # Parse arguments
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('bed', help='BED file of intervals to be evaluated')
    parser.add_argument('probes_tsv',
                        help='.tsv of path to probeset BED and ' +
                        'array name')
    parser.add_argument('samples_tsv',
                        help='.tsv matrix of sample counts for ' +
                        'arrays (rows) X cohorts (columns)')
    parser.add_argument('cohorts', help='.tsv of metacohort assignments')
    parser.add_argument('-o',
                        '--outfile',
                        help='Output BED file annotated with ' +
                        'exclusion criteria [default: stdout]',
                        default='stdout')
    parser.add_argument('--probecounts-outfile',
                        help='Output BED file annotated with ' +
                        'number of probes per interval [optional]')
    parser.add_argument(
        '--control-mean-counts-outfile',
        help='Output BED file ' +
        'annotated with mean number of probes in controls per ' +
        'cohort, computed as the weighted average per platform ' +
        '[optional]')
    parser.add_argument(
        '--frac-pass-outfile',
        help='Output BED file annotated with ' +
        'fraction of passing samples per cohort per per interval [optional]')
    parser.add_argument('--min-interval-size',
                        dest='min_size',
                        type=int,
                        default=100000,
                        help='Uniformly expand small intervals to ' +
                        'be at least this size [default: 100,000]')
    parser.add_argument(
        '--min-probes',
        type=int,
        default=10,
        help='Minimum number ' +
        'of probes required per interval to pass [default: 10]')
    parser.add_argument(
        '--min-frac-samples',
        type=float,
        default=0.9,
        help='Minimum fraction of samples required per interval ' +
        'to pass [default: 0.9]')
    parser.add_argument('-k',
                        '--keep-n-columns',
                        type=int,
                        default=3,
                        help='Number of columns from input BED to keep in ' +
                        '--outfile [default: 3]')
    parser.add_argument(
        '-z',
        '--bgzip',
        action='store_true',
        default=False,
        help='compress --outfile and --probecounts-outfile with ' + 'bgzip')
    args = parser.parse_args()

    # Step 1. Load intervals, and expand (as necessary)
    intervals, header = load_intervals(args.bed, args.min_size,
                                       args.keep_n_columns)

    # Step 2. Annotate intervals with probe counts using athena
    tracks, tnames = parse_probesets(args.probes_tsv)
    intervals = annotate_bins(bins=intervals.fn,
                              chroms=None,
                              ranges=None,
                              tracks=tracks,
                              ucsc_tracks=[],
                              ucsc_ref=None,
                              actions=['count' for i in range(len(tracks))],
                              fasta=None,
                              snv_mus=None,
                              maxfloat=8,
                              ucsc_chromsplit=False,
                              quiet=False)
    intervals = replace_coords(intervals, args.bed, args.keep_n_columns)
    counts_outfile = args.probecounts_outfile
    if counts_outfile is not None:
        if 'compressed' in determine_filetype(counts_outfile):
            counts_outfile = path.splitext(counts_outfile)[0]
        intervals.saveas(counts_outfile, trackline='\t'.join(header + tnames))
        if args.bgzip:
            bgzip(counts_outfile)

    # Step 3. Determine pass/fail labels per interval per array
    array_labels_df = label_array_fails(intervals, args.min_probes, header,
                                        tnames)

    # Step 4. Compute fraction of passing samples per interval per cohort
    array_counts = load_array_counts(args.samples_tsv, args.cohorts, tnames)
    cohort_fracs_df = get_passing_fracs(array_counts, array_labels_df,
                                        args.keep_n_columns)
    fracs_outfile = args.frac_pass_outfile
    if fracs_outfile is not None:
        if 'compressed' in determine_filetype(fracs_outfile):
            fracs_outfile = path.splitext(fracs_outfile)[0]
        cohort_fracs_df.to_csv(fracs_outfile, sep='\t', index=False)
        if args.bgzip:
            bgzip(fracs_outfile)

    # Step 5. Label each interval with cohorts to be excluded
    cohort_labels_df = label_cohort_fails(cohort_fracs_df,
                                          args.min_frac_samples,
                                          args.keep_n_columns)

    # Step 6. Format output file and write out
    if args.outfile in '- stdout /dev/stdout'.split():
        cohort_labels_df.to_csv(stdout, sep='\t', index=False)
    else:
        outfile = args.outfile
        if 'compressed' in determine_filetype(outfile):
            outfile = path.splitext(outfile)[0]
        cohort_labels_df.to_csv(outfile, sep='\t', index=False)
        if args.bgzip:
            bgzip(outfile)

    # [Optional] Step 7. Compute average number of probes per cohort per interval
    means_outfile = args.control_mean_counts_outfile
    if means_outfile is not None:
        cohort_means_df = get_cohort_means(array_counts, intervals, tnames,
                                           header, args.keep_n_columns)
        if 'compressed' in determine_filetype(means_outfile):
            means_outfile = path.splitext(means_outfile)[0]
        cohort_means_df.to_csv(means_outfile, sep='\t', index=False)
        if args.bgzip:
            bgzip(means_outfile)
Beispiel #13
0
def annotate_pairs(pairs, chroms, ranges, tracks, ucsc_tracks, actions, track_names, 
                   ucsc_ref, fasta, binsize, homology_cutoffs, ucsc_chromsplit, 
                   maxfloat, quiet):
    """
    Master pair annotation function
    """

    # Infer binsize and filetype
    if binsize is None:
        binsize = calc_binsize(pairs)
    ftype = determine_filetype(pairs)


    # Load pairs. Note: must read contents from file due to odd utf-8 decoding 
    # behavior for bgzipped BED files with pybedtools
    if 'compressed' in ftype:
        pairs = ''.join(s.decode('utf-8') for s in GzipFile(pairs).readlines())
    else:
        pairs = open(pairs, 'r').readlines()
    firstline = pairs.split('\n')[0].split('\t')
    if firstline[0].startswith('#'):
        colnames = firstline
    else:
        colnames = None
    n_cols_old = len(firstline)
    pairs = pbt.BedTool(pairs, from_string=True)


    # Subset pairs to specific chromosomes/ranges, if optioned
    if chroms is not None:
        chrlist = chroms.split(',')
        pairs = pairs.filter(lambda x: x.chrom in chrlist).saveas()
    if ranges is not None:
        pairs = pairs.intersect(range, wa=True).saveas()


    # Note: more efficient (and stable) when adding many annotations to hold 
    # pd.DataFrame of pairs with annotations in memory and convert entire 
    # pd.DataFrame back to pbt.BedTool after adding all annotations as columns
    # This appears to be due to peculiarities in pyBedTools handling of wide BED files
    pairs_bt = pairs.cut(range(3)).saveas()
    pairs_df = pairs.to_dataframe(names=colnames, comment='#')
    pairs_bedpe_bt = _pairs_bed_to_bedpe(pairs_bt, binsize)


    # Make master pbt.BedTool of all bins from all pairs
    split_pair_bts = [_split_pairs(p, binsize) for p in pairs_bt]
    allbins_bt = split_pair_bts[0].cat(*split_pair_bts[1:], postmerge=False).sort().merge(d=-1)
    query_regions = ucsc.collapse_query_regions(allbins_bt).saveas()


    # Annotate bins with all local tracks
    track_counter = 0
    if len(tracks) > 0:
        for track in tracks:
            action = actions[track_counter]
            pairs_df['newtrack_{}'.format(track_counter)] = \
                add_pairwise_local_track(pairs_bedpe_bt, track, action, query_regions, 
                                         binsize, quiet)
            track_counter += 1


    # Annotate bins with all UCSC tracks
    if len(ucsc_tracks) > 0:
        if quiet is False:
            status_msg = '[{0}] athena annotate-pairs: Connecting to UCSC ' + \
                         'Genome Browser database'
            print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), 
                                    fasta))
        db = ucsc.ucsc_connect(ucsc_ref)

        # Iterate over tracks
        for track in ucsc_tracks:
            action = actions[track_counter]
            pairs_df['newtrack_{}'.format(track_counter)] = \
                add_pairwise_ucsc_track(pairs_bedpe_bt, db, track, action, query_regions, 
                                        binsize, ucsc_ref, ucsc_chromsplit, quiet)
            track_counter += 1

        # Close UCSC connection
        db.close()


    # Annotate pairs based on nucleotide content, if optioned
    if fasta is not None:

        if quiet is False:
            status_msg = '[{0}] athena annotate-pairs: Adding sequence homology ' + \
                         'features from reference fasta "{1}".'
            print(status_msg.format(datetime.now().strftime('%b %d %Y @ %H:%M:%S'), 
                                    fasta))

        for identity in homology_cutoffs:
            for rev in True, False:
                pairs_df['newtrack_{}'.format(track_counter)] = \
                    add_homology(pairs_bt, fasta, binsize, identity, rev)
                track_counter += 1
    

    # Clean up long floats
    pairs_df = float_cleanup(pairs_df, maxfloat, start_idx=3)


    # Return bins as pbt.BedTool
    return pbt.BedTool.from_dataframe(pairs_df)
Beispiel #14
0
def annotatepairs(pairs, outfile, chroms, ranges, track, ucsc_track, actions,
                  track_names, track_list, ucsc_list, ucsc_ref, fasta, binsize,
                  homology_cutoffs, no_ucsc_chromsplit, maxfloat, bgzip,
                  quiet):
    """
    Annotate pairs
    """

    # Sanitize & format inputs
    ucsc_chromsplit = not no_ucsc_chromsplit
    tracks = list(track)
    ucsc_tracks = list(ucsc_track)
    actions = tuple([a.lower() for a in actions])
    if len(homology_cutoffs) > 0:
        homology_cutoffs = list(homology_cutoffs)
    else:
        homology_cutoffs = [1.0]

    # Parse file with lists of tracks (if provided) and add to track lists
    if track_list is not None:
        supp_tracks, supp_actions, supp_names = mutrate.parse_track_file(
            track_list)
        tracks = tracks + supp_tracks
        n_ucsc_tracks = len(ucsc_tracks)
        if n_ucsc_tracks > 0:
            actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \
                            + list(actions[n_ucsc_tracks:]))
            track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \
                            + list(track_names[n_ucsc_tracks:]))
        else:
            actions = tuple(list(actions) + supp_actions)
            track_names = tuple(list(track_names) + supp_names)

    # Parse file with list of UCSC tracks (if provided and add to track lists)
    if ucsc_list is not None:
        supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file(
            ucsc_list)
        ucsc_tracks = ucsc_tracks + supp_ucsc_tracks
        actions = tuple(list(actions) + supp_ucsc_actions)
        track_names = tuple(list(track_names) + supp_ucsc_names)

    # Handle header reformatting
    if 'compressed' in determine_filetype(pairs):
        header = GzipFile(pairs).readline().decode('utf-8').rstrip()
    else:
        header = open(pairs, 'r').readline().rstrip()
    if not header.startswith('#'):
        msg = 'INPUT WARNING: '
        status_msg = '[{0}] athena annotate-pairs: No header line detected. ' + \
                     'Adding default header.'
        print(status_msg.format(
            datetime.now().strftime('%b %d %Y @ %H:%M:%S')))
        n_extra_cols = len(header.split('\t')) - 3
        header = make_default_bed_header(n_extra_cols)
    if len(track_names) > 0:
        newheader = header + '\t' + '\t'.join(list(track_names))
    else:
        newheader = header
    if fasta is not None:
        for k in homology_cutoffs:
            for direction in 'fwd rev'.split():
                newheader += '\t' + 'longest_{}_kmer_{}pct_identity'.format(
                    direction, int(round(100 * k)))

    # Annotate pairs
    newpairs = mutrate.annotate_pairs(pairs, chroms, ranges, tracks,
                                      ucsc_tracks, actions, track_names,
                                      ucsc_ref, fasta, binsize,
                                      homology_cutoffs, ucsc_chromsplit,
                                      maxfloat, quiet)

    # Save annotated bins
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    newpairs.saveas(outfile, trackline=newheader)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
Beispiel #15
0
def annotatebins(bins, outfile, include_chroms, ranges, track, ucsc_track,
                 actions, track_names, track_list, ucsc_list, ucsc_ref, fasta,
                 snv_mus, no_ucsc_chromsplit, maxfloat, bgzip, quiet):
    """
    Annotate bins
    """

    # Sanitize & format inputs
    ucsc_chromsplit = not no_ucsc_chromsplit
    track = list(track)
    ucsc_track = list(ucsc_track)
    actions = tuple([a.lower() for a in actions])

    # Parse file with lists of tracks (if provided) and add to track lists
    if track_list is not None:
        supp_tracks, supp_actions, supp_names = mutrate.parse_track_file(
            track_list)
        track = track + supp_tracks
        n_ucsc_tracks = len(ucsc_track)
        if n_ucsc_tracks > 0:
            actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \
                            + list(actions[n_ucsc_tracks:]))
            track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \
                            + list(track_names[n_ucsc_tracks:]))
        else:
            actions = tuple(list(actions) + supp_actions)
            track_names = tuple(list(track_names) + supp_names)

    # Parse file with list of UCSC tracks (if provided and add to track lists)
    if ucsc_list is not None:
        supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file(
            ucsc_list)
        ucsc_track = ucsc_track + supp_ucsc_tracks
        actions = tuple(list(actions) + supp_ucsc_actions)
        track_names = tuple(list(track_names) + supp_ucsc_names)

    # Handle header reformatting
    n_tracks = len(track) + len(ucsc_track)
    if n_tracks != len(track_names):
        err = 'INPUT ERROR: Number of supplied track names ({0}) does not ' + \
              'match number of tracks ({1}).'
        exit(err.format(len(track_names), n_tracks))
    if 'compressed' in determine_filetype(bins):
        header = GzipFile(bins).readline().decode('utf-8').rstrip()
    else:
        header = open(bins, 'r').readline().rstrip()
    if not header.startswith('#'):
        status_msg = '[{0}] athena annotate-bins: No header line detected. ' + \
                     'Adding default header.'
        print(status_msg.format(
            datetime.now().strftime('%b %d %Y @ %H:%M:%S')))
        n_extra_cols = len(header.split('\t')) - 3
        header = make_default_bed_header(n_extra_cols)
    newheader = header + '\t' + '\t'.join(list(track_names))
    if fasta is not None:
        newheader = '\t'.join([newheader, 'pct_gc'])
        if snv_mus is not None:
            newheader = '\t'.join([newheader, 'snv_mu'])

    # Annotate bins
    newbins = mutrate.annotate_bins(bins, include_chroms, ranges, track,
                                    ucsc_track, ucsc_ref, actions, fasta,
                                    snv_mus, maxfloat, ucsc_chromsplit, quiet)

    # Save annotated bins
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    newbins.saveas(outfile, trackline=newheader)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
Beispiel #16
0
def annotate_bins(bins, chroms, ranges, tracks, ucsc_tracks, ucsc_ref, actions,
                  fasta, snv_mus, maxfloat, ucsc_chromsplit, quiet):
    """
    Master bin annotation function
    """

    # Parse & sanity check all track inputs
    n_all_tracks = len(tracks) + len(ucsc_tracks)
    if len(actions) != n_all_tracks:
        from sys import exit
        err = 'INPUT ERROR: Number of actions ({0}) does not match number ' + \
              'of tracks ({1}).'
        exit(err.format(len(actions), n_all_tracks))

    if len(ucsc_tracks) > 0:
        if ucsc_ref is None:
            from sys import exit
            exit('INPUT ERROR: --ucsc-ref must be specified if any UCSC ' +
                 'tracks are requested.')

    # Load bins. Note: must read contents from file due to odd utf-8 decoding
    # behavior for bgzipped BED files
    ftype = determine_filetype(bins)
    if ftype is None:
        ftype = 'unknown'
    if 'compressed' in ftype:
        bins = ''.join(s.decode('utf-8') for s in GzipFile(bins).readlines())
    else:
        bins = ''.join(open(bins, 'r').readlines())
    firstline = bins.split('\n')[0].split('\t')
    if firstline[0].startswith('#'):
        colnames = firstline
    else:
        colnames = None
    n_cols_old = len(firstline)
    bins = pbt.BedTool(bins, from_string=True)

    # Subset bins to specific chromosomes/ranges, if optioned
    if chroms is not None:
        chrlist = chroms.split(',')
        bins = bins.filter(lambda x: x.chrom in chrlist).saveas()
    if ranges is not None:
        bins = bins.intersect(range, wa=True).saveas()

    # Note: more efficient (and stable) when adding many annotations to hold
    # pd.DataFrame of bins with annotations in memory and convert entire
    # pd.DataFrame back to pbt.BedTool after adding all annotations as columns
    # This appears to be due to peculiarities in pyBedTools handling of wide BED files
    bins_bt = bins.cut(range(3)).saveas()
    bins_df = bins.to_dataframe(names=colnames, comment='#')

    # Annotate bins with all local tracks
    track_counter = 0
    if len(tracks) > 0:
        for track in tracks:
            action = actions[track_counter]
            bins_df['newtrack_{}'.format(track_counter)] = \
                add_local_track(bins_bt, track, action, quiet)
            track_counter += 1

    # Annotate bins with all UCSC tracks
    if len(ucsc_tracks) > 0:
        if quiet is False:
            status_msg = '[{0}] athena annotate-bins: Connecting to UCSC ' + \
                         'Genome Browser database'
            print(
                status_msg.format(
                    datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta))
        db = ucsc.ucsc_connect(ucsc_ref)
        query_regions = ucsc.collapse_query_regions(bins).saveas()

        # Iterate over tracks
        for track in ucsc_tracks:
            # Ping db connection is still active (UCSC may timeout over sequential long queries)
            # If UCSC connection has timed out, reopen new connection
            try:
                db.ping(True)
            except:
                try:
                    db.close()
                except:
                    pass
                db = ucsc.ucsc_connect(ucsc_ref)

            # Submit UCSC query
            action = actions[track_counter]
            bins_df['newtrack_{}'.format(track_counter)] = \
                add_ucsc_track(bins_bt, db, track, action, query_regions,
                               ucsc_ref, ucsc_chromsplit, quiet)
            track_counter += 1

        # Close UCSC connection
        db.close()

    # Annotate bins with nucleotide content, if optioned
    if fasta is not None:

        if quiet is False:
            status_msg = '[{0}] athena annotate-bins: Adding nucleotide ' + \
                         'content from reference fasta "{1}".'
            print(
                status_msg.format(
                    datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta))

        bins_df['pct_gc'] = add_nuc_content(bins, fasta, maxfloat)

        # Annotate bins with SNV mutation rates, if optioned
        if snv_mus is not None:
            if quiet is False:
                status_msg = '[{0}] athena annotate-bins: Adding SNV mutation ' + \
                             'rates from reference fasta "{1}".'
                print(
                    status_msg.format(
                        datetime.now().strftime('%b %d %Y @ %H:%M:%S'), fasta))

            bins_df['snv_mu'] = add_snv_mu(bins, fasta, snv_mus, maxfloat)

    # Clean up long floats
    bins_df = float_cleanup(bins_df, maxfloat, start_idx=n_cols_old)

    # Return bins as pbt.BedTool
    return pbt.BedTool.from_dataframe(bins_df)
Beispiel #17
0
def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs,
             sv_ci, maxfloat, bgzip):
    """
    Master function to annotate bins_in with count (or probability) of SVs
    """

    # Load bins, split bin coordinates from annotations, and retain header
    if 'compressed' in determine_filetype(bins_in):
        bins_header = gzip.open(
            bins_in, 'r').readline().decode('utf-8').rstrip().split('\t')
    else:
        bins_header = open(bins_in, 'r').readline().rstrip().split('\t')
    bins_bt = pbt.BedTool(bins_in).cut(range(3)).saveas()
    bins_df = bins_bt.to_dataframe()
    feats_df = dfutils.load_feature_df(bins_in)
    if binsize is None:
        binsize = calc_binsize(bins_in)

    # Parse input SV file depending on format
    # If breakpoints == False, will return simple four-column BED with variant ID in fourth column
    # If breakpoints == True, will return two rows per record where each record
    # is one breakpoint with columns 4 = variant ID, 5 = POS or END, 6 = original
    # POS or END coordinate, 7 = std dev of left side of breakpoint, 8 = std dev of
    # right side of breakpoint, and 9 = number of std deviations extended left & right (i.e., z_extend)
    sv_format = determine_filetype(sv_in)
    if 'vcf' in sv_format:
        vcf = pysam.VariantFile(sv_in)
        sv = vcf2bed(vcf,
                     breakpoints=breakpoints,
                     add_ci_to_bkpts=probs,
                     ci=sv_ci)
    elif 'bed' in sv_format:
        sv = _load_sv_from_bed(sv_in, breakpoints=breakpoints)

    # Perform intersection with bins depending on input parameters
    if breakpoints:
        bins_bt = add_names_to_bed(bins_bt)
        bin_ids = [b.name for b in bins_bt]

        # Split pairs if necessary
        if paired:
            bins_bt = _split_pairs(bins_bt,
                                   binsize=binsize,
                                   add_name=True,
                                   add_side=True)

        # Intersect breakpoints with bins
        hits = bins_bt.intersect(sv, wa=True, wb=True)
        bkpt_res = parse_breakpoint_hits(hits, paired, probs)
        sv_column = pd.Series([bkpt_res.get(b_id, 0) for b_id in bin_ids])

    # --comparison "overlap" (i.e., breakpoints == False) is the same for both 1D and 2D bins
    else:
        if probs:
            sv_column = pd.Series(
                [min([1, int(x[-1])]) for x in bins_bt.intersect(sv, c=True)])
        else:
            sv_column = pd.Series(
                [int(x[-1]) for x in bins_bt.intersect(sv, c=True)])

    # Paste bin coordinates, SV counts, and original features into single dataframe
    out_df = dfutils.float_cleanup(
        pd.concat([bins_df, sv_column, feats_df], axis=1), maxfloat, 3)
    out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:]

    # Save bins with SV counts
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', header=True, index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)