Esempio n. 1
0
def mu_predict(pairs, model_pkl, outfile, raw_mu, keep_features, maxfloat,
               bgzip):
    """
    Apply a trained mutation rate model to new bin-pairs
    """

    # Load pairs and split coordinates from features
    coords = pd.read_csv(pairs, sep='\t', usecols=range(3))
    feats, labels = load_bed(pairs)
    if keep_features:
        feats_df = dfutils.load_feature_df(pairs)

    # Load model from .pkl and switch to evaluation mode
    model = torch.load(model_pkl)
    model.eval()

    # Predict mutation rates for all bins
    with torch.no_grad():
        preds = model(feats).numpy()
        if not raw_mu:
            preds = log10(preds)
        preds_df = pd.DataFrame(preds, columns=['mu'])

    # Format output dataframe
    out_df = pd.concat([coords, preds_df], axis=1)
    if keep_features:
        out_df = pd.concat([out_df, feats_df], axis=1)
    out_df = dfutils.float_cleanup(out_df, maxfloat=maxfloat, start_idx=3)

    # Save pairs with predicted mutation rates
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
Esempio n. 2
0
def filter_vcf(vcf, out, chroms, xchroms, svtypes, exclusion_list, minAF,
               maxAF, minAC, maxAC, minAN, filters, minQUAL, maxQUAL, HWE,
               af_field, keep_infos, bgzip):

    # Open connection to input VCF
    if vcf in '- stdin'.split():
        invcf = pysam.VariantFile(stdin)
    else:
        invcf = pysam.VariantFile(vcf)
    header = invcf.header

    #Clean undesired INFO fields from header
    if keep_infos != 'ALL':
        if keep_infos is None:
            keep_infos = []
        else:
            keep_infos = keep_infos.split(',')
        for key in 'END CHR2 SVTYPE SVLEN'.split():
            keep_infos.append(key)
        for key in header.info.keys():
            if key not in keep_infos:
                header.info.remove_header(key)

    # Open connection to output VCF
    if out in '- stdout'.split():
        outvcf = pysam.VariantFile(stdout, 'w', header=header)
    else:
        if '.gz' in out:
            out = path.splitext(out)[0]
        outvcf = pysam.VariantFile(out, 'w', header=header)

    # Parse filtering options
    if chroms is not None:
        chroms = chroms.split(',')
    else:
        chroms = header.contigs.keys()
    if xchroms is not None:
        xchroms = xchroms.split(',')
        chroms = [c for c in chroms if c not in xchroms]
    if svtypes is not None:
        if 'SVTYPE' not in header.info.keys():
            sys.exit('SVTYPE filtering was specified, but input VCF ' +
                     'does not have SVTYPE entry in INFO.')
        else:
            svtypes = svtypes.split(',')
    if filters is not None:
        filters = filters.split(',')
    if exclusion_list is not None:
        bl = pybedtools.BedTool(exclusion_list)

    # Raise warning if AF or AC are missing from VCF
    for key in [af_field, 'AC']:
        if key not in header.info.keys():
            import warnings
            warning_message = '{0} not found in VCF INFO, so {0}-based filtering ' + \
                              'will be ignored'
            warning_message = warning_message.format(key)
            warnings.warn(warning_message, RuntimeWarning)

    # Raise exception if HWE enabled but any necessary fields missing
    if HWE is not None:
        for key in 'N_HOMREF N_HET N_HOMALT'.split():
            if key not in header.info.keys():
                error_message = 'Hardy-Weinberg filtering not possible due to ' + \
                                'missing {0} in VCF INFO'
                sys.exit(error_message.format(key))

    # Iterate over vcf & filter records
    for record in invcf.fetch():
        # Filter by chromosome
        if chroms is not None \
        and record.chrom not in chroms:
            continue

        # Filter by svtype
        if svtypes is not None \
        and record.info['SVTYPE'] not in svtypes:
            continue

        # Exclude records where end < start
        if record.stop < record.start:
            continue

        # Filter by AF/AC
        if af_field in record.info.keys():
            if minAF is not None:
                if np.nansum(record.info[af_field]) < minAF:
                    continue
            if maxAF is not None:
                if np.nansum(record.info[af_field]) > maxAF:
                    continue
        if 'AC' in record.info.keys():
            if minAC is not None:
                if np.nansum(record.info['AC']) < minAC:
                    continue
            if maxAC is not None:
                if np.nansum(record.info['AC']) > maxAC:
                    continue

        # Filter by AN
        if 'AN' in record.info.keys():
            if minAN is not None:
                if record.info['AN'] < minAN:
                    continue

        # Filter by VCF FILTER
        if filters is not None:
            if len([f for f in record.filter if f not in filters]) > 0:
                continue

        # Filter by QUAL score
        if minQUAL is not None \
        and record.qual is not None:
            if record.qual < minQUAL:
                continue
        if maxQUAL is not None \
        and record.qual is not None:
            if record.qual > maxQUAL:
                continue

        # Filter by Hardy-Weinberg equilibrium
        if HWE is not None and len(record.alts) < 3:
            if np.nansum(record.info[af_field]) < 1:
                if hwe_chisq(record) < HWE:
                    continue

        # Clean record
        if keep_infos != 'ALL':
            for key in record.info.keys():
                if key not in keep_infos:
                    record.info.pop(key)

        # Write filter-passing records to output VCF
        outvcf.write(record)

    outvcf.close()

    # Filter remaining records against exclusion_list
    if exclusion_list is not None:
        prebl_vcf = pybedtools.BedTool(out)
        prebl_vcf.intersect(bl, header=True, v=True).saveas(out)

    # Bgzip output VCF, if optioned
    if bgzip:
        bgz(out)
Esempio n. 3
0
def pair_bins(query_bins,
              all_bins,
              outfile,
              max_dist,
              exclusion_list,
              excl_buffer,
              annotate_dist,
              sort_features,
              annotate_absdiff,
              maxfloat,
              bgzip,
              input_has_header=True):
    """
    Create pairs of bins from input BED
    """

    # Open connection to infiles & outfile
    if determine_filetype(query_bins) == 'compressed-bed':
        fin = gzip.open(query_bins, 'rt')
    else:
        fin = open(query_bins)
    if input_has_header:
        colnames = [
            k.replace('#', '') for k in fin.readline().rstrip().split('\t')
        ]
    if all_bins is None:
        bins_tabix = TabixFile(bins)
    else:
        bins_tabix = TabixFile(all_bins)
    xbt = load_exclusion_bts(exclusion_list, excl_buffer)

    # Open connection to output file
    out_ftype, out_ext = determine_filetype(outfile, return_extension=True)
    if 'compressed' in out_ftype:
        outpath = outfile.replace(out_ext, 'bed')
    else:
        outpath = outfile
    fout = open(outpath, 'w')

    # Format header and write to outfile
    header = '#chr start end'.split()
    if annotate_dist:
        header.append('distance')
    for fname in colnames[3:]:
        if sort_features:
            fname_suffixes = ['min', 'max']
        else:
            fname_suffixes = ['left', 'right']
        if annotate_absdiff:
            fname_suffixes.append('absdiff')
        header += ['_'.join([fname, v]) for v in fname_suffixes]
    fout.write('\t'.join(header) + '\n')

    # Identify and curate all pairs for each bin in fin
    for query_line in fin.readlines():
        query_vals = query_line.rstrip().split('\t')
        new_pairs = _get_pairs(fout, query_vals, bins_tabix, max_dist, xbt,
                               annotate_dist, sort_features, annotate_absdiff,
                               maxfloat)

    # Clean up
    fout.close()
    if bgzip:
        bgz(outpath)
Esempio n. 4
0
def decompose_bins(bins, bins_outfile=None, parameters_outfile=None, precomp_model=None, 
                   components=10, minvar=None, trans_dict=None, whiten=False, 
                   fill_missing=0, first_column=3, maxfloat=5, max_pcs=100, 
                   pca_stats=None, eigen_prefix='eigenfeature', bgzip=False):
    """
    Master function for Eigendecomposition of bin annotations
    """

    # Set certain defaults prior to loading precomputed model
    whitener = None

    # Load precomputed model, if optioned
    if precomp_model is not None:
        df_fills, trans_dict, scaler, pca, components, whitener = \
            _load_precomp_model(precomp_model)
        fill_missing = df_fills

    # Expand feature transformation dictionary
    log_transform = trans_dict.get('log', [])
    sqrt_transform = trans_dict.get('sqrt', [])
    exp_transform = trans_dict.get('exp', [])
    square_transform = trans_dict.get('square', [])
    boxcox_transform = trans_dict.get('boxcox', [])

    # Read bins, then sanitize and transform annotations
    df_bins = pd.read_csv(bins, sep='\t', usecols=range(first_column))
    df_annos, df_fills = \
        dfutils.load_feature_df(bins, first_column, log_transform, sqrt_transform, 
                                exp_transform, square_transform,  boxcox_transform, 
                                fill_missing, return_fills=True)
    feature_names = df_annos.columns.tolist()

    # Scale all columns
    if precomp_model is None:
        scaler = StandardScaler().fit(df_annos)
    df_annos = scaler.transform(df_annos)

    # Learn covariance matrix & determine number of components to keep
    if precomp_model is None:
        pcs_to_calc = min([df_annos.shape[1], max_pcs])
        pca = PCA(n_components=pcs_to_calc).fit(df_annos)
        if minvar is None:
            components = pcs_to_calc
        else:
            components = len([i for i in np.cumsum(pca.explained_variance_ratio_) \
                              if i < minvar])

    # Decompose annotations
    pcs = pca.transform(df_annos)
    eigen_names = ['_'.join([eigen_prefix, str(i+1)]) for i in range(components)]
    df_pcs = pd.DataFrame(pcs[:, :components], columns=eigen_names)

    # "Whiten" eigenfeatures, if optioned
    if whiten:
        if precomp_model is None:
            whitener = StandardScaler().fit(df_pcs)
    if whitener is not None:
        df_pcs = pd.DataFrame(whitener.transform(df_pcs), columns=eigen_names)

    # Write output bins with PCs
    if bins_outfile is not None:
        if 'compressed' in determine_filetype(bins_outfile):
            bins_outfile = path.splitext(bins_outfile)[0]
        out_df = dfutils.float_cleanup(pd.concat([df_bins, df_pcs], axis=1), 
                                       maxfloat, first_column)
        out_df.to_csv(bins_outfile, sep='\t', index=False)
        if bgzip:
            bgz(bins_outfile)

    # Save model for future use, if optioned
    if parameters_outfile is not None:
        _save_model_params(df_fills, trans_dict, scaler, pca, components, 
                           whitener, parameters_outfile)

    # Perform extra assessments of PCA & feature fits, if optioned
    if pca_stats is not None:
        get_feature_stats(df_annos, feature_names, pca, pcs, pca_stats, 
                          eigen_prefix, components)
Esempio n. 5
0
def annotatepairs(pairs, outfile, chroms, ranges, track, ucsc_track, actions,
                  track_names, track_list, ucsc_list, ucsc_ref, fasta, binsize,
                  homology_cutoffs, no_ucsc_chromsplit, maxfloat, bgzip,
                  quiet):
    """
    Annotate pairs
    """

    # Sanitize & format inputs
    ucsc_chromsplit = not no_ucsc_chromsplit
    tracks = list(track)
    ucsc_tracks = list(ucsc_track)
    actions = tuple([a.lower() for a in actions])
    if len(homology_cutoffs) > 0:
        homology_cutoffs = list(homology_cutoffs)
    else:
        homology_cutoffs = [1.0]

    # Parse file with lists of tracks (if provided) and add to track lists
    if track_list is not None:
        supp_tracks, supp_actions, supp_names = mutrate.parse_track_file(
            track_list)
        tracks = tracks + supp_tracks
        n_ucsc_tracks = len(ucsc_tracks)
        if n_ucsc_tracks > 0:
            actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \
                            + list(actions[n_ucsc_tracks:]))
            track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \
                            + list(track_names[n_ucsc_tracks:]))
        else:
            actions = tuple(list(actions) + supp_actions)
            track_names = tuple(list(track_names) + supp_names)

    # Parse file with list of UCSC tracks (if provided and add to track lists)
    if ucsc_list is not None:
        supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file(
            ucsc_list)
        ucsc_tracks = ucsc_tracks + supp_ucsc_tracks
        actions = tuple(list(actions) + supp_ucsc_actions)
        track_names = tuple(list(track_names) + supp_ucsc_names)

    # Handle header reformatting
    if 'compressed' in determine_filetype(pairs):
        header = GzipFile(pairs).readline().decode('utf-8').rstrip()
    else:
        header = open(pairs, 'r').readline().rstrip()
    if not header.startswith('#'):
        msg = 'INPUT WARNING: '
        status_msg = '[{0}] athena annotate-pairs: No header line detected. ' + \
                     'Adding default header.'
        print(status_msg.format(
            datetime.now().strftime('%b %d %Y @ %H:%M:%S')))
        n_extra_cols = len(header.split('\t')) - 3
        header = make_default_bed_header(n_extra_cols)
    if len(track_names) > 0:
        newheader = header + '\t' + '\t'.join(list(track_names))
    else:
        newheader = header
    if fasta is not None:
        for k in homology_cutoffs:
            for direction in 'fwd rev'.split():
                newheader += '\t' + 'longest_{}_kmer_{}pct_identity'.format(
                    direction, int(round(100 * k)))

    # Annotate pairs
    newpairs = mutrate.annotate_pairs(pairs, chroms, ranges, tracks,
                                      ucsc_tracks, actions, track_names,
                                      ucsc_ref, fasta, binsize,
                                      homology_cutoffs, ucsc_chromsplit,
                                      maxfloat, quiet)

    # Save annotated bins
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    newpairs.saveas(outfile, trackline=newheader)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
Esempio n. 6
0
def annotatebins(bins, outfile, include_chroms, ranges, track, ucsc_track,
                 actions, track_names, track_list, ucsc_list, ucsc_ref, fasta,
                 snv_mus, no_ucsc_chromsplit, maxfloat, bgzip, quiet):
    """
    Annotate bins
    """

    # Sanitize & format inputs
    ucsc_chromsplit = not no_ucsc_chromsplit
    track = list(track)
    ucsc_track = list(ucsc_track)
    actions = tuple([a.lower() for a in actions])

    # Parse file with lists of tracks (if provided) and add to track lists
    if track_list is not None:
        supp_tracks, supp_actions, supp_names = mutrate.parse_track_file(
            track_list)
        track = track + supp_tracks
        n_ucsc_tracks = len(ucsc_track)
        if n_ucsc_tracks > 0:
            actions = tuple(list(actions[:n_ucsc_tracks]) + supp_actions \
                            + list(actions[n_ucsc_tracks:]))
            track_names = tuple(list(track_names[:n_ucsc_tracks]) + supp_names \
                            + list(track_names[n_ucsc_tracks:]))
        else:
            actions = tuple(list(actions) + supp_actions)
            track_names = tuple(list(track_names) + supp_names)

    # Parse file with list of UCSC tracks (if provided and add to track lists)
    if ucsc_list is not None:
        supp_ucsc_tracks, supp_ucsc_actions, supp_ucsc_names = mutrate.parse_track_file(
            ucsc_list)
        ucsc_track = ucsc_track + supp_ucsc_tracks
        actions = tuple(list(actions) + supp_ucsc_actions)
        track_names = tuple(list(track_names) + supp_ucsc_names)

    # Handle header reformatting
    n_tracks = len(track) + len(ucsc_track)
    if n_tracks != len(track_names):
        err = 'INPUT ERROR: Number of supplied track names ({0}) does not ' + \
              'match number of tracks ({1}).'
        exit(err.format(len(track_names), n_tracks))
    if 'compressed' in determine_filetype(bins):
        header = GzipFile(bins).readline().decode('utf-8').rstrip()
    else:
        header = open(bins, 'r').readline().rstrip()
    if not header.startswith('#'):
        status_msg = '[{0}] athena annotate-bins: No header line detected. ' + \
                     'Adding default header.'
        print(status_msg.format(
            datetime.now().strftime('%b %d %Y @ %H:%M:%S')))
        n_extra_cols = len(header.split('\t')) - 3
        header = make_default_bed_header(n_extra_cols)
    newheader = header + '\t' + '\t'.join(list(track_names))
    if fasta is not None:
        newheader = '\t'.join([newheader, 'pct_gc'])
        if snv_mus is not None:
            newheader = '\t'.join([newheader, 'snv_mu'])

    # Annotate bins
    newbins = mutrate.annotate_bins(bins, include_chroms, ranges, track,
                                    ucsc_track, ucsc_ref, actions, fasta,
                                    snv_mus, maxfloat, ucsc_chromsplit, quiet)

    # Save annotated bins
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    newbins.saveas(outfile, trackline=newheader)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
Esempio n. 7
0
def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs,
             sv_ci, maxfloat, bgzip):
    """
    Master function to annotate bins_in with count (or probability) of SVs
    """

    # Load bins, split bin coordinates from annotations, and retain header
    if 'compressed' in determine_filetype(bins_in):
        bins_header = gzip.open(
            bins_in, 'r').readline().decode('utf-8').rstrip().split('\t')
    else:
        bins_header = open(bins_in, 'r').readline().rstrip().split('\t')
    bins_bt = pbt.BedTool(bins_in).cut(range(3)).saveas()
    bins_df = bins_bt.to_dataframe()
    feats_df = dfutils.load_feature_df(bins_in)
    if binsize is None:
        binsize = calc_binsize(bins_in)

    # Parse input SV file depending on format
    # If breakpoints == False, will return simple four-column BED with variant ID in fourth column
    # If breakpoints == True, will return two rows per record where each record
    # is one breakpoint with columns 4 = variant ID, 5 = POS or END, 6 = original
    # POS or END coordinate, 7 = std dev of left side of breakpoint, 8 = std dev of
    # right side of breakpoint, and 9 = number of std deviations extended left & right (i.e., z_extend)
    sv_format = determine_filetype(sv_in)
    if 'vcf' in sv_format:
        vcf = pysam.VariantFile(sv_in)
        sv = vcf2bed(vcf,
                     breakpoints=breakpoints,
                     add_ci_to_bkpts=probs,
                     ci=sv_ci)
    elif 'bed' in sv_format:
        sv = _load_sv_from_bed(sv_in, breakpoints=breakpoints)

    # Perform intersection with bins depending on input parameters
    if breakpoints:
        bins_bt = add_names_to_bed(bins_bt)
        bin_ids = [b.name for b in bins_bt]

        # Split pairs if necessary
        if paired:
            bins_bt = _split_pairs(bins_bt,
                                   binsize=binsize,
                                   add_name=True,
                                   add_side=True)

        # Intersect breakpoints with bins
        hits = bins_bt.intersect(sv, wa=True, wb=True)
        bkpt_res = parse_breakpoint_hits(hits, paired, probs)
        sv_column = pd.Series([bkpt_res.get(b_id, 0) for b_id in bin_ids])

    # --comparison "overlap" (i.e., breakpoints == False) is the same for both 1D and 2D bins
    else:
        if probs:
            sv_column = pd.Series(
                [min([1, int(x[-1])]) for x in bins_bt.intersect(sv, c=True)])
        else:
            sv_column = pd.Series(
                [int(x[-1]) for x in bins_bt.intersect(sv, c=True)])

    # Paste bin coordinates, SV counts, and original features into single dataframe
    out_df = dfutils.float_cleanup(
        pd.concat([bins_df, sv_column, feats_df], axis=1), maxfloat, 3)
    out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:]

    # Save bins with SV counts
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', header=True, index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)