Ejemplo n.º 1
0
def prepare_snp_sfs(vcf, call, n, sel_sfs_regions, call_sel_reg):
    """
    gets sfs from vcf and prepares as anavar input
    :param vcf: str
    :param call: dict
    :param n: int
    :param sel_sfs_regions: list
    :param call_sel_reg: str
    :return: dict
    """

    # extract site frequencies
    sel_sfs = vcf2sfs(vcf_name=vcf,
                      mode='snp',
                      auto_only=True,
                      regions=sel_sfs_regions)

    neu_sfs = vcf2sfs(vcf_name=vcf, mode='snp', auto_only=True, degen=4)

    # convert to correct format for anavar
    sfs_sel = sfs2counts(sel_sfs, n)
    sfs_neu = sfs2counts(neu_sfs, n)

    # get callable sites
    sel_m = call['ALL'][call_sel_reg]['pol']
    neu_m = call['ALL']['4fold']['pol']

    # construct control file sfs
    sfs_m = {'selected_SNP': (sfs_sel, sel_m), 'neutral_SNP': (sfs_neu, neu_m)}

    return sfs_m
Ejemplo n.º 2
0
def prepare_indel_sfs(vcf, call, n, length_list):
    """
    gets sfs from vcf and prepares as anavar input
    :param vcf: str
    :param call: dict
    :param n: int
    :param length_list: set
    :return: dict
    """

    # extract site frequencies
    del_sfs = vcf2sfs(vcf_name=vcf,
                      mode='del',
                      auto_only=True,
                      skip_hetero=True,
                      lengths=length_list,
                      regions=['CDS_frameshift', 'CDS_non_frameshift'])

    n_d_sfs = vcf2sfs(vcf_name=vcf,
                      mode='del',
                      auto_only=True,
                      skip_hetero=True,
                      lengths=length_list,
                      regions=['intergenic', 'intron'])

    ins_sfs = vcf2sfs(vcf_name=vcf,
                      mode='ins',
                      auto_only=True,
                      skip_hetero=True,
                      lengths=length_list,
                      regions=['CDS_frameshift', 'CDS_non_frameshift'])

    n_i_sfs = vcf2sfs(vcf_name=vcf,
                      mode='ins',
                      auto_only=True,
                      skip_hetero=True,
                      lengths=length_list,
                      regions=['intergenic', 'intron'])

    # convert to correct format for anavar
    sfs_i = sfs2counts(ins_sfs, n)
    sfs_d = sfs2counts(del_sfs, n)
    sfs_ni = sfs2counts(n_i_sfs, n)
    sfs_nd = sfs2counts(n_d_sfs, n)

    # get callable sites
    sel_m = call['ALL']['CDS']['pol']
    neu_m = call['ALL']['intergenic']['pol'] + call['ALL']['intron']['pol']

    # construct control file sfs
    sfs_m = {
        'selected_INS': (sfs_i, sel_m),
        'selected_DEL': (sfs_d, sel_m),
        'neutral_INS': (sfs_ni, neu_m),
        'neutral_DEL': (sfs_nd, neu_m)
    }

    return sfs_m
Ejemplo n.º 3
0
def prepare_indel_sfs(vcf,
                      call,
                      n,
                      sel_sfs_regions,
                      call_sel_reg,
                      ar_ref=True):
    """
    gets sfs from vcf and prepares as anavar input
    :param vcf: str
    :param call: dict
    :param n: int
    :param sel_sfs_regions: list
    :param call_sel_reg: str
    :param ar_ref: str
    :return: dict
    """

    if ar_ref:
        neu_sfs_regions = ['intergenic_ar', 'intron_ar']
        neu_call_reg = 'AR'
    else:
        neu_sfs_regions = ['intergenic', 'intron']
        neu_call_reg = 'noncoding'

    # extract site frequencies
    del_sfs = vcf2sfs(vcf_name=vcf,
                      mode='del',
                      auto_only=True,
                      regions=sel_sfs_regions)

    n_d_sfs = vcf2sfs(vcf_name=vcf,
                      mode='del',
                      auto_only=True,
                      regions=neu_sfs_regions)

    ins_sfs = vcf2sfs(vcf_name=vcf,
                      mode='ins',
                      auto_only=True,
                      regions=sel_sfs_regions)

    n_i_sfs = vcf2sfs(vcf_name=vcf,
                      mode='ins',
                      auto_only=True,
                      regions=neu_sfs_regions)

    # convert to correct format for anavar
    sfs_i = sfs2counts(ins_sfs, n)
    sfs_d = sfs2counts(del_sfs, n)
    sfs_ni = sfs2counts(n_i_sfs, n)
    sfs_nd = sfs2counts(n_d_sfs, n)

    # get callable sites
    sel_m = call['ALL'][call_sel_reg]['pol']
    neu_m = call['ALL'][neu_call_reg]['pol']

    # construct control file sfs
    sfs_m = {
        'selected_INS': (sfs_i, sel_m),
        'selected_DEL': (sfs_d, sel_m),
        'neutral_INS': (sfs_ni, neu_m),
        'neutral_DEL': (sfs_nd, neu_m)
    }

    return sfs_m
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-correct_sfs',
                        default=False,
                        action='store_true',
                        help=argparse.SUPPRESS)
    args = parser.parse_args()

    # paths
    if platform.uname()[0] == 'Darwin':
        stem = '/Users/henryjuho/sharc_fastdata/'
    else:
        stem = '/fastdata/bop15hjb/h_j_b/'

    vcf = stem + 'GT_data/BGI_BWA_GATK/Analysis_ready_data/' \
                 'final/bgi_10birds.filtered_indels.pol.anno.recomb.line.vcf.gz'

    rec = stem + 'GT_data/BGI_BWA_GATK/anavar_analysis/window_analysis/gt_2Mb_window_rec_rates.txt'

    windows = stem + 'GT_data/BGI_BWA_GATK/anavar_analysis/window_analysis/gt_windows.2Mb.bed.gz'

    call_sites = pysam.FastaFile(stem + 'GT_ref/bgi_10birds.callable.fa')

    noncoding_bed = stem + 'GT_ref/gt_noncoding.bed.gz'
    rec_data = rec_rates(rec)
    nc_bed = pysam.TabixFile(noncoding_bed)

    # header
    print('chr',
          'start',
          'stop',
          'window',
          'rec_rate',
          'callable',
          'n_ins',
          'theta_ins',
          'pi_ins',
          'tajd_ins',
          'n_del',
          'theta_del',
          'pi_del',
          'tajd_del',
          'n_indel',
          'pol_success',
          sep='\t')

    for line in gzip.open(windows):

        chromo, start, stop, wind_id = line.split()

        # sfs
        ins = vcf2sfs(vcf,
                      mode='ins',
                      chromo=chromo,
                      start=int(start),
                      stop=int(stop),
                      regions=['intron', 'intergenic'])

        dels = vcf2sfs(vcf,
                       mode='del',
                       chromo=chromo,
                       start=int(start),
                       stop=int(stop),
                       regions=['intron', 'intergenic'])

        # correct if specified
        if args.correct_sfs:
            ins, dels = correct_sfs(list(ins), list(dels))
        else:
            ins = list(ins)
            dels = list(dels)

        indels = vcf2sfs(vcf,
                         mode='indel',
                         fold=True,
                         chromo=chromo,
                         start=int(start),
                         stop=int(stop),
                         regions=['intron', 'intergenic'])

        n_indels = len(list(indels))

        # callsites
        n_call = window_call_sites(call_sites, nc_bed,
                                   (chromo, int(start), int(stop)))

        if len(ins) == 0 or len(dels) == 0 or n_call == 0:
            ins_t, ins_pi, ins_taj, dels_t, dels_pi, dels_taj, pol_success = 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'
        else:
            pol_success = (len(ins) + len(dels)) / float(n_indels)

            # summary stats
            ins_t = theta_w(20, len(ins)) / float(n_call)
            ins_pi = pi(20, ins) / float(n_call)
            ins_taj = tajimas_d(20, ins)

            dels_t = theta_w(20, len(dels)) / float(n_call)
            dels_pi = pi(20, dels) / float(n_call)
            dels_taj = tajimas_d(20, dels)

        # rec rate
        rr = rec_data[wind_id]

        print(chromo,
              start,
              stop,
              wind_id,
              rr,
              n_call,
              len(ins),
              ins_t,
              ins_pi,
              ins_taj,
              len(dels),
              dels_t,
              dels_pi,
              dels_taj,
              n_indels,
              pol_success,
              sep='\t')
Ejemplo n.º 5
0
def main():
    # argument parser
    parser = argparse.ArgumentParser()
    parser.add_argument('-vcf', help='vcf file with indels', required=True)
    parser.add_argument('-bed_list', help='list of bin beds', required=True)
    parser.add_argument('-call_fa',
                        help='callable sites fasta file',
                        required=True)
    parser.add_argument(
        '-call_txt',
        default='/home/bop15hjb/parus_indel/summary_analyses/bgi10_call.txt',
        help=argparse.SUPPRESS)
    parser.add_argument(
        '-equal_theta',
        help=
        'if specified runs with equal mutation rates between neu and sel sites',
        default=False,
        action='store_true')
    parser.add_argument(
        '-dfe',
        help='determines type of distribution to have in model',
        choices=['discrete', 'continuous'],
        default='continuous')
    parser.add_argument('-out_pre', help='output prefix', required=True)
    parser.add_argument('-evolgen',
                        help='if specified will run on lab queue',
                        default=False,
                        action='store_true')
    parser.add_argument('-sub',
                        help='If specified will submit script to cluster',
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    # submission loop
    if args.sub is True:
        command_line = [' '.join([x for x in sys.argv if x != '-sub'])]
        q_sub(command_line,
              args.out_pre + '.control_job',
              evolgen=args.evolgen)
        sys.exit()

    # flags
    if args.equal_theta:
        constraint = 'equal_mutation_rate'
    else:
        constraint = 'none'

    # files
    call_fasta = pysam.FastaFile(args.call_fa)

    # make a sorted list of form [('1', 'bin1.bed.gz'), ('2', 'bin2.bed.gz')]
    bed_files = sorted([(x.rstrip().split('.')[-3].replace('bin',
                                                           ''), x.rstrip())
                        for x in open(args.bed_list)
                        if x.rstrip().endswith('.bed.gz')])

    # neu reference
    n_i_sfs = vcf2sfs(vcf_name=args.vcf,
                      mode='ins',
                      auto_only=True,
                      regions=['intergenic_ar', 'intron_ar'])

    n_d_sfs = vcf2sfs(vcf_name=args.vcf,
                      mode='del',
                      auto_only=True,
                      regions=['intergenic_ar', 'intron_ar'])

    sfs_ni = sfs2counts(n_i_sfs, 20)
    sfs_nd = sfs2counts(n_d_sfs, 20)

    neu_m = read_callable_txt(args.call_txt)['ALL']['AR']['pol']

    # everything else per window
    for dist_bin in bed_files:

        bin_id = dist_bin[0]
        bin_bed = dist_bin[1]

        out_stem = '{}_bin{}'.format(args.out_pre, bin_id)

        call_sites = 0

        sfs_cmd = (
            'bedtools intersect -header -a {} -b {} | '
            '~/sfs_utils/vcf2raw_sfs.py -region intergenic -region intron -mode {} -auto_only'
        )

        ins_sfs = subprocess.Popen(
            sfs_cmd.format(args.vcf, bin_bed, 'ins'),
            shell=True,
            stdout=subprocess.PIPE).communicate()[0].split('\n')[:-1]
        del_sfs = subprocess.Popen(
            sfs_cmd.format(args.vcf, bin_bed, 'del'),
            shell=True,
            stdout=subprocess.PIPE).communicate()[0].split('\n')[:-1]

        for coord_set in gzip.open(bin_bed):

            coords = coord_set.split()

            if coords[0] == 'chrZ':
                continue

            # get sel call sites
            call_sites += window_call_sites(
                call_fasta, None, (coords[0], int(coords[1]), int(coords[2])))

        ins_counts = sfs2counts(ins_sfs, 20)
        del_counts = sfs2counts(del_sfs, 20)

        # anavar setup
        sfs_data = {
            'selected_INS': (ins_counts, call_sites),
            'selected_DEL': (del_counts, call_sites),
            'neutral_INS': (sfs_ni, neu_m),
            'neutral_DEL': (sfs_nd, neu_m)
        }

        anavar_path = '/shared/evolgen1/shared_data/program_files/sharc/'

        anavar_cmd = '{path}anavar1.22 {ctl} {rslts} {log} {seed}'

        # sort file names
        ctl_name = out_stem + '.control.txt'
        result_name = out_stem + '.results.txt'
        log_name = out_stem + '.log.txt'

        # make control file
        ctl = an.IndelNeuSelControlFile()

        ctl.set_alg_opts(search=500,
                         alg='NLOPT_LD_SLSQP',
                         key=3,
                         epsabs=1e-20,
                         epsrel=1e-9,
                         rftol=1e-9,
                         maxtime=3600,
                         optional=True)

        ctl.set_data(sfs_data,
                     20,
                     dfe=args.dfe,
                     c=1,
                     theta_r=(1e-10, 0.1),
                     r_r=(0.01, 100),
                     scale_r=(0.1, 5000.0),
                     gamma_r=(-5e4, 1e2))

        ctl.set_constraint(constraint)
        ctl_contents = ctl.construct()

        with open(ctl_name, 'w') as control:
            control.write(ctl_contents)

        # submit anavar window job
        window_cmd = anavar_cmd.format(path=anavar_path,
                                       ctl=ctl_name,
                                       rslts=result_name,
                                       log=log_name,
                                       seed=int(bin_id))

        q_sub([window_cmd], out=out_stem, t=24, evolgen=args.evolgen)