def prepare_snp_sfs(vcf, call, n, sel_sfs_regions, call_sel_reg): """ gets sfs from vcf and prepares as anavar input :param vcf: str :param call: dict :param n: int :param sel_sfs_regions: list :param call_sel_reg: str :return: dict """ # extract site frequencies sel_sfs = vcf2sfs(vcf_name=vcf, mode='snp', auto_only=True, regions=sel_sfs_regions) neu_sfs = vcf2sfs(vcf_name=vcf, mode='snp', auto_only=True, degen=4) # convert to correct format for anavar sfs_sel = sfs2counts(sel_sfs, n) sfs_neu = sfs2counts(neu_sfs, n) # get callable sites sel_m = call['ALL'][call_sel_reg]['pol'] neu_m = call['ALL']['4fold']['pol'] # construct control file sfs sfs_m = {'selected_SNP': (sfs_sel, sel_m), 'neutral_SNP': (sfs_neu, neu_m)} return sfs_m
def prepare_indel_sfs(vcf, call, n, length_list): """ gets sfs from vcf and prepares as anavar input :param vcf: str :param call: dict :param n: int :param length_list: set :return: dict """ # extract site frequencies del_sfs = vcf2sfs(vcf_name=vcf, mode='del', auto_only=True, skip_hetero=True, lengths=length_list, regions=['CDS_frameshift', 'CDS_non_frameshift']) n_d_sfs = vcf2sfs(vcf_name=vcf, mode='del', auto_only=True, skip_hetero=True, lengths=length_list, regions=['intergenic', 'intron']) ins_sfs = vcf2sfs(vcf_name=vcf, mode='ins', auto_only=True, skip_hetero=True, lengths=length_list, regions=['CDS_frameshift', 'CDS_non_frameshift']) n_i_sfs = vcf2sfs(vcf_name=vcf, mode='ins', auto_only=True, skip_hetero=True, lengths=length_list, regions=['intergenic', 'intron']) # convert to correct format for anavar sfs_i = sfs2counts(ins_sfs, n) sfs_d = sfs2counts(del_sfs, n) sfs_ni = sfs2counts(n_i_sfs, n) sfs_nd = sfs2counts(n_d_sfs, n) # get callable sites sel_m = call['ALL']['CDS']['pol'] neu_m = call['ALL']['intergenic']['pol'] + call['ALL']['intron']['pol'] # construct control file sfs sfs_m = { 'selected_INS': (sfs_i, sel_m), 'selected_DEL': (sfs_d, sel_m), 'neutral_INS': (sfs_ni, neu_m), 'neutral_DEL': (sfs_nd, neu_m) } return sfs_m
def prepare_indel_sfs(vcf, call, n, sel_sfs_regions, call_sel_reg, ar_ref=True): """ gets sfs from vcf and prepares as anavar input :param vcf: str :param call: dict :param n: int :param sel_sfs_regions: list :param call_sel_reg: str :param ar_ref: str :return: dict """ if ar_ref: neu_sfs_regions = ['intergenic_ar', 'intron_ar'] neu_call_reg = 'AR' else: neu_sfs_regions = ['intergenic', 'intron'] neu_call_reg = 'noncoding' # extract site frequencies del_sfs = vcf2sfs(vcf_name=vcf, mode='del', auto_only=True, regions=sel_sfs_regions) n_d_sfs = vcf2sfs(vcf_name=vcf, mode='del', auto_only=True, regions=neu_sfs_regions) ins_sfs = vcf2sfs(vcf_name=vcf, mode='ins', auto_only=True, regions=sel_sfs_regions) n_i_sfs = vcf2sfs(vcf_name=vcf, mode='ins', auto_only=True, regions=neu_sfs_regions) # convert to correct format for anavar sfs_i = sfs2counts(ins_sfs, n) sfs_d = sfs2counts(del_sfs, n) sfs_ni = sfs2counts(n_i_sfs, n) sfs_nd = sfs2counts(n_d_sfs, n) # get callable sites sel_m = call['ALL'][call_sel_reg]['pol'] neu_m = call['ALL'][neu_call_reg]['pol'] # construct control file sfs sfs_m = { 'selected_INS': (sfs_i, sel_m), 'selected_DEL': (sfs_d, sel_m), 'neutral_INS': (sfs_ni, neu_m), 'neutral_DEL': (sfs_nd, neu_m) } return sfs_m
def main(): parser = argparse.ArgumentParser() parser.add_argument('-correct_sfs', default=False, action='store_true', help=argparse.SUPPRESS) args = parser.parse_args() # paths if platform.uname()[0] == 'Darwin': stem = '/Users/henryjuho/sharc_fastdata/' else: stem = '/fastdata/bop15hjb/h_j_b/' vcf = stem + 'GT_data/BGI_BWA_GATK/Analysis_ready_data/' \ 'final/bgi_10birds.filtered_indels.pol.anno.recomb.line.vcf.gz' rec = stem + 'GT_data/BGI_BWA_GATK/anavar_analysis/window_analysis/gt_2Mb_window_rec_rates.txt' windows = stem + 'GT_data/BGI_BWA_GATK/anavar_analysis/window_analysis/gt_windows.2Mb.bed.gz' call_sites = pysam.FastaFile(stem + 'GT_ref/bgi_10birds.callable.fa') noncoding_bed = stem + 'GT_ref/gt_noncoding.bed.gz' rec_data = rec_rates(rec) nc_bed = pysam.TabixFile(noncoding_bed) # header print('chr', 'start', 'stop', 'window', 'rec_rate', 'callable', 'n_ins', 'theta_ins', 'pi_ins', 'tajd_ins', 'n_del', 'theta_del', 'pi_del', 'tajd_del', 'n_indel', 'pol_success', sep='\t') for line in gzip.open(windows): chromo, start, stop, wind_id = line.split() # sfs ins = vcf2sfs(vcf, mode='ins', chromo=chromo, start=int(start), stop=int(stop), regions=['intron', 'intergenic']) dels = vcf2sfs(vcf, mode='del', chromo=chromo, start=int(start), stop=int(stop), regions=['intron', 'intergenic']) # correct if specified if args.correct_sfs: ins, dels = correct_sfs(list(ins), list(dels)) else: ins = list(ins) dels = list(dels) indels = vcf2sfs(vcf, mode='indel', fold=True, chromo=chromo, start=int(start), stop=int(stop), regions=['intron', 'intergenic']) n_indels = len(list(indels)) # callsites n_call = window_call_sites(call_sites, nc_bed, (chromo, int(start), int(stop))) if len(ins) == 0 or len(dels) == 0 or n_call == 0: ins_t, ins_pi, ins_taj, dels_t, dels_pi, dels_taj, pol_success = 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA' else: pol_success = (len(ins) + len(dels)) / float(n_indels) # summary stats ins_t = theta_w(20, len(ins)) / float(n_call) ins_pi = pi(20, ins) / float(n_call) ins_taj = tajimas_d(20, ins) dels_t = theta_w(20, len(dels)) / float(n_call) dels_pi = pi(20, dels) / float(n_call) dels_taj = tajimas_d(20, dels) # rec rate rr = rec_data[wind_id] print(chromo, start, stop, wind_id, rr, n_call, len(ins), ins_t, ins_pi, ins_taj, len(dels), dels_t, dels_pi, dels_taj, n_indels, pol_success, sep='\t')
def main(): # argument parser parser = argparse.ArgumentParser() parser.add_argument('-vcf', help='vcf file with indels', required=True) parser.add_argument('-bed_list', help='list of bin beds', required=True) parser.add_argument('-call_fa', help='callable sites fasta file', required=True) parser.add_argument( '-call_txt', default='/home/bop15hjb/parus_indel/summary_analyses/bgi10_call.txt', help=argparse.SUPPRESS) parser.add_argument( '-equal_theta', help= 'if specified runs with equal mutation rates between neu and sel sites', default=False, action='store_true') parser.add_argument( '-dfe', help='determines type of distribution to have in model', choices=['discrete', 'continuous'], default='continuous') parser.add_argument('-out_pre', help='output prefix', required=True) parser.add_argument('-evolgen', help='if specified will run on lab queue', default=False, action='store_true') parser.add_argument('-sub', help='If specified will submit script to cluster', action='store_true', default=False) args = parser.parse_args() # submission loop if args.sub is True: command_line = [' '.join([x for x in sys.argv if x != '-sub'])] q_sub(command_line, args.out_pre + '.control_job', evolgen=args.evolgen) sys.exit() # flags if args.equal_theta: constraint = 'equal_mutation_rate' else: constraint = 'none' # files call_fasta = pysam.FastaFile(args.call_fa) # make a sorted list of form [('1', 'bin1.bed.gz'), ('2', 'bin2.bed.gz')] bed_files = sorted([(x.rstrip().split('.')[-3].replace('bin', ''), x.rstrip()) for x in open(args.bed_list) if x.rstrip().endswith('.bed.gz')]) # neu reference n_i_sfs = vcf2sfs(vcf_name=args.vcf, mode='ins', auto_only=True, regions=['intergenic_ar', 'intron_ar']) n_d_sfs = vcf2sfs(vcf_name=args.vcf, mode='del', auto_only=True, regions=['intergenic_ar', 'intron_ar']) sfs_ni = sfs2counts(n_i_sfs, 20) sfs_nd = sfs2counts(n_d_sfs, 20) neu_m = read_callable_txt(args.call_txt)['ALL']['AR']['pol'] # everything else per window for dist_bin in bed_files: bin_id = dist_bin[0] bin_bed = dist_bin[1] out_stem = '{}_bin{}'.format(args.out_pre, bin_id) call_sites = 0 sfs_cmd = ( 'bedtools intersect -header -a {} -b {} | ' '~/sfs_utils/vcf2raw_sfs.py -region intergenic -region intron -mode {} -auto_only' ) ins_sfs = subprocess.Popen( sfs_cmd.format(args.vcf, bin_bed, 'ins'), shell=True, stdout=subprocess.PIPE).communicate()[0].split('\n')[:-1] del_sfs = subprocess.Popen( sfs_cmd.format(args.vcf, bin_bed, 'del'), shell=True, stdout=subprocess.PIPE).communicate()[0].split('\n')[:-1] for coord_set in gzip.open(bin_bed): coords = coord_set.split() if coords[0] == 'chrZ': continue # get sel call sites call_sites += window_call_sites( call_fasta, None, (coords[0], int(coords[1]), int(coords[2]))) ins_counts = sfs2counts(ins_sfs, 20) del_counts = sfs2counts(del_sfs, 20) # anavar setup sfs_data = { 'selected_INS': (ins_counts, call_sites), 'selected_DEL': (del_counts, call_sites), 'neutral_INS': (sfs_ni, neu_m), 'neutral_DEL': (sfs_nd, neu_m) } anavar_path = '/shared/evolgen1/shared_data/program_files/sharc/' anavar_cmd = '{path}anavar1.22 {ctl} {rslts} {log} {seed}' # sort file names ctl_name = out_stem + '.control.txt' result_name = out_stem + '.results.txt' log_name = out_stem + '.log.txt' # make control file ctl = an.IndelNeuSelControlFile() ctl.set_alg_opts(search=500, alg='NLOPT_LD_SLSQP', key=3, epsabs=1e-20, epsrel=1e-9, rftol=1e-9, maxtime=3600, optional=True) ctl.set_data(sfs_data, 20, dfe=args.dfe, c=1, theta_r=(1e-10, 0.1), r_r=(0.01, 100), scale_r=(0.1, 5000.0), gamma_r=(-5e4, 1e2)) ctl.set_constraint(constraint) ctl_contents = ctl.construct() with open(ctl_name, 'w') as control: control.write(ctl_contents) # submit anavar window job window_cmd = anavar_cmd.format(path=anavar_path, ctl=ctl_name, rslts=result_name, log=log_name, seed=int(bin_id)) q_sub([window_cmd], out=out_stem, t=24, evolgen=args.evolgen)