def main(): parser = argparse.ArgumentParser( description="Extract the SFS from a binch of merged SLiM output files") parser.add_argument( "-i", "--input", required=True, dest="input", type=str, help="The name of the input file (or the input directory)") parser.add_argument( "-o", "--output", required=True, dest="output", type=str, help="The name of the output file you want to write to") parser.add_argument( "-d", "--dir", required=False, dest="dir", action='store_true', help= "Use this flag if you want to combine a number of files in the same directory", default=False) args = parser.parse_args() if args.dir: files = glob.glob(args.input + '/*sfs') else: files = [args.input] sites = float(len(files)) * 1e6 # Get a list of dictionaries containing the SFSs dicts = [getDict(i) for i in files] nonsynSFS, synSFS = mergeDicts(dicts) nonsynSFS[0] += (sites * 0.75) - sum(nonsynSFS[1:]) synSFS[0] += (sites * 0.25) - sum(synSFS[1:]) ds = float(synSFS[-1]) / sum(synSFS) dn = float(nonsynSFS[-1]) / sum(nonsynSFS) print 'nonsyn pi:', pi(nonsynSFS) print 'syn pi:', pi(synSFS) print 'pi / pi_0:', pi(synSFS) / 0.01 print 'dN', dn print 'ds', ds print 'adv. Cont', float(sum(mergeTheSFS([ i['m2'] for i in dicts ])[1:-1])) / (sum(mergeTheSFS([i['m1'] for i in dicts])[1:-1]) + sum(mergeTheSFS([i['m2'] for i in dicts])[1:-1])) print 'dN/dS:', dn / ds
def main(): parser = argparse.ArgumentParser( description="Extract the SFS from a binch of merged SLiM output files") parser.add_argument( "-i", "--input", required=True, dest="input", type=str, help="The name of the directory containing the SLiM output") parser.add_argument( "-o", "--output", required=True, dest="output", type=str, help="The name of the output file you want to write to") args = parser.parse_args() # output = [] for m in range(1, 9): print 'm' + str(m) full_sfs = [] for i in glob.glob(args.input + '/R*'): num = i.split('/')[-1].split('.')[2] process = subprocess.Popen(['zgrep', 'm' + str(m), i], stdout=subprocess.PIPE).communicate()[0] fixations, sfs = getSFSfromSLiM(process) if fixations == None: continue polymorphs = sum(sfs) sfs[-1] += fixations sfs[0] = 0 print SFS.pi(sfs) / 140000. print ':'.join(map(str, sfs)) continue if len(full_sfs) == 0: full_sfs = sfs else: full_sfs = SFS.merge_SFS(full_sfs, sfs) # output.append(['m'+str(m),full_sfs]) return txt = open(args.output, 'w') for i in output: print i txt.write(i[0] + '\n') txt.write(' '.join(map(str, i[1])) + '\n') txt.close()
def main(): parser = argparse.ArgumentParser( description= "Combine all the sfs files coming out of the sfs_from_slim_update_bootstrap.py script" ) parser.add_argument( "-i", "--input", required=True, dest="input", type=str, help="The name of the file that contains the sfs files") parser.add_argument("-o", "--output", required=True, dest="output", type=str, help="The name of the output file") args = parser.parse_args() sfs_dict = {} for i in gzip.open(args.input): z = i.split('[') region = z[1].strip("'").replace("'", '') sfs_temp = map( int, z[2].replace(']', '').replace(',', '').strip().split(' ')) try: sfs_dict[region].append(sfs_temp) except KeyError: sfs_dict[region] = [sfs_temp] data = [] for i in sfs_dict.keys(): sfs = sfs_dict[i][0] for j in sfs_dict[i][1:]: sfs = SFS_tools.merge_SFS(sfs, j) stream = i.split('.')[0] dist = map(int, i.replace(',', '').split('.')[1].split('-')) if stream == 'u': mult = -1 else: mult = 1 mid = mult * sum(dist) / 2 data.append([mid, SFS_tools.pi(sfs), SFS_tools.tajima(sfs)]) pd.DataFrame(data, columns=['dist', 'pi', 'TD']).to_csv(args.output)
def main(): parser = argparse.ArgumentParser( description= "Give a directory and I'll analyse the patterns of diversity around a simulated exon" ) parser.add_argument( "-i", "--input", required=True, dest="input", type=str, help="The name of the file that contains the SLiM output") args = parser.parse_args() count = 0 r_bins = [float(i) / 10 for i in range(1, 500, 1)] + [1. * i for i in range(50, 500)] approx = combinedSel(np.array(r_bins)) pylab.plot(r_bins, approx * 0.01, 'r') for i in glob.glob(args.input + '*.out.gz'): if count == 0: data = processSLiM(i) else: temp = processSLiM(i) data = pd.concat([data, temp]) count += 1 if count == 50: break top = max(range(len(r_bins))) analysis = [] for r in range(len(r_bins)): bin = r_bins[r] if r == top: next = 1e8 else: next = r_bins[r + 1] in_range = list(data[(data.r_dist_true >= bin) & (data.r_dist_true < next)].sfs) if len(in_range) == 0: continue analysis.append([r_bins[r], SFS.pi(mergeManySFS(in_range))]) true_r = pd.DataFrame(analysis, columns=['dist', 'pi']) pylab.plot(true_r.dist, true_r.pi, 'b') pylab.show()
def get_summary(sfs_dict, label): out = [] for key in sfs_dict.keys(): mid = key if sum(sfs_dict[key][1:-1]) == 0: out.append([ label, mid, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', sum(sfs_dict[key]) ]) elif sum(sfs_dict[key][1:-1]) > 0: out.append([ label, mid, SFS.pi(sfs_dict[key]), SFS.xsi(sfs_dict[key]), SFS.pi2(sfs_dict[key]), SFS.fwh(sfs_dict[key]), SFS.theta_W(sfs_dict[key]), SFS.tajima(sfs_dict[key]), # SFS.KZl(sfs_dict[key]), sum(sfs_dict[key]) ]) return out
def main(): parser = argparse.ArgumentParser( description="Takes the summary file of recombination distances and ") parser.add_argument("-i", "--input", required=True, dest="input", type=str, help="The name of the sorted, bed file of segments") parser.add_argument("-o", "--output", required=True, dest="output", type=str, help="The name of the output file") parser.add_argument( "-l", "--label", required=False, dest="label", type=str, help= "Add a label to this data, e;g; which chromosome do they come from? The default will be 'Autosomes'", default='Autosomes') parser.add_argument( "--ncpg", required=False, dest="ncpg", action='store_true', help="Add this flag if you want to analyse the non-CpG sites", default=False) parser.add_argument("--cne", required=False, dest="cne", action='store_true', help="Add this flag if you are analysing CNEs", default=False) parser.add_argument("--Cox", required=False, dest="Cox", action='store_true', help="Add this flag if you want to use the Cox map", default=False) parser.add_argument( "--GC", required=False, dest="GC", action='store_true', help= "Add this flag if you want to include GeneConversion according to the Paigen et al estimates", default=False) args = parser.parse_args() data = pd.read_csv(args.input, compression='gzip', header=None, sep='\t') data['scale'] = [-1 if x.split('.')[0] == 'u' else 1 for x in data[0]] # The following gene conversion parameters come from Paigen et al 2008 PLoS Genetics nc_gc_ratio = 0.105 # The relative rate of NCO gene conversion compared to CO Gene Conversion 0.105 for Paigen tract_length = 144 # average tract length if args.Cox: data['r_co'] = data[7] * 426200 * 4 # RecPos is the index of the dataframe where recombination rates are held recPos = 'r_co' data['r_gc'] = 426200 * 4 * (nc_gc_ratio * data[7] / data[3]) * tract_length * (1 - np.exp( (-1. * data[3]) / tract_length)) data['joint'] = data['r_gc'] + data['r_co'] if args.GC: recPos = 'joint' else: data['r_co'] = data[2] recPos = 'r_co' data['r_gc'] = (nc_gc_ratio * data[2] / data[3]) * tract_length * (1 - np.exp( (-1. * data[3]) / tract_length)) data['joint'] = data['r_gc'] + data['r_co'] if args.GC: recPos = 'joint' data['dist'] = data[recPos] * data['scale'] if args.cne: roundBy = 2 bins = np.logspace(0, 2.477122, 100) - 1 # bins = range(0, 200, roundBy) ## For CNEs else: roundBy = 30 bins = np.logspace(0, 3.477122, 100) - 1 # bins = range(0, 3000, roundBy) ## For Exons output_lines = [] output_lines_2 = [] for i in range(len(bins)): if i < 99: chunk = data.loc[(data[recPos] >= bins[i]) & (data[recPos] < bins[i + 1])] else: chunk = data.loc[(data[recPos] >= bins[i])] if len(chunk) == 0: continue SFS, div = summariseChunk(chunk, ncpg=args.ncpg) up_chunk = chunk[chunk['dist'] < 0] if len(up_chunk) == 0: continue SFS_up, div_up = summariseChunk(up_chunk, ncpg=args.ncpg) down_chunk = chunk[chunk['dist'] > 0] if len(down_chunk) == 0: continue SFS_down, div_down = summariseChunk(down_chunk, ncpg=args.ncpg) if sum(SFS) == 0: continue else: outline = [ bins[i], args.label, SFS_tools.pi(SFS), pgm.jukes_cantor(float(div[0]) / sum(SFS)), pgm.jukes_cantor(float(div[1]) / sum(SFS)), SFS_tools.tajima(SFS), sum(SFS), ] if sum(SFS_up) == 0: continue else: outline_up = [ -1 * bins[i], args.label, SFS_tools.pi(SFS_up), pgm.jukes_cantor(float(div_up[0]) / sum(SFS_up)), pgm.jukes_cantor(float(div_up[1]) / sum(SFS_up)), SFS_tools.tajima(SFS_up), sum(SFS_up), ] if sum(SFS_down) == 0: continue else: outline_down = [ bins[i], args.label, SFS_tools.pi(SFS_down), pgm.jukes_cantor(float(div_down[0]) / sum(SFS_down)), pgm.jukes_cantor(float(div_down[1]) / sum(SFS_down)), SFS_tools.tajima(SFS_down), sum(SFS_down), ] output_lines.append(outline) output_lines_2.append(outline_up) output_lines_2.append(outline_down) output1 = pd.DataFrame(output_lines, columns=[ 'mid', 'label', 'pi', 'fam_div_jc', 'rat_div_jc', 'tajima', 'sites' ]) output1.to_csv(args.output) output2 = pd.DataFrame(output_lines_2, columns=[ 'mid', 'label', 'pi', 'fam_div_jc', 'rat_div_jc', 'tajima', 'sites' ]) output2.to_csv('split_' + args.output)
for i in glob.glob('*/*sfs'): x = open(i).readlines() name = i.split('/')[0] if name[0] == 'u': mult = -1 elif name[0] == 'd': mult = 1 interval = name.split('.')[1] dist = mult * (int(interval.split('-')[0]) + int(interval.split('-')[1])) / 2 x = open(i).readlines() sfs = map(float, x[0].strip().split(',')) if sfs == [0.0]: continue outline = [ dist, sum(sfs), SFS.pi(sfs), SFS.xsi(sfs), SFS.pi2(sfs), SFS.theta_W(sfs), SFS.fwh(sfs), SFS.tajima(sfs), sfs[-1] / sum(sfs) ] out_lines.append(outline) # output.write(','.join(map(str,outline))+'\n') data = pd.DataFrame(out_lines, columns=headings) data.sort_values('dist').to_csv(sys.argv[1])
def main(): parser = argparse.ArgumentParser( description="Takes the summary file of recombination distances and ") parser.add_argument("-i", "--input", required=True, dest="input", type=str, help="The name of the sorted, bed file of segments") parser.add_argument("-o", "--output", required=True, dest="output", type=str, help="The name of the output file") parser.add_argument( "-l", "--label", required=False, dest="label", type=str, help= "Add a label to this data, e;g; which chromosome do they come from? The default will be 'Autosomes'", default='Autosomes') parser.add_argument( "--ncpg", required=False, dest="ncpg", action='store_true', help="Add this flag if you want to analyse the non-CpG sites", default=False) parser.add_argument( "--dir", required=False, dest="dir", action='store_true', help="Add this flag if you are analysing a directory of files", default=False) args = parser.parse_args() if not args.dir: data = pd.read_csv(args.input, header=None, sep='\t') else: data = pd.concat([ pd.read_csv(i, header=None, sep='\t') for i in glob.glob(args.input + '*') ]) ranges = range(0, 100, 1) ## For CNEs # ranges = range(0, 3000, 30) ## For Exons output_lines = [] for i in range(len(ranges)): if i < 99: chunk = data.loc[(data[2] >= ranges[i]) & (data[2] < ranges[i + 1])] else: chunk = data.loc[(data[2] >= ranges[i])] if len(chunk) == 0: continue SFS, div = summariseChunk(chunk, ncpg=args.ncpg) if sum(SFS) == 0: continue outline = [ ranges[i], args.label, SFS_tools.pi(SFS), pgm.jukes_cantor(float(div[0]) / sum(SFS)), pgm.jukes_cantor(float(div[1]) / sum(SFS)), SFS_tools.tajima(SFS), sum(SFS), ] output_lines.append(outline) output = pd.DataFrame(output_lines, columns=[ 'distance', 'label', 'pi', 'fam_div_jc', 'rat_div_jc', 'tajima', 'sites' ]) output.to_csv(args.output)
def summariseChunk(chunk): recDist = chunk[1].mean() sfs = [] for i in list(chunk.columns)[2:]: sfs.append( chunk[i].sum() ) return [recDist, SFS_tools.pi(sfs)]