def main(recom): number_of_slims = 0 reader = slim_reader_gzip rec = pd.DataFrame([[1, -99]] + recom, columns=['pos', 'rate']) # rec['cumulative'] = rec['rate']* ( rec['pos'] - rec['pos'].shift(1)) # freq_mean = 4000. * rec['cumulative'].sum() / rec['pos'].max() interval = [[173800, 193200], [1, 428]] for i in interval: print recombination_rates(rec, i[0], i[1]) sys.exit() for i in reader(sys.argv[1]): x = slim(i) check_point = sanity_checks(x) if check_point == "insane": continue else: pass print x.recomb_intervals recombination = pd.DataFrame([[1, -99]] + x.recomb_intervals, columns=['pos', 'rate']) recombination['cumulative'] = recombination['rate'] * ( recombination['pos'] - recombination['pos'].shift(1)) freq_mean = 4. * x.N * recombination['cumulative'].sum( ) / recombination['pos'].max() break
def generate_seq_dicts(slim_in, mat_dict, number_of_sequences, all_individuals=True, singletons=False): slim = tom_slim.slim(slim_in, fixed=True, give_genomes=True, all_individuals=True) mut_mat = mat_dict["mut_mat"] mut_freqs = mat_dict["mut_freqs"] reference = tuple( [get_ref(mut_mat) for i in xrange(slim.length)] ) ## This defines the ancestral sequence as a string of random letters (ATCG) with the length of the SLiM chromosome # return print slim.name #### LEFT IT HERE JANUARY 20th. Need to make a dict of the mutation frequencies so that the mutate function can do the same thing as the get_ref function, to choose the base to mutate to based upon the matrix if singletons == True: min_frequency = 0 elif singletons == False: min_frequency = 1 mut_dict_raw = slim.mutations_dict(minFreq=min_frequency) for i in mut_dict_raw: print i, mut_dict_raw[i] mut_dict = {} for i in mut_dict_raw.keys(): pos = mut_dict_raw[i] - 1 ref_at_pos = reference[pos] alt_at_pos = mutate(ref_at_pos, mut_freqs) mut_dict[i] = [pos, alt_at_pos] samples = slim.genome_dict() genomes = samples seqs = {} individuals_chosen = [] while len(individuals_chosen) < number_of_sequences: ### Add a loop here to get multiple individuals individual_to_choose = random.randint(1, (len(genomes.keys()) / 2)) if individual_to_choose in individuals_chosen: continue else: individuals_chosen.append(individual_to_choose) seq_1 = "p1:" + str(individual_to_choose * 2) seq_2 = "p1:" + str(individual_to_choose * 2 - 1) # brace() for p in [seq_1, seq_2]: seqs[p] = list(reference) for allele in genomes[p]: try: mutation = mut_dict[int(allele)] seqs[p][mutation[0]] = mutation[1] except KeyError: pass return seqs, slim.name
def generate_seq_dicts(slim_in, mat_dict, number_of_sequences, all_individuals=True): slim = tom_slim.slim(slim_in, fixed=False, give_genomes=True, all_individuals=True) mut_mat = mat_dict["mut_mat"] mut_freqs = mat_dict["mut_freqs"] reference = tuple( [get_ref(mut_mat) for i in xrange(slim.length)] ) ## This defines the ancestral sequence as a string of random letters (ATCG) with the length of the SLiM chromosome # return #### LEFT IT HERE JANUARY 20th. Need to make a dict of the mutation frequencies so that the mutate function can do the same thing as the get_ref function, to choose the base to mutate to based upon the matrix mut_dict_raw = slim.mutations_dict(minFreq=1) # for i in mut_dict_raw: # print i, mut_dict_raw[i] mut_dict = {} for i in mut_dict_raw.keys(): pos = mut_dict_raw[i] - 1 ref_at_pos = reference[pos] alt_at_pos = mutate(ref_at_pos, mut_freqs) mut_dict[i] = [pos, ref_at_pos, alt_at_pos] samples = slim.genome_dict() #haps = {} haps = collections.OrderedDict() individuals_chosen = [] while len(individuals_chosen) < number_of_sequences: ### Add a loop here to get multiple individuals individual_to_choose = random.randint(1, (len(samples.keys()) / 2)) if individual_to_choose in individuals_chosen: continue else: individuals_chosen.append(individual_to_choose) seq_1 = "p1:" + str(individual_to_choose * 2) seq_2 = "p1:" + str(individual_to_choose * 2 - 1) # brace() for p in [seq_1, seq_2]: haps[p] = samples[p] return haps, slim.name, mut_dict
def get_sfs_dict_from_sample(slim_input): data = [i.strip() for i in gzip.open(slim_input).readlines()] x = ts.slim(data, fixed=True, give_genomes=True) if not x.sanity: return [None, None] # print x.name genomes = x.genome_dict() mutations = x.mutations_dict() lengthDict = parseLengths(x.organ_lengths()) individuals = [random.choice(genomes.keys()) for i in range(20)] # print individuals # if x.name == '/exports/csce/eddie/biology/groups/eddie_biology_ieb_keightley/toms_simulations/updated_DFE/longRuns/full_usfs/configs/3381.temp.slim': # individuals = ['p1:1398', 'p1:1646', 'p1:297', 'p1:165', 'p1:999', 'p1:1451', 'p1:982', 'p1:973', 'p1:615', 'p1:832', 'p1:12', 'p1:1109', 'p1:1137', 'p1:496', 'p1:164', 'p1:412', 'p1:1687', 'p1:1373', 'p1:72', 'p1:39'] muts_by_organ = x.organ_mutations() new_muts = Counter() for g in individuals: for m in genomes[g]: new_muts[m] += 1 polyDict = {} for h in muts_by_organ.keys(): mTypeDict = {} for m in muts_by_organ[h]: if new_muts[m[0]] == 0: continue if m[1] not in mTypeDict.keys(): mTypeDict[m[1]] = [new_muts[m[0]]] else: mTypeDict[m[1]].append(new_muts[m[0]]) # print h, mTypeDict mPoly = {} for k in mTypeDict.keys(): mPoly[k] = sfs_tools.SFS_from_all_frequencies(mTypeDict[k], 20) polyDict[h] = mPoly # print '!', x.name fixedDict = x.organ_fixed(threshold=int(x.N) * 20) fixD = orgFixDict(fixedDict) polyfix = combinePolyFix(polyDict, fixD) elDict = combineElements(polyfix, lengthDict) print 'processed ' + x.name return [x.name, elDict]
def generate_seq_dicts(slim_in,mat_dict,number_of_sequences,all_individuals=True): slim = tom_slim.slim(slim_in,fixed=False,give_genomes=True,all_individuals=True) mut_mat = mat_dict["mut_mat"] mut_freqs = mat_dict["mut_freqs"] reference = tuple([get_ref(mut_mat) for i in xrange(slim.length)]) ## This defines the ancestral sequence as a string of random letters (ATCG) with the length of the SLiM chromosome # return print slim.name #### LEFT IT HERE JANUARY 20th. Need to make a dict of the mutation frequencies so that the mutate function can do the same thing as the get_ref function, to choose the base to mutate to based upon the matrix mut_dict_raw = slim.mutations_dict(minFreq=1) # for i in mut_dict_raw: # print i, mut_dict_raw[i] mut_dict = {} for i in mut_dict_raw.keys(): pos = mut_dict_raw[i]-1 ref_at_pos = reference[pos] alt_at_pos = mutate(ref_at_pos,mut_freqs) mut_dict[i] = [pos,alt_at_pos] samples = slim.genome_dict() genomes = samples seqs = {} individuals_chosen = [] while len(individuals_chosen) < number_of_sequences: ### Add a loop here to get multiple individuals individual_to_choose = random.randint(1,(len(genomes.keys())/2)) if individual_to_choose in individuals_chosen: continue else: individuals_chosen.append(individual_to_choose) seq_1 = "p1:"+str(individual_to_choose *2) seq_2 = "p1:"+str(individual_to_choose *2 -1) # brace() for p in [seq_1,seq_2]: seqs[p] = list(reference) for allele in genomes[p]: try: mutation = mut_dict[int(allele)] seqs[p][mutation[0]] = mutation[1] except KeyError:pass return seqs,slim.name
def get_sfs_dict(slim_input, num=-1): x = ts.slim(slim_input, fixed=True, give_genomes=True) # print x.genomes if not x.sanity: return [None, None] thresh = x.N * 10 polyDict = orgPolyDict(x.organ_mutations(), x.sampleN) lengthDict = parseLengths(x.organ_lengths()) # print lengthDict # print polyDict fixedDict = x.organ_fixed(threshold=int(x.N) * 10) fixD = orgFixDict(fixedDict) polyfix = combinePolyFix(polyDict, fixD) elDict = combineElements(polyfix, lengthDict) print 'processed ' + x.name return [x.name, elDict]
def SFS_by_organ(raw_slim): x = tom_slim.slim(raw_slim) org_dict = x.organ_mutations() org_lengths = x.organ_lengths() sfs_dict = {} for j in org_dict.keys(): freq_sel = [] freq_neu = [] for mut in org_dict[j]: if float(mut[3]) != 0.0: freq_sel.append(int(mut[7])) else: freq_neu.append(int(mut[7])) sel_sites = sel_prop_dict[j] * org_lengths[j] neu_sites = (1.0-sel_prop_dict[j]) * org_lengths[j] ## Get the SFS for neutral and selected sites using the lengths ## of the different genomic elements sel_sfs = SFS(freq_sel,sel_sites,x.sampleN) neu_sfs = SFS(freq_neu,neu_sites,x.sampleN) sfs_dict[j] = [neu_sfs,sel_sfs] return [x.name, sfs_dict]
def get_sfs_dict_from_sample(slim_input): x = ts.slim(slim_input, fixed=True, give_genomes=True) if not x.sanity: return [None, None] genomes = x.genome_dict() mutations = x.mutations_dict() lengthDict = parseLengths(x.organ_lengths()) individuals = [random.choice(genomes.keys()) for i in range(20)] muts_by_organ = x.organ_mutations() new_muts = Counter() for g in individuals: for m in genomes[g]: new_muts[m] += 1 polyDict = {} for h in muts_by_organ.keys(): mTypeDict = {} for m in muts_by_organ[h]: if new_muts[m[0]] == 0: continue if m[1] not in mTypeDict.keys(): mTypeDict[m[1]] = [new_muts[m[0]]] else: mTypeDict[m[1]].append(new_muts[m[0]]) # print h, mTypeDict mPoly = {} for k in mTypeDict.keys(): mPoly[k] = sfs_tools.SFS_from_all_frequencies(mTypeDict[k], 20) polyDict[h] = mPoly thresh = x.N * 10 fixedDict = x.organ_fixed(threshold=int(x.N) * 10) fixD = orgFixDict(fixedDict) polyfix = combinePolyFix(polyDict, fixD) elDict = combineElements(polyfix, lengthDict) print 'processed ' + x.name return [x.name, elDict]
def get_both_stats(input_args): index = input_args[0] args = input_args[1] boundary = args.boundary window = args.window file_name = args.input output_raw = args.output output = args.output + '.sfs' ### use class: slim instead... quite a bit faster number = 0 test_number = 0 if args.gz: reader = slim_reader_gzip else: reader = slim_reader for i in reader(file_name): number = number + 1 name = "non" x = slim(i) all_rates = pd.DataFrame(x.recomb_intervals, columns=['pos', 'rate']) ## This nex little snippet gets a dict of exon staring positions and the strand of those exons fixed = False check_point = sanity_checks(x) if check_point == "insane": continue else: pass if args.orientation != 'No': # tem = x.name.split('/')[-1] # print args.orientation for i in open(args.orientation): head = i.strip('/') if i.startswith('/'): break # print head element_look_up = pysam.Tabixfile( "/home/booker/mouse_genome/all_elements/combined_elements/combined_elements_sorted.bed.gz" ) els1 = [ m.strip().split() for m in element_look_up.fetch( head.split(':')[0], int(head.split(':')[1].split('-')[0]), int(head.split(':')[1].split('-')[1])) ] els = [ ment for ment in els1 if not (int(ment[2]) - int(ment[1]) == 1 and ment[3] == 'INTERGENIC') ] strandDict = {} for g, h in zip(els, x.organs): # print g, h if g[3] == 'CDS': strandDict[h[1]] = g[4] # print g,h, g[4] name2 = x.name.split('/')[-1] #print name2 individuals = x.sampleN length = x.length # print length element_positions = [] temp = open(output, "w") temp.close() temp_out = open(output, "a") ### this bit here gets the selected site lists from ### the slim object and then gets all of the sites ### for all selected sites in the simulation sites_dict = x.sites_dict() ## ALSO RETURN A KEY OF SITE TYPES? for f in sites_dict["selected"]: if f[0][0] != "g0": element_positions += f[1] if fixed: non_element_subs = [ v for v in x.fixed if int(v[2]) not in element_positions ] mutations_dict = {} for key in [ b for b in x.mutations if int(b[2]) not in element_positions ]: mutations_dict[int(key[2])] = key position_keys = sorted(set(mutations_dict.keys())) print("Processing file: " + str(number) + "\n\t" + args.element + " make up " + str(round(len(element_positions) * 100.0 / x.length, 2)) + "% of the " + str(int(x.length / 1000)) + "Kb simulated chromosome\n") if boundary >= x.length: distances_raw = range(1, x.length + window, window) elif boundary < x.length: distances_raw = range(1, boundary + window, window) distances = distances_raw + [i * -1 for i in distances_raw] exons = [g for g in x.organs if g[0] == args.element] #print strandDict #print exons for point in exons: #print point lower_lim, upper_lim = get_analysis_limits( exons, point, x.length) # Find the limits for the analysis of this exon for k in distances: low_bound, up_bound = get_window_bounds(point, window, k) if up_bound > upper_lim or low_bound < lower_lim: continue # print mid_window = (up_bound + low_bound) / 2 if k < 0: dist_start = mid_window dist_end = int(point[1]) if k > 0: dist_end = mid_window dist_start = int(point[2]) if up_bound > x.length - args.threshold: # print 'check 2' continue elif low_bound < 0 + args.threshold: # print 'check 3' #, low_bound continue else: frequencies = [ int(mutations_dict[j][7]) for j in position_keys if j >= low_bound and j <= up_bound and up_bound < length ] # # frequencies = [int(hh[7]) for hh in muts_in_window] # replace muts_in_windows with a condensed # if fixed: # subs_in_window = [j for j in non_element_subs if int(j[2]) > low_bound and int(j[2]) < up_bound and up_bound < length] # print subs_in_window # elements_in_window = [p for p in element_positions if int(p) > low_bound and int(p) < up_bound] bin_width = window - len([ p for p in element_positions if int(p) >= low_bound and int(p) <= up_bound ]) if bin_width == 0 or bin_width < 0: continue else: sfs_window = SFS_from_frequencies( frequencies, bin_width, individuals) window_name = str( abs(k)) + '-' + str(abs(k) + window - 1) r_dist, p_dist = getDistance(dist_start, dist_end, all_rates) # print r_dist * x.N*4. , p_dist if args.orientation != 'No': try: strand = strandDict[point[1]] except KeyError: print x.name return if strand == '+': if k < 0: y = "u." + window_name elif k > 0: y = "d." + window_name elif strand == '-': if k > 0: y = "u." + window_name elif k < 0: y = "d." + window_name else: if k < 0: y = "u." + window_name elif k > 0: y = "d." + window_name temp_out.write( str([y, r_dist * x.N * 4., p_dist, sfs_window]) + "\n") temp_out.close()