Exemple #1
0
def main(recom):
    number_of_slims = 0
    reader = slim_reader_gzip

    rec = pd.DataFrame([[1, -99]] + recom, columns=['pos', 'rate'])
    #	rec['cumulative'] = rec['rate']* ( rec['pos'] - rec['pos'].shift(1))
    #	freq_mean = 4000. * rec['cumulative'].sum() / rec['pos'].max()

    interval = [[173800, 193200], [1, 428]]
    for i in interval:
        print recombination_rates(rec, i[0], i[1])

    sys.exit()

    for i in reader(sys.argv[1]):
        x = slim(i)
        check_point = sanity_checks(x)
        if check_point == "insane":
            continue
        else:
            pass
        print x.recomb_intervals
        recombination = pd.DataFrame([[1, -99]] + x.recomb_intervals,
                                     columns=['pos', 'rate'])
        recombination['cumulative'] = recombination['rate'] * (
            recombination['pos'] - recombination['pos'].shift(1))
        freq_mean = 4. * x.N * recombination['cumulative'].sum(
        ) / recombination['pos'].max()
        break
def generate_seq_dicts(slim_in,
                       mat_dict,
                       number_of_sequences,
                       all_individuals=True,
                       singletons=False):
    slim = tom_slim.slim(slim_in,
                         fixed=True,
                         give_genomes=True,
                         all_individuals=True)

    mut_mat = mat_dict["mut_mat"]
    mut_freqs = mat_dict["mut_freqs"]
    reference = tuple(
        [get_ref(mut_mat) for i in xrange(slim.length)]
    )  ## This defines the ancestral sequence as a string of random letters (ATCG) with the length of the SLiM chromosome
    #	return
    print slim.name
    #### LEFT IT HERE JANUARY 20th. Need to make a dict of the mutation frequencies so that the mutate function can do the same thing as the get_ref function, to choose the base to mutate to based upon the matrix
    if singletons == True:
        min_frequency = 0
    elif singletons == False:
        min_frequency = 1

    mut_dict_raw = slim.mutations_dict(minFreq=min_frequency)

    for i in mut_dict_raw:
        print i, mut_dict_raw[i]
    mut_dict = {}

    for i in mut_dict_raw.keys():
        pos = mut_dict_raw[i] - 1
        ref_at_pos = reference[pos]
        alt_at_pos = mutate(ref_at_pos, mut_freqs)
        mut_dict[i] = [pos, alt_at_pos]

    samples = slim.genome_dict()
    genomes = samples
    seqs = {}
    individuals_chosen = []
    while len(individuals_chosen) < number_of_sequences:
        ### Add a loop here to get multiple individuals
        individual_to_choose = random.randint(1, (len(genomes.keys()) / 2))
        if individual_to_choose in individuals_chosen: continue
        else: individuals_chosen.append(individual_to_choose)
        seq_1 = "p1:" + str(individual_to_choose * 2)
        seq_2 = "p1:" + str(individual_to_choose * 2 - 1)
        #		brace()
        for p in [seq_1, seq_2]:
            seqs[p] = list(reference)
            for allele in genomes[p]:
                try:
                    mutation = mut_dict[int(allele)]
                    seqs[p][mutation[0]] = mutation[1]
                except KeyError:
                    pass
    return seqs, slim.name
Exemple #3
0
def generate_seq_dicts(slim_in,
                       mat_dict,
                       number_of_sequences,
                       all_individuals=True):
    slim = tom_slim.slim(slim_in,
                         fixed=False,
                         give_genomes=True,
                         all_individuals=True)
    mut_mat = mat_dict["mut_mat"]
    mut_freqs = mat_dict["mut_freqs"]
    reference = tuple(
        [get_ref(mut_mat) for i in xrange(slim.length)]
    )  ## This defines the ancestral sequence as a string of random letters (ATCG) with the length of the SLiM chromosome
    #	return
    #### LEFT IT HERE JANUARY 20th. Need to make a dict of the mutation frequencies so that the mutate function can do the same thing as the get_ref function, to choose the base to mutate to based upon the matrix

    mut_dict_raw = slim.mutations_dict(minFreq=1)

    #	for i in mut_dict_raw:
    #		print i, mut_dict_raw[i]
    mut_dict = {}

    for i in mut_dict_raw.keys():
        pos = mut_dict_raw[i] - 1
        ref_at_pos = reference[pos]
        alt_at_pos = mutate(ref_at_pos, mut_freqs)
        mut_dict[i] = [pos, ref_at_pos, alt_at_pos]

    samples = slim.genome_dict()

    #haps = {}
    haps = collections.OrderedDict()
    individuals_chosen = []
    while len(individuals_chosen) < number_of_sequences:
        ### Add a loop here to get multiple individuals
        individual_to_choose = random.randint(1, (len(samples.keys()) / 2))
        if individual_to_choose in individuals_chosen: continue
        else: individuals_chosen.append(individual_to_choose)
        seq_1 = "p1:" + str(individual_to_choose * 2)
        seq_2 = "p1:" + str(individual_to_choose * 2 - 1)
        #		brace()

        for p in [seq_1, seq_2]:
            haps[p] = samples[p]

    return haps, slim.name, mut_dict
Exemple #4
0
def get_sfs_dict_from_sample(slim_input):
    data = [i.strip() for i in gzip.open(slim_input).readlines()]
    x = ts.slim(data, fixed=True, give_genomes=True)
    if not x.sanity:
        return [None, None]
#	print x.name
    genomes = x.genome_dict()
    mutations = x.mutations_dict()
    lengthDict = parseLengths(x.organ_lengths())
    individuals = [random.choice(genomes.keys()) for i in range(20)]
    #	print individuals
    #	if x.name == 	'/exports/csce/eddie/biology/groups/eddie_biology_ieb_keightley/toms_simulations/updated_DFE/longRuns/full_usfs/configs/3381.temp.slim':
    #		individuals = ['p1:1398', 'p1:1646', 'p1:297', 'p1:165', 'p1:999', 'p1:1451', 'p1:982', 'p1:973', 'p1:615', 'p1:832', 'p1:12', 'p1:1109', 'p1:1137', 'p1:496', 'p1:164', 'p1:412', 'p1:1687', 'p1:1373', 'p1:72', 'p1:39']
    muts_by_organ = x.organ_mutations()
    new_muts = Counter()
    for g in individuals:
        for m in genomes[g]:
            new_muts[m] += 1

    polyDict = {}
    for h in muts_by_organ.keys():
        mTypeDict = {}
        for m in muts_by_organ[h]:
            if new_muts[m[0]] == 0: continue
            if m[1] not in mTypeDict.keys():
                mTypeDict[m[1]] = [new_muts[m[0]]]
            else:
                mTypeDict[m[1]].append(new_muts[m[0]])
#		print h, mTypeDict
        mPoly = {}
        for k in mTypeDict.keys():
            mPoly[k] = sfs_tools.SFS_from_all_frequencies(mTypeDict[k], 20)
        polyDict[h] = mPoly


#	print '!', x.name
    fixedDict = x.organ_fixed(threshold=int(x.N) * 20)

    fixD = orgFixDict(fixedDict)
    polyfix = combinePolyFix(polyDict, fixD)
    elDict = combineElements(polyfix, lengthDict)

    print 'processed ' + x.name
    return [x.name, elDict]
def generate_seq_dicts(slim_in,mat_dict,number_of_sequences,all_individuals=True):
	slim = tom_slim.slim(slim_in,fixed=False,give_genomes=True,all_individuals=True)

	mut_mat = mat_dict["mut_mat"]
	mut_freqs = mat_dict["mut_freqs"]
	reference =  tuple([get_ref(mut_mat) for i in xrange(slim.length)]) ## This defines the ancestral sequence as a string of random letters (ATCG) with the length of the SLiM chromosome
#	return
	print slim.name
	#### LEFT IT HERE JANUARY 20th. Need to make a dict of the mutation frequencies so that the mutate function can do the same thing as the get_ref function, to choose the base to mutate to based upon the matrix

	mut_dict_raw = slim.mutations_dict(minFreq=1)
	
#	for i in mut_dict_raw:
#		print i, mut_dict_raw[i]
	mut_dict = {}

	for i in mut_dict_raw.keys():
		pos = mut_dict_raw[i]-1
		ref_at_pos = reference[pos]
		alt_at_pos = mutate(ref_at_pos,mut_freqs)
		mut_dict[i] = [pos,alt_at_pos]

	samples = slim.genome_dict()
	genomes = samples
	seqs = {}
	individuals_chosen = []
	while len(individuals_chosen) < number_of_sequences:
	### Add a loop here to get multiple individuals
		individual_to_choose = random.randint(1,(len(genomes.keys())/2))
		if individual_to_choose in individuals_chosen: continue
		else: individuals_chosen.append(individual_to_choose)
		seq_1 = "p1:"+str(individual_to_choose *2)
		seq_2 = "p1:"+str(individual_to_choose *2 -1)
#		brace()
		for p in [seq_1,seq_2]:
			seqs[p] = list(reference)
			for allele in genomes[p]:			
				try:
					mutation = mut_dict[int(allele)]
					seqs[p][mutation[0]] = mutation[1]
				except KeyError:pass
	return seqs,slim.name
Exemple #6
0
def get_sfs_dict(slim_input, num=-1):
    x = ts.slim(slim_input, fixed=True, give_genomes=True)
    #	print x.genomes
    if not x.sanity:
        return [None, None]
    thresh = x.N * 10

    polyDict = orgPolyDict(x.organ_mutations(), x.sampleN)
    lengthDict = parseLengths(x.organ_lengths())
    #	print lengthDict
    #	print polyDict
    fixedDict = x.organ_fixed(threshold=int(x.N) * 10)

    fixD = orgFixDict(fixedDict)

    polyfix = combinePolyFix(polyDict, fixD)

    elDict = combineElements(polyfix, lengthDict)
    print 'processed ' + x.name
    return [x.name, elDict]
def SFS_by_organ(raw_slim):
	x = tom_slim.slim(raw_slim)
	org_dict = x.organ_mutations()
	org_lengths = x.organ_lengths()
	sfs_dict = {}
	for j in org_dict.keys():
		freq_sel = []
		freq_neu = []
		for mut in org_dict[j]:
			if float(mut[3]) != 0.0:
				freq_sel.append(int(mut[7]))
			else:
				freq_neu.append(int(mut[7]))
		sel_sites = sel_prop_dict[j] * org_lengths[j]
		neu_sites = (1.0-sel_prop_dict[j]) * org_lengths[j]
		## Get the SFS for neutral and selected sites  using the lengths
		## of the different genomic elements
		sel_sfs = SFS(freq_sel,sel_sites,x.sampleN)
		neu_sfs = SFS(freq_neu,neu_sites,x.sampleN)

		sfs_dict[j] = [neu_sfs,sel_sfs]
	return [x.name, sfs_dict]	
Exemple #8
0
def get_sfs_dict_from_sample(slim_input):
    x = ts.slim(slim_input, fixed=True, give_genomes=True)
    if not x.sanity:
        return [None, None]

    genomes = x.genome_dict()
    mutations = x.mutations_dict()
    lengthDict = parseLengths(x.organ_lengths())
    individuals = [random.choice(genomes.keys()) for i in range(20)]
    muts_by_organ = x.organ_mutations()
    new_muts = Counter()
    for g in individuals:
        for m in genomes[g]:
            new_muts[m] += 1
    polyDict = {}
    for h in muts_by_organ.keys():
        mTypeDict = {}
        for m in muts_by_organ[h]:
            if new_muts[m[0]] == 0: continue
            if m[1] not in mTypeDict.keys():
                mTypeDict[m[1]] = [new_muts[m[0]]]
            else:
                mTypeDict[m[1]].append(new_muts[m[0]])


#		print h, mTypeDict
        mPoly = {}
        for k in mTypeDict.keys():
            mPoly[k] = sfs_tools.SFS_from_all_frequencies(mTypeDict[k], 20)
        polyDict[h] = mPoly

    thresh = x.N * 10
    fixedDict = x.organ_fixed(threshold=int(x.N) * 10)
    fixD = orgFixDict(fixedDict)
    polyfix = combinePolyFix(polyDict, fixD)
    elDict = combineElements(polyfix, lengthDict)

    print 'processed ' + x.name
    return [x.name, elDict]
def get_both_stats(input_args):
    index = input_args[0]
    args = input_args[1]
    boundary = args.boundary
    window = args.window
    file_name = args.input
    output_raw = args.output
    output = args.output + '.sfs'
    ### use class: slim instead...  quite a bit faster
    number = 0
    test_number = 0
    if args.gz:
        reader = slim_reader_gzip
    else:
        reader = slim_reader

    for i in reader(file_name):

        number = number + 1
        name = "non"

        x = slim(i)

        all_rates = pd.DataFrame(x.recomb_intervals, columns=['pos', 'rate'])

        ## This nex little snippet gets a dict of exon staring positions and the strand of those exons
        fixed = False
        check_point = sanity_checks(x)
        if check_point == "insane":
            continue
        else:
            pass

        if args.orientation != 'No':
            #			tem = x.name.split('/')[-1]
            #			print args.orientation
            for i in open(args.orientation):
                head = i.strip('/')
                if i.startswith('/'): break
        #	print head
            element_look_up = pysam.Tabixfile(
                "/home/booker/mouse_genome/all_elements/combined_elements/combined_elements_sorted.bed.gz"
            )
            els1 = [
                m.strip().split() for m in element_look_up.fetch(
                    head.split(':')[0], int(head.split(':')[1].split('-')[0]),
                    int(head.split(':')[1].split('-')[1]))
            ]
            els = [
                ment for ment in els1
                if not (int(ment[2]) -
                        int(ment[1]) == 1 and ment[3] == 'INTERGENIC')
            ]
            strandDict = {}
            for g, h in zip(els, x.organs):
                #			print g, h
                if g[3] == 'CDS':
                    strandDict[h[1]] = g[4]
#					print g,h, g[4]

        name2 = x.name.split('/')[-1]
        #print name2
        individuals = x.sampleN
        length = x.length
        #		print length
        element_positions = []
        temp = open(output, "w")
        temp.close()
        temp_out = open(output, "a")
        ### this bit here gets the selected site lists from
        ### the slim object and then gets all of the sites
        ### for all  selected sites in the simulation
        sites_dict = x.sites_dict()  ## ALSO RETURN A KEY OF SITE  TYPES?
        for f in sites_dict["selected"]:
            if f[0][0] != "g0":
                element_positions += f[1]

        if fixed:
            non_element_subs = [
                v for v in x.fixed if int(v[2]) not in element_positions
            ]
        mutations_dict = {}
        for key in [
                b for b in x.mutations if int(b[2]) not in element_positions
        ]:
            mutations_dict[int(key[2])] = key
        position_keys = sorted(set(mutations_dict.keys()))

        print("Processing file: " + str(number) + "\n\t" + args.element +
              " make up " +
              str(round(len(element_positions) * 100.0 / x.length, 2)) +
              "% of the " + str(int(x.length / 1000)) +
              "Kb simulated chromosome\n")
        if boundary >= x.length:
            distances_raw = range(1, x.length + window, window)
        elif boundary < x.length:
            distances_raw = range(1, boundary + window, window)

        distances = distances_raw + [i * -1 for i in distances_raw]

        exons = [g for g in x.organs if g[0] == args.element]
        #print strandDict
        #print exons
        for point in exons:
            #print point
            lower_lim, upper_lim = get_analysis_limits(
                exons, point,
                x.length)  # Find the limits for the analysis of this exon

            for k in distances:
                low_bound, up_bound = get_window_bounds(point, window, k)

                if up_bound > upper_lim or low_bound < lower_lim:
                    continue
#				print
                mid_window = (up_bound + low_bound) / 2

                if k < 0:
                    dist_start = mid_window
                    dist_end = int(point[1])
                if k > 0:
                    dist_end = mid_window
                    dist_start = int(point[2])
                if up_bound > x.length - args.threshold:
                    #					print 'check 2'
                    continue
                elif low_bound < 0 + args.threshold:
                    #					print 'check 3' #, low_bound
                    continue
                else:

                    frequencies = [
                        int(mutations_dict[j][7]) for j in position_keys if
                        j >= low_bound and j <= up_bound and up_bound < length
                    ]
                    #
                    #					frequencies = [int(hh[7]) for hh in muts_in_window] # replace muts_in_windows with a condensed

                    #					if fixed:
                    #						subs_in_window = [j for j in non_element_subs if int(j[2]) > low_bound and int(j[2]) < up_bound and up_bound < length]
                    #					print subs_in_window
                    #			elements_in_window = [p for p in element_positions if int(p) > low_bound and int(p) < up_bound]

                    bin_width = window - len([
                        p for p in element_positions
                        if int(p) >= low_bound and int(p) <= up_bound
                    ])

                    if bin_width == 0 or bin_width < 0:
                        continue
                    else:
                        sfs_window = SFS_from_frequencies(
                            frequencies, bin_width, individuals)
                        window_name = str(
                            abs(k)) + '-' + str(abs(k) + window - 1)
                        r_dist, p_dist = getDistance(dist_start, dist_end,
                                                     all_rates)
                        #						print r_dist * x.N*4. , p_dist
                        if args.orientation != 'No':
                            try:
                                strand = strandDict[point[1]]
                            except KeyError:
                                print x.name
                                return
                            if strand == '+':

                                if k < 0:
                                    y = "u." + window_name
                                elif k > 0:
                                    y = "d." + window_name
                            elif strand == '-':
                                if k > 0:
                                    y = "u." + window_name
                                elif k < 0:
                                    y = "d." + window_name
                        else:
                            if k < 0:
                                y = "u." + window_name
                            elif k > 0:
                                y = "d." + window_name

                        temp_out.write(
                            str([y, r_dist * x.N * 4., p_dist, sfs_window]) +
                            "\n")
        temp_out.close()