예제 #1
0
def test_nsl():
    n_variants = 1000
    n_haplotypes = 20
    h = np.random.randint(0, 2, size=(n_variants, n_haplotypes)).astype('i1')

    for use_threads in True, False:
        score = nsl(h, use_threads=use_threads)
        assert_is_instance(score, np.ndarray)
        eq((n_variants, ), score.shape)
        eq(np.dtype('f8'), score.dtype)
def nsl(haplotype, pos_vec=None, window=None):
    """
    Compute the standardize number of segregating sites by length (nSl)
    for each variant, comparing the reference and alternate alleles,
    after Ferrer-Admetlla et al. (2014)

    if windowed stat, provide pos_vec too.
    """

    nsl_stats = allel.nsl(haplotype)
    nsl_stand, bins = allel.standardize_by_allele_count(
        nsl_stats, haplotype.count_alleles().T[1], diagnostics=False)
    if window:
        dn = pd.DataFrame(nsl_stand, columns=["nSL"])
        dn["pos_cat"] = pd.cut(pos_vec, window, labels=range(1, window + 1))
        dng = dn.groupby("pos_cat").nSL.mean()
        return dng
    else:
        return nsl_stand
예제 #3
0
for c in chromlist:
    callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r')
    samples = callset['samples'][:]
    sample_name = [sid.decode() for sid in samples.tolist()]
    g = allel.GenotypeChunkedArray(callset["calldata/GT"])
    h = g.to_haplotypes()
    pos = allel.SortedIndex(callset["variants/POS"][:])
    acc = h.count_alleles()[:, 1]
    # ihs
    ihs = allel.ihs(h, pos, include_edges=True)
    ihs_std = allel.standardize_by_allele_count(ihs, acc)
    plt.plot(pos, -np.log10(ihs_std[0]))
    nan = ~np.isnan(ihs)
    ihs_real = ihs[nan]
    pos_ihs = pos[nan]
    # nsl
    nsl = allel.nsl(h)
    nsl_std = allel.standardize_by_allele_count(nsl, acc)
    plt.plot(pos, -np.log10(nsl_std[0]))
    nan = ~np.isnan(ihs)
    nsl_real = ihs[nan]
    pos_nsl = pos[nan]
    seldict[c] = (ihs_std[0], nsl_std[0])
    ## ehh is site dependent site dependent
    #ehh = allel.ehh_decay(h)
    #nan = ~np.isnan(ihs)
    #ehh_real = ihs[nan]
    #pos_ehh = pos[nan]
    # H12

def image_simulation(path1,path2,S, N, file_name, NCHROMS, threshold, apply_threshold,sort,maj_min):
	"""
	Generates images from iterations of simulation files 
		- Deals with both txt files and gzip txt files 
		- Calculates summary statistics for each iteration

	Keyword Arguments: 
		apply_threshold (Boolean) -- Whether or not to apply p-threshold
		col_order (Boolean) -- Whether or not to order the columns
		file_name (string) -- The name of the simulation file being processed( either txt or txt.gz)
		NCHROMS (int) --
		N (int) -- N parameter of simulation
		n_alleles (array) -- Number of alleles at each genome position
		path1 (string) -- Path to directory where the simulation files exist
		path2 (string) -- Path to directory where produced image should be stored
		S (float) -- Selection Co-efficient of simulation
		threshold (float) -- Threshold value ?
		row_order (Boolean) -- Whether or not to order the columns
		maj_min (Boolean) -- Whether or not to colour my major/minor alleles

	Returns: 
		simulation_error (list) -- List of erronous simulation files and the iteration with error in
		statistics_list (list) -- List of dictionaries containing summary statitics of simulations

	"""
	global once
	global nsl
	simulation_error = []
	statistics_list = []
	dim = []
	##################################################
	#############OPENING THE SIMULATION FILES#########
	##################################################
	#Suffix of g_zip files (Compressed)
	gzip_suffix = ".gz" 
	#Suffix of txt files (Uncompressed)
	txt_suffix = ".txt" 
	#we import and open the file
	if file_name.endswith(gzip_suffix):
		with gzip.open(path1 + file_name, 'rb') as f:
			file = f.read()
		if type(file) == str:
			#gzip files might need to be processed to be in correct format
			file = file.splitlines(True)
	elif file_name.endswith(txt_suffix):
		file = open(path1 + file_name).readlines()
	##################################################
	##########INDEXING THE FILES BY INTERATION########
	##################################################
	#we look for the caracter // inside the file 
	find = []
	for i, string in enumerate(file):
		if string == '//\n':
			find.append(i+3)
	##################################################
	###GENERATE ONE IMAGE PER SIMULATION ITERATION####
	##################################################	
	for ITER, pointer in enumerate(find):
		try:
			###########################
			####CREATE CHROM MATRIX####
			###########################
			n_columns = len(list(file[pointer]))-1
			croms = np.zeros((NCHROMS,n_columns),dtype=int)
			for j in range(NCHROMS):
				f = list(file[pointer + j])
				del f[-1]
				position_it = file[pointer - 1].split()
				del position_it[0]
				position_it = np.array(position_it, dtype='float')
				position_it = position_it*N
				F = np.array(f,dtype=int)
				if j == 0:
					crom_array = F
				else:
					crom_array = np.vstack((crom_array,F))
				croms[j,:]=F
			n_pos = np.size(croms,1)

			###########################
			#####APPLY THRESHOLD#######
			###########################
			if apply_threshold == True:
				#Count the number of derived alleles at each position
				count = croms.sum(axis=0,dtype=float)
				#Calculate the frrequency of the drived allele for each position
				freq = count/float(NCHROMS)
				for i in range(n_pos):
					if freq[i] > 0.5:
						freq[i] = 1-freq[i]
				#freq is now a vector that contains the minor allele frequency for each position
				#we delete the positions in which the minor allele frequency is <= threshold
				positions = np.where(freq<=threshold)
				croms,n_pos,freq = delete_simulation(n_pos,croms,freq,positions)
		
			###########################
			###COLOUR BY MAJOR/MINOR###
			###########################
			if maj_min == True:
				#Calculate the Major and the minor allele for each position of the matrix/array
				#Traspose the matrix/array
				transponse_array_croms = np.transpose(croms)
				#Record the Major and Minor allele for each allelic position
				maj_allele = []
				minor_allele = []
				for i in range(len(transponse_array_croms)):
					freq_data = np.unique(transponse_array_croms[i], return_counts = True)
					index_max =  np.argmax(freq_data[1])
					if index_max == 0:
						maj_allele.append(0)
						minor_allele.append(1)
					if index_max == 1: 
						maj_allele.append(1)
						minor_allele.append(0)
	
				#Black and white image:
				#Simulation File: 0 = ancestrial, 1 = Derived (White encoded by 1, Black encoded by 0)
				#If the major allele is 0, we want to change 0 with 1 and vice verasa (1 = Major, 0 = Minor)
				#If the major allele is 1, no changes need to be made as 1 would by default be coded to be white
				matrix_maj_min_col = np.ones((n_pos,NCHROMS),dtype=int)
				for row in range(len(transponse_array_croms)):
					if maj_allele[row] == 1:
						matrix_maj_min_col[row,:] = transponse_array_croms[row]
					if maj_allele[row] == 0:
						matrix_maj_min_col[row,:] = matrix_maj_min_col[row,:] - transponse_array_croms[row]
				#Transpose the matrix so that the rows are the NCHROM and the columns are n_pos
				croms = np.transpose(matrix_maj_min_col)
			if maj_min == False:
				#Black and white image:
				#Simulation File: 0 = ancestrial, 1 = Derived (White encoded by 1, Black encoded by 0)
				#We want the opposite(ancestrial = white & derived = black) : hence we need to change 0 with 1 and vice versa before producing the image
				all1 = np.ones((NCHROMS,n_pos))
				croms = all1 - croms
			###########################
			####ORDER ROWS/COLUMNS#####
			###########################
			if sort == 2:
			#Sort the matrix by row (chromosome)
				croms = order_data(croms)

			if sort == 3:
			#Sort the matrix by column (genetic posistion) 
				croms_transpose = croms.transpose()
				croms_transpose = order_data(croms_transpose)
				croms = croms_transpose.transpose()

			if sort == 4:
				#First: sort the matrix by row (chromosome)
				croms = order_data(croms)
				#Second: sort the matrix by column (genetic posistion)
				croms_transpose = croms.transpose()
				croms_transpose = order_data(croms_transpose)
				croms = croms_transpose.transpose()

			######################
			###IMAGE GENERATION###
			######################			
			#Create image from the simulations
			bw_croms_uint8 = np.uint8(croms)
			bw_croms_im = Image.fromarray (bw_croms_uint8*255, mode = 'L')
			dim.append(bw_croms_im.size[0])
			#img..selection_coefficients..NREF..ITER.bmp"
			string = path2 + file_name + "_"+ str(ITER+1) + str(maj_min)+ str(sort) + ".bmp"
			bw_croms_im.save(string)
			
			######################
			##Summary Statistics##
			######################
			####THINK: DO I NEED TO CHANGE THIS IF THERE IS A MINOR/MAJOR ALLELE CONVERSION
			n_position_it = np.size(crom_array,1)
			freq_crom = crom_array.sum(axis=0)/NCHROMS
			freq_crom = np.array(freq_crom)
			positions_1 = np.where(freq_crom<0.50)
			mask_1 = np.ones(n_position_it, dtype=bool)
			mask_1[positions_1[0]] = False
			freq_crom = freq_crom[mask_1]
			n_positions_1 = np.size(freq_crom)	
			#Calculating the summary statistics
			haplos = np.transpose(crom_array)
			h = allel.HaplotypeArray(haplos)
			#tajimasd
			ac = h.count_alleles()
			TjD = allel.stats.tajima_d(ac)
			#watterson
			theta_hat_w = allel.stats.watterson_theta(position_it, ac)
			#nsl
			nsl = allel.nsl(h)
			nsl = nsl[mask_1]
			size = np.size(nsl)
			if size == 0:
				nsl_max = 0
			else:
				nsl_max = np.max(nsl)
			#dictionary to store the statistics 
			statistics_dictionary = {'simulation_file': file_name, 'Selection coefficient':str(S),'Population size':str(N),'Iteration':str(ITER+1), 'Tajimas D':TjD,'Watterson':theta_hat_w,'nsl':nsl_max}
			statistics_list.append(statistics_dictionary)
		except:
			simulation_error.append(pointer)
			continue
	return(simulation_error,statistics_list,dim)
예제 #5
0
for c in chromlist:
    callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c),
                        mode='r')
    samples = callset['samples'][:]
    sample_name = [sid.decode() for sid in samples.tolist()]
    g = allel.GenotypeChunkedArray(callset["calldata/GT"])
    h = g.to_haplotypes()
    pos = allel.SortedIndex(callset["variants/POS"][:])
    acc = h.count_alleles()[:, 1]
    # ihs
    ihs = allel.ihs(h, pos, include_edges=True)
    ihs_std = allel.standardize_by_allele_count(ihs, acc)
    plt.plot(pos, -np.log10(ihs_std[0]))
    nan = ~np.isnan(ihs)
    ihs_real = ihs[nan]
    pos_ihs = pos[nan]
    # nsl
    nsl = allel.nsl(h)
    nsl_std = allel.standardize_by_allele_count(nsl, acc)
    plt.plot(pos, -np.log10(nsl_std[0]))
    nan = ~np.isnan(ihs)
    nsl_real = ihs[nan]
    pos_nsl = pos[nan]
    seldict[c] = (ihs_std[0], nsl_std[0])
    ## ehh is site dependent site dependent
    #ehh = allel.ehh_decay(h)
    #nan = ~np.isnan(ihs)
    #ehh_real = ihs[nan]
    #pos_ehh = pos[nan]
    # H12
def statistics (S, N, file_name, NCHROMS):
    global once
    global nsl
    #importo il file
    file = open("/home/lucrezialorenzon/Simulations/Results_decompressed/" + file_name).readlines()
    #cerco il carattere // nel file 
    find = []
    for i, string in enumerate(file):
        if string == '//\n':
           find.append(i+3)
    for ITER,pointer in enumerate(find):
    #croms è la matrice totale
        for j in range(NCHROMS):
            f = list(file[pointer + j])
            del f[-1]
            pos = file[pointer - 1].split()
            del pos[0]
            pos = np.array(pos, dtype='float')
            pos = pos*100000 #perchè abbiamo simulato una regione di 100000 posizioni 
            F = np.array(f,dtype=int)
            if j == 0:
               croms = F
            else:
               croms = np.vstack((croms,F))
        #n_pos è il numero di posizioni
        n_pos = np.size(croms,1)
        freq = croms.sum(axis=0)/NCHROMS
        freq = np.array(freq)
    
        positions_1 = np.where(freq<0.70)
        mask_1 = np.ones(n_pos, dtype=bool)
        mask_1[positions_1[0]] = False
        freq = freq[mask_1]
        n_pos_1 = np.size(freq)
        positions_2 = np.where(freq>0.90)
        mask_2 = np.ones(n_pos_1, dtype=bool)
        mask_2[positions_2[0]] = False
        freq = freq[mask_2]
        
        #SUMMARY STATISTICS 
        haplos = np.transpose(croms)
        h = allel.HaplotypeArray(haplos)
        #tajimasd
        ac = h.count_alleles()
        TjD = allel.stats.tajima_d(ac)
        #watterson
        theta_hat_w = allel.stats.watterson_theta(pos, ac)
        #nsl
        nsl = allel.nsl(h)
        nsl = nsl[mask_1]
        nsl = nsl[mask_2]
        size = np.size(nsl)
        if size == 0:
            nsl_max = 0
        else:
            nsl_max = np.max(nsl)
        #scrivo su file csv
        f = open("/home/lucrezialorenzon/Simulations/summarystatistics.csv",'a+')
        with f:
            header = ['Selection coefficient','Population size','Iteration','Tajimas D','Watterson','nsl']
            writer = csv.DictWriter(f,fieldnames=header)
            if once == 0:
                writer.writeheader()
                writer.writerow({'Selection coefficient':str(S),'Population size':str(N),'Iteration':str(ITER+1),
                                 'Tajimas D':TjD,'Watterson':theta_hat_w,'nsl':nsl_max})
                once = 1
            else: 
                writer.writerow({'Selection coefficient':str(S),'Population size':str(N),'Iteration':str(ITER+1),
                                 'Tajimas D':TjD,'Watterson':theta_hat_w,'nsl':nsl_max})