def test_nsl(): n_variants = 1000 n_haplotypes = 20 h = np.random.randint(0, 2, size=(n_variants, n_haplotypes)).astype('i1') for use_threads in True, False: score = nsl(h, use_threads=use_threads) assert_is_instance(score, np.ndarray) eq((n_variants, ), score.shape) eq(np.dtype('f8'), score.dtype)
def nsl(haplotype, pos_vec=None, window=None): """ Compute the standardize number of segregating sites by length (nSl) for each variant, comparing the reference and alternate alleles, after Ferrer-Admetlla et al. (2014) if windowed stat, provide pos_vec too. """ nsl_stats = allel.nsl(haplotype) nsl_stand, bins = allel.standardize_by_allele_count( nsl_stats, haplotype.count_alleles().T[1], diagnostics=False) if window: dn = pd.DataFrame(nsl_stand, columns=["nSL"]) dn["pos_cat"] = pd.cut(pos_vec, window, labels=range(1, window + 1)) dng = dn.groupby("pos_cat").nSL.mean() return dng else: return nsl_stand
for c in chromlist: callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) h = g.to_haplotypes() pos = allel.SortedIndex(callset["variants/POS"][:]) acc = h.count_alleles()[:, 1] # ihs ihs = allel.ihs(h, pos, include_edges=True) ihs_std = allel.standardize_by_allele_count(ihs, acc) plt.plot(pos, -np.log10(ihs_std[0])) nan = ~np.isnan(ihs) ihs_real = ihs[nan] pos_ihs = pos[nan] # nsl nsl = allel.nsl(h) nsl_std = allel.standardize_by_allele_count(nsl, acc) plt.plot(pos, -np.log10(nsl_std[0])) nan = ~np.isnan(ihs) nsl_real = ihs[nan] pos_nsl = pos[nan] seldict[c] = (ihs_std[0], nsl_std[0]) ## ehh is site dependent site dependent #ehh = allel.ehh_decay(h) #nan = ~np.isnan(ihs) #ehh_real = ihs[nan] #pos_ehh = pos[nan] # H12
def image_simulation(path1,path2,S, N, file_name, NCHROMS, threshold, apply_threshold,sort,maj_min): """ Generates images from iterations of simulation files - Deals with both txt files and gzip txt files - Calculates summary statistics for each iteration Keyword Arguments: apply_threshold (Boolean) -- Whether or not to apply p-threshold col_order (Boolean) -- Whether or not to order the columns file_name (string) -- The name of the simulation file being processed( either txt or txt.gz) NCHROMS (int) -- N (int) -- N parameter of simulation n_alleles (array) -- Number of alleles at each genome position path1 (string) -- Path to directory where the simulation files exist path2 (string) -- Path to directory where produced image should be stored S (float) -- Selection Co-efficient of simulation threshold (float) -- Threshold value ? row_order (Boolean) -- Whether or not to order the columns maj_min (Boolean) -- Whether or not to colour my major/minor alleles Returns: simulation_error (list) -- List of erronous simulation files and the iteration with error in statistics_list (list) -- List of dictionaries containing summary statitics of simulations """ global once global nsl simulation_error = [] statistics_list = [] dim = [] ################################################## #############OPENING THE SIMULATION FILES######### ################################################## #Suffix of g_zip files (Compressed) gzip_suffix = ".gz" #Suffix of txt files (Uncompressed) txt_suffix = ".txt" #we import and open the file if file_name.endswith(gzip_suffix): with gzip.open(path1 + file_name, 'rb') as f: file = f.read() if type(file) == str: #gzip files might need to be processed to be in correct format file = file.splitlines(True) elif file_name.endswith(txt_suffix): file = open(path1 + file_name).readlines() ################################################## ##########INDEXING THE FILES BY INTERATION######## ################################################## #we look for the caracter // inside the file find = [] for i, string in enumerate(file): if string == '//\n': find.append(i+3) ################################################## ###GENERATE ONE IMAGE PER SIMULATION ITERATION#### ################################################## for ITER, pointer in enumerate(find): try: ########################### ####CREATE CHROM MATRIX#### ########################### n_columns = len(list(file[pointer]))-1 croms = np.zeros((NCHROMS,n_columns),dtype=int) for j in range(NCHROMS): f = list(file[pointer + j]) del f[-1] position_it = file[pointer - 1].split() del position_it[0] position_it = np.array(position_it, dtype='float') position_it = position_it*N F = np.array(f,dtype=int) if j == 0: crom_array = F else: crom_array = np.vstack((crom_array,F)) croms[j,:]=F n_pos = np.size(croms,1) ########################### #####APPLY THRESHOLD####### ########################### if apply_threshold == True: #Count the number of derived alleles at each position count = croms.sum(axis=0,dtype=float) #Calculate the frrequency of the drived allele for each position freq = count/float(NCHROMS) for i in range(n_pos): if freq[i] > 0.5: freq[i] = 1-freq[i] #freq is now a vector that contains the minor allele frequency for each position #we delete the positions in which the minor allele frequency is <= threshold positions = np.where(freq<=threshold) croms,n_pos,freq = delete_simulation(n_pos,croms,freq,positions) ########################### ###COLOUR BY MAJOR/MINOR### ########################### if maj_min == True: #Calculate the Major and the minor allele for each position of the matrix/array #Traspose the matrix/array transponse_array_croms = np.transpose(croms) #Record the Major and Minor allele for each allelic position maj_allele = [] minor_allele = [] for i in range(len(transponse_array_croms)): freq_data = np.unique(transponse_array_croms[i], return_counts = True) index_max = np.argmax(freq_data[1]) if index_max == 0: maj_allele.append(0) minor_allele.append(1) if index_max == 1: maj_allele.append(1) minor_allele.append(0) #Black and white image: #Simulation File: 0 = ancestrial, 1 = Derived (White encoded by 1, Black encoded by 0) #If the major allele is 0, we want to change 0 with 1 and vice verasa (1 = Major, 0 = Minor) #If the major allele is 1, no changes need to be made as 1 would by default be coded to be white matrix_maj_min_col = np.ones((n_pos,NCHROMS),dtype=int) for row in range(len(transponse_array_croms)): if maj_allele[row] == 1: matrix_maj_min_col[row,:] = transponse_array_croms[row] if maj_allele[row] == 0: matrix_maj_min_col[row,:] = matrix_maj_min_col[row,:] - transponse_array_croms[row] #Transpose the matrix so that the rows are the NCHROM and the columns are n_pos croms = np.transpose(matrix_maj_min_col) if maj_min == False: #Black and white image: #Simulation File: 0 = ancestrial, 1 = Derived (White encoded by 1, Black encoded by 0) #We want the opposite(ancestrial = white & derived = black) : hence we need to change 0 with 1 and vice versa before producing the image all1 = np.ones((NCHROMS,n_pos)) croms = all1 - croms ########################### ####ORDER ROWS/COLUMNS##### ########################### if sort == 2: #Sort the matrix by row (chromosome) croms = order_data(croms) if sort == 3: #Sort the matrix by column (genetic posistion) croms_transpose = croms.transpose() croms_transpose = order_data(croms_transpose) croms = croms_transpose.transpose() if sort == 4: #First: sort the matrix by row (chromosome) croms = order_data(croms) #Second: sort the matrix by column (genetic posistion) croms_transpose = croms.transpose() croms_transpose = order_data(croms_transpose) croms = croms_transpose.transpose() ###################### ###IMAGE GENERATION### ###################### #Create image from the simulations bw_croms_uint8 = np.uint8(croms) bw_croms_im = Image.fromarray (bw_croms_uint8*255, mode = 'L') dim.append(bw_croms_im.size[0]) #img..selection_coefficients..NREF..ITER.bmp" string = path2 + file_name + "_"+ str(ITER+1) + str(maj_min)+ str(sort) + ".bmp" bw_croms_im.save(string) ###################### ##Summary Statistics## ###################### ####THINK: DO I NEED TO CHANGE THIS IF THERE IS A MINOR/MAJOR ALLELE CONVERSION n_position_it = np.size(crom_array,1) freq_crom = crom_array.sum(axis=0)/NCHROMS freq_crom = np.array(freq_crom) positions_1 = np.where(freq_crom<0.50) mask_1 = np.ones(n_position_it, dtype=bool) mask_1[positions_1[0]] = False freq_crom = freq_crom[mask_1] n_positions_1 = np.size(freq_crom) #Calculating the summary statistics haplos = np.transpose(crom_array) h = allel.HaplotypeArray(haplos) #tajimasd ac = h.count_alleles() TjD = allel.stats.tajima_d(ac) #watterson theta_hat_w = allel.stats.watterson_theta(position_it, ac) #nsl nsl = allel.nsl(h) nsl = nsl[mask_1] size = np.size(nsl) if size == 0: nsl_max = 0 else: nsl_max = np.max(nsl) #dictionary to store the statistics statistics_dictionary = {'simulation_file': file_name, 'Selection coefficient':str(S),'Population size':str(N),'Iteration':str(ITER+1), 'Tajimas D':TjD,'Watterson':theta_hat_w,'nsl':nsl_max} statistics_list.append(statistics_dictionary) except: simulation_error.append(pointer) continue return(simulation_error,statistics_list,dim)
def statistics (S, N, file_name, NCHROMS): global once global nsl #importo il file file = open("/home/lucrezialorenzon/Simulations/Results_decompressed/" + file_name).readlines() #cerco il carattere // nel file find = [] for i, string in enumerate(file): if string == '//\n': find.append(i+3) for ITER,pointer in enumerate(find): #croms è la matrice totale for j in range(NCHROMS): f = list(file[pointer + j]) del f[-1] pos = file[pointer - 1].split() del pos[0] pos = np.array(pos, dtype='float') pos = pos*100000 #perchè abbiamo simulato una regione di 100000 posizioni F = np.array(f,dtype=int) if j == 0: croms = F else: croms = np.vstack((croms,F)) #n_pos è il numero di posizioni n_pos = np.size(croms,1) freq = croms.sum(axis=0)/NCHROMS freq = np.array(freq) positions_1 = np.where(freq<0.70) mask_1 = np.ones(n_pos, dtype=bool) mask_1[positions_1[0]] = False freq = freq[mask_1] n_pos_1 = np.size(freq) positions_2 = np.where(freq>0.90) mask_2 = np.ones(n_pos_1, dtype=bool) mask_2[positions_2[0]] = False freq = freq[mask_2] #SUMMARY STATISTICS haplos = np.transpose(croms) h = allel.HaplotypeArray(haplos) #tajimasd ac = h.count_alleles() TjD = allel.stats.tajima_d(ac) #watterson theta_hat_w = allel.stats.watterson_theta(pos, ac) #nsl nsl = allel.nsl(h) nsl = nsl[mask_1] nsl = nsl[mask_2] size = np.size(nsl) if size == 0: nsl_max = 0 else: nsl_max = np.max(nsl) #scrivo su file csv f = open("/home/lucrezialorenzon/Simulations/summarystatistics.csv",'a+') with f: header = ['Selection coefficient','Population size','Iteration','Tajimas D','Watterson','nsl'] writer = csv.DictWriter(f,fieldnames=header) if once == 0: writer.writeheader() writer.writerow({'Selection coefficient':str(S),'Population size':str(N),'Iteration':str(ITER+1), 'Tajimas D':TjD,'Watterson':theta_hat_w,'nsl':nsl_max}) once = 1 else: writer.writerow({'Selection coefficient':str(S),'Population size':str(N),'Iteration':str(ITER+1), 'Tajimas D':TjD,'Watterson':theta_hat_w,'nsl':nsl_max})