def read_dist_out(self, SNP_Z): gene_SNP_out_w = dict() file = P.sed_file in_ = open(file) while True: ln = in_.readline() if ln == '': break sln1 = ln.split() ln = in_.readline() sln2 = ln.split() if int(sln1[1]) in {1, 2}: if sln1[3] not in gene_SNP_out_w: gene_SNP_out_w[sln1[3]] = list() if sln2[3] not in gene_SNP_out_w: gene_SNP_out_w[sln2[3]] = list() if sln1[2] in SNP_Z: gene_SNP_out_w[sln1[3]].append((sln1[2], U.weight_f(int(sln1[5])))) if sln2[2] in SNP_Z: gene_SNP_out_w[sln2[3]].append((sln2[2], U.weight_f(int(sln2[5])))) if int(sln1[1]) == 3: if sln1[3] not in gene_SNP_out_w: gene_SNP_out_w[sln1[3]] = list() if sln1[2] in SNP_Z: gene_SNP_out_w[sln1[3]].append((sln1[2], U.weight_f(int(sln1[5])))) if int(sln2[1]) == 3: if sln2[3] not in gene_SNP_out_w: gene_SNP_out_w[sln2[3]] = list() if sln2[2] in SNP_Z: gene_SNP_out_w[sln2[3]].append((sln2[2], U.weight_f(int(sln2[5])))) print('Distances SNP to exons, SNP outside exons and inside genes, from', file) return gene_SNP_out_w
def create_seq(self): self.seq_pos = U.read_exon() self.SNP_pos, _ = U.read_SNP() for c in range(24): self.seq_pos[c].extend(self.SNP_pos[c]) [self.seq_pos[i].sort() for i in range(24)] self.SNP_io = self.SNP_in_gene()
def fit_distance(self): print('Distance Between SNP, all SNP') dist = self.read_distance() cx, cy = U.cumul_number(dist) param = U.curve_fit(cx, cy, U.gamma_cdf) U.gamma_draw_fit(cx, cy, U.gamma_cdf, param, title='Fit distance between SNP, ' + P.datas[P.data_src])
def fit_lin_log(self, inf_pv, sup_pv): print( '\nStep 2- Compute correlation coefficient between log of SNP p-values' ) print('Fixed limits of p-values', P.inf_pv, P.sup_pv) self.SNP_pv.sort() cx, cy = U.cumul_number(self.SNP_pv) cx, cy = cx[1:], cy[1:] i_min = np.where(cx > inf_pv)[0][0] i_max = np.where(cx < sup_pv)[0][-1] lx = np.log10(cx[i_min:i_max]) ly = np.log10(cy[i_min:i_max]) plt.figure() plt.title( P.data + ' linear adjustment of log p-value and log cumulative number') plt.xticks(ticks=[], labels=[]) plt.yticks(ticks=[], labels=[]) plt.plot(lx, ly) slope, intercept, r_value, p_value, std_err = stats.linregress(lx, ly) plt.plot(lx, lx * slope + intercept, label='R=' + '{:.4f}'.format(r_value)) plt.legend() print('number of p-values', len(lx), 'out of a total of', len(self.SNP_pv)) print('Real limits of p_values: [', cx[i_min], ', ', cx[i_max], ']', sep='') print('r_value:', r_value) print('p_value:', p_value) print('std_err:', std_err)
def outliers(self): SNP_pv = self.SNP_pv print( 'Step 1- Choice of fork of p-values to eliminate outlier p-values') print('Quantiles are computed for p-values in', '[', P.inf_pv, ', ', P.sup_pv, ']') print( 'Once choosen, update the parameter to compute correlation coefficient at step 2' ) print( 'Warning: Python cannot compute the Z-score for p-value < 5.552e-17' ) quant = [ 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.8, 0.85, 0.9, 0.95, 0.99, 0.995, 0.999 ] print('quantile\tvalue') for q in quant: print(q, np.quantile(SNP_pv, q), sep='\t') U.plot_histo(SNP_pv, 1000, title=P.data + ' Histogram of p-values')
def SNP_in_gene(self): seq_pos = U.read_gene() SNP_io = [dict() for i in range(24)] for c in range(24): seq_pos[c].extend(self.SNP_pos[c]) [seq_pos[i].sort() for i in range(24)] for c in range(24): in_gene = set() for sq in seq_pos[c]: if sq[1] == 1: in_gene.add(sq[2]) elif sq[1] == 2: in_gene.remove(sq[2]) else: SNP_io[c][sq[2]] = set(in_gene) return SNP_io
def SNP_compute_Z(): # log = True to get details of excluded p-values log = False _, SNP_pv = U.read_SNP(log) out = open(P.pvZ_file, mode='w') print('Infinity result of computing Z score') inf_pos, inf_neg = 0, 0 for snp in SNP_pv: z = stats.norm.ppf(1 - SNP_pv[snp]) if math.isinf(z): if z == math.inf: inf_pos += 1 else: inf_neg += 1 print(snp, z, SNP_pv[snp], sep='\t') else: out.write(snp + '\t' + str(SNP_pv[snp]) + '\t' + str(z) + '\n') print('Z computed from p-values of SNP:', inf_pos, '+inf, ', inf_neg, '-inf /', len(SNP_pv)) out.close()
def __init__(self): _, SNP_pv_dict = U.read_SNP(log=log) self.SNP_pv = list(SNP_pv_dict.values())