def main_loop(init_param, X, K, iter=1000, tol=1e-6): """ Gaussian Mixture Model Arguments: - `X`: Input data (2D array, [[x11, x12, ..., x1D], ..., [xN1, ... xND]]). - `K`: Number of clusters. - `iter`: Number of iterations to run. - `tol`: Tolerance. """ X = sp.asarray(X) N, D = X.shape pi = sp.asarray(init_param["coff"]) mu = sp.asarray(init_param["mean"]) sigma = sp.asarray(init_param["cov"]) L = sp.inf for i in xrange(iter): # E-step gamma = sp.apply_along_axis( lambda x: sp.fromiter( (pi[k] * gauss_mixture_calculate(x, mu[k], sigma[k]) for k in xrange(K)), dtype=float ), 1, X, ) gamma /= sp.sum(gamma, 1)[:, sp.newaxis] # M-step Nk = sp.sum(gamma, 0) mu = sp.sum(X * gamma.T[..., sp.newaxis], 1) / Nk[..., sp.newaxis] xmu = X[:, sp.newaxis, :] - mu sigma = ( sp.sum(gamma[..., sp.newaxis, sp.newaxis] * xmu[:, :, sp.newaxis, :] * xmu[:, :, :, sp.newaxis], 0) / Nk[..., sp.newaxis, sp.newaxis] ) pi = Nk / N # Likelihood Lnew = sp.sum( sp.log2( sp.sum( sp.apply_along_axis( lambda x: sp.fromiter( (pi[k] * gauss_mixture_calculate(x, mu[k], sigma[k]) for k in xrange(K)), dtype=float ), 1, X, ), 1, ) ) ) if abs(L - Lnew) < tol: break L = Lnew print "log likelihood=%s" % L return dict(pi=pi, mu=mu, sigma=sigma, gamma=gamma)
def main_loop(init_param, X, K, iter=1000, tol=1e-6): """ Gaussian Mixture Model Arguments: - `X`: Input data (2D array, [[x11, x12, ..., x1D], ..., [xN1, ... xND]]). - `K`: Number of clusters. - `iter`: Number of iterations to run. - `tol`: Tolerance. """ X = sp.asarray(X) N, D = X.shape pi = sp.asarray(init_param['coff']) mu = sp.asarray(init_param['mean']) sigma = sp.asarray(init_param['cov']) L = sp.inf for i in xrange(iter): # E-step gamma = sp.apply_along_axis( lambda x: sp.fromiter( (pi[k] * gauss_mixture_calculate(x, mu[k], sigma[k]) for k in xrange(K)), dtype=float), 1, X) gamma /= sp.sum(gamma, 1)[:, sp.newaxis] # M-step Nk = sp.sum(gamma, 0) mu = sp.sum(X * gamma.T[..., sp.newaxis], 1) / Nk[..., sp.newaxis] xmu = X[:, sp.newaxis, :] - mu sigma = sp.sum( gamma[..., sp.newaxis, sp.newaxis] * xmu[:, :, sp.newaxis, :] * xmu[:, :, :, sp.newaxis], 0) / Nk[..., sp.newaxis, sp.newaxis] pi = Nk / N # Likelihood Lnew = sp.sum( sp.log2( sp.sum( sp.apply_along_axis( lambda x: sp.fromiter((pi[k] * gauss_mixture_calculate( x, mu[k], sigma[k]) for k in xrange(K)), dtype=float), 1, X), 1))) if abs(L - Lnew) < tol: break L = Lnew print "log likelihood=%s" % L return dict(pi=pi, mu=mu, sigma=sigma, gamma=gamma)
def meddis_compute(samples): nchannels = samples.shape[1] if init[0]: kt[0] = g*A/(A+B) spont[0] = M*y*kt[0]/(l*kt[0]+y*(l+r)) c[0] = spont[0] * scipy.ones(nchannels) q[0] = c[0]*(l+r)/kt[0] w[0] = c[0]*r/x zeroVector[0] = scipy.zeros(nchannels) init[0] = False def meddis_iteration(row): limitedSt = scipy.maximum(row + A, 0.) kt[0] = gdt * limitedSt / (limitedSt + B) replenish = scipy.maximum(ydt * (M-q[0]), zeroVector[0]) eject = kt[0] * q[0] loss = ldt * c[0] reuptake = rdt * c[0] reprocess = xdt * w[0] q[0] += replenish - eject + reprocess c[0] += eject - loss - reuptake w[0] += reuptake - reprocess # Now iterate through each time slice of the data. Use the # max function to implement the "if (0>" test. out = h * c[0] if substractSpont: out = scipy.maximum(0., out - spont[0]) return out return scipy.apply_along_axis(meddis_iteration, 1, samples)
def meddis_compute(samples): nchannels = samples.shape[1] if init[0]: kt[0] = g * A / (A + B) spont[0] = M * y * kt[0] / (l * kt[0] + y * (l + r)) c[0] = spont[0] * scipy.ones(nchannels) q[0] = c[0] * (l + r) / kt[0] w[0] = c[0] * r / x zeroVector[0] = scipy.zeros(nchannels) init[0] = False def meddis_iteration(row): limitedSt = scipy.maximum(row + A, 0.) kt[0] = gdt * limitedSt / (limitedSt + B) replenish = scipy.maximum(ydt * (M - q[0]), zeroVector[0]) eject = kt[0] * q[0] loss = ldt * c[0] reuptake = rdt * c[0] reprocess = xdt * w[0] q[0] += replenish - eject + reprocess c[0] += eject - loss - reuptake w[0] += reuptake - reprocess # Now iterate through each time slice of the data. Use the # max function to implement the "if (0>" test. out = h * c[0] if substractSpont: out = scipy.maximum(0., out - spont[0]) return out return scipy.apply_along_axis(meddis_iteration, 1, samples)
def append_spectral_features(df, path_to_tradb_file, **kwargs): """ :param df: AE hits&features dataframe, arbitrarily filtered. :param path_to_tradb_file: str path to waveforms data (.tradb file) :param kwargs: None :return: AE hits dataframe with appended columns ['Ef_95_150', 'Ef_150_250', 'Ef_250_350', 'Ef_350_500', 'Ef_500_850'] """ df['TRAI'] = df['TRAI'].astype(np.object) trai = sorted(df.loc[df['TRAI'] > 0, 'TRAI'].dropna().astype(np.int).tolist()) ae_waveforms = read_tradb(path_to_tradb_file, trai=trai) f_khz = np.linspace(0, 1000, 1025) locs = [np.argmax(f_khz >= 95), np.argmax(f_khz >= 150), np.argmax(f_khz >= 250), np.argmax(f_khz >= 350), np.argmax(f_khz >= 500), np.argmin(f_khz < 850)] progress_bar = Bar('Processing', max=len(trai)) """ 4 threads: 1000/455677 in 1:25 min 3 threads: 1000/455677 in 1:24 min 2 threads: 1000/455677 in 1:22 min linear process; one row a time: 1000/455677 in 1:14 min linear process; python lists only: 1000/455677 in 0:00.155 min 2 threads; python lists only: 1000/455677 in 0:00.23 min """ # Compute fft spectra_list = [] for tr in trai: fft_result = fft.rfft(ae_waveforms[tr]) fft_result = sci.apply_along_axis(lambda x: sci.absolute(x), 0, fft_result) spectra_list.append([sum(fft_result[locs[0]:locs[1] + 1]), sum(fft_result[locs[1]:locs[2] + 1]), sum(fft_result[locs[2]:locs[3] + 1]), sum(fft_result[locs[3]:locs[4] + 1]), sum(fft_result[locs[4]:locs[5] + 1])]) progress_bar.next() progress_bar.finish() sf = pd.DataFrame(spectra_list, index=trai, columns=['Ef_95_150', 'Ef_150_250', 'Ef_250_350', 'Ef_350_500', 'Ef_500_850']) # print sf.head(10) oldindex = df.index.name df[oldindex] = df.index df.set_index('TRAI', drop=False, inplace=True) # print len(df.index) df = pd.concat([df, sf], join_axes=[df.index], axis=1) # print len(df.index) df.set_index(oldindex, inplace=True) # print df.dropna(subset=['TRAI']).head(10) return df
def grad(self, x): """Evaluate the gradient at x.""" if x.ndim == 1: x = x.reshape(1, x.size) def gradfx(x): return 2 * self.alpha * (x - self.center) ans = scipy.apply_along_axis(gradfx, axis=1, arr=x) # gradfx = 2 * self.alpha * ( x - self.center ) return bound(ans)
def bound(vec,unitlen=1): norm = scipy.sqrt( (vec*vec).sum(axis=1) ) if norm.ndim == 1: norm = norm.ravel() def normalize(vec): return vec / norm ans = scipy.apply_along_axis(normalize,axis=0,arr=vec) norm = norm.reshape(norm.size,1) outliers = scipy.where(norm > unitlen) ans[outliers] = vec[outliers] / norm[outliers] * unitlen ans[norm.ravel() == 0] = 0 return ans
def compute(nn_params): m = Y.shape[0] # Reshape nn_params back into the parameters theta_1 and theta_2 theta_1 = nn_params[0:(hidden_layer_size*(input_layer_size+1))]. \ reshape([hidden_layer_size, input_layer_size+1]) theta_2 = nn_params[(hidden_layer_size*(input_layer_size+1)):]. \ reshape([num_labels, hidden_layer_size+1]) theta_1_reg = sp.copy(theta_1) theta_1_reg[:, 0] = 0 theta_2_reg = sp.copy(theta_2) theta_2_reg[:, 0] = 0 # Forward propagation f = forward_prop(X)(theta_1, theta_2) # Initialize variables for back propagation a = f['a'] # Add bias a_1 = a[0] a_2 = a[1] a_3 = a[2] z = f['z'] z_2 = z[0] z_3 = z[1] # Transform Y b = sp.matrix( sp.apply_along_axis( lambda n: sp.int_(sp.array(range(1, num_labels + 1)) == n), 1, Y)) DEL_1 = sp.matrix(sp.zeros((hidden_layer_size, input_layer_size + 1))) DEL_2 = sp.matrix(sp.zeros((num_labels, hidden_layer_size + 1))) for i in range(0, m): del_3 = a_3[i, :].T - b[i, :].T del_2 = sp.multiply(theta_2[:, 1:].T * del_3, sigmoid_gradient(z_2[i, :].T)) DEL_2 = DEL_2 + del_3 * a_2[i, :] DEL_1 = DEL_1 + del_2 * a_1[i, :] # Regularize theta_1_grad = DEL_1 / m + (_lambda / m) * theta_1_reg theta_2_grad = DEL_2 / m + (_lambda / m) * theta_2_reg grad = sp.concatenate([sp.ravel(theta_1_grad), sp.ravel(theta_2_grad)]) return grad
def sgrad(self, x, ndata=None): """Return a stochastic gradient at x. Returns the gradient of a uniformly random summand.""" ### Pick random x, return gradient vector for that x ### in the form [0 0 0 ... x ... 0 0 0 0 0] etc. if x.ndim == 1: x = x.reshape(1, x.size) i = scipy.random.randint(0, x.size) def gradfx(x, i): return 2 * self.alpha[i] * (x[i] - self.center[i]) ans = scipy.apply_along_axis(gradfx, axis=1, arr=x) # gradx = 2 * self.alpha[idx] * ( x[0][idx] - self.center[idx] ) grad = scipy.zeros_like(x) ans[0][i] = grad return bound(ans)
def filterbank_compute(samples): v = samples x = scipy.resize(v, (gain.shape[0], v.shape[0])) if zi.shape[0] != gain.shape[0]: zi.resize((max(gain.shape[0], gain.shape[0]), 4 , 2)) def filt(x): coeffsB1 = scipy.array([B0[row[0]] / gain[row[0]], B11[row[0]]/ gain[row[0]], B2[row[0]] / gain[row[0]]]) a = scipy.array([A0[row[0]], A1[row[0]], A2[row[0]]]) y1, zi[row[0],0,:] = scipy.signal.lfilter(coeffsB1, a, x, zi = zi[row[0],0,:]) y2, zi[row[0],1,:] = scipy.signal.lfilter([B0[row[0]], B12[row[0]], B2[row[0]]], a, y1, zi = zi[row[0],1,:]) y3, zi[row[0],2,:] = scipy.signal.lfilter([B0[row[0]], B13[row[0]], B2[row[0]]], a, y2, zi = zi[row[0],2,:]) y4, zi[row[0],3,:] = scipy.signal.lfilter([B0[row[0]], B14[row[0]], B2[row[0]]], a, y3, zi = zi[row[0],3,:]) row[0] += 1 return y4 row = [0] y = scipy.apply_along_axis(filt, 1, x) return y.T
def compute(nn_params): m = Y.shape[0] # Reshape nn_params back into the parameters theta_1 and theta_2 theta_1 = nn_params[0:(hidden_layer_size*(input_layer_size+1))]. \ reshape([hidden_layer_size, input_layer_size+1]) theta_2 = nn_params[(hidden_layer_size*(input_layer_size+1)):]. \ reshape([num_labels, hidden_layer_size+1]) theta_1_reg = sp.copy(theta_1) theta_1_reg[:, 0] = 0 theta_2_reg = sp.copy(theta_2) theta_2_reg[:, 0] = 0 # Forward propagation f = forward_prop(X)(theta_1, theta_2) a = f['a'] a_3 = a[2] # Transform Y b = sp.matrix( sp.apply_along_axis( lambda n: sp.int_(sp.array(range(1, num_labels + 1)) == n), 1, Y)) J = 0 for i in range(0, m): J = J + (1 / m) * (-b[i, :] * sp.log(a_3[i, :].T) - (1 - b[i, :]) * sp.log(1 - a_3[i, :].T))[0, 0] # Regularize J = J + (_lambda / (2 * m)) * (sp.sum(sp.power(theta_1_reg, 2)) + sp.sum(sp.power(theta_2_reg, 2))).real return J
scipy.arange(ruin, victory + 1, dtype=int)) paths = scipy.zeros((n + 1, k, interval), dtype=int) paths[1:n + 1, :, :] = totals paths = paths + start def match(a, b, nomatch=None): return b.index(a) if a in b else nomatch # arguments: a is a scalar, b is a python list, value of nomatch is scalar # returns the position of first match of its first argument in its second argument # but if a is not there, returns the value nomatch # modeled on the R function "match", but with less generality hitVictory = scipy.apply_along_axis( lambda x: (match(victory, x.tolist(), nomatch=n + 2)), 0, paths) hitRuin = scipy.apply_along_axis( lambda x: match(ruin, x.tolist(), nomatch=n + 2), 0, paths) # If no ruin or victory on a walk, nomatch=n+2 sets the hitting # time to be two more than the number of steps, one more than # the column length. probRuinBeforeVictory = scipy.mean((hitRuin < hitVictory), axis=0) # note that you can treat the bool's as binary data! startValues = scipy.arange(ruin, victory + 1, dtype=int) ruinFunction = scipy.polyfit(startValues, probRuinBeforeVictory, 1) print("Ruin function Intercept:", ruinFunction[1]) print("Ruin function Slope:", ruinFunction[0]) # should return a slope near -1/(victory-ruin) and an intercept near 0.5
def calc_mcdonald_kreitman_stat(geno_species=['gsB', 'gsC'], min_num_strains=30, min_num_sub_pol=10, gt_hdf5_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/snps.hdf5', fig_dir = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/figures', out_file = 'mk_stats_gsB_gsC.hdf5'): ni_stats = [] pop = parse_pop_map() pop_map = pop.keys() ct_array = pop.values() codon_syn_map = get_codon_syn_map() h5f = h5py.File(gt_hdf5_file) ag = h5f['alignments'] gene_groups = sorted(ag.keys()) num_parsed_genes = 0 dn_ds_ratio_dict = {} oh5f = h5py.File(out_file,'w') for gg in gene_groups: g = ag[gg] #0. Check if there is evidence for CNVs/paralogs? seq_ids = g['strains'] strains_list = sp.array(map(lambda x: x.split('-')[0], seq_ids)) gs_list = sp.array([pop[strain]['genospecies'] for strain in strains_list]) gs_filters = [sp.in1d(gs_list,[gs]) for gs in geno_species] #print gs_filters common_filter = sp.zeros((len(gs_list)),dtype='bool8') for i in range(len(geno_species)): common_filter += gs_filters[i] gs_strains_lists = [strains_list[gs_filter] for gs_filter in gs_filters] gs_strains = [ ] has_paralogs = False for gs_strains_list in gs_strains_lists: gs_strains = sp.unique(gs_strains_list) has_paralogs = len(gs_strains)<len(gs_strains_list) if has_paralogs: break num_strains = [] for gs_strains_list in gs_strains_lists: num_strains.append(len(gs_strains_list)) num_strains = sp.array(num_strains) #print num_strains if has_paralogs: #pass print 'Evidence for paralogs/CNVs' elif sp.all(num_strains>min_num_strains): gs_strains = gs_strains_lists all_gs_strains = strains_list[common_filter] gs_list = sp.array([pop[strain]['genospecies'] for strain in all_gs_strains]) gs_filters = [sp.in1d(gs_list,[gs]) for gs in geno_species] #1. Filter rows with indels and missing data nt_mat = g['nsequences'][...] nt_mat = nt_mat[common_filter] no_gaps_no_missing = sp.all(nt_mat<5,0) nt_mat = sp.transpose(nt_mat) if sp.sum(no_gaps_no_missing)>5: raw_snps = nt_mat[no_gaps_no_missing] print 'Working on gene group: %s'%gg #First calc within genospcies Ka/Ks d = {} for i, gs in enumerate(geno_species): gs_filter = gs_filters[i] gs_raw_snps = raw_snps[:,gs_filter] num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, nt_mat[:,gs_filter]) ok_num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, gs_raw_snps) const_seq_filter = ok_num_vars==1 good_snp_filter = ok_num_vars==2 num_bin_snps = sp.sum(good_snp_filter) if num_bin_snps>5: M,N = nt_mat.shape non_gap_positions = sp.arange(M)[no_gaps_no_missing] #3. Identify good SNPs (dimorphic SNPs) ok_snps = gs_raw_snps[good_snp_filter] snp_positions = non_gap_positions[good_snp_filter] assert len(ok_snps)==len(snp_positions), 'A bug detected!' #4. Call good SNPs sequences = (g['sequences'][...])[common_filter] good_snps_dict = call_good_snps(sequences[0], ok_snps, snp_positions, codon_syn_map = codon_syn_map, ok_seq_filter = no_gaps_no_missing, seq_num_vars = num_vars) # codon_snps = good_snps_dict['codon_snps'] is_synonimous_snp = good_snps_dict['is_synonimous_snp'] num_syn_sites = good_snps_dict['num_syn_sites'] num_non_syn_sites = good_snps_dict['num_non_syn_sites'] # norm_codon_snps = sp.transpose(codon_snps) # codon_snp_freqs = sp.mean(norm_codon_snps,0) #Calculate dn/ds ratios num_syn_pol = sp.sum(is_synonimous_snp) num_non_syn_pol = len(is_synonimous_snp)-num_syn_pol if num_syn_pol>0: pn_ps_ratio = (num_non_syn_pol/num_non_syn_sites)/(num_syn_pol/num_syn_sites) else: pn_ps_ratio=-1 d[gs]={'pn_ps_ratio':pn_ps_ratio, 'num_syn_pol':num_syn_pol, 'num_non_syn_pol':num_non_syn_pol, 'M':len(nt_mat), 'const_seq_filter':const_seq_filter, 'num_syn_sites':num_syn_sites, 'num_non_syn_sites':num_non_syn_sites} else: d[gs]={'pn_ps_ratio':-1, 'num_syn_pol':0, 'num_non_syn_pol':0, 'M':len(nt_mat), 'const_seq_filter':const_seq_filter, 'num_syn_sites':0, 'num_non_syn_sites':0} #Get the constrained seq filter for the two genospecies gs1 = geno_species[0] gs2 = geno_species[1] const_seq_filter1 = d[gs1]['const_seq_filter'] const_seq_filter2 = d[gs2]['const_seq_filter'] constrained_seq_filter = const_seq_filter1 * const_seq_filter2 #Filter seq_num_var array to the two genospecies considered gs_filter = gs_filters[0]+gs_filters[1] num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, nt_mat[:,gs_filter]) constr_seq_len = sp.sum(constrained_seq_filter) if constr_seq_len>5: constr_seq = raw_snps[constrained_seq_filter] constr_num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, constr_seq) constr_bin_snps_filter = constr_num_vars==2 num_const_seq_bin_snps = sp.sum(constr_bin_snps_filter) if num_const_seq_bin_snps>5: gs_specific_snps = constr_seq[constr_bin_snps_filter] #Get positions for constrained SNPs non_gap_positions = sp.arange(len(nt_mat))[no_gaps_no_missing] constrained_positions = non_gap_positions[constrained_seq_filter] constrained_snps_positions = constrained_positions[constr_bin_snps_filter] #4. Call good SNPs good_snps_dict = call_good_snps(g['sequences'][0], gs_specific_snps, constrained_snps_positions, codon_syn_map=codon_syn_map, ok_seq_filter = no_gaps_no_missing, seq_num_vars=num_vars) is_synonimous_snp = good_snps_dict['is_synonimous_snp'] num_syn_sites = good_snps_dict['num_syn_sites'] num_non_syn_sites = good_snps_dict['num_non_syn_sites'] # norm_codon_snps = sp.transpose(codon_snps) # codon_snp_freqs = sp.mean(norm_codon_snps,0) #Calculate dn/ds ratios num_syn_subt = sp.sum(is_synonimous_snp) num_non_syn_subt = len(is_synonimous_snp)-num_syn_subt if num_syn_subt>0: dn_ds_ratio = (num_non_syn_subt/num_non_syn_sites)/(num_syn_subt/num_syn_sites) else: dn_ds_ratio=-1 d['%s_%s'%(gs1,gs2)]={'dn_ds_ratio':dn_ds_ratio, 'num_syn_subt':num_syn_subt, 'num_non_syn_subt':num_non_syn_subt, 'constr_seq_len':constr_seq_len, 'num_const_seq_bin_snps':num_const_seq_bin_snps} else: print 'No binary variants were found to be specific to either genospecies within the gene.' d['%s_%s'%(gs1,gs2)]={'dn_ds_ratio':-1, 'num_syn_subt':0, 'num_non_syn_subt':0, 'constr_seq_len':constr_seq_len, 'num_const_seq_bin_snps':num_const_seq_bin_snps} else: print 'No sequence was found to be constrained in both genospecies within the gene.' d['%s_%s'%(gs1,gs2)]={'dn_ds_ratio':-1, 'num_syn_subt':0, 'num_non_syn_subt':0, 'constr_seq_len':constr_seq_len, 'num_const_seq_bin_snps':0} num_syn_pol = d[gs1]['num_syn_pol']+d[gs2]['num_syn_pol'] num_non_syn_pol = d[gs1]['num_non_syn_pol']+d[gs2]['num_non_syn_pol'] num_syn_pol_sites = d[gs1]['num_syn_sites']+d[gs2]['num_syn_sites'] num_non_syn_pol_sites = d[gs1]['num_non_syn_sites']+d[gs2]['num_non_syn_sites'] if num_syn_pol>0: pn_ps_ratio = (num_non_syn_pol/num_non_syn_pol_sites)/(num_syn_pol/num_syn_pol_sites) else: pn_ps_ratio = -1 num_subt = d['%s_%s'%(gs1,gs2)]['num_syn_subt']+d['%s_%s'%(gs1,gs2)]['num_non_syn_subt'] num_pol = d[gs1]['num_syn_pol']+d[gs1]['num_non_syn_pol'] + d[gs2]['num_syn_pol']+d[gs2]['num_non_syn_pol'] #Now calculate the neutrality index (MK statistic) if d['%s_%s'%(gs1,gs2)]['dn_ds_ratio']>0 and pn_ps_ratio>=0: ni_stat = float(pn_ps_ratio/float(d['%s_%s'%(gs1,gs2)]['dn_ds_ratio'])) if num_subt>min_num_sub_pol and num_pol>min_num_sub_pol: print 'Found NI stat to be %0.3f'%ni_stat ni_stats.append(ni_stat) else: ni_stat = -1 mk_alpha = 1-ni_stat d['%s_%s'%(gs1,gs2)]['ni_stat']=ni_stat d['%s_%s'%(gs1,gs2)]['MK_alpha']=mk_alpha d['%s_%s'%(gs1,gs2)]['num_subt']=num_subt d['%s_%s'%(gs1,gs2)]['num_pol']=num_pol dn_ds_ratio_dict[gg]=d o_gg = oh5f.create_group(gg) o_gg.create_dataset('ni_stat',data=ni_stat) o_gg.create_dataset('mk_alpha',data=mk_alpha) o_gg.create_dataset('num_subt',data=num_subt) o_gg.create_dataset('num_pol',data=num_pol) o_gg.create_dataset('num_syn_subt',data=num_syn_subt) o_gg.create_dataset('num_non_syn_subt', data = num_non_syn_subt) #o_gg.create_dataset('pn', data = (num_non_syn_pol+1/num_non_syn_pol_sites+1)) #o_gg.create_dataset('ps', data = (num_syn_pol+1/num_syn_pol_sites+1)) o_gg.create_dataset('num_non_syn_pol', data = num_non_syn_pol) # from both groups o_gg.create_dataset('num_syn_pol', data = num_syn_pol) # from both groups o_gg.create_dataset('num_syn_pol_sites', data = num_syn_pol_sites) o_gg.create_dataset('num_non_syn_pol_sites', data = num_non_syn_pol_sites) o_gg.create_dataset('pn_ps_ratio1',data=d[gs1]['pn_ps_ratio']) o_gg.create_dataset('pn_ps_ratio2',data=d[gs1]['pn_ps_ratio']) o_gg.create_dataset('pn_ps_ratio',data=pn_ps_ratio) o_gg.create_dataset('dn_ds_ratio',data=d['%s_%s'%(gs1,gs2)]['dn_ds_ratio']) num_parsed_genes +=1 else: pass # print 'Too few strains..' print 'Parsed %d'%num_parsed_genes oh5f.close() print 'Number of NI stats: %d'%len(ni_stats) ni_stats = sp.array(ni_stats) ni_stats[ni_stats<0.005]=0.005 log_nis = sp.log10(ni_stats) pylab.hist(log_nis,bins=100) pylab.xlabel(r'$\log(NI)_{10}$ (McDonald-Kreitman Neutrality Index)') pylab.savefig(fig_dir+'/MK_stats_%s_%s.png'%(geno_species[0],geno_species[1])) return dn_ds_ratio_dict, ni_stats
def flawed_mcUnit(func, numPoints, dims): points = sp.rand(numPoints, dims) points = (2 - .05) * (points) - 0.95 total = sp.sum(sp.apply_along_axis(func, 1, points)) return float(total) / numPoints
def mcUnit(func, numPoints, dims): points = sp.rand(numPoints, dims) points = 2 * (points - .5) total = sp.sum(sp.apply_along_axis(func, 1, points)) return float(total) / numPoints
fft_wave_l = fft(wave_l) #fourier transform of left wave fft_wave_r = fft(wave_r) #fourier transform of right wave sz_l = fft_wave_l.shape[0] sz_r = fft_wave_r.shape[0] assert sz_l == sz_r csp = sp.zeros(sz_l) for i in range(fft_wave_l.shape[0]): dividend = fft_wave_l[i] * (fft_wave_r[i].conj()) divisor = abs(fft_wave_l[i]) * abs(fft_wave_r[i]) csp[i] = dividend / divisor ##calculate Cross-power Spectrum Phase analysis csp = ifft(csp) a_csp = sp.apply_along_axis(lambda x: abs(x), 0, csp) max = a_csp.max(0) / 16000 ##max of argument degree = sp.arccos( max * 34000 / 10) ##34000: speed of sound wave, 10: distance of microphone array ##find noise direction ##delete 2 max value max_idx = a_csp.argmax(0) a_csp[max_idx] = 0 max_idx = a_csp.argmax(0) a_csp[max_idx] = 0 #find second max value max_n = a_csp.max(0) / 16000 #noise time interval
fft_wave_l = fft(wave_l) #fourier transform of left wave fft_wave_r = fft(wave_r) #fourier transform of right wave sz_l = fft_wave_l.shape[0] sz_r = fft_wave_r.shape[0] assert sz_l == sz_r csp = sp.zeros(sz_l) for i in range(fft_wave_l.shape[0]): dividend = fft_wave_l[i] * (fft_wave_r[i].conj()) divisor = abs(fft_wave_l[i]) * abs(fft_wave_r[i]) csp[i] = dividend / divisor ##calculate Cross-power Spectrum Phase analysis csp = ifft(csp) a_csp = sp.apply_along_axis(lambda x: abs(x), 0, csp) max = a_csp.max(0)/16000 ##max of argument degree = sp.arccos(max*34000/10) ##34000: speed of sound wave, 10: distance of microphone array ##find noise direction ##delete 2 max value max_idx = a_csp.argmax(0) a_csp[max_idx] = 0 max_idx = a_csp.argmax(0) a_csp[max_idx] = 0 #find second max value max_n = a_csp.max(0)/16000 #noise time interval max_n_idx = a_csp.argmax(0) degree_n = sp.arccos(max_n*34000/10)
def calc_mcdonald_kreitman_stat( geno_species=['gsB', 'gsC'], min_num_strains=30, min_num_sub_pol=10, gt_hdf5_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/snps.hdf5', fig_dir='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/figures', out_file='mk_stats_gsB_gsC.hdf5'): ni_stats = [] pop = parse_pop_map() pop_map = pop.keys() ct_array = pop.values() codon_syn_map = get_codon_syn_map() h5f = h5py.File(gt_hdf5_file) ag = h5f['alignments'] gene_groups = sorted(ag.keys()) num_parsed_genes = 0 dn_ds_ratio_dict = {} oh5f = h5py.File(out_file, 'w') for gg in gene_groups: g = ag[gg] #0. Check if there is evidence for CNVs/paralogs? seq_ids = g['strains'] strains_list = sp.array(map(lambda x: x.split('-')[0], seq_ids)) gs_list = sp.array( [pop[strain]['genospecies'] for strain in strains_list]) gs_filters = [sp.in1d(gs_list, [gs]) for gs in geno_species] #print gs_filters common_filter = sp.zeros((len(gs_list)), dtype='bool8') for i in range(len(geno_species)): common_filter += gs_filters[i] gs_strains_lists = [ strains_list[gs_filter] for gs_filter in gs_filters ] gs_strains = [] has_paralogs = False for gs_strains_list in gs_strains_lists: gs_strains = sp.unique(gs_strains_list) has_paralogs = len(gs_strains) < len(gs_strains_list) if has_paralogs: break num_strains = [] for gs_strains_list in gs_strains_lists: num_strains.append(len(gs_strains_list)) num_strains = sp.array(num_strains) #print num_strains if has_paralogs: #pass print 'Evidence for paralogs/CNVs' elif sp.all(num_strains > min_num_strains): gs_strains = gs_strains_lists all_gs_strains = strains_list[common_filter] gs_list = sp.array( [pop[strain]['genospecies'] for strain in all_gs_strains]) gs_filters = [sp.in1d(gs_list, [gs]) for gs in geno_species] #1. Filter rows with indels and missing data nt_mat = g['nsequences'][...] nt_mat = nt_mat[common_filter] no_gaps_no_missing = sp.all(nt_mat < 5, 0) nt_mat = sp.transpose(nt_mat) if sp.sum(no_gaps_no_missing) > 5: raw_snps = nt_mat[no_gaps_no_missing] print 'Working on gene group: %s' % gg #First calc within genospcies Ka/Ks d = {} for i, gs in enumerate(geno_species): gs_filter = gs_filters[i] gs_raw_snps = raw_snps[:, gs_filter] num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, nt_mat[:, gs_filter]) ok_num_vars = sp.apply_along_axis( lambda x: len(sp.unique(x)), 1, gs_raw_snps) const_seq_filter = ok_num_vars == 1 good_snp_filter = ok_num_vars == 2 num_bin_snps = sp.sum(good_snp_filter) if num_bin_snps > 5: M, N = nt_mat.shape non_gap_positions = sp.arange(M)[no_gaps_no_missing] #3. Identify good SNPs (dimorphic SNPs) ok_snps = gs_raw_snps[good_snp_filter] snp_positions = non_gap_positions[good_snp_filter] assert len(ok_snps) == len( snp_positions), 'A bug detected!' #4. Call good SNPs sequences = (g['sequences'][...])[common_filter] good_snps_dict = call_good_snps( sequences[0], ok_snps, snp_positions, codon_syn_map=codon_syn_map, ok_seq_filter=no_gaps_no_missing, seq_num_vars=num_vars) # codon_snps = good_snps_dict['codon_snps'] is_synonimous_snp = good_snps_dict['is_synonimous_snp'] num_syn_sites = good_snps_dict['num_syn_sites'] num_non_syn_sites = good_snps_dict['num_non_syn_sites'] # norm_codon_snps = sp.transpose(codon_snps) # codon_snp_freqs = sp.mean(norm_codon_snps,0) #Calculate dn/ds ratios num_syn_pol = sp.sum(is_synonimous_snp) num_non_syn_pol = len(is_synonimous_snp) - num_syn_pol if num_syn_pol > 0: pn_ps_ratio = (num_non_syn_pol / num_non_syn_sites ) / (num_syn_pol / num_syn_sites) else: pn_ps_ratio = -1 d[gs] = { 'pn_ps_ratio': pn_ps_ratio, 'num_syn_pol': num_syn_pol, 'num_non_syn_pol': num_non_syn_pol, 'M': len(nt_mat), 'const_seq_filter': const_seq_filter, 'num_syn_sites': num_syn_sites, 'num_non_syn_sites': num_non_syn_sites } else: d[gs] = { 'pn_ps_ratio': -1, 'num_syn_pol': 0, 'num_non_syn_pol': 0, 'M': len(nt_mat), 'const_seq_filter': const_seq_filter, 'num_syn_sites': 0, 'num_non_syn_sites': 0 } #Get the constrained seq filter for the two genospecies gs1 = geno_species[0] gs2 = geno_species[1] const_seq_filter1 = d[gs1]['const_seq_filter'] const_seq_filter2 = d[gs2]['const_seq_filter'] constrained_seq_filter = const_seq_filter1 * const_seq_filter2 #Filter seq_num_var array to the two genospecies considered gs_filter = gs_filters[0] + gs_filters[1] num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, nt_mat[:, gs_filter]) constr_seq_len = sp.sum(constrained_seq_filter) if constr_seq_len > 5: constr_seq = raw_snps[constrained_seq_filter] constr_num_vars = sp.apply_along_axis( lambda x: len(sp.unique(x)), 1, constr_seq) constr_bin_snps_filter = constr_num_vars == 2 num_const_seq_bin_snps = sp.sum(constr_bin_snps_filter) if num_const_seq_bin_snps > 5: gs_specific_snps = constr_seq[constr_bin_snps_filter] #Get positions for constrained SNPs non_gap_positions = sp.arange( len(nt_mat))[no_gaps_no_missing] constrained_positions = non_gap_positions[ constrained_seq_filter] constrained_snps_positions = constrained_positions[ constr_bin_snps_filter] #4. Call good SNPs good_snps_dict = call_good_snps( g['sequences'][0], gs_specific_snps, constrained_snps_positions, codon_syn_map=codon_syn_map, ok_seq_filter=no_gaps_no_missing, seq_num_vars=num_vars) is_synonimous_snp = good_snps_dict['is_synonimous_snp'] num_syn_sites = good_snps_dict['num_syn_sites'] num_non_syn_sites = good_snps_dict['num_non_syn_sites'] # norm_codon_snps = sp.transpose(codon_snps) # codon_snp_freqs = sp.mean(norm_codon_snps,0) #Calculate dn/ds ratios num_syn_subt = sp.sum(is_synonimous_snp) num_non_syn_subt = len( is_synonimous_snp) - num_syn_subt if num_syn_subt > 0: dn_ds_ratio = (num_non_syn_subt / num_non_syn_sites ) / (num_syn_subt / num_syn_sites) else: dn_ds_ratio = -1 d['%s_%s' % (gs1, gs2)] = { 'dn_ds_ratio': dn_ds_ratio, 'num_syn_subt': num_syn_subt, 'num_non_syn_subt': num_non_syn_subt, 'constr_seq_len': constr_seq_len, 'num_const_seq_bin_snps': num_const_seq_bin_snps } else: print 'No binary variants were found to be specific to either genospecies within the gene.' d['%s_%s' % (gs1, gs2)] = { 'dn_ds_ratio': -1, 'num_syn_subt': 0, 'num_non_syn_subt': 0, 'constr_seq_len': constr_seq_len, 'num_const_seq_bin_snps': num_const_seq_bin_snps } else: print 'No sequence was found to be constrained in both genospecies within the gene.' d['%s_%s' % (gs1, gs2)] = { 'dn_ds_ratio': -1, 'num_syn_subt': 0, 'num_non_syn_subt': 0, 'constr_seq_len': constr_seq_len, 'num_const_seq_bin_snps': 0 } num_syn_pol = d[gs1]['num_syn_pol'] + d[gs2]['num_syn_pol'] num_non_syn_pol = d[gs1]['num_non_syn_pol'] + d[gs2][ 'num_non_syn_pol'] num_syn_pol_sites = d[gs1]['num_syn_sites'] + d[gs2][ 'num_syn_sites'] num_non_syn_pol_sites = d[gs1]['num_non_syn_sites'] + d[gs2][ 'num_non_syn_sites'] if num_syn_pol > 0: pn_ps_ratio = (num_non_syn_pol / num_non_syn_pol_sites) / ( num_syn_pol / num_syn_pol_sites) else: pn_ps_ratio = -1 num_subt = d['%s_%s' % (gs1, gs2)]['num_syn_subt'] + d[ '%s_%s' % (gs1, gs2)]['num_non_syn_subt'] num_pol = d[gs1]['num_syn_pol'] + d[gs1]['num_non_syn_pol'] + d[ gs2]['num_syn_pol'] + d[gs2]['num_non_syn_pol'] #Now calculate the neutrality index (MK statistic) if d['%s_%s' % (gs1, gs2)]['dn_ds_ratio'] > 0 and pn_ps_ratio >= 0: ni_stat = float(pn_ps_ratio / float(d['%s_%s' % (gs1, gs2)]['dn_ds_ratio'])) if num_subt > min_num_sub_pol and num_pol > min_num_sub_pol: print 'Found NI stat to be %0.3f' % ni_stat ni_stats.append(ni_stat) else: ni_stat = -1 mk_alpha = 1 - ni_stat d['%s_%s' % (gs1, gs2)]['ni_stat'] = ni_stat d['%s_%s' % (gs1, gs2)]['MK_alpha'] = mk_alpha d['%s_%s' % (gs1, gs2)]['num_subt'] = num_subt d['%s_%s' % (gs1, gs2)]['num_pol'] = num_pol dn_ds_ratio_dict[gg] = d o_gg = oh5f.create_group(gg) o_gg.create_dataset('ni_stat', data=ni_stat) o_gg.create_dataset('mk_alpha', data=mk_alpha) o_gg.create_dataset('num_subt', data=num_subt) o_gg.create_dataset('num_pol', data=num_pol) o_gg.create_dataset('num_syn_subt', data=num_syn_subt) o_gg.create_dataset('num_non_syn_subt', data=num_non_syn_subt) #o_gg.create_dataset('pn', data = (num_non_syn_pol+1/num_non_syn_pol_sites+1)) #o_gg.create_dataset('ps', data = (num_syn_pol+1/num_syn_pol_sites+1)) o_gg.create_dataset('num_non_syn_pol', data=num_non_syn_pol) # from both groups o_gg.create_dataset('num_syn_pol', data=num_syn_pol) # from both groups o_gg.create_dataset('num_syn_pol_sites', data=num_syn_pol_sites) o_gg.create_dataset('num_non_syn_pol_sites', data=num_non_syn_pol_sites) o_gg.create_dataset('pn_ps_ratio1', data=d[gs1]['pn_ps_ratio']) o_gg.create_dataset('pn_ps_ratio2', data=d[gs1]['pn_ps_ratio']) o_gg.create_dataset('pn_ps_ratio', data=pn_ps_ratio) o_gg.create_dataset('dn_ds_ratio', data=d['%s_%s' % (gs1, gs2)]['dn_ds_ratio']) num_parsed_genes += 1 else: pass # print 'Too few strains..' print 'Parsed %d' % num_parsed_genes oh5f.close() print 'Number of NI stats: %d' % len(ni_stats) ni_stats = sp.array(ni_stats) ni_stats[ni_stats < 0.005] = 0.005 log_nis = sp.log10(ni_stats) pylab.hist(log_nis, bins=100) pylab.xlabel(r'$\log(NI)_{10}$ (McDonald-Kreitman Neutrality Index)') pylab.savefig(fig_dir + '/MK_stats_%s_%s.png' % (geno_species[0], geno_species[1])) return dn_ds_ratio_dict, ni_stats
def normalize(): traject = sp.asarray(trajectory) mean = np.mean(traject, axis=0) return sp.apply_along_axis(lambda x: x - mean, 1, traject) #正規化
def fit(in_data, output, num): # basically numpy apply_along_axis for parts of data axis = 3 out_part = sp.apply_along_axis(add, axis, in_data) output.put((num, out_part))
def showVariables(self,normalize=True): """ maybe add a normalize thing ... over total range... not really the data... i think this doesnt work without normalized stuff atm... """ variables=scipy.zeros([len(self.goalGene.getVariableList()),len(self.population)]) for i in range(len(self.population)): variables[:,i]=self.population[i].getVariableList() correctlyPredicted=[] # only has to predict if the variable is positive or negative bestCorrectlyPredicted=[] pnz=[] predictedPositive=[] predictedNegative=[] predictedZero=[] trueValues=scipy.array(self.goalGene.getVariableList()) ## print(variables.shape) ## print(len(self.goalGene.getVariableList())) ## print(range(1,len(self.goalGene.getVariableList())+1)) if normalize: # weird place for this? prop need to be always on else nothing happends... mmmh... for i in self.analysableVariables: # for each variable ## print(i) mi=self.goalGene.minParRangeList[i] ma=self.goalGene.maxParRangeList[i] if trueValues[i]>0: # variable is positive correctlyPredicted.append(sum(variables[i,:]>0)) bestCorrectlyPredicted.append(variables[i,0]>0) pnz.append(1) elif trueValues[i]<0: # variable is negative correctlyPredicted.append(sum(variables[i,:]<0)) bestCorrectlyPredicted.append(variables[i,0]<0) pnz.append(-1) else: # variable = 0 correctlyPredicted.append(0) bestCorrectlyPredicted.append(0) pnz.append(0) # every thing is good... or everything is bad... but thats so negative... # maybe only good if near 0.... but... meh... predictedNegative.append(sum(variables[i,:]<0)) predictedPositive.append(sum(variables[i,:]>0)) predictedZero.append(sum(variables[i,:]==0)) # should not happen very often... # want it in a range from 0-1 addToGetMinToZero=0-mi fullrange=ma-mi # normalize variables[i,:]+=addToGetMinToZero variables[i,:]=variables[i,:]/fullrange trueValues[i]+=addToGetMinToZero trueValues[i]=trueValues[i]/fullrange ## variables=scipy.array([1,2,3]) ## avarages=scipy.average(variables) avarages=scipy.apply_along_axis(scipy.average,1,variables[self.analysableVariables]) bestValues=variables[self.analysableVariables,0] # take the first individual, assume that the population is sorted on fitness correctlyPredicted=scipy.array(correctlyPredicted).astype(float) bestCorrectlyPredicted=scipy.array(bestCorrectlyPredicted).astype(float) correctyPredictedPercentage=((correctlyPredicted/len(self.population))*100).astype(int) correctyPredictedPercentageGT50=(sum(correctyPredictedPercentage[correctyPredictedPercentage>50])/len(correctyPredictedPercentage))*100 print('--------------------------------------------------------------------------') print('nr of individuals : '+str(len(self.population))) print('--------------------------------------------------------------------------') print('true variables : '+str(self.goalGene.getVariableList())) print('avarage variables : '+str(avarages)) print('best variables : '+str(bestValues)) print('positive/negative/0 : '+str(pnz)) print('# predicted negative : '+str(predictedNegative)) print('# predicted positive : '+str(predictedPositive)) print('# predicted zero : '+str(predictedZero)) print('correctly predicted #: '+str(correctlyPredicted)) ## correctlyPredicted[correctlyPredicted==-1]=0 # set it so that 0 is set to 0%... else its just confusing... print('correctly predicted %: '+str(correctyPredictedPercentage)) print('best predicted : '+str(bestCorrectlyPredicted)) print('--------------------------------------------------------------------------') print('average predicted >50% correct % :'+str(correctyPredictedPercentageGT50)) print('best predicted % : '+str((sum(bestCorrectlyPredicted)/len(bestCorrectlyPredicted))*100)) # also counts zeros... which aint funny ## x=range(1,len(self.goalGene.getVariableList())+1) x=self.analysableVariables p1=pylab.plot(x, variables[self.analysableVariables], linewidth=0, marker='.', color='#cccccc', markeredgecolor='#cccccc')# cccccc=ligth gray pylab.xlim(xmin=-1,xmax=len(self.analysableVariables)) p2=pylab.plot(x, avarages, color='yellow') p3=pylab.plot(x, bestValues, color='red' ) p4=pylab.plot(x, trueValues[self.analysableVariables], color='blue') pylab.legend((p1[0],p2[0],p3[0],p4[0]),('values','average','best predicted','actual')) print(self.goalGene.variableNames) pylab.xticks(self.analysableVariables,self.goalGene.variableNames[self.analysableVariables], rotation=45) ## pylab.xlabel('variables') pylab.ylabel('normalized values') pylab.title('variables') pylab.show()
def op_mulvec(self,uvec): return apply_along_axis(cfunc,0,(self.csqvec,uvec))
paths[1:n + 1, :] = scipy.sqrt(Delta) * totals def match(x, arry, nomatch=None): if arry[scipy.where((arry >= x))].any(): return scipy.where((arry >= x))[0][0] - 1 else: return nomatch # arguments: x is a scalar, arry is a python list, value of nomatch is scalar # returns the first index of first of its first argument in its second argument # but if a is not there, returns the value nomatch # modeled on the R function "match", but with less generality hitIndex = scipy.apply_along_axis(lambda x: (match(a, x, nomatch=n + 2)), 0, paths) # If no ruin or victory on a walk, nomatch=n+2 sets the hitting # time to be two more than the number of steps, one more than # the column length. hittingTime = Delta * hitIndex probHitlessTa = (scipy.sum(hittingTime < time).astype('float')) / k probMax = (scipy.sum( scipy.amax(paths[0:scipy.floor(time / Delta) + 1, :], axis=0) >= a).astype('float')) / k from scipy.stats import norm theoreticalProb = 2 * (1 - norm.cdf(a / scipy.sqrt(time))) print "Empirical probability Wiener process paths hit ", a, "before ", time, "is ", probHitlessTa
def call_variants( gt_hdf5_file='/Users/PM/Dropbox/Cavassim_et_al_2019_Rhizobium_data/final_snps.hdf5', out_file='/Users/PM/Dropbox/Cavassim_et_al_2019_Rhizobium_data/newsnps_100.hdf5', min_num_strains=100): #blosum62_file='/project/NChain/faststorage/rhizobium/ld/blosum62.txt'): """ Generate a new set of SNPs to look at. For all nts: if it is a SNP count # of variants. check AA changes quantify AA change severity """ pop_map = parse_pop_map() print pop_map from itertools import izip # blosum62_matrix, blosum62_dict = parse_blosum62(blosum62_file) codon_syn_map = get_codon_syn_map() h5f = h5py.File(gt_hdf5_file) ag = h5f['alignments'] oh5f = h5py.File(out_file) gene_groups = sorted(ag.keys()) num_parsed_genes = 0 for gg in gene_groups: g = ag[gg] #0. Check if there is evidence for CNVs/paralogs? seq_ids = g['strains'] strains_list = map(lambda x: x.split('-')[0], seq_ids) strains, strain_counts = sp.unique(strains_list, return_counts=True) if len(strains) < len(strains_list): print 'Evidence for paralogs/CNVs' print strain_counts print '%d strains have unique gene copies'.format(len(strains)) elif len(seq_ids) >= min_num_strains: strains = map(lambda x: x[0:4], seq_ids) #1. Filter indel/bad rows nt_mat = g['nsequences'][...] num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 0, nt_mat) no_gaps_no_missing = sp.all(nt_mat < 5, 0) nt_mat = sp.transpose(nt_mat) bad_rows_filter = (num_vars < 5) * no_gaps_no_missing if sp.sum(bad_rows_filter) > 0: print 'passed bad filter control' raw_snps = nt_mat[bad_rows_filter] #Calculate nucleotide diversity and ani M, N = raw_snps.shape diversity = 0.0 ani = 0.0 for i in range(N - 1): for j in range(i + 1, N): diversity += sp.sum(raw_snps[:, i] != raw_snps[:, j]) ani += sp.sum(raw_snps[:, i] == raw_snps[:, j]) diversity = diversity / len(raw_snps) diversity = 2 * diversity / (N * (N - 1.0)) ani = ani / len(raw_snps) ani = 2 * ani / (N * (N - 1.0)) #2. Filter non-variable rows ok_num_vars = num_vars[bad_rows_filter] var_filter = ok_num_vars > 1 num_raw_snps = sp.sum(var_filter) if num_raw_snps > 0: print 'Working on gene group: %s' % gg M, N = nt_mat.shape non_gap_positions = sp.arange(M)[bad_rows_filter] all_snps = raw_snps[var_filter] all_snp_positions = non_gap_positions[var_filter] #3. Identify good SNPs (dimorphic SNPs) good_snp_filter = ok_num_vars == 2 ok_snps = raw_snps[good_snp_filter] snp_positions = non_gap_positions[good_snp_filter] assert len(ok_snps) == len( snp_positions), 'A bug detected!' #4. Call good SNPs good_snps_dict = call_good_snps( g['sequences'][0], ok_snps, snp_positions, codon_syn_map=codon_syn_map, ok_seq_filter=no_gaps_no_missing, seq_num_vars=num_vars) snps = good_snps_dict['snps'] nts = good_snps_dict['nts'] codon_snps = good_snps_dict['codon_snps'] codon_snp_positions = good_snps_dict['codon_snp_positions'] codons = good_snps_dict['codons'] aacids = good_snps_dict['aacids'] is_synonimous_snp = good_snps_dict['is_synonimous_snp'] num_syn_sites = good_snps_dict['num_syn_sites'] num_non_syn_sites = good_snps_dict['num_non_syn_sites'] #Normalize SNPs norm_snps = sp.transpose(snps) freqs = sp.mean(norm_snps, 0) norm_snps = (norm_snps - freqs) / sp.sqrt(freqs * (1 - freqs)) norm_snps = sp.transpose(norm_snps) norm_codon_snps = sp.transpose(codon_snps) codon_snp_freqs = sp.mean(norm_codon_snps, 0) norm_codon_snps = (norm_codon_snps - codon_snp_freqs ) / sp.sqrt(codon_snp_freqs * (1 - codon_snp_freqs)) norm_codon_snps = sp.transpose(norm_codon_snps) #Calculate dn/ds ratios num_syn_subt = sp.sum(is_synonimous_snp) num_non_syn_subt = len(is_synonimous_snp) - num_syn_subt if num_syn_subt > 0: dn_ds_ratio = (num_non_syn_subt / num_non_syn_sites ) / (num_syn_subt / num_syn_sites) else: dn_ds_ratio = -1 #Calculate McDonald-Kreitman Statistics.. #Store everything to a HDF5 file og = oh5f.create_group(gg) og.create_dataset('num_vars', data=num_vars) og.create_dataset('raw_snps', data=sp.array(all_snps, dtype='int8'), compression='lzf') og.create_dataset('raw_snp_positions', data=all_snp_positions) og.create_dataset('snps', data=sp.array(snps, dtype='int8'), compression='lzf') og.create_dataset('norm_snps', data=sp.array(norm_snps, dtype='single'), compression='lzf') og.create_dataset('freqs', data=sp.array(freqs, dtype='single')) og.create_dataset('snp_positions', data=snp_positions) og.create_dataset('codon_snps', data=sp.array(codon_snps, dtype='single'), compression='lzf') og.create_dataset('norm_codon_snps', data=sp.array(norm_codon_snps, dtype='single'), compression='lzf') og.create_dataset('codon_snp_freqs', data=sp.array(codon_snp_freqs, dtype='single')) og.create_dataset('is_synonimous_snp', data=is_synonimous_snp) og.create_dataset('strains', data=strains) og.create_dataset('codon_snp_positions', data=codon_snp_positions) # og.create_dataset('blosum62_scores', data=blosum62_scores) og.create_dataset('aacids', data=sp.array(aacids)) og.create_dataset('nts', data=sp.array(nts)) og.create_dataset('codons', data=sp.array(codons)) og.create_dataset('num_syn_sites', data=num_syn_sites) og.create_dataset('num_non_syn_sites', data=num_non_syn_sites) og.create_dataset('dn_ds_ratio', data=dn_ds_ratio) og.create_dataset('diversity', data=diversity) og.create_dataset('ani', data=ani) oh5f.flush() num_parsed_genes += 1 else: print 'Too few strains..' print 'Parsed %d' % num_parsed_genes
def op_bvec(self,uvec): return apply_along_axis(bfunc,0,(self.csqvec,uvec)) - (self.lapmat * uvec)
def flawed_mcUnit(func,numPoints,dims): points = sp.rand(numPoints,dims) points =(2-.05)*(points)-0.95 total = sp.sum(sp.apply_along_axis(func,1,points)) return float(total)/numPoints
def mcUnit(func,numPoints,dims): points = sp.rand(numPoints,dims) points = 2*(points-.5) total = sp.sum(sp.apply_along_axis(func,1,points)) return float(total)/numPoints
def normalize(): traject = sp.asarray(trajectory) mean = np.mean(traject, axis=0) return sp.apply_along_axis(lambda x: x-mean, 1, traject) #正規化