def main_loop(init_param, X, K, iter=1000, tol=1e-6):
    """
    Gaussian Mixture Model
    Arguments:
    - `X`: Input data (2D array, [[x11, x12, ..., x1D], ..., [xN1, ... xND]]).
    - `K`: Number of clusters.
    - `iter`: Number of iterations to run.
    - `tol`: Tolerance.
    """
    X = sp.asarray(X)
    N, D = X.shape
    pi = sp.asarray(init_param["coff"])
    mu = sp.asarray(init_param["mean"])
    sigma = sp.asarray(init_param["cov"])

    L = sp.inf

    for i in xrange(iter):
        # E-step
        gamma = sp.apply_along_axis(
            lambda x: sp.fromiter(
                (pi[k] * gauss_mixture_calculate(x, mu[k], sigma[k]) for k in xrange(K)), dtype=float
            ),
            1,
            X,
        )
        gamma /= sp.sum(gamma, 1)[:, sp.newaxis]

        # M-step
        Nk = sp.sum(gamma, 0)
        mu = sp.sum(X * gamma.T[..., sp.newaxis], 1) / Nk[..., sp.newaxis]
        xmu = X[:, sp.newaxis, :] - mu
        sigma = (
            sp.sum(gamma[..., sp.newaxis, sp.newaxis] * xmu[:, :, sp.newaxis, :] * xmu[:, :, :, sp.newaxis], 0)
            / Nk[..., sp.newaxis, sp.newaxis]
        )
        pi = Nk / N

        # Likelihood
        Lnew = sp.sum(
            sp.log2(
                sp.sum(
                    sp.apply_along_axis(
                        lambda x: sp.fromiter(
                            (pi[k] * gauss_mixture_calculate(x, mu[k], sigma[k]) for k in xrange(K)), dtype=float
                        ),
                        1,
                        X,
                    ),
                    1,
                )
            )
        )
        if abs(L - Lnew) < tol:
            break
        L = Lnew
        print "log likelihood=%s" % L

    return dict(pi=pi, mu=mu, sigma=sigma, gamma=gamma)
def main_loop(init_param, X, K, iter=1000, tol=1e-6):
    """
    Gaussian Mixture Model
    Arguments:
    - `X`: Input data (2D array, [[x11, x12, ..., x1D], ..., [xN1, ... xND]]).
    - `K`: Number of clusters.
    - `iter`: Number of iterations to run.
    - `tol`: Tolerance.
    """
    X = sp.asarray(X)
    N, D = X.shape
    pi = sp.asarray(init_param['coff'])
    mu = sp.asarray(init_param['mean'])
    sigma = sp.asarray(init_param['cov'])

    L = sp.inf

    for i in xrange(iter):
        # E-step
        gamma = sp.apply_along_axis(
            lambda x: sp.fromiter(
                (pi[k] * gauss_mixture_calculate(x, mu[k], sigma[k])
                 for k in xrange(K)),
                dtype=float), 1, X)
        gamma /= sp.sum(gamma, 1)[:, sp.newaxis]

        # M-step
        Nk = sp.sum(gamma, 0)
        mu = sp.sum(X * gamma.T[..., sp.newaxis], 1) / Nk[..., sp.newaxis]
        xmu = X[:, sp.newaxis, :] - mu
        sigma = sp.sum(
            gamma[..., sp.newaxis, sp.newaxis] * xmu[:, :, sp.newaxis, :] *
            xmu[:, :, :, sp.newaxis], 0) / Nk[..., sp.newaxis, sp.newaxis]
        pi = Nk / N

        # Likelihood
        Lnew = sp.sum(
            sp.log2(
                sp.sum(
                    sp.apply_along_axis(
                        lambda x: sp.fromiter((pi[k] * gauss_mixture_calculate(
                            x, mu[k], sigma[k]) for k in xrange(K)),
                                              dtype=float), 1, X), 1)))
        if abs(L - Lnew) < tol: break
        L = Lnew
        print "log likelihood=%s" % L

    return dict(pi=pi, mu=mu, sigma=sigma, gamma=gamma)
Exemple #3
0
def meddis_compute(samples):
    nchannels = samples.shape[1]
    
    if init[0]:
        kt[0] = g*A/(A+B)
        spont[0] = M*y*kt[0]/(l*kt[0]+y*(l+r))
        c[0] = spont[0] * scipy.ones(nchannels)
        q[0] = c[0]*(l+r)/kt[0]
        w[0] = c[0]*r/x
        zeroVector[0] = scipy.zeros(nchannels)
        init[0] = False
        
    def meddis_iteration(row):
        limitedSt = scipy.maximum(row + A, 0.)
        kt[0] = gdt * limitedSt / (limitedSt + B)
        replenish = scipy.maximum(ydt * (M-q[0]), zeroVector[0])
        eject = kt[0] * q[0]
        loss = ldt * c[0]
        reuptake = rdt * c[0]
        reprocess = xdt * w[0]
        
        q[0] += replenish - eject + reprocess
        c[0] += eject - loss - reuptake
        w[0] += reuptake - reprocess
        
        # Now iterate through each time slice of the data.  Use the
        # max function to implement the "if (0>" test.
        out = h * c[0]
        
        if substractSpont:
            out = scipy.maximum(0., out - spont[0])
            
        return out
    
    return scipy.apply_along_axis(meddis_iteration, 1, samples)
Exemple #4
0
def meddis_compute(samples):
    nchannels = samples.shape[1]

    if init[0]:
        kt[0] = g * A / (A + B)
        spont[0] = M * y * kt[0] / (l * kt[0] + y * (l + r))
        c[0] = spont[0] * scipy.ones(nchannels)
        q[0] = c[0] * (l + r) / kt[0]
        w[0] = c[0] * r / x
        zeroVector[0] = scipy.zeros(nchannels)
        init[0] = False

    def meddis_iteration(row):
        limitedSt = scipy.maximum(row + A, 0.)
        kt[0] = gdt * limitedSt / (limitedSt + B)
        replenish = scipy.maximum(ydt * (M - q[0]), zeroVector[0])
        eject = kt[0] * q[0]
        loss = ldt * c[0]
        reuptake = rdt * c[0]
        reprocess = xdt * w[0]

        q[0] += replenish - eject + reprocess
        c[0] += eject - loss - reuptake
        w[0] += reuptake - reprocess

        # Now iterate through each time slice of the data.  Use the
        # max function to implement the "if (0>" test.
        out = h * c[0]

        if substractSpont:
            out = scipy.maximum(0., out - spont[0])

        return out

    return scipy.apply_along_axis(meddis_iteration, 1, samples)
Exemple #5
0
def append_spectral_features(df, path_to_tradb_file, **kwargs):
    """
    :param df:                      AE hits&features dataframe, arbitrarily filtered.
    :param path_to_tradb_file:      str path to waveforms data (.tradb file)
    :param kwargs:                  None
    :return:                        AE hits dataframe with appended columns
                                    ['Ef_95_150', 'Ef_150_250', 'Ef_250_350', 'Ef_350_500', 'Ef_500_850']
    """
    df['TRAI'] = df['TRAI'].astype(np.object)
    trai = sorted(df.loc[df['TRAI'] > 0, 'TRAI'].dropna().astype(np.int).tolist())

    ae_waveforms = read_tradb(path_to_tradb_file, trai=trai)
    f_khz = np.linspace(0, 1000, 1025)
    locs = [np.argmax(f_khz >= 95), np.argmax(f_khz >= 150), np.argmax(f_khz >= 250),
            np.argmax(f_khz >= 350), np.argmax(f_khz >= 500), np.argmin(f_khz < 850)]

    progress_bar = Bar('Processing', max=len(trai))

    """
    4 threads:                          1000/455677 in 1:25 min
    3 threads:                          1000/455677 in 1:24 min
    2 threads:                          1000/455677 in 1:22 min
    linear process; one row a time:     1000/455677 in 1:14 min
    linear process; python lists only:  1000/455677 in 0:00.155 min
    2 threads; python lists only:       1000/455677 in 0:00.23 min
    """

    # Compute fft
    spectra_list = []
    for tr in trai:
        fft_result = fft.rfft(ae_waveforms[tr])
        fft_result = sci.apply_along_axis(lambda x: sci.absolute(x), 0, fft_result)
        spectra_list.append([sum(fft_result[locs[0]:locs[1] + 1]),
                            sum(fft_result[locs[1]:locs[2] + 1]),
                            sum(fft_result[locs[2]:locs[3] + 1]),
                            sum(fft_result[locs[3]:locs[4] + 1]),
                            sum(fft_result[locs[4]:locs[5] + 1])])
        progress_bar.next()
    progress_bar.finish()
    sf = pd.DataFrame(spectra_list,
                      index=trai, columns=['Ef_95_150', 'Ef_150_250', 'Ef_250_350', 'Ef_350_500', 'Ef_500_850'])

    # print sf.head(10)

    oldindex = df.index.name
    df[oldindex] = df.index
    df.set_index('TRAI', drop=False, inplace=True)
    # print len(df.index)
    df = pd.concat([df, sf], join_axes=[df.index], axis=1)
    # print len(df.index)
    df.set_index(oldindex, inplace=True)
    # print df.dropna(subset=['TRAI']).head(10)

    return df
    def grad(self, x):
        """Evaluate the gradient at x."""
        if x.ndim == 1:
            x = x.reshape(1, x.size)

        def gradfx(x):
            return 2 * self.alpha * (x - self.center)

        ans = scipy.apply_along_axis(gradfx, axis=1, arr=x)
        # gradfx = 2 * self.alpha * ( x - self.center )
        return bound(ans)
Exemple #7
0
def bound(vec,unitlen=1):
	norm = scipy.sqrt( (vec*vec).sum(axis=1) )
	if norm.ndim == 1: 
		norm = norm.ravel()
	def normalize(vec):
		return vec / norm
	ans = scipy.apply_along_axis(normalize,axis=0,arr=vec)
	norm = norm.reshape(norm.size,1)
	outliers = scipy.where(norm > unitlen)
	ans[outliers] = vec[outliers] / norm[outliers] * unitlen
	ans[norm.ravel() == 0] = 0
	return ans	
Exemple #8
0
    def compute(nn_params):
        m = Y.shape[0]

        # Reshape nn_params back into the parameters theta_1 and theta_2
        theta_1 = nn_params[0:(hidden_layer_size*(input_layer_size+1))]. \
                    reshape([hidden_layer_size, input_layer_size+1])
        theta_2 = nn_params[(hidden_layer_size*(input_layer_size+1)):]. \
                    reshape([num_labels, hidden_layer_size+1])

        theta_1_reg = sp.copy(theta_1)
        theta_1_reg[:, 0] = 0
        theta_2_reg = sp.copy(theta_2)
        theta_2_reg[:, 0] = 0

        # Forward propagation
        f = forward_prop(X)(theta_1, theta_2)

        # Initialize variables for back propagation
        a = f['a']

        # Add bias
        a_1 = a[0]
        a_2 = a[1]
        a_3 = a[2]

        z = f['z']
        z_2 = z[0]
        z_3 = z[1]

        # Transform Y
        b = sp.matrix(
            sp.apply_along_axis(
                lambda n: sp.int_(sp.array(range(1, num_labels + 1)) == n), 1,
                Y))

        DEL_1 = sp.matrix(sp.zeros((hidden_layer_size, input_layer_size + 1)))
        DEL_2 = sp.matrix(sp.zeros((num_labels, hidden_layer_size + 1)))

        for i in range(0, m):
            del_3 = a_3[i, :].T - b[i, :].T
            del_2 = sp.multiply(theta_2[:, 1:].T * del_3,
                                sigmoid_gradient(z_2[i, :].T))

            DEL_2 = DEL_2 + del_3 * a_2[i, :]
            DEL_1 = DEL_1 + del_2 * a_1[i, :]

        # Regularize
        theta_1_grad = DEL_1 / m + (_lambda / m) * theta_1_reg
        theta_2_grad = DEL_2 / m + (_lambda / m) * theta_2_reg
        grad = sp.concatenate([sp.ravel(theta_1_grad), sp.ravel(theta_2_grad)])

        return grad
    def sgrad(self, x, ndata=None):
        """Return a stochastic gradient at x. 
		Returns the gradient of a uniformly random summand."""
        ### Pick random x, return gradient vector for that x
        ### in the form [0 0 0 ... x ... 0 0 0 0 0] etc.
        if x.ndim == 1:
            x = x.reshape(1, x.size)
        i = scipy.random.randint(0, x.size)

        def gradfx(x, i):
            return 2 * self.alpha[i] * (x[i] - self.center[i])

        ans = scipy.apply_along_axis(gradfx, axis=1, arr=x)
        # gradx = 2 * self.alpha[idx] * ( x[0][idx] - self.center[idx] )
        grad = scipy.zeros_like(x)
        ans[0][i] = grad
        return bound(ans)
Exemple #10
0
def filterbank_compute(samples):
    v = samples
    x = scipy.resize(v, (gain.shape[0], v.shape[0]))
    
    if zi.shape[0] != gain.shape[0]:
        zi.resize((max(gain.shape[0], gain.shape[0]), 4 , 2))
        
    def filt(x):
        coeffsB1 = scipy.array([B0[row[0]] / gain[row[0]],
                                B11[row[0]]/ gain[row[0]],
                                B2[row[0]] / gain[row[0]]])

        a = scipy.array([A0[row[0]], A1[row[0]], A2[row[0]]])

        y1, zi[row[0],0,:] = scipy.signal.lfilter(coeffsB1,
                                                  a,
                                                  x, zi = zi[row[0],0,:])
        
        y2, zi[row[0],1,:] = scipy.signal.lfilter([B0[row[0]],
                                                   B12[row[0]],
                                                   B2[row[0]]],
                                                  a,
                                                  y1, zi = zi[row[0],1,:])
        
        y3, zi[row[0],2,:] = scipy.signal.lfilter([B0[row[0]],
                                                   B13[row[0]],
                                                   B2[row[0]]],
                                                  a,
                                                  y2, zi = zi[row[0],2,:])
        
        y4, zi[row[0],3,:] = scipy.signal.lfilter([B0[row[0]],
                                                   B14[row[0]],
                                                   B2[row[0]]],
                                                  a,
                                                  y3, zi = zi[row[0],3,:])
        row[0] += 1
        return y4

    row = [0]
    y = scipy.apply_along_axis(filt, 1, x)
    return y.T 
Exemple #11
0
    def compute(nn_params):
        m = Y.shape[0]

        # Reshape nn_params back into the parameters theta_1 and theta_2
        theta_1 = nn_params[0:(hidden_layer_size*(input_layer_size+1))]. \
                    reshape([hidden_layer_size, input_layer_size+1])
        theta_2 = nn_params[(hidden_layer_size*(input_layer_size+1)):]. \
                    reshape([num_labels, hidden_layer_size+1])

        theta_1_reg = sp.copy(theta_1)
        theta_1_reg[:, 0] = 0
        theta_2_reg = sp.copy(theta_2)
        theta_2_reg[:, 0] = 0

        # Forward propagation
        f = forward_prop(X)(theta_1, theta_2)
        a = f['a']
        a_3 = a[2]

        # Transform Y
        b = sp.matrix(
            sp.apply_along_axis(
                lambda n: sp.int_(sp.array(range(1, num_labels + 1)) == n), 1,
                Y))

        J = 0

        for i in range(0, m):
            J = J + (1 / m) * (-b[i, :] * sp.log(a_3[i, :].T) -
                               (1 - b[i, :]) * sp.log(1 - a_3[i, :].T))[0, 0]

        # Regularize
        J = J + (_lambda / (2 * m)) * (sp.sum(sp.power(theta_1_reg, 2)) +
                                       sp.sum(sp.power(theta_2_reg, 2))).real

        return J
Exemple #12
0
                             scipy.arange(ruin, victory + 1, dtype=int))
paths = scipy.zeros((n + 1, k, interval), dtype=int)
paths[1:n + 1, :, :] = totals
paths = paths + start


def match(a, b, nomatch=None):
    return b.index(a) if a in b else nomatch


# arguments: a is a scalar, b is a python list, value of nomatch is scalar
# returns the position of first match of its first argument in its second argument
# but if a is not there, returns the value nomatch
# modeled on the R function "match", but with less generality

hitVictory = scipy.apply_along_axis(
    lambda x: (match(victory, x.tolist(), nomatch=n + 2)), 0, paths)
hitRuin = scipy.apply_along_axis(
    lambda x: match(ruin, x.tolist(), nomatch=n + 2), 0, paths)
# If no ruin or victory on a walk, nomatch=n+2 sets the hitting
# time to be two more than the number of steps, one more than
# the column length.

probRuinBeforeVictory = scipy.mean((hitRuin < hitVictory), axis=0)
# note that you can treat the bool's as binary data!

startValues = scipy.arange(ruin, victory + 1, dtype=int)
ruinFunction = scipy.polyfit(startValues, probRuinBeforeVictory, 1)
print("Ruin function Intercept:", ruinFunction[1])
print("Ruin function Slope:", ruinFunction[0])
# should return a slope near -1/(victory-ruin) and an intercept near 0.5
def calc_mcdonald_kreitman_stat(geno_species=['gsB', 'gsC'], min_num_strains=30, min_num_sub_pol=10,
                                gt_hdf5_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/snps.hdf5',
                                fig_dir = 'C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/figures',
                                out_file = 'mk_stats_gsB_gsC.hdf5'):

    ni_stats = []
    pop = parse_pop_map()
    pop_map = pop.keys()
    ct_array = pop.values()
    codon_syn_map = get_codon_syn_map()
    h5f = h5py.File(gt_hdf5_file)
    ag = h5f['alignments']
    gene_groups = sorted(ag.keys())
    num_parsed_genes = 0
    dn_ds_ratio_dict = {}
    oh5f = h5py.File(out_file,'w')
    for gg in gene_groups:
        g = ag[gg]
        
        #0. Check if there is evidence for CNVs/paralogs?
        seq_ids = g['strains']
        strains_list = sp.array(map(lambda x: x.split('-')[0], seq_ids))

        gs_list = sp.array([pop[strain]['genospecies'] for strain in strains_list])

        gs_filters = [sp.in1d(gs_list,[gs]) for gs in geno_species]
        #print gs_filters
        common_filter = sp.zeros((len(gs_list)),dtype='bool8')
        for i in range(len(geno_species)):
            common_filter += gs_filters[i]

        gs_strains_lists = [strains_list[gs_filter] for gs_filter in gs_filters]
        gs_strains = [ ]
        has_paralogs = False
        for gs_strains_list in gs_strains_lists:
            gs_strains = sp.unique(gs_strains_list)
            has_paralogs = len(gs_strains)<len(gs_strains_list)
            if has_paralogs:
                break
        num_strains = []
        for gs_strains_list in gs_strains_lists:
            num_strains.append(len(gs_strains_list))
        num_strains = sp.array(num_strains)
        #print num_strains
        
        if has_paralogs:
            #pass
            print 'Evidence for paralogs/CNVs'
        elif sp.all(num_strains>min_num_strains):
            gs_strains = gs_strains_lists
            all_gs_strains = strains_list[common_filter]

            gs_list = sp.array([pop[strain]['genospecies'] for strain in all_gs_strains])
            gs_filters = [sp.in1d(gs_list,[gs]) for gs in geno_species]
                        
            #1. Filter rows with indels and missing data
            nt_mat = g['nsequences'][...]
            nt_mat = nt_mat[common_filter]
            
            no_gaps_no_missing = sp.all(nt_mat<5,0)
            nt_mat = sp.transpose(nt_mat)
            if sp.sum(no_gaps_no_missing)>5:
                raw_snps = nt_mat[no_gaps_no_missing]
                
                print 'Working on gene group: %s'%gg
                #First calc within genospcies Ka/Ks
                d = {}
                for i, gs in enumerate(geno_species):
                    gs_filter = gs_filters[i]
                    gs_raw_snps = raw_snps[:,gs_filter]
                    
                    num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, nt_mat[:,gs_filter])
                    ok_num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, gs_raw_snps)
                    const_seq_filter = ok_num_vars==1
                    good_snp_filter = ok_num_vars==2

                    num_bin_snps = sp.sum(good_snp_filter)
                    if num_bin_snps>5:
                        
                        M,N = nt_mat.shape
                        non_gap_positions = sp.arange(M)[no_gaps_no_missing]
                        
                        #3. Identify good SNPs (dimorphic SNPs)
                        ok_snps = gs_raw_snps[good_snp_filter]
                        snp_positions = non_gap_positions[good_snp_filter]
                        assert len(ok_snps)==len(snp_positions), 'A bug detected!'
                        
                        #4. Call good SNPs                        
                        sequences = (g['sequences'][...])[common_filter] 
                        good_snps_dict = call_good_snps(sequences[0], ok_snps, snp_positions, codon_syn_map = codon_syn_map,
                                                        ok_seq_filter = no_gaps_no_missing, seq_num_vars = num_vars)
                        
#                         codon_snps = good_snps_dict['codon_snps']
                        is_synonimous_snp = good_snps_dict['is_synonimous_snp']
                        num_syn_sites = good_snps_dict['num_syn_sites']
                        num_non_syn_sites = good_snps_dict['num_non_syn_sites']
                                                
#                         norm_codon_snps = sp.transpose(codon_snps)
#                         codon_snp_freqs = sp.mean(norm_codon_snps,0)
                        
                        #Calculate dn/ds ratios
                        num_syn_pol = sp.sum(is_synonimous_snp)
                        num_non_syn_pol = len(is_synonimous_snp)-num_syn_pol
                        if num_syn_pol>0:
                            pn_ps_ratio = (num_non_syn_pol/num_non_syn_sites)/(num_syn_pol/num_syn_sites)
                        else:
                            pn_ps_ratio=-1

                        d[gs]={'pn_ps_ratio':pn_ps_ratio, 'num_syn_pol':num_syn_pol, 'num_non_syn_pol':num_non_syn_pol, 
                               'M':len(nt_mat), 'const_seq_filter':const_seq_filter, 'num_syn_sites':num_syn_sites, 
                               'num_non_syn_sites':num_non_syn_sites}
                    else:
                        d[gs]={'pn_ps_ratio':-1, 'num_syn_pol':0, 'num_non_syn_pol':0, 
                               'M':len(nt_mat), 'const_seq_filter':const_seq_filter,
                               'num_syn_sites':0, 'num_non_syn_sites':0}
                
                
                #Get the constrained seq filter for the two genospecies
                gs1 = geno_species[0]
                gs2 = geno_species[1]                
                const_seq_filter1 = d[gs1]['const_seq_filter']
                const_seq_filter2 = d[gs2]['const_seq_filter']
                constrained_seq_filter = const_seq_filter1 * const_seq_filter2
                
                
                #Filter seq_num_var array to the two genospecies considered
                gs_filter = gs_filters[0]+gs_filters[1]
                num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, nt_mat[:,gs_filter])


                constr_seq_len = sp.sum(constrained_seq_filter)
                if constr_seq_len>5:
                    constr_seq = raw_snps[constrained_seq_filter]
                    constr_num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1, constr_seq)
                    constr_bin_snps_filter = constr_num_vars==2
                    num_const_seq_bin_snps = sp.sum(constr_bin_snps_filter)
                    if num_const_seq_bin_snps>5:
                        gs_specific_snps = constr_seq[constr_bin_snps_filter]
                        
                        #Get positions for constrained SNPs
                        non_gap_positions = sp.arange(len(nt_mat))[no_gaps_no_missing]
                        constrained_positions = non_gap_positions[constrained_seq_filter]
                        constrained_snps_positions = constrained_positions[constr_bin_snps_filter]

                        #4. Call good SNPs                        
                        good_snps_dict = call_good_snps(g['sequences'][0], gs_specific_snps, constrained_snps_positions, codon_syn_map=codon_syn_map,
                                        ok_seq_filter = no_gaps_no_missing, seq_num_vars=num_vars)
                        
                        is_synonimous_snp = good_snps_dict['is_synonimous_snp']
                        num_syn_sites = good_snps_dict['num_syn_sites']
                        num_non_syn_sites = good_snps_dict['num_non_syn_sites']
                                                
#                         norm_codon_snps = sp.transpose(codon_snps)
#                         codon_snp_freqs = sp.mean(norm_codon_snps,0)
                        
                        #Calculate dn/ds ratios
                        num_syn_subt = sp.sum(is_synonimous_snp)
                        num_non_syn_subt = len(is_synonimous_snp)-num_syn_subt
                        if num_syn_subt>0:
                            dn_ds_ratio = (num_non_syn_subt/num_non_syn_sites)/(num_syn_subt/num_syn_sites)
                        else:
                            dn_ds_ratio=-1


                        d['%s_%s'%(gs1,gs2)]={'dn_ds_ratio':dn_ds_ratio, 'num_syn_subt':num_syn_subt, 
                                              'num_non_syn_subt':num_non_syn_subt, 
                                              'constr_seq_len':constr_seq_len, 
                                              'num_const_seq_bin_snps':num_const_seq_bin_snps}                        
                        
                    else:
                        print 'No binary variants were found to be specific to either genospecies within the gene.'
                        d['%s_%s'%(gs1,gs2)]={'dn_ds_ratio':-1, 'num_syn_subt':0, 'num_non_syn_subt':0, 
                                              'constr_seq_len':constr_seq_len, 
                                              'num_const_seq_bin_snps':num_const_seq_bin_snps}
 
                else:
                    print 'No sequence was found to be constrained in both genospecies within the gene.'
                    d['%s_%s'%(gs1,gs2)]={'dn_ds_ratio':-1, 'num_syn_subt':0, 'num_non_syn_subt':0, 
                                            'constr_seq_len':constr_seq_len, 
                                            'num_const_seq_bin_snps':0}

                num_syn_pol = d[gs1]['num_syn_pol']+d[gs2]['num_syn_pol']
                num_non_syn_pol = d[gs1]['num_non_syn_pol']+d[gs2]['num_non_syn_pol']
                num_syn_pol_sites = d[gs1]['num_syn_sites']+d[gs2]['num_syn_sites']
                num_non_syn_pol_sites = d[gs1]['num_non_syn_sites']+d[gs2]['num_non_syn_sites']
                
                if num_syn_pol>0:
                    pn_ps_ratio = (num_non_syn_pol/num_non_syn_pol_sites)/(num_syn_pol/num_syn_pol_sites)
                else:
                    pn_ps_ratio = -1
                    
                num_subt = d['%s_%s'%(gs1,gs2)]['num_syn_subt']+d['%s_%s'%(gs1,gs2)]['num_non_syn_subt']
                num_pol = d[gs1]['num_syn_pol']+d[gs1]['num_non_syn_pol'] + d[gs2]['num_syn_pol']+d[gs2]['num_non_syn_pol']
                #Now calculate the neutrality index (MK statistic)
                if d['%s_%s'%(gs1,gs2)]['dn_ds_ratio']>0 and pn_ps_ratio>=0:
                    ni_stat = float(pn_ps_ratio/float(d['%s_%s'%(gs1,gs2)]['dn_ds_ratio']))
                    if num_subt>min_num_sub_pol and num_pol>min_num_sub_pol:
                        print 'Found NI stat to be %0.3f'%ni_stat
                        ni_stats.append(ni_stat)
                else:
                    ni_stat = -1
                
                mk_alpha = 1-ni_stat
                    
                d['%s_%s'%(gs1,gs2)]['ni_stat']=ni_stat
                d['%s_%s'%(gs1,gs2)]['MK_alpha']=mk_alpha
                d['%s_%s'%(gs1,gs2)]['num_subt']=num_subt
                d['%s_%s'%(gs1,gs2)]['num_pol']=num_pol
                dn_ds_ratio_dict[gg]=d
               
                o_gg = oh5f.create_group(gg)
                o_gg.create_dataset('ni_stat',data=ni_stat)
                o_gg.create_dataset('mk_alpha',data=mk_alpha)
                o_gg.create_dataset('num_subt',data=num_subt)
                o_gg.create_dataset('num_pol',data=num_pol)
                o_gg.create_dataset('num_syn_subt',data=num_syn_subt)
                o_gg.create_dataset('num_non_syn_subt', data = num_non_syn_subt)
                #o_gg.create_dataset('pn', data = (num_non_syn_pol+1/num_non_syn_pol_sites+1))
                #o_gg.create_dataset('ps', data = (num_syn_pol+1/num_syn_pol_sites+1)) 
                o_gg.create_dataset('num_non_syn_pol', data = num_non_syn_pol) # from both groups
                o_gg.create_dataset('num_syn_pol', data = num_syn_pol) # from both groups
                o_gg.create_dataset('num_syn_pol_sites', data = num_syn_pol_sites)
                o_gg.create_dataset('num_non_syn_pol_sites', data = num_non_syn_pol_sites)
                
                o_gg.create_dataset('pn_ps_ratio1',data=d[gs1]['pn_ps_ratio'])
                o_gg.create_dataset('pn_ps_ratio2',data=d[gs1]['pn_ps_ratio'])
                o_gg.create_dataset('pn_ps_ratio',data=pn_ps_ratio)
                o_gg.create_dataset('dn_ds_ratio',data=d['%s_%s'%(gs1,gs2)]['dn_ds_ratio'])

                num_parsed_genes +=1
        else:
            pass
#             print 'Too few strains..'
    print 'Parsed %d'%num_parsed_genes
    oh5f.close()
    
    print 'Number of NI stats: %d'%len(ni_stats)
    ni_stats = sp.array(ni_stats)
    ni_stats[ni_stats<0.005]=0.005
    log_nis = sp.log10(ni_stats)
    pylab.hist(log_nis,bins=100)
    pylab.xlabel(r'$\log(NI)_{10}$ (McDonald-Kreitman Neutrality Index)')
    pylab.savefig(fig_dir+'/MK_stats_%s_%s.png'%(geno_species[0],geno_species[1]))


    return  dn_ds_ratio_dict, ni_stats
Exemple #14
0
def flawed_mcUnit(func, numPoints, dims):
    points = sp.rand(numPoints, dims)
    points = (2 - .05) * (points) - 0.95
    total = sp.sum(sp.apply_along_axis(func, 1, points))
    return float(total) / numPoints
Exemple #15
0
def mcUnit(func, numPoints, dims):
    points = sp.rand(numPoints, dims)
    points = 2 * (points - .5)
    total = sp.sum(sp.apply_along_axis(func, 1, points))
    return float(total) / numPoints
    fft_wave_l = fft(wave_l)  #fourier transform of left wave
    fft_wave_r = fft(wave_r)  #fourier transform of right wave

    sz_l = fft_wave_l.shape[0]
    sz_r = fft_wave_r.shape[0]
    assert sz_l == sz_r

    csp = sp.zeros(sz_l)

    for i in range(fft_wave_l.shape[0]):
        dividend = fft_wave_l[i] * (fft_wave_r[i].conj())
        divisor = abs(fft_wave_l[i]) * abs(fft_wave_r[i])
        csp[i] = dividend / divisor  ##calculate Cross-power Spectrum Phase analysis

    csp = ifft(csp)
    a_csp = sp.apply_along_axis(lambda x: abs(x), 0, csp)
    max = a_csp.max(0) / 16000  ##max of argument

    degree = sp.arccos(
        max * 34000 /
        10)  ##34000: speed of sound wave, 10: distance of microphone array

    ##find noise direction
    ##delete 2 max value
    max_idx = a_csp.argmax(0)
    a_csp[max_idx] = 0
    max_idx = a_csp.argmax(0)
    a_csp[max_idx] = 0

    #find second max value
    max_n = a_csp.max(0) / 16000  #noise time interval
  fft_wave_l = fft(wave_l) #fourier transform of left wave
  fft_wave_r = fft(wave_r) #fourier transform of right wave
  
  sz_l = fft_wave_l.shape[0]
  sz_r = fft_wave_r.shape[0]
  assert sz_l == sz_r 
  
  csp = sp.zeros(sz_l)

  for i in range(fft_wave_l.shape[0]):
    dividend = fft_wave_l[i] * (fft_wave_r[i].conj())
    divisor = abs(fft_wave_l[i]) * abs(fft_wave_r[i])
    csp[i] = dividend / divisor ##calculate Cross-power Spectrum Phase analysis

  csp = ifft(csp)
  a_csp = sp.apply_along_axis(lambda x: abs(x), 0, csp)
  max = a_csp.max(0)/16000 ##max of argument

  degree = sp.arccos(max*34000/10) ##34000: speed of sound wave, 10: distance of microphone array

  ##find noise direction
  ##delete 2 max value
  max_idx = a_csp.argmax(0)
  a_csp[max_idx] = 0
  max_idx = a_csp.argmax(0)
  a_csp[max_idx] = 0
  
  #find second max value 
  max_n = a_csp.max(0)/16000 #noise time interval
  max_n_idx = a_csp.argmax(0)
  degree_n = sp.arccos(max_n*34000/10)
Exemple #18
0
def calc_mcdonald_kreitman_stat(
        geno_species=['gsB', 'gsC'],
        min_num_strains=30,
        min_num_sub_pol=10,
        gt_hdf5_file='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/snps.hdf5',
        fig_dir='C:/Users/MariaIzabel/Desktop/MASTER/PHD/Bjarnicode/figures',
        out_file='mk_stats_gsB_gsC.hdf5'):

    ni_stats = []
    pop = parse_pop_map()
    pop_map = pop.keys()
    ct_array = pop.values()
    codon_syn_map = get_codon_syn_map()
    h5f = h5py.File(gt_hdf5_file)
    ag = h5f['alignments']
    gene_groups = sorted(ag.keys())
    num_parsed_genes = 0
    dn_ds_ratio_dict = {}
    oh5f = h5py.File(out_file, 'w')
    for gg in gene_groups:
        g = ag[gg]

        #0. Check if there is evidence for CNVs/paralogs?
        seq_ids = g['strains']
        strains_list = sp.array(map(lambda x: x.split('-')[0], seq_ids))

        gs_list = sp.array(
            [pop[strain]['genospecies'] for strain in strains_list])

        gs_filters = [sp.in1d(gs_list, [gs]) for gs in geno_species]
        #print gs_filters
        common_filter = sp.zeros((len(gs_list)), dtype='bool8')
        for i in range(len(geno_species)):
            common_filter += gs_filters[i]

        gs_strains_lists = [
            strains_list[gs_filter] for gs_filter in gs_filters
        ]
        gs_strains = []
        has_paralogs = False
        for gs_strains_list in gs_strains_lists:
            gs_strains = sp.unique(gs_strains_list)
            has_paralogs = len(gs_strains) < len(gs_strains_list)
            if has_paralogs:
                break
        num_strains = []
        for gs_strains_list in gs_strains_lists:
            num_strains.append(len(gs_strains_list))
        num_strains = sp.array(num_strains)
        #print num_strains

        if has_paralogs:
            #pass
            print 'Evidence for paralogs/CNVs'
        elif sp.all(num_strains > min_num_strains):
            gs_strains = gs_strains_lists
            all_gs_strains = strains_list[common_filter]

            gs_list = sp.array(
                [pop[strain]['genospecies'] for strain in all_gs_strains])
            gs_filters = [sp.in1d(gs_list, [gs]) for gs in geno_species]

            #1. Filter rows with indels and missing data
            nt_mat = g['nsequences'][...]
            nt_mat = nt_mat[common_filter]

            no_gaps_no_missing = sp.all(nt_mat < 5, 0)
            nt_mat = sp.transpose(nt_mat)
            if sp.sum(no_gaps_no_missing) > 5:
                raw_snps = nt_mat[no_gaps_no_missing]

                print 'Working on gene group: %s' % gg
                #First calc within genospcies Ka/Ks
                d = {}
                for i, gs in enumerate(geno_species):
                    gs_filter = gs_filters[i]
                    gs_raw_snps = raw_snps[:, gs_filter]

                    num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)),
                                                   1, nt_mat[:, gs_filter])
                    ok_num_vars = sp.apply_along_axis(
                        lambda x: len(sp.unique(x)), 1, gs_raw_snps)
                    const_seq_filter = ok_num_vars == 1
                    good_snp_filter = ok_num_vars == 2

                    num_bin_snps = sp.sum(good_snp_filter)
                    if num_bin_snps > 5:

                        M, N = nt_mat.shape
                        non_gap_positions = sp.arange(M)[no_gaps_no_missing]

                        #3. Identify good SNPs (dimorphic SNPs)
                        ok_snps = gs_raw_snps[good_snp_filter]
                        snp_positions = non_gap_positions[good_snp_filter]
                        assert len(ok_snps) == len(
                            snp_positions), 'A bug detected!'

                        #4. Call good SNPs
                        sequences = (g['sequences'][...])[common_filter]
                        good_snps_dict = call_good_snps(
                            sequences[0],
                            ok_snps,
                            snp_positions,
                            codon_syn_map=codon_syn_map,
                            ok_seq_filter=no_gaps_no_missing,
                            seq_num_vars=num_vars)

                        #                         codon_snps = good_snps_dict['codon_snps']
                        is_synonimous_snp = good_snps_dict['is_synonimous_snp']
                        num_syn_sites = good_snps_dict['num_syn_sites']
                        num_non_syn_sites = good_snps_dict['num_non_syn_sites']

                        #                         norm_codon_snps = sp.transpose(codon_snps)
                        #                         codon_snp_freqs = sp.mean(norm_codon_snps,0)

                        #Calculate dn/ds ratios
                        num_syn_pol = sp.sum(is_synonimous_snp)
                        num_non_syn_pol = len(is_synonimous_snp) - num_syn_pol
                        if num_syn_pol > 0:
                            pn_ps_ratio = (num_non_syn_pol / num_non_syn_sites
                                           ) / (num_syn_pol / num_syn_sites)
                        else:
                            pn_ps_ratio = -1

                        d[gs] = {
                            'pn_ps_ratio': pn_ps_ratio,
                            'num_syn_pol': num_syn_pol,
                            'num_non_syn_pol': num_non_syn_pol,
                            'M': len(nt_mat),
                            'const_seq_filter': const_seq_filter,
                            'num_syn_sites': num_syn_sites,
                            'num_non_syn_sites': num_non_syn_sites
                        }
                    else:
                        d[gs] = {
                            'pn_ps_ratio': -1,
                            'num_syn_pol': 0,
                            'num_non_syn_pol': 0,
                            'M': len(nt_mat),
                            'const_seq_filter': const_seq_filter,
                            'num_syn_sites': 0,
                            'num_non_syn_sites': 0
                        }

                #Get the constrained seq filter for the two genospecies
                gs1 = geno_species[0]
                gs2 = geno_species[1]
                const_seq_filter1 = d[gs1]['const_seq_filter']
                const_seq_filter2 = d[gs2]['const_seq_filter']
                constrained_seq_filter = const_seq_filter1 * const_seq_filter2

                #Filter seq_num_var array to the two genospecies considered
                gs_filter = gs_filters[0] + gs_filters[1]
                num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 1,
                                               nt_mat[:, gs_filter])

                constr_seq_len = sp.sum(constrained_seq_filter)
                if constr_seq_len > 5:
                    constr_seq = raw_snps[constrained_seq_filter]
                    constr_num_vars = sp.apply_along_axis(
                        lambda x: len(sp.unique(x)), 1, constr_seq)
                    constr_bin_snps_filter = constr_num_vars == 2
                    num_const_seq_bin_snps = sp.sum(constr_bin_snps_filter)
                    if num_const_seq_bin_snps > 5:
                        gs_specific_snps = constr_seq[constr_bin_snps_filter]

                        #Get positions for constrained SNPs
                        non_gap_positions = sp.arange(
                            len(nt_mat))[no_gaps_no_missing]
                        constrained_positions = non_gap_positions[
                            constrained_seq_filter]
                        constrained_snps_positions = constrained_positions[
                            constr_bin_snps_filter]

                        #4. Call good SNPs
                        good_snps_dict = call_good_snps(
                            g['sequences'][0],
                            gs_specific_snps,
                            constrained_snps_positions,
                            codon_syn_map=codon_syn_map,
                            ok_seq_filter=no_gaps_no_missing,
                            seq_num_vars=num_vars)

                        is_synonimous_snp = good_snps_dict['is_synonimous_snp']
                        num_syn_sites = good_snps_dict['num_syn_sites']
                        num_non_syn_sites = good_snps_dict['num_non_syn_sites']

                        #                         norm_codon_snps = sp.transpose(codon_snps)
                        #                         codon_snp_freqs = sp.mean(norm_codon_snps,0)

                        #Calculate dn/ds ratios
                        num_syn_subt = sp.sum(is_synonimous_snp)
                        num_non_syn_subt = len(
                            is_synonimous_snp) - num_syn_subt
                        if num_syn_subt > 0:
                            dn_ds_ratio = (num_non_syn_subt / num_non_syn_sites
                                           ) / (num_syn_subt / num_syn_sites)
                        else:
                            dn_ds_ratio = -1

                        d['%s_%s' % (gs1, gs2)] = {
                            'dn_ds_ratio': dn_ds_ratio,
                            'num_syn_subt': num_syn_subt,
                            'num_non_syn_subt': num_non_syn_subt,
                            'constr_seq_len': constr_seq_len,
                            'num_const_seq_bin_snps': num_const_seq_bin_snps
                        }

                    else:
                        print 'No binary variants were found to be specific to either genospecies within the gene.'
                        d['%s_%s' % (gs1, gs2)] = {
                            'dn_ds_ratio': -1,
                            'num_syn_subt': 0,
                            'num_non_syn_subt': 0,
                            'constr_seq_len': constr_seq_len,
                            'num_const_seq_bin_snps': num_const_seq_bin_snps
                        }

                else:
                    print 'No sequence was found to be constrained in both genospecies within the gene.'
                    d['%s_%s' % (gs1, gs2)] = {
                        'dn_ds_ratio': -1,
                        'num_syn_subt': 0,
                        'num_non_syn_subt': 0,
                        'constr_seq_len': constr_seq_len,
                        'num_const_seq_bin_snps': 0
                    }

                num_syn_pol = d[gs1]['num_syn_pol'] + d[gs2]['num_syn_pol']
                num_non_syn_pol = d[gs1]['num_non_syn_pol'] + d[gs2][
                    'num_non_syn_pol']
                num_syn_pol_sites = d[gs1]['num_syn_sites'] + d[gs2][
                    'num_syn_sites']
                num_non_syn_pol_sites = d[gs1]['num_non_syn_sites'] + d[gs2][
                    'num_non_syn_sites']

                if num_syn_pol > 0:
                    pn_ps_ratio = (num_non_syn_pol / num_non_syn_pol_sites) / (
                        num_syn_pol / num_syn_pol_sites)
                else:
                    pn_ps_ratio = -1

                num_subt = d['%s_%s' % (gs1, gs2)]['num_syn_subt'] + d[
                    '%s_%s' % (gs1, gs2)]['num_non_syn_subt']
                num_pol = d[gs1]['num_syn_pol'] + d[gs1]['num_non_syn_pol'] + d[
                    gs2]['num_syn_pol'] + d[gs2]['num_non_syn_pol']
                #Now calculate the neutrality index (MK statistic)
                if d['%s_%s' %
                     (gs1, gs2)]['dn_ds_ratio'] > 0 and pn_ps_ratio >= 0:
                    ni_stat = float(pn_ps_ratio /
                                    float(d['%s_%s' %
                                            (gs1, gs2)]['dn_ds_ratio']))
                    if num_subt > min_num_sub_pol and num_pol > min_num_sub_pol:
                        print 'Found NI stat to be %0.3f' % ni_stat
                        ni_stats.append(ni_stat)
                else:
                    ni_stat = -1

                mk_alpha = 1 - ni_stat

                d['%s_%s' % (gs1, gs2)]['ni_stat'] = ni_stat
                d['%s_%s' % (gs1, gs2)]['MK_alpha'] = mk_alpha
                d['%s_%s' % (gs1, gs2)]['num_subt'] = num_subt
                d['%s_%s' % (gs1, gs2)]['num_pol'] = num_pol
                dn_ds_ratio_dict[gg] = d

                o_gg = oh5f.create_group(gg)
                o_gg.create_dataset('ni_stat', data=ni_stat)
                o_gg.create_dataset('mk_alpha', data=mk_alpha)
                o_gg.create_dataset('num_subt', data=num_subt)
                o_gg.create_dataset('num_pol', data=num_pol)
                o_gg.create_dataset('num_syn_subt', data=num_syn_subt)
                o_gg.create_dataset('num_non_syn_subt', data=num_non_syn_subt)
                #o_gg.create_dataset('pn', data = (num_non_syn_pol+1/num_non_syn_pol_sites+1))
                #o_gg.create_dataset('ps', data = (num_syn_pol+1/num_syn_pol_sites+1))
                o_gg.create_dataset('num_non_syn_pol',
                                    data=num_non_syn_pol)  # from both groups
                o_gg.create_dataset('num_syn_pol',
                                    data=num_syn_pol)  # from both groups
                o_gg.create_dataset('num_syn_pol_sites',
                                    data=num_syn_pol_sites)
                o_gg.create_dataset('num_non_syn_pol_sites',
                                    data=num_non_syn_pol_sites)

                o_gg.create_dataset('pn_ps_ratio1', data=d[gs1]['pn_ps_ratio'])
                o_gg.create_dataset('pn_ps_ratio2', data=d[gs1]['pn_ps_ratio'])
                o_gg.create_dataset('pn_ps_ratio', data=pn_ps_ratio)
                o_gg.create_dataset('dn_ds_ratio',
                                    data=d['%s_%s' %
                                           (gs1, gs2)]['dn_ds_ratio'])

                num_parsed_genes += 1
        else:
            pass


#             print 'Too few strains..'
    print 'Parsed %d' % num_parsed_genes
    oh5f.close()

    print 'Number of NI stats: %d' % len(ni_stats)
    ni_stats = sp.array(ni_stats)
    ni_stats[ni_stats < 0.005] = 0.005
    log_nis = sp.log10(ni_stats)
    pylab.hist(log_nis, bins=100)
    pylab.xlabel(r'$\log(NI)_{10}$ (McDonald-Kreitman Neutrality Index)')
    pylab.savefig(fig_dir + '/MK_stats_%s_%s.png' %
                  (geno_species[0], geno_species[1]))

    return dn_ds_ratio_dict, ni_stats
Exemple #19
0
def normalize():
    traject = sp.asarray(trajectory)
    mean = np.mean(traject, axis=0)
    return sp.apply_along_axis(lambda x: x - mean, 1, traject)  #正規化
Exemple #20
0
def fit(in_data, output, num):
# basically numpy apply_along_axis for parts of data
	axis = 3
	out_part = sp.apply_along_axis(add, axis, in_data)
	output.put((num, out_part))
    def showVariables(self,normalize=True):
        """
            maybe add a normalize thing ... over total range... not really the data...

            i think this doesnt work without normalized stuff atm...
        """

        variables=scipy.zeros([len(self.goalGene.getVariableList()),len(self.population)])
        for i in range(len(self.population)):
            variables[:,i]=self.population[i].getVariableList()

        correctlyPredicted=[] # only has to predict if the variable is positive or negative
        bestCorrectlyPredicted=[]
        pnz=[]
        predictedPositive=[]
        predictedNegative=[]
        predictedZero=[]

        trueValues=scipy.array(self.goalGene.getVariableList())
##        print(variables.shape)
##        print(len(self.goalGene.getVariableList()))
##        print(range(1,len(self.goalGene.getVariableList())+1))
        if normalize: # weird place for this? prop need to be always on else nothing happends... mmmh...
            for i in self.analysableVariables: # for each variable
##                print(i)
                mi=self.goalGene.minParRangeList[i]
                ma=self.goalGene.maxParRangeList[i]

                if trueValues[i]>0: # variable is positive
                    correctlyPredicted.append(sum(variables[i,:]>0))
                    bestCorrectlyPredicted.append(variables[i,0]>0)
                    pnz.append(1)
                elif trueValues[i]<0: # variable is negative
                    correctlyPredicted.append(sum(variables[i,:]<0))
                    bestCorrectlyPredicted.append(variables[i,0]<0)
                    pnz.append(-1)
                else: # variable = 0
                    correctlyPredicted.append(0)
                    bestCorrectlyPredicted.append(0)
                    pnz.append(0)
                    # every thing is good... or everything is bad... but thats so negative...
                    # maybe only good if near 0.... but... meh...

                predictedNegative.append(sum(variables[i,:]<0))
                predictedPositive.append(sum(variables[i,:]>0))
                predictedZero.append(sum(variables[i,:]==0)) # should not happen very often...


                # want it in a range from 0-1
                addToGetMinToZero=0-mi
                fullrange=ma-mi

                # normalize
                variables[i,:]+=addToGetMinToZero
                variables[i,:]=variables[i,:]/fullrange

                trueValues[i]+=addToGetMinToZero
                trueValues[i]=trueValues[i]/fullrange



##        variables=scipy.array([1,2,3])
##        avarages=scipy.average(variables)
        avarages=scipy.apply_along_axis(scipy.average,1,variables[self.analysableVariables])
        bestValues=variables[self.analysableVariables,0] # take the first individual, assume that the population is sorted on fitness
        correctlyPredicted=scipy.array(correctlyPredicted).astype(float)
        bestCorrectlyPredicted=scipy.array(bestCorrectlyPredicted).astype(float)

        correctyPredictedPercentage=((correctlyPredicted/len(self.population))*100).astype(int)
        correctyPredictedPercentageGT50=(sum(correctyPredictedPercentage[correctyPredictedPercentage>50])/len(correctyPredictedPercentage))*100

        print('--------------------------------------------------------------------------')
        print('nr of individuals    : '+str(len(self.population)))
        print('--------------------------------------------------------------------------')
        print('true variables       : '+str(self.goalGene.getVariableList()))
        print('avarage variables    : '+str(avarages))
        print('best variables       : '+str(bestValues))
        print('positive/negative/0  : '+str(pnz))
        print('# predicted negative : '+str(predictedNegative))
        print('# predicted positive : '+str(predictedPositive))
        print('# predicted zero     : '+str(predictedZero))
        print('correctly predicted #: '+str(correctlyPredicted))
##        correctlyPredicted[correctlyPredicted==-1]=0 # set it so that 0 is set to 0%... else its just confusing...

        print('correctly predicted %: '+str(correctyPredictedPercentage))
        print('best predicted       : '+str(bestCorrectlyPredicted))
        print('--------------------------------------------------------------------------')


        print('average predicted >50% correct % :'+str(correctyPredictedPercentageGT50))
        print('best predicted %     : '+str((sum(bestCorrectlyPredicted)/len(bestCorrectlyPredicted))*100)) # also counts zeros... which aint funny



##        x=range(1,len(self.goalGene.getVariableList())+1)
        x=self.analysableVariables
        p1=pylab.plot(x, variables[self.analysableVariables], linewidth=0, marker='.', color='#cccccc', markeredgecolor='#cccccc')# cccccc=ligth gray
        pylab.xlim(xmin=-1,xmax=len(self.analysableVariables))
        p2=pylab.plot(x, avarages, color='yellow')
        p3=pylab.plot(x, bestValues, color='red' )
        p4=pylab.plot(x, trueValues[self.analysableVariables], color='blue')
        pylab.legend((p1[0],p2[0],p3[0],p4[0]),('values','average','best predicted','actual'))
        print(self.goalGene.variableNames)
        pylab.xticks(self.analysableVariables,self.goalGene.variableNames[self.analysableVariables], rotation=45)
##        pylab.xlabel('variables')
        pylab.ylabel('normalized values')
        pylab.title('variables')
        pylab.show()
Exemple #22
0
 def op_mulvec(self,uvec):
     return apply_along_axis(cfunc,0,(self.csqvec,uvec))
Exemple #23
0
paths[1:n + 1, :] = scipy.sqrt(Delta) * totals


def match(x, arry, nomatch=None):
    if arry[scipy.where((arry >= x))].any():
        return scipy.where((arry >= x))[0][0] - 1
    else:
        return nomatch


# arguments: x is a scalar, arry is a python list, value of nomatch is scalar
# returns the first index of first  of its first argument in its second argument
# but if a is not there, returns the value nomatch
# modeled on the R function "match", but with less generality

hitIndex = scipy.apply_along_axis(lambda x: (match(a, x, nomatch=n + 2)), 0,
                                  paths)
# If no ruin or victory on a walk, nomatch=n+2 sets the hitting
# time to be two more than the number of steps, one more than
# the column length.

hittingTime = Delta * hitIndex

probHitlessTa = (scipy.sum(hittingTime < time).astype('float')) / k
probMax = (scipy.sum(
    scipy.amax(paths[0:scipy.floor(time / Delta) +
                     1, :], axis=0) >= a).astype('float')) / k
from scipy.stats import norm

theoreticalProb = 2 * (1 - norm.cdf(a / scipy.sqrt(time)))

print "Empirical probability Wiener process paths hit ", a, "before ", time, "is ", probHitlessTa
def call_variants(
        gt_hdf5_file='/Users/PM/Dropbox/Cavassim_et_al_2019_Rhizobium_data/final_snps.hdf5',
        out_file='/Users/PM/Dropbox/Cavassim_et_al_2019_Rhizobium_data/newsnps_100.hdf5',
        min_num_strains=100):
    #blosum62_file='/project/NChain/faststorage/rhizobium/ld/blosum62.txt'):
    """
	Generate a new set of SNPs to look at.
	
	For all nts:
		if it is a SNP
			count # of variants. 
			check AA changes
			quantify AA change severity    
	
	"""
    pop_map = parse_pop_map()
    print pop_map

    from itertools import izip
    #     blosum62_matrix, blosum62_dict = parse_blosum62(blosum62_file)
    codon_syn_map = get_codon_syn_map()
    h5f = h5py.File(gt_hdf5_file)
    ag = h5f['alignments']
    oh5f = h5py.File(out_file)
    gene_groups = sorted(ag.keys())
    num_parsed_genes = 0
    for gg in gene_groups:
        g = ag[gg]

        #0. Check if there is evidence for CNVs/paralogs?
        seq_ids = g['strains']
        strains_list = map(lambda x: x.split('-')[0], seq_ids)
        strains, strain_counts = sp.unique(strains_list, return_counts=True)
        if len(strains) < len(strains_list):
            print 'Evidence for paralogs/CNVs'
            print strain_counts
            print '%d strains have unique gene copies'.format(len(strains))
        elif len(seq_ids) >= min_num_strains:
            strains = map(lambda x: x[0:4], seq_ids)

            #1. Filter indel/bad rows
            nt_mat = g['nsequences'][...]
            num_vars = sp.apply_along_axis(lambda x: len(sp.unique(x)), 0,
                                           nt_mat)
            no_gaps_no_missing = sp.all(nt_mat < 5, 0)
            nt_mat = sp.transpose(nt_mat)
            bad_rows_filter = (num_vars < 5) * no_gaps_no_missing
            if sp.sum(bad_rows_filter) > 0:
                print 'passed bad filter control'
                raw_snps = nt_mat[bad_rows_filter]

                #Calculate nucleotide diversity and ani
                M, N = raw_snps.shape
                diversity = 0.0
                ani = 0.0
                for i in range(N - 1):
                    for j in range(i + 1, N):
                        diversity += sp.sum(raw_snps[:, i] != raw_snps[:, j])
                        ani += sp.sum(raw_snps[:, i] == raw_snps[:, j])

                diversity = diversity / len(raw_snps)
                diversity = 2 * diversity / (N * (N - 1.0))
                ani = ani / len(raw_snps)
                ani = 2 * ani / (N * (N - 1.0))

                #2. Filter non-variable rows
                ok_num_vars = num_vars[bad_rows_filter]
                var_filter = ok_num_vars > 1
                num_raw_snps = sp.sum(var_filter)
                if num_raw_snps > 0:
                    print 'Working on gene group: %s' % gg

                    M, N = nt_mat.shape
                    non_gap_positions = sp.arange(M)[bad_rows_filter]
                    all_snps = raw_snps[var_filter]
                    all_snp_positions = non_gap_positions[var_filter]

                    #3. Identify good SNPs (dimorphic SNPs)
                    good_snp_filter = ok_num_vars == 2
                    ok_snps = raw_snps[good_snp_filter]
                    snp_positions = non_gap_positions[good_snp_filter]
                    assert len(ok_snps) == len(
                        snp_positions), 'A bug detected!'

                    #4. Call good SNPs
                    good_snps_dict = call_good_snps(
                        g['sequences'][0],
                        ok_snps,
                        snp_positions,
                        codon_syn_map=codon_syn_map,
                        ok_seq_filter=no_gaps_no_missing,
                        seq_num_vars=num_vars)

                    snps = good_snps_dict['snps']
                    nts = good_snps_dict['nts']
                    codon_snps = good_snps_dict['codon_snps']
                    codon_snp_positions = good_snps_dict['codon_snp_positions']
                    codons = good_snps_dict['codons']
                    aacids = good_snps_dict['aacids']
                    is_synonimous_snp = good_snps_dict['is_synonimous_snp']
                    num_syn_sites = good_snps_dict['num_syn_sites']
                    num_non_syn_sites = good_snps_dict['num_non_syn_sites']

                    #Normalize SNPs
                    norm_snps = sp.transpose(snps)
                    freqs = sp.mean(norm_snps, 0)
                    norm_snps = (norm_snps - freqs) / sp.sqrt(freqs *
                                                              (1 - freqs))
                    norm_snps = sp.transpose(norm_snps)

                    norm_codon_snps = sp.transpose(codon_snps)
                    codon_snp_freqs = sp.mean(norm_codon_snps, 0)
                    norm_codon_snps = (norm_codon_snps - codon_snp_freqs
                                       ) / sp.sqrt(codon_snp_freqs *
                                                   (1 - codon_snp_freqs))
                    norm_codon_snps = sp.transpose(norm_codon_snps)

                    #Calculate dn/ds ratios
                    num_syn_subt = sp.sum(is_synonimous_snp)
                    num_non_syn_subt = len(is_synonimous_snp) - num_syn_subt
                    if num_syn_subt > 0:
                        dn_ds_ratio = (num_non_syn_subt / num_non_syn_sites
                                       ) / (num_syn_subt / num_syn_sites)
                    else:
                        dn_ds_ratio = -1

                    #Calculate McDonald-Kreitman Statistics..

                    #Store everything to a HDF5 file
                    og = oh5f.create_group(gg)
                    og.create_dataset('num_vars', data=num_vars)
                    og.create_dataset('raw_snps',
                                      data=sp.array(all_snps, dtype='int8'),
                                      compression='lzf')
                    og.create_dataset('raw_snp_positions',
                                      data=all_snp_positions)
                    og.create_dataset('snps',
                                      data=sp.array(snps, dtype='int8'),
                                      compression='lzf')
                    og.create_dataset('norm_snps',
                                      data=sp.array(norm_snps, dtype='single'),
                                      compression='lzf')
                    og.create_dataset('freqs',
                                      data=sp.array(freqs, dtype='single'))
                    og.create_dataset('snp_positions', data=snp_positions)
                    og.create_dataset('codon_snps',
                                      data=sp.array(codon_snps,
                                                    dtype='single'),
                                      compression='lzf')
                    og.create_dataset('norm_codon_snps',
                                      data=sp.array(norm_codon_snps,
                                                    dtype='single'),
                                      compression='lzf')
                    og.create_dataset('codon_snp_freqs',
                                      data=sp.array(codon_snp_freqs,
                                                    dtype='single'))
                    og.create_dataset('is_synonimous_snp',
                                      data=is_synonimous_snp)
                    og.create_dataset('strains', data=strains)
                    og.create_dataset('codon_snp_positions',
                                      data=codon_snp_positions)
                    #                     og.create_dataset('blosum62_scores', data=blosum62_scores)
                    og.create_dataset('aacids', data=sp.array(aacids))
                    og.create_dataset('nts', data=sp.array(nts))
                    og.create_dataset('codons', data=sp.array(codons))
                    og.create_dataset('num_syn_sites', data=num_syn_sites)
                    og.create_dataset('num_non_syn_sites',
                                      data=num_non_syn_sites)
                    og.create_dataset('dn_ds_ratio', data=dn_ds_ratio)
                    og.create_dataset('diversity', data=diversity)
                    og.create_dataset('ani', data=ani)
                    oh5f.flush()
                    num_parsed_genes += 1
        else:
            print 'Too few strains..'
    print 'Parsed %d' % num_parsed_genes
Exemple #25
0
 def op_bvec(self,uvec):
     return apply_along_axis(bfunc,0,(self.csqvec,uvec)) - (self.lapmat * uvec)
Exemple #26
0
def flawed_mcUnit(func,numPoints,dims):
	points = sp.rand(numPoints,dims)
	points =(2-.05)*(points)-0.95
	total = sp.sum(sp.apply_along_axis(func,1,points))
	return float(total)/numPoints	
Exemple #27
0
def mcUnit(func,numPoints,dims):
	points = sp.rand(numPoints,dims)
	points = 2*(points-.5)
	total = sp.sum(sp.apply_along_axis(func,1,points))
	return float(total)/numPoints
Exemple #28
0
def normalize():
  traject = sp.asarray(trajectory)
  mean = np.mean(traject, axis=0)
  return sp.apply_along_axis(lambda x: x-mean, 1, traject) #正規化