Esempio n. 1
0
def bootstrap_EM(df, kgene, part=0.5, n_boot=100, maxIter=1000, alpha=1e-5):
    '''Given BLAST output as a dataframe, bootstrap fraction part of reads and
    run Expectation-Maximization algorithm on bootstrapped output n_boot times.
    Return mean of bootstrapped allele probability estimates & allele names.'''
    # read in list of possible alleles from file, not from BLAST outtput
    df_tr = pd.read_csv('../ref_files/KIRallele_translation.txt',
                        sep='\t',
                        index_col=0)
    tdict = df_tr['Colon_Name'].to_dict()

    # load list of alleles and possible 2-allele combinations
    df_kirvars = pd.read_csv('../ref_files/KIR_variants.txt', sep='\t')
    sortedAlleles = sorted([tdict[a] for a in df_kirvars[kgene].dropna()])

    # set up bootstrapping
    Pmat = np.zeros((n_boot, len(sortedAlleles)))
    for i in range(n_boot):
        df_bs = bootstrap(df, part=part)

        if len(df_bs) != 0:
            P, alleles = em.run_EM(df_bs, maxIter=maxIter, alpha=alpha)

            pdict = {a: P[i] for i, a in enumerate(alleles)}
            for a in sortedAlleles:
                if a not in pdict:
                    pdict[a] = 0

            # Make sure alleles are in same order every time
            sortedP = np.array([pdict[a] for a in sortedAlleles])

            if len(sortedP) == len(Pmat[i]):
                Pmat[i] = sortedP

    return np.mean(Pmat, axis=0), sortedAlleles
Esempio n. 2
0
def run(TFIntervaldict,pad,threshold,bins):
    distances = dict()
    for TF in TFIntervaldict:
        x = list()
        for array in TFIntervaldict[TF]:
            for interval in array:
                for position in interval:
                    if position[2] != np.inf and position[2] > threshold:
                       x.append((position[0]+position[1]/2)-pad)
        if len(x) > 0:
            counts,edges 	= np.histogram(x, bins=bins)
            edges 			= edges[1:]
            X 				= np.zeros((len(counts), 2))
            X[:,0] 			= edges
            X[:,1] 			= counts
            w = em.fit(X)
            start = min(x)
            stop = max(x)
            sigma = np.std(x)
            mu = np.mean(x)
            N = len(x)
            y = np.random.uniform(start, stop, N)
            y = np.linspace(start,stop,N)
            z = mu/(sigma/math.sqrt(N))
            p = 1 - scipy.special.ndtr(z)
            k = scipy.stats.ks_2samp(x,y)
            m = scipy.stats.mode(x)[0][0]
            if -0.25 < m < 0.25:
                m = 0
            else:
                m = 1
            distances[TF] = [w,k[1],p,m,x]
        
    return distances
Esempio n. 3
0
def main(articles_file, topics):
    #get the topics
    topics_model = ut.get_the_topics_lst(topics)
    #get the training set
    headers, articles, words_freqs, articles_freqs = ut.make_train_set(
        articles_file)

    #divide the words into cluster
    words_into_clusters = ut.divide_clusters(articles)

    #get the good wights for our model
    w_model = em.run_em_algorithm(articles_freqs, words_freqs,
                                  words_into_clusters, len(topics_model))

    #create the confuision matrix
    conf_matrix, clusters_and_topics, articles_of_clusters = ut.make_conf_matrix(
        w_model, articles_freqs, topics_model, headers)
    # conf_matrix_descending_order = sorted(conf_matrix, key=lambda line: line[-1], reverse=True)
    print conf_matrix

    #add the topic to the articles
    articles_by_topic = ut.add_tag_to_articles(clusters_and_topics,
                                               articles_of_clusters)
    #print empty line
    print "\n"
    #compute the accuracy of the model
    accuracy = ut.compute_accuracy(headers, articles_by_topic)
    print "the accuracy of our model is- ", accuracy
def discriminatory_ob_gene(results, n_ass, n_objects, n_object_per_assessment):
    ass.symmetrized_Votes(results)
    Delta_est, Assessors_est, S_est = em.EM_est(results, n_objects, 0.001)
    s = np.argsort(S_est, axis=None)
    n = np.shape(S_est)[0]
    sorted_ind = [(s[i] / n, s[i] - (s[i] / n) * n) for i in range(len(s))]
    res = objects_from_edges(sorted_ind, n_object_per_assessment)
    return res
def graph_ob_gene(results, n_ass, n_objects, n_object_per_assessment):
    # Compute Distance
    ass.symmetrized_Votes(results)
    Delta_est, Assessors_est, S_est = em.EM_est(results, n_objects, 0.00001)
    Delta = np.ones(np.shape(Delta_est)) - Delta_est
    Delta = Delta / np.max(Delta, axis=None)
    sorted_Delta = np.sort(Delta, axis=None)
    # Set threshold as median of non zeros values
    threshold = np.median([sorted_Delta[i] for i in np.nonzero(sorted_Delta)])
    # Create graph and compute betweeness of edges
    G = gra.init_graph(Delta, threshold)
    edges_betweeness = gra.betweeness(G)
    # Return the appropriate number of objects choosing the high betweeness first
    best_ind = list(np.argsort(
        edges_betweeness.values()))[-n_object_per_assessment:]
    best_edges = [edges_betweeness.keys()[i] for i in best_ind]
    res = objects_from_edges(best_edges, n_object_per_assessment)
    return res
def run(bidirfile, fimodir):
    
    distances = dict()
    directorylist = [fimodir + '/' + item for item in os.listdir(fimodir) if 'fimo_out' in item]
    for item in directorylist:
        print item
        TF = item.split('/')[6].split('_')[0]
        x = Functions.get_distances_pad_v3(bidirfile, item + "/fimo.cut.txt", True, 1500)
        for i in range(len(x)):
            x[i] = x[i]*1500
            
        if len(x) != 0:
            counts,edges 	= np.histogram(x, bins=200)
            edges 			= edges[1:]
            X 				= np.zeros((len(counts), 2))
            X[:,0] 			= edges
            X[:,1] 			= counts
            w = em.fit(X)
            w2 = ds2.get_w(X)
            ks = list()
            for a in range(1000):
                d = ds.simulate
                ks.append(scipy.stats.ks_2samp(x,d))
            d = np.mean(ks)
            start = min(x)
            stop = max(x)
            sigma = np.std(x)
            mu = np.mean(x)
            N = len(x)
            y = np.random.uniform(start, stop, N)
            y = np.linspace(start,stop,N)
            z = mu/(sigma/math.sqrt(N))
            p = 1 - scipy.special.ndtr(z)
            k = scipy.stats.ks_2samp(x,y)
            m = scipy.stats.mode(x)[0][0]
            if -0.25 < m < 0.25:
                m = 0
            else:
                m = 1
            distances[TF] = [w,w2,k[1],d,p,m,x]
        
    return distances
Esempio n. 7
0
import scipy.io as sio
import numpy as np
import funcs
import EM_algorithm

# First we need to load the data which is in matlab format
# This returns a dictionary
ratings = sio.loadmat('ratings.mat')

# Extract the array of ratings
X = ratings['X']

log = np.empty([4, 5])
for i in range(1, 4):
    for j in range(5):
        K = i + 1
        mixture, post = funcs.init(X, K, seed=j)
        mix, post, loglike = EM_algorithm.run(X, mixture, post)
        log[i, j] = loglike
    funcs.plot(X, mix, post, "Gaussian Mixture Model with K = " + str(K))
def discriminatory_ass_gene(results, n_ass, n_objects):
    ass.symmetrized_Votes(results)
    Delta_est, Assessors_est, S_est = em.EM_est(results, n_objects, 0.001)
    return np.argsort(Assessors_est)[0]