def bootstrap_EM(df, kgene, part=0.5, n_boot=100, maxIter=1000, alpha=1e-5): '''Given BLAST output as a dataframe, bootstrap fraction part of reads and run Expectation-Maximization algorithm on bootstrapped output n_boot times. Return mean of bootstrapped allele probability estimates & allele names.''' # read in list of possible alleles from file, not from BLAST outtput df_tr = pd.read_csv('../ref_files/KIRallele_translation.txt', sep='\t', index_col=0) tdict = df_tr['Colon_Name'].to_dict() # load list of alleles and possible 2-allele combinations df_kirvars = pd.read_csv('../ref_files/KIR_variants.txt', sep='\t') sortedAlleles = sorted([tdict[a] for a in df_kirvars[kgene].dropna()]) # set up bootstrapping Pmat = np.zeros((n_boot, len(sortedAlleles))) for i in range(n_boot): df_bs = bootstrap(df, part=part) if len(df_bs) != 0: P, alleles = em.run_EM(df_bs, maxIter=maxIter, alpha=alpha) pdict = {a: P[i] for i, a in enumerate(alleles)} for a in sortedAlleles: if a not in pdict: pdict[a] = 0 # Make sure alleles are in same order every time sortedP = np.array([pdict[a] for a in sortedAlleles]) if len(sortedP) == len(Pmat[i]): Pmat[i] = sortedP return np.mean(Pmat, axis=0), sortedAlleles
def run(TFIntervaldict,pad,threshold,bins): distances = dict() for TF in TFIntervaldict: x = list() for array in TFIntervaldict[TF]: for interval in array: for position in interval: if position[2] != np.inf and position[2] > threshold: x.append((position[0]+position[1]/2)-pad) if len(x) > 0: counts,edges = np.histogram(x, bins=bins) edges = edges[1:] X = np.zeros((len(counts), 2)) X[:,0] = edges X[:,1] = counts w = em.fit(X) start = min(x) stop = max(x) sigma = np.std(x) mu = np.mean(x) N = len(x) y = np.random.uniform(start, stop, N) y = np.linspace(start,stop,N) z = mu/(sigma/math.sqrt(N)) p = 1 - scipy.special.ndtr(z) k = scipy.stats.ks_2samp(x,y) m = scipy.stats.mode(x)[0][0] if -0.25 < m < 0.25: m = 0 else: m = 1 distances[TF] = [w,k[1],p,m,x] return distances
def main(articles_file, topics): #get the topics topics_model = ut.get_the_topics_lst(topics) #get the training set headers, articles, words_freqs, articles_freqs = ut.make_train_set( articles_file) #divide the words into cluster words_into_clusters = ut.divide_clusters(articles) #get the good wights for our model w_model = em.run_em_algorithm(articles_freqs, words_freqs, words_into_clusters, len(topics_model)) #create the confuision matrix conf_matrix, clusters_and_topics, articles_of_clusters = ut.make_conf_matrix( w_model, articles_freqs, topics_model, headers) # conf_matrix_descending_order = sorted(conf_matrix, key=lambda line: line[-1], reverse=True) print conf_matrix #add the topic to the articles articles_by_topic = ut.add_tag_to_articles(clusters_and_topics, articles_of_clusters) #print empty line print "\n" #compute the accuracy of the model accuracy = ut.compute_accuracy(headers, articles_by_topic) print "the accuracy of our model is- ", accuracy
def discriminatory_ob_gene(results, n_ass, n_objects, n_object_per_assessment): ass.symmetrized_Votes(results) Delta_est, Assessors_est, S_est = em.EM_est(results, n_objects, 0.001) s = np.argsort(S_est, axis=None) n = np.shape(S_est)[0] sorted_ind = [(s[i] / n, s[i] - (s[i] / n) * n) for i in range(len(s))] res = objects_from_edges(sorted_ind, n_object_per_assessment) return res
def graph_ob_gene(results, n_ass, n_objects, n_object_per_assessment): # Compute Distance ass.symmetrized_Votes(results) Delta_est, Assessors_est, S_est = em.EM_est(results, n_objects, 0.00001) Delta = np.ones(np.shape(Delta_est)) - Delta_est Delta = Delta / np.max(Delta, axis=None) sorted_Delta = np.sort(Delta, axis=None) # Set threshold as median of non zeros values threshold = np.median([sorted_Delta[i] for i in np.nonzero(sorted_Delta)]) # Create graph and compute betweeness of edges G = gra.init_graph(Delta, threshold) edges_betweeness = gra.betweeness(G) # Return the appropriate number of objects choosing the high betweeness first best_ind = list(np.argsort( edges_betweeness.values()))[-n_object_per_assessment:] best_edges = [edges_betweeness.keys()[i] for i in best_ind] res = objects_from_edges(best_edges, n_object_per_assessment) return res
def run(bidirfile, fimodir): distances = dict() directorylist = [fimodir + '/' + item for item in os.listdir(fimodir) if 'fimo_out' in item] for item in directorylist: print item TF = item.split('/')[6].split('_')[0] x = Functions.get_distances_pad_v3(bidirfile, item + "/fimo.cut.txt", True, 1500) for i in range(len(x)): x[i] = x[i]*1500 if len(x) != 0: counts,edges = np.histogram(x, bins=200) edges = edges[1:] X = np.zeros((len(counts), 2)) X[:,0] = edges X[:,1] = counts w = em.fit(X) w2 = ds2.get_w(X) ks = list() for a in range(1000): d = ds.simulate ks.append(scipy.stats.ks_2samp(x,d)) d = np.mean(ks) start = min(x) stop = max(x) sigma = np.std(x) mu = np.mean(x) N = len(x) y = np.random.uniform(start, stop, N) y = np.linspace(start,stop,N) z = mu/(sigma/math.sqrt(N)) p = 1 - scipy.special.ndtr(z) k = scipy.stats.ks_2samp(x,y) m = scipy.stats.mode(x)[0][0] if -0.25 < m < 0.25: m = 0 else: m = 1 distances[TF] = [w,w2,k[1],d,p,m,x] return distances
import scipy.io as sio import numpy as np import funcs import EM_algorithm # First we need to load the data which is in matlab format # This returns a dictionary ratings = sio.loadmat('ratings.mat') # Extract the array of ratings X = ratings['X'] log = np.empty([4, 5]) for i in range(1, 4): for j in range(5): K = i + 1 mixture, post = funcs.init(X, K, seed=j) mix, post, loglike = EM_algorithm.run(X, mixture, post) log[i, j] = loglike funcs.plot(X, mix, post, "Gaussian Mixture Model with K = " + str(K))
def discriminatory_ass_gene(results, n_ass, n_objects): ass.symmetrized_Votes(results) Delta_est, Assessors_est, S_est = em.EM_est(results, n_objects, 0.001) return np.argsort(Assessors_est)[0]