def family_exemplar_structs(rfid, refseq_method = None, sp_method = None, aff_type = None): suboptimals = rutils.family_suboptimals(rfid) c2 = rutils.cluster_2(spairs, ungapped_ref) arr = rutils.rna_draw(ungapped_ref.seq, rutils.pairs_stk(sp,len(ungapped_ref)), 'name' ) raise Exception() affinities, ss = rutils.affinity_matrix(spairs, aff_type = aff_type) aff_shape, ss_shape = rutils.affinity_matrix(spairs, aff_type = 'easy', ss_multiplier = .5) pca_vecs = mlab.PCA(affinities).project(affinities) pca_vecs_shape = mlab.PCA(aff_shape).project(aff_shape) inds = compute_clusters(aff_shape, ss_shape) exemplars = list(set(inds)) import compbio.utils.colors as mycolors ct = mycolors.getct(len(exemplars)) import matplotlib.pyplot as plt f = plt.gcf() plt.clf() for idx0, embeddings in enumerate([pca_vecs, pca_vecs_shape]): ax = f.add_subplot('21{0}'.format(idx0 +1)) lims =[ [min(embeddings[:,0]),max(embeddings[:,0])], [min(embeddings[:,1]),max(embeddings[:,1])] ] lims += [-.5,.5] *squeeze(diff(lims,1))[:,newaxis] ax.set_xlim(lims[0]) ax.set_ylim(lims[1]) print sum(embeddings) for idx, embedding in enumerate(embeddings): if mod(idx,1) != 0: continue sp = spairs[idx] arr = rutils.rna_draw(ungapped_ref.seq, rutils.pairs_stk(sp,len(ungapped_ref)), 'name' ) struct_emb = arr + embedding[0:2] #plt.plot(*struct_emb.T) pkw = {'color':ct[exemplars.index(inds[idx])], 'lw':8 if idx in inds else 1, 'alpha': 1 if idx in inds else .2} lc = rplots.show_rna(embedding, arr, pkw = pkw) #exemplar_structs = [spairs[e] for e in set(inds)] raise Exception() return pca_vecs, exemplar_structs
def main(): sonets = open_corpus('corpus1.txt') anna = open_corpus('corpus2.txt') sonets_data = make_features(sonets) anna_data = make_features(anna) data = np.vstack((sonets_data, anna_data)) p = mlab.PCA(data, True) N = len(sonets_data) print(p.Wt) plt.plot(p.Y[N:, 0], p.Y[N:, 1], 'og', p.Y[:N, 0], p.Y[:N, 1], 'sb') # зелененькое - анна каренина, а синенькое - сонеты # Правда ли, что существует линейная комбинация признаков (т.е. значение по первой оси в преобразованных методом главных компонент данных), и пороговое значение, при которых больше 70% текстов каждого жанра находятся с одной стороны от порогового значения? Напишите программу genre-by-pos.py, которая демонстрирует ответ на этот вопрос. # Мне кажется, что ответ да, судя по картинке print( 'Линейная комбинация и пороговое значение, при которых больше 70% текстов каждого жанра находятся с одной стороны от порогового значения, существуют.' ) # plt.savefig('result.png') plt.show() # Подберем, например, на глаз по картинке пороговое значение, # при котором больше 70% предложений анны карениной справа от него, и больше 70% предложений сонетов -- слева # Например: print('Пороговое значение: -4.2') print( sum(p.Y[N:, 0] > -4.2) / len(p.Y[N:, 0]) * 100, '- процент предложений "Анны Карениной", которые лежат справа от порога' ) print( sum(p.Y[:N, 0] < -4.2) / len(p.Y[:N, 0]) * 100, '- процент предложений сонетов, которые лежат слева от порога')
def get_test_data(): f = File( 'C:/Users/sommerc/data/Chromatin-Microtubles/Analysis/H2b_aTub_MD20x_exp911_2_channels_nozip/dump_save/two_positions.hdf5' ) pos = f[f.positions[0]] print f.positions[0] events = pos.get_objects('event') feature_matrix = [] labels = [] for e in events: item_features = e.item_features item_labels = e.item_labels if item_features is not None: feature_matrix.append(item_features) labels.append(item_labels) feature_matrix = numpy.concatenate(feature_matrix) feature_matrix = remove_constant_columns(feature_matrix) feature_matrix = stats.zscore(feature_matrix) pca = mlab.PCA(feature_matrix) feature_matrix = pca.project(feature_matrix) feature_matrix = feature_matrix.reshape(len(events), len(item_features), -1) f.close() labels2 = numpy.asarray(labels) labels2[labels2 == 7] = 1 labels2 += 1 return feature_matrix, labels2
def preprocess(self): """Preprocess data for further analysis 1) remove columns with zeros 2) remove columns with NAN's 3) z-score data 4) perform pca """ data, nodes = self.data_matrix() #import pdb; pdb.set_trace() if data.shape[0] <= data.shape[1]: msg = ("Not enough objects in data set to proceed", "Number of object is smaller than the number of features", "(%d <= %d)" % tuple(data.shape)) raise EventSelectionError(msg) # delete columns with zeros ind = np.where(data == 0)[1] data = np.delete(data, ind, 1) # remove columns with nans data, nodes = self._filter_nans(data, nodes) data_zs = stats.zscore(data) # sss.zscore(self.remove_constant_columns(data)) pca = mlab.PCA(data_zs) # XXX take the minimum to make it more readable num_features = np.nonzero(np.cumsum(pca.fracs) > self.varfrac)[0][0] data_pca = pca.project(data_zs)[:, 0:num_features] return data_pca, nodes
def plotKMedoid(K, X): # Used demo from https://stackoverflow.com/questions/9847026/plotting-output-of-kmeanspycluster-impl # cluster kMedoids = trainKMedoid(K, X) clustersIds = assignKMedoids(K, X, kMedoids) # reduce dimensionality iris_pca = mlab.PCA(X) cutoff = iris_pca.fracs[1] iris_2d = iris_pca.project(X, minfrac=cutoff) medoids_2d = iris_pca.project(list(kMedoids.values()), minfrac=cutoff) # make a plot colors = ['red', 'green', 'blue', 'yellow'] plt.figure() plt.xlim([iris_2d[:, 0].min() - .5, iris_2d[:, 0].max() + .5]) plt.ylim([iris_2d[:, 1].min() - .5, iris_2d[:, 1].max() + .5]) plt.xticks([], []) plt.yticks([], []) # numbers aren't meaningful # show the centroids plt.scatter(medoids_2d[:, 0], medoids_2d[:, 1], marker='o', c=colors, s=100) # show user numbers, colored by their cluster id for i, ((x, y), kls) in enumerate(zip(iris_2d, list(clustersIds.values()))): plt.annotate(str(i), xy=(x, y), xytext=(0, 0), textcoords='offset points', color=colors[kls])
def pca(self, feature, var_lim): i = 0 max_var = 0 results = mlab.PCA(feature) while max_var <= var_lim: max_var += results.fracs[i] i += 1 return results.Y[:, 0:5]
def surf_segmentation(points, config): global ELAPSE_SEG config.slice_count = min(int(len(points) / config.origin_points), config.slice_count) assert len(points) / config.slice_count >= config.origin_points surfs = [] npoints = point_normalize(points) # cov = np.cov(npoints) pca_md = mlab.PCA(np.copy(npoints)) projection0 = pca_md.Y[:, 0] step_count = len(projection0) / config.slice_count pointsets = [np.array([]).reshape(0, 3)] * config.slice_count starttime = time.clock() # projection0_index = np.hstack((projection0, np.arange(len(projection0)))) sorted_projection0_index = np.argsort(projection0) # sorted_projection0 = projection0[sorted_projection0_index] current_slot_count, ptsetid = 0, 0 # for (index, value) in zip(sorted_projection0_index, sorted_projection0): for index in sorted_projection0_index: pointsets[ptsetid] = np.vstack((pointsets[ptsetid], npoints[index])) current_slot_count += 1 if current_slot_count > step_count: current_slot_count = 0 ptsetid += 1 partial_surfs = [] for ptset in pointsets: print "before segment", len(partial_surfs), len(ptset) if len(ptset) > 0: partial_surfs, _ = identifysurf(np.copy(ptset), AdaSurfConfig({ 'origin_points': config.origin_points, 'most_combination_points': config.most_combination_points, 'same_threshold': config.same_threshold, 'filter_rate': config.filter_rate, 'ori_adarate': config.ori_adarate }), donorm=False, surfs=partial_surfs) print "after segment", len(partial_surfs) surfs.extend(partial_surfs) return surfs, npoints
def get_pca_variance(df, dates, loopback=30): """ computes the variance of each dimension per date (with 30 days loopback) """ result = {} for day in dates: try: end_day = day + BDay(loopback - 1) sd = mlab.PCA(df.ix[day:end_day]).sigma variance = [x**2 for x in sd] result[end_day] = Series(variance, index=df.columns) print '%s done' % day except: print 'error in PCA computation of %s' % day return DataFrame.from_dict(result, orient='index')
def runMUSIC(): global runtime, stoptime, timestep, state, state_hist, d, PCA_HIST_LENGTH, spikes, SPIKE_HIST_LENGTH, proj_hist print "running PCA adapter" t = 0 pca_created = False while runtime.time() < stoptime: if runtime.time() > PCA_HIST_LENGTH and not pca_created: pca = mlab.PCA(state_hist['states']) pca_created = True state = state * np.exp(-timestep/ tau) if t % 50 == 0: if runtime.time() < PCA_HIST_LENGTH: state_hist['states'] = np.append(state_hist['states'], [state], axis = 0) state_hist['times'] = np.append(state_hist['times'], [runtime.time()], axis = 0) state_hist_mask = np.where(state_hist['times'] > max(state_hist['times']) - PCA_HIST_LENGTH) state_hist['times'] = state_hist['times'][state_hist_mask] state_hist['states'] = state_hist['states'][state_hist_mask] #print "states", state_hist['states'] if runtime.time() > PCA_HIST_LENGTH: projection = pca.project(state) #print "proj", len(projection) projection = projection[:3] proj_hist['projs'] = np.append(proj_hist['projs'], [projection], axis = 0) proj_hist['times'] = np.append(proj_hist['times'], [runtime.time()], axis = 0) proj_hist_mask = np.where(proj_hist['times'] > max(proj_hist['times']) - PROJ_HIST_LENGTH) proj_hist['times'] = proj_hist['times'][proj_hist_mask] proj_hist['projs'] = proj_hist['projs'][proj_hist_mask] spike_hist_mask = np.where(spikes['times'] > max(spikes['times']) - SPIKE_HIST_LENGTH) spikes['times'] = spikes['times'][spike_hist_mask] spikes['senders'] = spikes['senders'][spike_hist_mask] d.on_running(spikes['times'], spikes['senders'], proj_hist['projs']) #print spikes #print t, runtime.time() runtime.tick() t += 1
def surf_segmentation(points, config): global ELAPSE_SEG assert len(points) / config.slice_count >= config.origin_points npoints = point_normalize(points) # cov = np.cov(npoints) pca_md = mlab.PCA(npoints) projection0 = pca_md.Y[:, 0] projection0min, projection0max = np.min(projection0), np.max(projection0) slice_step = (projection0max - projection0min) / config.slice_count pointsets = [np.array([]).reshape(0,3)] * config.slice_count surfs = [] starttime = time.clock() for row_id in xrange(len(projection0)): if projection0[row_id] == projection0max: ptsetid = config.slice_count - 1 else: ptsetid = int((projection0[row_id]-projection0min) / slice_step) pointsets[ptsetid] = np.vstack((pointsets[ptsetid], npoints[row_id])) ELAPSE_SEG += time.clock() - starttime partial_surfs = [] for ptset in pointsets: print "before segment", len(partial_surfs) if len(ptset) > 0: partial_surfs, _ = identifysurf(ptset, AdaSurfConfig( {'origin_points': config.origin_points, 'most_combination_points': config.most_combination_points, 'same_threshold': config.same_threshold}), donorm = False, surfs = partial_surfs) # # 注意这里不能简单地extend,应当将surfs和partial_surfs去重 # if len(partial_surfs) > 0: # surfs.extend(partial_surfs) print "after segment", len(partial_surfs) surfs.extend(partial_surfs) # print np.std(pca_md.Y[:, 0]),np.std(pca_md.Y[:, 1]),np.std(pca_md.Y[:, 2]) # print pca_md.Y[:, 0] # print projection0.shape, npoints.shape # print np.linalg.norm(pca_md.Wt[0]) # fig = pl.figure() # ax = fig.add_subplot(111, projection='3d') # ax.scatter(npoints[:, 0], npoints[:, 1], npoints[:, 2], c='r') # x = np.linspace(0, pca_md.Wt[0, 0], 300) # y = np.linspace(0, pca_md.Wt[0, 1], 300) # z = np.linspace(0, pca_md.Wt[0, 2], 300) # ax.plot(x, y, z, c='k') # x = np.linspace(0, pca_md.Wt[1, 0], 300) # y = np.linspace(0, pca_md.Wt[1, 1], 300) # z = np.linspace(0, pca_md.Wt[1, 2], 300) # ax.plot(x, y, z, c='g') # pl.show() return surfs, npoints
def _run_pca(self): self.feature_matrix = [] self.item_colors = [] for well_key in self.data_provider.positions: for pos_key in self.data_provider.positions[well_key]: position = self.data_provider.get_position(well_key, pos_key) events = position.get_events() for t in events: item_features = t.item_features if item_features is not None: self.feature_matrix.append(item_features) item_colors = t.item_colors if item_colors is not None: self.item_colors.extend(item_colors) print 'number ofevents', len(self.feature_matrix) self.feature_matrix = numpy.concatenate(self.feature_matrix) nan_index = ~numpy.isnan(self.feature_matrix).any(1) self.feature_matrix = self.feature_matrix[nan_index, :] self.item_colors = numpy.asarray(self.item_colors)[nan_index] print self.feature_matrix.shape, self.item_colors.shape temp_pca = mlab.PCA(self.feature_matrix) result = temp_pca.project(self.feature_matrix)[:, :4] for cnt, (i, j) in enumerate([(1, 2), (1, 3), (2, 3), (1, 4)]): self.axes = self.fig.add_subplot(221 + cnt) means = kmeans(result[:, [i - 1, j - 1]], 7)[0] self.axes.scatter(result[:, i - 1], result[:, j - 1], c=self.item_colors) self.axes.plot(means[:, 0], means[:, 1], 'or', markeredgecolor='r', markerfacecolor='None', markersize=12, markeredgewidth=3) self.axes.set_xlabel('Principle component %d' % i) self.axes.set_ylabel('Principle component %d' % j) self.axes.set_title('Events in PCA Subspace %d' % (cnt + 1))
def evaluate(self): """ Compute the temporal covariance between nodes in the time_series. """ cls_attr_name = self.__class__.__name__+".time_series" self.time_series.trait["data"].log_debug(owner = cls_attr_name) ts_shape = self.time_series.data.shape #Need more measurements than variables if ts_shape[0] < ts_shape[2]: msg = "PCA requires a longer timeseries (tpts > number of nodes)." LOG.error(msg) raise Exception, msg #(nodes, nodes, state-variables, modes) weights_shape = (ts_shape[2], ts_shape[2], ts_shape[1], ts_shape[3]) LOG.info("weights shape will be: %s" % str(weights_shape)) fractions_shape = (ts_shape[2], ts_shape[1], ts_shape[3]) LOG.info("fractions shape will be: %s" % str(fractions_shape)) weights = numpy.zeros(weights_shape) fractions = numpy.zeros(fractions_shape) #One inter-node temporal covariance matrix for each state-var & mode. for mode in range(ts_shape[3]): for var in range(ts_shape[1]): data = self.time_series.data[:, var, :, mode] data_pca = mlab.PCA(data) fractions[:, var, mode ] = data_pca.fracs weights[:, :, var, mode] = data_pca.Wt util.log_debug_array(LOG, fractions, "fractions") util.log_debug_array(LOG, weights, "weights") pca_result = mode_decompositions.PrincipalComponents( source = self.time_series, fractions = fractions, weights = weights, use_storage = False) return pca_result
def main(): print('Читаю тексты...') sonets = Text('corpus1.txt') anna = Text('corpus2.txt') news = Text('corpus3.txt') print('Считаю характеристики...') sonets_data = sonets.make_features() anna_data = anna.make_features() news_data = news.make_features() print('Использую метод главных компонент...') data = np.vstack((sonets_data, anna_data, news_data)) p = mlab.PCA(data, True) N1 = len(sonets_data) N2 = len(anna_data) print('Значимые признаки:') main_features = sorted(zip(Text.features, p.s, range(12)), key=lambda pair: -abs(pair[1]))[:3] print('\r\n'.join(' ' + par[0] + ' - ' + str(par[1]) for par in main_features)) print('Рисую графики...') plt.figure() plt.plot(p.Y[:N1, 0], p.Y[:N1, 1], 'og', p.Y[N1:N2 + N1, 0], p.Y[N1:N2 + N1, 1], 'sb', p.Y[N2 + N1:, 0], p.Y[N2 + N1:, 1], 'xr') plt.savefig('PCA-result.png') plt.close() for i, j in itertools.combinations(main_features, 2): plt.figure() plt.plot(data[:N1, i[2]], data[:N1, j[2]], 'og', data[N1:N2 + N1, i[2]], data[N1:N2 + N1, j[2]], 'sb', data[N2 + N1:, j[2]], data[N2 + N1:, j[2]], 'xr') plt.savefig('%s_vs_%s.png' % (Text.features[i[2]], Text.features[j[2]])) plt.close() print('Ура! Конец!')
def train(d, attrCount): matrix = [] count = [] test = [] u = [] sigma = [] for i in range(10): matrix.append(0) count.append(0) d = read() for img in d: # Subtract mean m = np.subtract(img[1], np.mean(img[1])) if count[img[0]] is 0: matrix[img[0]] = np.reshape(m, -1).T count[img[0]] = 1 if count[img[0]] < 100: count[img[0]] += 1 centered = np.reshape(m, -1).T matrix[img[0]] = np.vstack((matrix[img[0]], centered)) else: test.append(img) b = 0 for num in matrix: pca = (mlab.PCA(num.T, standardize=False)).Y.T pca = pca[0:attrCount] show(np.cov(pca.T)) sigma.append(np.cov(pca.T)) u.append([]) for i in range(attrCount): u[b].append(np.mean(pca[i])) b += 1 return u, sigma, test
def test_colinear_pca(): a = mlab.PCA._get_colinear() pca = mlab.PCA(a) assert (np.allclose(pca.fracs[2:], 0.)) assert (np.allclose(pca.Y[:, 2:], 0.))
c.append(y) c.append(len(i.split())) c2data.append(c) anna_data = [] sonet_data = [] for t in range(len(dataa)): anna_data.append(c1data[t] + dataa[t]) for t in range(len(datas)): sonet_data.append(c2data[t] + datas[t]) data = np.vstack((anna_data, sonet_data)) N = len(anna_data) p = mlab.PCA(data, True) a = p.Wt[0] di = {} di[a[0]] = 'A' di[a[1]] = 'S' di[a[2]] = 'V' di[a[3]] = 'ADV' di[a[4]] = 'SPRO' di[a[5]] = 'len_in_letters' di[a[6]] = 'len_in_diff_letters' di[a[7]] = 'len_in_vowels' di[a[8]] = 'median_length_of_words' di[a[9]] = 'mean_length_of_words' di[a[10]] = 'median_vowels_in_words' di[a[11]] = 'length_of_sent_in_words'
# -*- coding: utf-8 -*- """ Created on Tue Nov 15 17:47:42 2011 @author: Sat Kumar Tomer @website: www.ambhas.com @email: [email protected] """ import matplotlib.mlab as ml import numpy as np mean = [0, 0, 0] cov = [[1, 0.2, 0.5], [0.2, 1, 0.8], [0.5, 0.8, 1]] print(np.array(cov)) data = np.random.multivariate_normal(mean, cov, 100) foo = ml.PCA(data)
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True, draw_distances = draw_all_easy, draw_clusters = draw_all_easy, draw_single_cluster = draw_all_hard): ''' Run the tree computation for each clsuter in the rfam family. (Or just one) 1) Compute clusters using a distance measure derived either phyml or a simple levenshtein dist. kwds: tree [True] Use a tree or just a levenshtein distance to get distances for init clustering. 2) Choose a cluster of well related sequences and for this this cluster, compute an alignment (For each structure using phase or for sequences using MUSCLE) kwds: struct_align [True] Whether to compute structural alignments or use MUSCLE ''' rutils = utils ali, tree, infos = rfam.get_fam(rfid) n = len(ali) if draw_distances: dists_t = seq_dists(ali,rfid, tree = True) dists_l = seq_dists(ali,rfid, tree = False) dtf = dists_t.flatten() dlf = dists_l.flatten() lin = linregress(dtf, dlf) rsquared = lin[2]**2 f = myplots.fignum(5, (7,7)) ax = f.add_subplot(111) ax.annotate('Levenshtein distance vs. BioNJ branch lengths', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('R-Squared: {0}'.format(rsquared), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('BIONJ Tree ML Distance') ax.set_ylabel('Levenshtein Distance') ax.scatter(dtf, dlf, 100) datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff') f.savefig(datafile) dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid, register = rfid, on_fail = 'compute', reset = reset) clusters = maxclust_dists(dists, k = 5, method = 'complete') clusters -= 1 if draw_clusters: ct = mycolors.getct(len(set(clusters))) colors = [ct[elt] for elt in clusters] pca_vecs = mlab.PCA(dists).project(dists) f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of Clusters: {0}'.format(len(ct)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors) datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps') f.savefig(datafile) #now take the largest cluster and do the analysis. cgrps = dict([ (k, list(g)) for k , g in it.groupby(\ sorted( list(enumerate(clusters)),key = lambda x: x[1]), key = lambda x: x[1])]) cbig = argmax([len(x) for x in cgrps.values()]) cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] csize = len(cluster_seqs) seqs =[ali[c] for c in cluster_seqs] if 0: ct = mycolors.getct(2) pca_vecs = mlab.PCA(dists).project(dists) colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n - csize), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 0') ax.set_ylabel('Distance') for s in cluster_seqs: ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2), color = colors, alpha = .2) datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps') f.savefig(datafile) clusters_final = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))] seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final] return seqs_final
def run(self): """ Task run method. Computes the principal component decomposition of the input images and populates the output eigenimages and projection matrix. Parameters ---------- None Returns ------- None """ self._summary = {} # BDP output uses the alias name if provided, else a flow-unique one. stem = self._alias if not stem: stem = "pca%d" % (self.id()) inum = 0 data = [] icols = [] for ibdp in self._bdp_in: # Convert input CASA images to numpy arrays. istem = ibdp.getimagefile(bt.CASA) ifile = ibdp.baseDir() + istem icols.append(os.path.splitext(istem)[0]) if os.path.dirname(icols[-1]): icols[-1] = os.path.dirname(icols[-1]) # Typical line cube case. img = admit.casautil.getdata(ifile, zeromask=True).data data.append(img) admit.logging.info("%s shape=%s min=%g max=%g" % (icols[-1], str(img.shape), np.amin(img), np.amax(img))) assert len(data[0].shape) == 2, "Only 2-D input images supported" assert data[0].shape == data[inum].shape, "Input shapes must match" inum += 1 # At least two inputs required for meaningful PCA! assert inum >= 2, "At least two input images required" # Each 2-D input image is a plane in a single multi-color image. # Each color multiplet (one per pixel) is an observation. # For PCA we collate the input images into a vector of observations. shape = data[0].shape npix = shape[0] * shape[1] clip = self.getkey('clipvals') if not clip: clip = [0 for i in range(inum)] assert len(clip) >= inum, "Too few clipvals provided" # Clip input values and stack into a vector of observations. pca_data = [] for i in range(inum): nd = data[i] nd[nd < clip[i]] = 0.0 pca_data.append(np.reshape(nd, (npix,1))) pca_in = np.hstack(pca_data) pca = mlab.PCA(pca_in) # Input statistics and output variance fractions. #print "fracs:", pca.fracs #print "mean:", pca.mu #print "sdev:", pca.sigma obdp = admit.Table_BDP(stem+"_stats") obdp.table.setData(np.vstack([pca.mu, pca.sigma,pca.fracs]).T) obdp.table.columns = ["Input mean", "Input deviation", "Eigenimage variance fraction"] obdp.table.description = "PCA Image Statistics" self.addoutput(obdp) # Pre-format columns for summary output. # This is required when mixing strings and numbers in a table. # (NumPy will output the array as all strings.) table1 = admit.Table() table1.setData(np.vstack([[i for i in range(inum)], icols, ["%.3e" % x for x in pca.mu], ["%.3e" % x for x in pca.sigma], ["%s_eigen/%d.im" % (stem, i) for i in range(inum)], ["%.4f" % x for x in pca.fracs]]).T) table1.columns = ["Index", "Input", "Input mean", "Input deviation", "Eigenimage", "Eigenimage variance fraction"] table1.description = "PCA Image Statistics" # Projection matrix (eigenvectors). #print "projection:", pca.Wt obdp = admit.Table_BDP(stem + "_proj") obdp.table.setData(pca.Wt) obdp.table.columns = icols obdp.table.description = \ "PCA Projection Matrix (normalized input to output)" self.addoutput(obdp) # Covariance matrix. covar = np.cov(pca.a, rowvar=0, bias=1) #print "covariance:", covar obdp = admit.Table_BDP(stem + "_covar") obdp.table.setData(covar) obdp.table.columns = icols obdp.table.description = "PCA Covariance Matrix" self.addoutput(obdp) # Collate projected observations into eigenimages and save output. os.mkdir(self.baseDir()+stem+"_eigen") pca_out = np.hsplit(pca.Y, inum) odata = [] for i in range(inum): ofile = "%s_eigen/%d" % (stem, i) img = np.reshape(pca_out[i], shape) odata.append(img) #print ofile, "shape, min, max:", img.shape, np.amin(img), np.amax(img) aplot = admit.util.APlot(figno=inum, abspath=self.baseDir(), ptype=admit.util.PlotControl.PNG) aplot.map1(np.rot90(img), title=ofile, figname=ofile) aplot.final() # Currently the output eigenimages are stored as PNG files only. admit.casautil.putdata_raw(self.baseDir()+ofile+".im", img, ifile) oimg = admit.Image() oimg.addimage(admit.imagedescriptor(ofile+".im", format=bt.CASA)) oimg.addimage(admit.imagedescriptor(ofile+".png", format=bt.PNG)) obdp = admit.Image_BDP(ofile) obdp.addimage(oimg) self.addoutput(obdp) # As a cross-check, reconstruct input images and compute differences. for k in range(inum): ximg = pca.Wt[0][k]*odata[0] for l in range(1,inum): ximg += pca.Wt[l][k]*odata[l] ximg = pca.mu[k] + pca.sigma[k]*ximg admit.logging.regression("PCA: %s residual: " % icols[k] + str(np.linalg.norm(ximg - data[k]))) # Collect large covariance values for summary. cvmin = self.getkey('covarmin') cvsum = [] cvmax = 0.0 for i in range(inum): for j in range(i+1, inum): if abs(covar[i][j]) >= cvmax: cvmax = abs(covar[i][j]) if abs(covar[i][j]) >= cvmin: cvsum.append([icols[i], icols[j], "%.4f" % (covar[i][j])]) admit.logging.regression("PCA: Covariances > %.4f: %s (max: %.4f)" % (cvmin,str(cvsum),cvmax)) table2 = admit.Table() table2.columns = ["Input1", "Input2", "Covariance"] table2.setData(cvsum) table2.description = "PCA High Covariance Summary" keys = "covarmin=%.4f clipvals=%s" % (cvmin, str(clip)) self._summary["pca"] = admit.SummaryEntry([table1.serialize(), table2.serialize() ], "PrincipalComponent_AT", self.id(True), keys)
def surf_segmentation(points, config, paint_when_end=False): global ELAPSE_SEG config.slice_count = min(int(len(points) / config.origin_points), config.slice_count) assert len(points) / config.slice_count >= config.origin_points adasurconfig = AdaSurfConfig({ 'origin_points': config.origin_points, 'most_combination_points': config.most_combination_points, 'same_threshold': config.same_threshold, 'filter_rate': config.filter_rate, 'ori_adarate': config.ori_adarate, 'step_adarate': config.step_adarate, 'max_adarate': config.max_adarate, 'pointsame_threshold': config.pointsame_threshold, 'filter_count': config.filter_count, 'weak_abort': config.weak_abort }) surfs = [] slice_fig = [] npoints = point_normalize(points) starttime = time.clock() xlim = (np.min(npoints[:, 0]), np.max(npoints[:, 0])) ylim = (np.min(npoints[:, 1]), np.max(npoints[:, 1])) zlim = (np.min(npoints[:, 2]), np.max(npoints[:, 2])) pca_md = mlab.PCA(np.copy(npoints)) projection0_direction = None # projection0_direction = pca_md.Y[0] # projection0 = np.inner(projection0_direction, npoints) projection0 = npoints[:, 0] if config.split_by_count: step_count = len(projection0) / config.slice_count pointsets = [np.array([]).reshape(0, 3)] * config.slice_count sorted_projection0_index = np.argsort(projection0) current_slot_count, ptsetid = 0, 0 for index in sorted_projection0_index: pointsets[ptsetid] = np.vstack( (pointsets[ptsetid], npoints[index, :])) current_slot_count += 1 if current_slot_count > step_count: current_slot_count = 0 ptsetid += 1 else: projection0min, projection0max = np.min(projection0), np.max( projection0) step_len = (projection0max - projection0min) / config.slice_count pointsets = [np.array([]).reshape(0, 3)] * config.slice_count for i in xrange(len(projection0)): if projection0[i] == projection0max: ptsetid = config.slice_count - 1 else: ptsetid = int((projection0[i] - projection0min) / step_len) pointsets[ptsetid] = np.vstack((pointsets[ptsetid], npoints[i])) # random.shuffle(pointsets) partial_surfs, fail = [], np.array([]).reshape(0, 3) # for (ptset, ptsetindex) in zip(pointsets, range(len(pointsets))): # print "slice", len(ptset), xlim, ylim, zlim # paint_points(ptset, xlim = xlim, ylim = ylim, zlim = zlim) for (ptset, ptsetindex) in zip(pointsets, range(len(pointsets))): print "--------------------------------------" print "before segment", ptsetindex, '/', len(pointsets) print 'derived surfs:' # print '---000', ptset.shape, np.array(fail).shape, np.array(fail), fail if fail == None: allptfortest = np.array(ptset) else: allptfortest = np.vstack((ptset, np.array(fail).reshape(-1, 3))) print "len of surf is: ", len( partial_surfs), ", len of points is: ", len(allptfortest) if allptfortest != None and len(allptfortest) > 0: partial_surfs, _, fail, extradata = identifysurf( allptfortest, adasurconfig, donorm=False, surfs=partial_surfs, title=str(ptsetindex), paint_when_end=paint_when_end, current_direction=projection0_direction) if paint_when_end: slice_fig.append(extradata[0]) if fail == None: print "after segment", ptsetindex, "len of surf", len( partial_surfs), "fail is None", fail else: print "after segment", ptsetindex, "len of surf", len( partial_surfs), "len of fail", len(fail) for x in partial_surfs: x.printf() surfs.extend(partial_surfs) # fig = pl.figure() # ax = fig.add_subplot(111, projection='3d') # ax.scatter(npoints[:, 0], npoints[:, 1], npoints[:, 2], c='r') # x = np.linspace(0, pca_md.Wt[0, 0] * 100, 300) # y = np.linspace(0, pca_md.Wt[0, 1] * 100, 300) # z = np.linspace(0, pca_md.Wt[0, 2] * 100, 300) # ax.plot(x, y, z, c='k') # x = np.linspace(0, pca_md.Wt[1, 0] * 100, 300) # y = np.linspace(0, pca_md.Wt[1, 1] * 100, 300) # z = np.linspace(0, pca_md.Wt[1, 2] * 100, 300) # ax.plot(x, y, z, c='g') # pl.show() return surfs, npoints, (slice_fig, )
def test_colinear_pca(self): a = mlab.PCA._get_colinear() pca = mlab.PCA(a) np.testing.assert_allclose(pca.fracs[2:], 0., atol=1e-8) np.testing.assert_allclose(pca.Y[:, 2:], 0., atol=1e-8)
def project_structs(structs, ptype ='l', affinities = None, n_comp = None, l = None, vecs = None): ''' Project RNA structures in any one of several ways. Different projections require different inputs. inputs: ptype: ['pca', 'rnd', 'l', 'full_pairs'] affinities: aff matrix for pca n_comps: n_comps for pca l: length for l projections and pairs projections vecs: vecs from l/pairs projection for random projections outputs: projections in the form of an [N, X] matrix where X is the size of the projection and N is the number of input structures. The projections requiring the fewest input variables are 'full_l' and 'full_pairs' as these only require a list of structures (specifiied as base pairs) and a sequence length. Most of the rest can be called using the output projections from 'full_l' or 'full_pairs' as input. In particular, we can project onto PCA vectors: pca inputs: n_comps: the number of components to take from PCA projection affinities: the affinity matrix to use for the projection Or random matrices: rnd inputs: vecs: vectors in l dimensional space to project onto random matrices. ''' if ptype == 'pca': assert affinities != None assert n_comp != None pca_vecs = mlab.PCA(affinities).project(affinities) pca_vecs = pca_vecs[:,0:n_comp] return pca_vecs elif ptype == 'l': assert l != None return array([struct_project_l(p, l) for p in structs]) elif ptype == 'full_pairs': assert l != None return [struct_project_l2(p, l) for p in structs] elif ptype == 'rnd': assert n_comp != None assert vecs != None mat = array(np.round(random.rand(n_comp, l)),float) mat *= 2 mat -= 1 mat/= sqrt(l) plt.imshow(mat) cvecs = dot(mat,vecs.T).T return cvecs else: raise Exception('Projection type: {0} not yet implemented'.format(ptype))
# <markdowncell> # Principal Components Analysis and Display # ----------------------------------------- # # The first three principal components identify the three major risk factors, and account for 95% of the total variance: # # * The first factor represents an approximate parallel shift # * The second factor represents a twist # * The third factor represents a change in convexity # <codecell> # PCA on rate change zc_pca = ml.PCA(np.diff(zc_rate, axis=0)) fig = plt.figure() fig.set_size_inches(10, 6) ax = fig.add_subplot(121) # compute x-axis limits dtCalc = dtObs[0] ts = get_term_structure(df_libor, dtCalc) (dtMat, zc) = zero_curve(ts, days, dtCalc) dtMin = dtMat[0] dtCalc = dtObs[-1] ts = get_term_structure(df_libor, dtCalc) (dtMat, zc) = zero_curve(ts, days, dtCalc)
topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocab[i] for i in word_idx]) for t in range(len(topic_words)): print("Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))) kmeans = KMeans(n_clusters=11, random_state=0).fit(doctopic) clusters = kmeans.predict(doctopic) clusters = clusters.reshape(-1, 1) centroid = kmeans.cluster_centers_ clusterid = kmeans.labels_ doctopic_pca = mlab.PCA(doctopic) cutoff = doctopic_pca.fracs[1] doctopic_2d = doctopic_pca.project(doctopic, minfrac=cutoff) centroid_2d = doctopic_pca.project(centroid, minfrac=cutoff) colors = [ 'red', 'green', 'blue', 'yellow', 'black', 'cyan', 'magenta', 'brown', 'tomato', 'c', 'slateblue' ] plt.figure() plt.xlim([doctopic_2d[:, 0].min() - 0.5, doctopic_2d[:, 0].max() + 0.5]) plt.ylim([doctopic_2d[:, 1].min() - 0.5, doctopic_2d[:, 1].max() + 0.5]) plt.xticks([], []) plt.yticks([], []) plt.scatter(centroid_2d[:, 0], centroid_2d[:, 1], marker='o', c=colors, s=100)
import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.mlab as mlab pca_df = pd.read_csv('../data/parsed/results/DataFrameProto.csv') pca_T = pca_df.T x = pca_df[0] y = pca_df[1] mean_x = np.mean(x) mean_y = np.mean(y) pca = mlab.PCA(pca_T) sig_x = np.std(pca_T['0']) # ~ 2.01 sig_y = np.std(pca_T['1']) # ~ 0.96 plt.figure(1) plt.plot(pca.Y[0:, 0], pca.Y[0:, 1], 'o', alpha=0.5, color='blue') #plt.axis('equal') plt.title('Transformed PCA samples') plt.xlim(xmin=(sig_x * -3), xmax=(sig_x * 3)) plt.xticks(np.arange((sig_x * -2), (sig_x * 3), sig_x)) plt.axvline(x=sig_x * -2, linestyle='dotted') plt.axvline(x=sig_x * -1, linestyle='dotted') plt.axvline(x=sig_x * 1, linestyle='dotted') plt.axvline(x=sig_x * 2, linestyle='dotted')
#from code import operators import code from code.operators import * from math import * from code.utils.mathlogic import * from code.rencode import * from code.evodevo import * from random import sample import matplotlib.mlab as mlab from numpy import array as nparray import numpy numclasses = 5 allclasses = nparray( [int(c) for c in open('datafiles/MIREXclasses.txt').readlines()]) allfeatures = nparray([ map(lambda t: float(t), l.split(',')) for l in open('datafiles/FMnorm.txt').readlines() ]) pcafeats = mlab.PCA(allfeatures) #print pcafeats.fracs #print pcafeats.Y[0,:] projected = pcafeats.project(allfeatures, 0.03) print projected[:1] numpy.save('datafiles/projectedfeat-03', projected)