def do_fast_ica(pca_first): mo1_cj_inverse = numpy.array(mo1_cj).T mo2_cj_inverse = numpy.array(mo2_cj).T if pca_first: mo1_cj_array = mdp.pca(mo1_cj_inverse, input_dim=4, output_dim=3) mo2_cj_array = mdp.pca(mo2_cj_inverse, input_dim=4, output_dim=3) else: mo1_cj_array = mo1_cj_inverse mo2_cj_array = mo2_cj_inverse a = mdp.fastica(mo1_cj_array) b = mdp.fastica(mo2_cj_array) return a, b
def do_fast_ica(pca_first): mo1_cj_inverse = numpy.array(mo1_cj).T mo2_cj_inverse = numpy.array(mo2_cj).T if pca_first: mo1_cj_array = mdp.pca(mo1_cj_inverse, input_dim=4, output_dim=3) mo2_cj_array = mdp.pca(mo2_cj_inverse, input_dim=4, output_dim=3) else: mo1_cj_array = mo1_cj_inverse mo2_cj_array = mo2_cj_inverse a = mdp.fastica(mo1_cj_array) b = mdp.fastica(mo2_cj_array) return a,b
def _pca(self): #self.pca_box_surface_area= 2*( self.pca_lengths[0]*self.pca_lengths[1] # + self.pca_lengths[1]*self.pca_lengths[2] # + self.pca_lengths[2]*self.pca_lengths[0] # ) ''' 2 * ( 1/2. * self.pca_lengths[0] * numpy.sqrt(numpy.square(self.pca_lengths[1]/2) + numpy.square(self.pca_lengths[2]/2)) + 1/2. * self.pca_lengths[0] * numpy.sqrt(numpy.square(self.pca_lengths[2]/2) + numpy.square(self.pca_lengths[1]/2)) ) ''' #self.pca_rhombus = self.pca_lengths[0] * numpy.sqrt(numpy.square(self.pca_lengths[2]) + numpy.square(self.pca_lengths[1])) mins = [float('inf'), float('inf'), float('inf')] maxs = [float('-inf'),float('-inf'),float('-inf')] for x in mdp.pca( numpy.array([[compartment.x, compartment.y, compartment.z] for compartment in self.morphology.compartments]) ): for d in xrange(3): if x[d] < mins[d]: mins[d] = x[d] if x[d] > maxs[d]: maxs[d] = x[d] self._pca_length_x = maxs[0] - mins[0] self._pca_length_y = maxs[1] - mins[1] self._pca_length_z = maxs[2] - mins[2] self._pca_lengths = ( self._pca_length_x, self._pca_length_y, self._pca_length_z )
def tweet_pca_reduce(tweets_train, tweets_test, output_dim): # convert dictionary feature vecs to numpy array print '--> Converting dictionaries to NumPy arrays' train_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \ (t,s) in tweets_train]) test_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \ (t,s) in tweets_test]) # compute principle components over training set print '--> Computing PCT' pca_array = mdp.pca( train_arr.transpose(), \ svd=True, output_dim=output_dim ) # both train and test sets to PC space print '--> Projecting feature vectors to PC space' train_arr = numpy.dot(train_arr, pca_array) test_arr = numpy.dot(test_arr, pca_array) # convert projected vecs back to reduced dictionaries print '--> Converting NumPy arrays to dictionaries' reduced_train = \ zip( [tweet_features.tweet_nparr_to_dict(v) for v in train_arr], \ [s for (t,s) in tweets_train] ) reduced_test = \ zip( [tweet_features.tweet_nparr_to_dict(v) for v in test_arr], \ [s for (t,s) in tweets_test]) return (reduced_train, reduced_test)
def pca(self): import numpy import mdp m = morphjongleur.model.morphology.Morphology( name = self.name, file_origin = self.file_origin, description = self.description, datetime_recording = self.datetime_recording ) assert m.number_of_compartments == 0 pca_cs = mdp.pca( numpy.array([ [c.x, c.y, c.z] for c in self.compartments ] ) ) assert self.number_of_compartments == len(pca_cs) for i in xrange( self.number_of_compartments ): m.add_compartment( morphjongleur.model.morphology.Compartment( self._compartments[i].compartment_id, self._compartments[i].compartment_parent_id, self._compartments[i].radius, x=pca_cs[i][0], y=pca_cs[i][1], z=pca_cs[i][2] ) ) assert self.number_of_compartments == m.number_of_compartments return m
def eval_func(chromosome): """ The evaluation function """ indices_values = [] sellTrendVector = [] buyTrendVector = [] for gene in chromosome: indices_values.append(gene.getResult()) sellTrendVector.append(gene.getResult()[:tradingGA.sellTrendBeginning]) buyTrendVector.append(gene.getResult()[:tradingGA.buyTrendBeginning]) #raw_input("Press ENTER to exit") indices_values = indicesNormalizer().normalize(indices_values) indices_values = numpy.asarray(indices_values) result = mdp.pca(indices_values.T, reduce=True) #, svd=True) sell_center = calculate_centroid_center(result[:4], sellTrendVector) buy_center = calculate_centroid_center(result[:4], buyTrendVector) #print sell_center, len(sell_center) #print buy_center, len(buy_center) wynik = numpy.linalg.norm( numpy.asarray(sell_center) - numpy.asarray(buy_center)) return wynik
def eval_func(chromosome): """ The evaluation function """ indices_values = [] sellTrendVector = [] buyTrendVector = [] for gene in chromosome: indices_values.append(gene.getResult()) sellTrendVector.append(gene.getResult()[:tradingGA.sellTrendBeginning]) buyTrendVector.append(gene.getResult()[:tradingGA.buyTrendBeginning]) #raw_input("Press ENTER to exit") indices_values = indicesNormalizer().normalize(indices_values) indices_values = numpy.asarray(indices_values) result = mdp.pca(indices_values.T, reduce=True)#, svd=True) sell_center = calculate_centroid_center(result[:4], sellTrendVector) buy_center = calculate_centroid_center(result[:4], buyTrendVector) #print sell_center, len(sell_center) #print buy_center, len(buy_center) wynik = numpy.linalg.norm(numpy.asarray(sell_center) - numpy.asarray(buy_center)) return wynik
def PCA(data): NBD=np.zeros((len(data)-1,len(data[0])-1)) for k in range(1,len(data),1): row=[] for k1 in range(1,len(data[0]),1): if(is_number(data[k][k1])): row.append(float(data[k][k1])) NBD[k-1]=row pca=mdp.pca(NBD,svd=True) return pca
def PCA(data): NBD = np.zeros((len(data) - 1, len(data[0]) - 1)) for k in range(1, len(data), 1): row = [] for k1 in range(1, len(data[0]), 1): if (is_number(data[k][k1])): row.append(float(data[k][k1])) NBD[k - 1] = row pca = mdp.pca(NBD, svd=True) return pca
def _reduce_dimensions( self, vectors, output_dim=6 ): """ Scales image data vectors to lower dimension """ matrix = np.array(vectors, dtype='float32') scaled = mdp.pca(matrix, output_dim=output_dim) return scaled
def simple_pca(a1): M = np.zeros((len(a1), len(a1))) for i in range(len(a1)): for j in range(i, len(a1)): M[i, j] = M[j, i] = (a1[i] - a1[j])**2 import mdp return mdp.pca(M, output_dim=2)
def pca_distance( m ): # perform PCA, add random noise based on how much the data spread comps = mdp.pca( m, output_dim = 2 ) a = comps[1,0] comps[:,0] += jitters( comps[:,0] ) b = comps[1,0] assert a != b comps[:,1] += jitters( comps[:,1] ) return comps
def pca_distance( aDistanceMatrix, dim = 2 ): comps = mdp.pca( aDistanceMatrix.M, output_dim = 2 ) a = comps[1,0] comps[:,0] += jitters( comps[:,0] ) b = comps[1,0] #assert a != b comps[:,1] += jitters( comps[:,1] ) return comps
def simple_pca( a1 ): M = np.zeros( (len(a1), len(a1)) ) for i in range( len(a1) ): for j in range( i, len(a1) ): M[i,j] = M[j,i] = (a1[i]-a1[j]) ** 2 import mdp return mdp.pca( M, output_dim = 2 )
def getPcaTransformedMatrix(samples, group2samples, type2intersectGenes, selectedvjs, genetype, abs, outfile, options): m, rownames = preparePcaMatrix(samples, group2samples, type2intersectGenes, selectedvjs, genetype, abs) transformedM = mdp.pca(m, output_dim=4) #Write to text file f = open("%s.txt" %outfile, 'w') for i,r in enumerate(transformedM): f.write("%s\t%s\t%s\n" %(rownames[i][0], rownames[i][1], "\t".join( [str(c) for c in r] ))) f.close() #Draw plot: drawPca(rownames, transformedM, outfile, options)
def Main(self,model): # self.model = model data = array(model.GetCurrentData()[:]) k = wx.GetNumberFromUser("PCA Dialog", "Enter number of principal components", "k", 1) pca_data = mdp.pca(data, output_dim=k) # ica_data = r.fastICA(data, k, alg_typ = "deflation", fun = "logcosh", alpha = 1, method = "R", row_norm = 0, maxit = 200, tol = 0.0001, verbose = 1) fields = ['Comp%02d' % c for c in range(1, k+1)] model.updateHDF('PcaPY', pca_data, fields=fields)
def computeClusterCentre(chromosome, trendBeginning): indices_values = [] trendVector = [] for gene in chromosome: indices_values.append(gene.getResult()) trendVector.append(gene.getResult()[:trendBeginning]) indices_values = indicesNormalizer().normalize(indices_values) indices_values = numpy.asarray(indices_values) result = mdp.pca(indices_values.T, reduce=True) center = calculate_centroid_center(result[:4], trendVector) return center
def PCAAlg(x, fps): global lastValue global counter global pca_bpms primo = True prova = -1 x = np.transpose(x) #print("dopo x: " + str(x)) y = mdp.pca(x) #print("pca: " + str(y)) secondComponent = y[:, 1] #print("second: " + str(secondComponent)) freqs, pruned = searchFreqs(secondComponent, fps, len(secondComponent)) prova, index = calcolaProssimaFreqSensata(freqs, pruned) #print("pca: " + str(prova)) pca_bpms.append(prova)
def plot_clusters(self, spikes, noise_cov=None): """:spikeplot.cluster: and :spikeplot.cluster_projection: plots There will be two plots visualizing the clustering and discrimination of the sorting. One showing the clustering of units (scatter plot using the first two principal components). The initial cluster labels are preserved as colorization in the projected data. Additionally there will be a plot showing the projection of each cluster coupling onto the vector connecting the corresponding cluster means/centers :type spikes: dict :param spikes: one set of waveforms per unit {k:[n,samples]} :type noise_cov: ndarray :param noise_cov: noise covariance matrix compatible with the dimension of individual observations in :spikes: """ # prepare data tf = sum(self.parameters['cut']) # TODO: prewhiten !!! data_stacked = pca(sp.vstack(spikes.values()), output_dim=4) data = {} idx = 0 for k, v in spikes.items(): n = v.shape[0] data[k] = data_stacked[idx:idx + n] idx += n # produce scatter plots for pcs in [(0, 1), (2, 3)]: self.result.append( cluster( data, data_dim=pcs, plot_mean=True, title='cluster plot', xlabel='PC%s' % (pcs[0] + 1), ylabel='PC%s' % (pcs[1] + 1))) # cluster projection self.result.append(cluster_projection(data))
def plot_clusters(self, spikes, noise_cov=None): """:spikeplot.cluster: and :spikeplot.cluster_projection: plots There will be two plots visualizing the clustering and discrimination of the sorting. One showing the clustering of units (scatter plot using the first two principal components). The initial cluster labels are preserved as colorization in the projected data. Additionally there will be a plot showing the projection of each cluster coupling onto the vector connecting the corresponding cluster means/centers :type spikes: dict :param spikes: one set of waveforms per unit {k:[n,samples]} :type noise_cov: ndarray :param noise_cov: noise covariance matrix compatible with the dimension of individual observations in :spikes: """ # prepare data tf = sum(self.parameters['cut']) # TODO: prewhiten !!! data_stacked = pca(sp.vstack(spikes.values()), output_dim=4) data = {} idx = 0 for k, v in spikes.items(): n = v.shape[0] data[k] = data_stacked[idx:idx + n] idx += n # produce scatter plots for pcs in [(0, 1), (2, 3)]: self.result.append( cluster(data, data_dim=pcs, plot_mean=True, title='cluster plot', xlabel='PC%s' % (pcs[0] + 1), ylabel='PC%s' % (pcs[1] + 1))) # cluster projection self.result.append(cluster_projection(data))
def compute(self, waveforms, sampling_rate=None, output_dim=2, start_sample=0, num_samples=0): """Computes PCA of waveforms concatenated across recording points. waveforms : ndarray of waveforms, of shape (N_spikes, N_recordingpoints, len(waveform)) sampling_rate : not used output_dim : Number of features (eigenvalues) to return per waveform start_sample : Index of first sample in each waveform to slice out to use for PCA num_samples : Number of samples of each waveform to use for PCA. The default is '0', which means to use all samples, regardless of the value of `start sample`. Returns : pca_mat, a matrix of components. shape: (N_spikes, N_features) """ lenwf = waveforms.shape[2] if num_samples > 0: # We're not using all samples if start_sample < 0 or start_sample >= lenwf: # garbage input, use all samples print "warning: start_sample must be in [0, %d)" % lenwf start_sample = 0 # slice waveforms = waveforms[:, :, start_sample:(start_sample + num_samples)] # reshape into the format PCA expects. Each row is now a concatenation # of waveforms from each channel in the group. waveforms2 = waveforms.reshape(waveforms.shape[0], waveforms.shape[1] * waveforms.shape[2]) # do PCA and return results in (N_spikes, N_features) shape pca_mat = mdp.pca(waveforms2, output_dim=output_dim) return pca_mat
def tweet_pca_reduce( tweets_train, tweets_test, output_dim ): # convert dictionary feature vecs to numpy array print '--> Converting dictionaries to NumPy arrays' train_arr = numpy.array( [tweet_dict_to_nparr(t) for \ (t,s) in tweets_train]) test_arr = numpy.array( [tweet_dict_to_nparr(t) for \ (t,s) in tweets_test]) # compute principle components over training set print '--> Computing PCT' pca_array = mdp.pca( train_arr.transpose(), \ svd=True, output_dim=output_dim ) # both train and test sets to PC space print '--> Projecting feature vectors to PC space' train_arr = numpy.dot( train_arr, pca_array ) test_arr = numpy.dot( test_arr, pca_array ) # convert projected vecs back to reduced dictionaries print '--> Converting NumPy arrays to dictionaries' reduced_train = \ zip( [tweet_nparr_to_dict(v) for v in train_arr], \ [s for (t,s) in tweets_train] ) reduced_test = \ zip( [tweet_nparr_to_dict(v) for v in test_arr], \ [s for (t,s) in tweets_test]) return (reduced_train, reduced_test)
def reduce_dimensions(myarray): # with std test input get # Covariance matrix may be singular.Try instantiating the node with svd=True. return mdp.pca(myarray, svd=True)
import mdp # x is matrix of all instances and features y = mdp.pca(x) # evt uitzoeken of mdp.ica iets oplevert
def featureSelection(trainData,labels,featureSelectionMechanism,numFeatures,minNumSongs, maxNumSongs, trainSongs, featureExtractor): """ Given the feature set of example, will give a reduced feature set. @param list of training data @param list of strings - a list of the unique labels @param string - type of feature selection we want to perform @param number of features we want to choose @param number of min songs for num_songs feature selection @param number of max songs for num_songs feature selection @return list with a reduced set of features """ informationGains = [] featureNames = [] featureLibraryInfo = [] featureLibrary = [] # Calculate all of the features (not just those from the example) print("Calculate all of the given features") allFeatures = Counter() featureArray = [] featureArray = [fs for (fs, label) in trainData] for featureSet in featureArray : allFeatures.update(featureSet) if(numFeatures<len(allFeatures) and numFeatures != 0): if(featureSelectionMechanism=="information_gain"): selected_features_info = [] selected_features = [] print("Using information gain to select features") # Loop through all of the features and calculate the information gain for each for feature in allFeatures: print("Calculating information gain for %s " % feature) informationGains.append(informationGain(trainData,allFeatures,feature, labels)) featureNames.append(feature) informationGains = np.array(informationGains) sortedargs = np.argsort(informationGains) featureNames = [featureNames[i] for i in sortedargs] print informationGains #informationGains.reverse() #featureNames.reverse() # Add the top numFeatures to the counter. # if requesting too many features change number of requested features. if(numFeatures>len(featureNames)): numFeatures = len(featureNames) for i in range(0,numFeatures): selected_features_info.append(informationGains[i]) selected_features.append(featureNames[i]) # print featureLibraryInfo elif(featureSelectionMechanism=="num_songs"): print("Using min/max song metric to select features") selected_features = getDict(minNumSongs, maxNumSongs, trainSongs, featureExtractor) else: print("Using PCA to select features") # Create a matrix with he relevant labels allFeaturePairList = list(allFeatures.items()); allFeatureKeyList = [pair[0] for pair in allFeaturePairList]; index = 0; data_features_matrix = np.zeros((len(trainData),len(allFeatures))); print("Creating the matrix for PCA input") for (features,label) in trainData: # Loop through each feature and populate matrix for feature in features: data_features_matrix[index][allFeatureKeyList.index(feature)] = allFeatures[feature] index=index+1 # Run PCA to reduce feature size. # print(data_features_matrix) print("Using PCA to reduce the number of features") reduced_features = mdp.pca(transpose(data_features_matrix),output_dim = 2, svd = True) u1 = reduced_features[:,1] order = np.argsort(u1)[::-1] order = order[1:numFeatures] print("Populate the selected_features based on representation in first principal component") selected_features = [allFeatureKeyList[index] for index in order] print(selected_features) else: for feature in allFeatures: featureNames.append(feature) selected_features = featureNames # Return the selected features regardless of algorithm used. return selected_features
def runPCA(paramFile): data = np.loadtxt(paramFile) y = mdp.pca(data, reduce=True) print y
def pca(data, singleValueDecomp=True): return (mdp.pca(data, svd=singleValueDecomp))
if ideology != prev: stats.append([]) cursor2 = connection.cursor() cursor2.execute("select ideology from ideology where id=%s" % ideology) idrow=cursor2.fetchone() idname=idrow[0] if idname=="": idname="[ideology #%d]" % ideology ideologies.append(idname) stats[-1].append(float(row[2])) prev=ideology row=cursor.fetchone() raw=array(stats) cooked=mdp.pca(raw, output_dim=2) # see http://nullege.com/codes/search/mdp.pca (xmax, ymax) = cooked.max(0) # max value in each column vector of y, see http://mathesaurus.sourceforge.net/numeric-numpy.html (xmin, ymin) = cooked.min(0) # And min. These will be used to interpolate the x,y coordinates for plotting idmap=Image.new("RGB", (width+240, height+12), (128,128,128)) draw=ImageDraw.Draw(idmap) for i in range(len(ideologies)): ts=draw.textsize(ideologies[i]) # center the name over its coordinates x=width*(cooked[i,0]-xmin)/(xmax-xmin)-math.trunc(ts[0]/2)+120 y=height*(cooked[i,1]-ymin)/(ymax-ymin)-math.trunc(ts[1]/2)+6 newcolor=[] for j in range(3): # generate a random color, one that contrasts w. midtone gray background newrand=random.random()+random.random()
from mdp.nodes import RBMNode import mdp from numpy import * import time import read_spro X = read_spro.load_mfcc_file() rbm = RBMNode(10, X.shape[1]) x2 = X.dot(X.T) print x2.shape mdp_pca = mdp.pca(x2) print X.shape
print "read data" for line in co_occ_file: if count < 100: count += 1 line = line.replace("\n", "") instance = line.split(" ") if (indexes[instance[0]] == -1): indexes[instance[0]] = current_index current_index += 1 if (indexes[instance[1]] == -1): indexes[instance[1]] = current_index current_index += 1 matrix[indexes[instance[0]], indexes[instance[1]]] = float(instance[2]) else: break if count % 10000 == 0: print count, "entries processed" co_occ_file.close() if current_index != nr_functional_words - 1: print "not the same", current_index, nr_functional_words print "perform pca" matrix = mdp.pca(matrix, 30) print "pca done, start tsne" #Y = tsne.tsne(X, no_dims, perplexity) y = tsne.tsne(matrix, 2, nr_functional_words, perplexity)
def featureSelection(trainData, labels, featureSelectionMechanism, numFeatures, minNumSongs, maxNumSongs, trainSongs, featureExtractor): """ Given the feature set of example, will give a reduced feature set. @param list of training data @param list of strings - a list of the unique labels @param string - type of feature selection we want to perform @param number of features we want to choose @param number of min songs for num_songs feature selection @param number of max songs for num_songs feature selection @return list with a reduced set of features """ informationGains = [] featureNames = [] featureLibraryInfo = [] featureLibrary = [] # Calculate all of the features (not just those from the example) print("Calculate all of the given features") allFeatures = Counter() featureArray = [] featureArray = [fs for (fs, label) in trainData] for featureSet in featureArray: allFeatures.update(featureSet) if (numFeatures < len(allFeatures) and numFeatures != 0): if (featureSelectionMechanism == "information_gain"): selected_features_info = [] selected_features = [] print("Using information gain to select features") # Loop through all of the features and calculate the information gain for each for feature in allFeatures: print("Calculating information gain for %s " % feature) informationGains.append( informationGain(trainData, allFeatures, feature, labels)) featureNames.append(feature) informationGains = np.array(informationGains) sortedargs = np.argsort(informationGains) featureNames = [featureNames[i] for i in sortedargs] print informationGains #informationGains.reverse() #featureNames.reverse() # Add the top numFeatures to the counter. # if requesting too many features change number of requested features. if (numFeatures > len(featureNames)): numFeatures = len(featureNames) for i in range(0, numFeatures): selected_features_info.append(informationGains[i]) selected_features.append(featureNames[i]) # print featureLibraryInfo elif (featureSelectionMechanism == "num_songs"): print("Using min/max song metric to select features") selected_features = getDict(minNumSongs, maxNumSongs, trainSongs, featureExtractor) else: print("Using PCA to select features") # Create a matrix with he relevant labels allFeaturePairList = list(allFeatures.items()) allFeatureKeyList = [pair[0] for pair in allFeaturePairList] index = 0 data_features_matrix = np.zeros((len(trainData), len(allFeatures))) print("Creating the matrix for PCA input") for (features, label) in trainData: # Loop through each feature and populate matrix for feature in features: data_features_matrix[index][allFeatureKeyList.index( feature)] = allFeatures[feature] index = index + 1 # Run PCA to reduce feature size. # print(data_features_matrix) print("Using PCA to reduce the number of features") reduced_features = mdp.pca(transpose(data_features_matrix), output_dim=2, svd=True) u1 = reduced_features[:, 1] order = np.argsort(u1)[::-1] order = order[1:numFeatures] print( "Populate the selected_features based on representation in first principal component" ) selected_features = [allFeatureKeyList[index] for index in order] print(selected_features) else: for feature in allFeatures: featureNames.append(feature) selected_features = featureNames # Return the selected features regardless of algorithm used. return selected_features
def pca(data, singleValueDecomp=True): return(mdp.pca(data,svd=singleValueDecomp))
random.seed([3]) #seting the random seed Y = get_headers() for i in Y: F = split() for j in K: #should return a number between 0 and Y.size() #The algorithm should delete a random subset of classes keepCols = [] deletedCols = [] for jj in K: if(random.randrange(1)==1): keepCols.append(jj) #every column has 50% to be deleted else: deletedCols.append(jj) #the 'p' means that's a prime Xijp = bootstrap(F[keepCols], 1, len(F)*0.75) #option 2 http://climateecology.wordpress.com/2013/08/19/r-vs-python-speed-comparison-for-bootstrapping/ Cij = mdp.pca(Xijp) #arrangin the rotation matrix Ri = [len(Cij)][K] id=0 for a in len(Cij): aux = 0 for b in K: aux += Cij[a][b] if(id == a): Ri[a][a] = aux #does the diagonal ++id #It should have the same order but without some columns, so is ok
cur = con.cursor() cur.execute("select * from bu_cat") rows = cur.fetchall() A = rows[0] for row in rows[1:]: A = np.vstack([A, list(row)]) except mdb.Error, e: print e sys.exit(1) finally: if con: con.close() A = mdp.pca(A.astype('float32'), reduce=True) ##distances = pdist(A, cosine) ##distances_2d = squareform(distances) clusters = hierarchy.linkage(A, method='complete', metric='cosine') flat_clusters = hierarchy.fcluster(clusters.clip(0,100000), 0.8,'inconsistent') plt.scatter(*numpy.transpose(A), c=clusters) plt.axis("equal") title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters))) plt.title(title) plt.show() with open('Clusters.dat', 'w+') as f: count = 0 for v in flat_clusters: count += 1 f.write(str(count) + "\t" + str(v) + "\n")
print "read data" for line in co_occ_file: if count < 100: count += 1 line = line.replace("\n", "") instance = line.split(" ") if(indexes[instance[0]] == -1): indexes[instance[0]] = current_index current_index+=1 if(indexes[instance[1]] == -1): indexes[instance[1]] = current_index current_index+=1 matrix[ indexes[instance[0]], indexes[instance[1]] ] = float(instance[2]) else: break if count%10000 == 0: print count, "entries processed" co_occ_file.close() if current_index != nr_functional_words-1: print "not the same", current_index, nr_functional_words print "perform pca" matrix = mdp.pca(matrix, 30) print "pca done, start tsne" #Y = tsne.tsne(X, no_dims, perplexity) y = tsne.tsne(matrix, 2, nr_functional_words, perplexity)
abec = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] correcto = ['P', 'W', '3', '7', '6', '8', '8', 'D', 'Z', 'G', '2', '0', '1', 'R', 'S', '2', '0', '0'] for k in range(len(correcto)): im = Image.open("muestras/%s.png"%k) pix = im.load() w, h = im.size x = [] for i in range(w): tmp = [] for j in range(h): if pix[i, j] == (255, 255, 255): tmp.append(1) else: tmp.append(0) x.append(tmp) y = mdp.pca(np.array(x,dtype = np.float64 ), output_dim = (7)) y = y.transpose() res = [] for value in y: res.append(value.sum()*1.0e+14) y = res f.write("%s "%bin(abec.index(correcto[k]))[2:].zfill(7)) for value in y: f.write("%s "%(str(value))) f.write("\n") if debug: print "Dato para la imagen %s: %s\n"%(k, y)
def do_pca(self,args): ''' PCA -> "pca gaeta_coor_blind50.txt 1,3,6" Automatically measures pca from coordinates filename and shows two interactives plots With the second argument (arbitrary) you can select the columns and the multiplier factor to use for the pca (for es "1,3*50,6,8x10,9"). Dont use spaces. "*" or "x" are the same thing. Without second argument it reads pca_config.txt file (c)Paolo Pancaldi, Massimo Sandal 2009 ''' # reads the columns of pca if self.config['hookedir'][0]=='/': slash='/' #a Unix or Unix-like system else: slash='\\' self.my_hooke_dir = self.config['hookedir']+slash #self.my_work_dir = os.getcwd()+slash+"pCluster_"+time.strftime("%Y%m%d_%H%M")+slash #self.my_curr_dir = os.path.basename(os.getcwd()) conf=open(self.my_hooke_dir+"pca_config.txt") config = conf.readlines() conf.close() self.plot_myCoord = [] # tiene le coordinate prese direttamente dal file creato con pCluster self.plot_origCoord = [] # tiene le coordinate solo delle colonne scelte e moltiplicate per i valori scelti self.plot_pcaCoord = [] # tiene le due colonne della pca self.plot_pcaCoordTr = [] # tiene le due colonne della pca trasposta self.plot_FiltOrigCoord = [] # tiene le coordinate solo dei punti filtrati per densita self.plot_FiltPaths = [] # tiene i paths dei plot solo dei punti filtrati per densita self.plot_paths = [] # tiene i paths di tutti i plots self.plot_NewPcaCoord = [] # tiene le due colonne della pca filtrate per densita self.plot_NewPcaCoordTr=[] # tiene le due colonne della pca trasposta filtrate per densita plot_path_temp = "" # prende in inpunt un arg (nome del file) # e il secondo le colonne su cui lavorare (e' arbitrario, riceve x es "1,2,3") arg = args.split(" ") if arg[0]==args: file_name=args else: file_name=arg[0] config[0] = arg[1] # creo l'array "plot_myCoord" con tutte le coordinate dei plots # e l'array plot_paths con tutti i percorsi dei plots nPlotTot = -3 #tolgo le prime 3 righe iniziali del file f=open(file_name) rows = f.readlines() for row in rows: if row[0]!=" " and row[0]!="": nPlotTot = nPlotTot+1 plot_path_temp = row if row[0]==" " and row.find('nan')==-1 and row.find("-1.#IND")==-1: row = row[row.index(";",2)+2:].split(" ; ") # non considero la prima colonna col #picchi row = [float(i) for i in row] #0:Mean delta, 1:Median delta, 2:Mean force, 3:Median force, 4:First peak length, 5:Last peak length #6:Max delta 7:Min delta 8:Max force 9:Min force 10:Std delta 11:Std force if (row[0]<500 and row[1]<500 and row[2]<500 and row[3]<500 and row[4]<500 and row[5]<500 and row[6]<500 and row[7]<500 and row[8]<500 and row[9]<500 and row[10]<500 and row[11]<500): if (row[0]>0 and row[1]>0 and row[2]>0 and row[3]>0 and row[4]>0 and row[5]>0 and row[6]>0 and row[7]>0 and row[8]>0 and row[9]>0 and row[10]>0 and row[11]>0): #row = row[0], row[2], row[3]*3, row[6], row[7]*56, row[8] self.plot_myCoord.append(row) self.plot_paths.append(plot_path_temp) f.close() # creo l'array con alcune colonne e pure moltiplicate for row in self.plot_myCoord: res=[] for cols in config[0].split(","): if cols.find("*")!=-1: col = int(cols.split("*")[0]) molt = int(cols.split("*")[1]) elif cols.find("x")!=-1: col = int(cols.split("x")[0]) molt = int(cols.split("x")[1]) else: col = int(cols) molt = 1 res.append(row[col]*molt) self.plot_origCoord.append(res) # array convert, calculate PCA, transpose self.plot_origCoord = np.array(self.plot_origCoord,dtype='float') #print self.plot_origCoord.shape self.plot_pcaCoord = pca(self.plot_origCoord, output_dim=2) #other way -> y = mdp.nodes.PCANode(output_dim=2)(array) self.plot_pcaCoordTr = np.transpose(self.plot_pcaCoord) pca_X=np.array(self.plot_pcaCoordTr[0],dtype='float') pca_Y=np.array(self.plot_pcaCoordTr[1],dtype='float') ''' # Start section of testing with good plots # 4 TESTING! Xsyn_1=[] Ysyn_1=[] Xgb1_1=[] Ygb1_1=[] Xbad_1=[] Ybad_1=[] goodnamefile=open(file_name.replace("coordinate", "good"),'r') goodnames=goodnamefile.readlines() nPlotGood = len(goodnames)-2 #tolgo prima e ultima riga goodnames=[i.split()[0] for i in goodnames[1:]] for index in range(len(self.plot_paths)): if self.plot_paths[index][:-1] in goodnames: Xsyn_1.append(pca_X[index]) Ysyn_1.append(pca_Y[index]) else: Xbad_1.append(pca_X[index]) Ybad_1.append(pca_Y[index]) # Stop section of testing with good plots # 4 TESTING! ''' # print first plot clustplot1=lhc.PlotObject() clustplot1.add_set(pca_X,pca_Y) #clustplot1.add_set(Xbad_1,Ybad_1) # 4 TESTING! #clustplot1.add_set(Xsyn_1,Ysyn_1) # 4 TESTING! clustplot1.normalize_vectors() clustplot1.styles=['scatter', 'scatter','scatter'] clustplot1.colors=[None,'red','green'] clustplot1.destination=0 self._send_plot([clustplot1]) self.clustplot1=clustplot1 # density and filer estimation kernel = sp.stats.kde.gaussian_kde(sp.c_[pca_X,pca_Y].T) tallest = 0 for i in range(len(pca_X)): kern_value = kernel.evaluate([pca_X[i],pca_Y[i]]) if tallest < kern_value: tallest = float(kern_value) if float(config[1]) == 0: my_filter = float(tallest / 3.242311147) else: my_filter = float(config[1]) ''' # section useful only for graphic printing xmin = pca_X.min() xmax = pca_X.max() ymin = pca_Y.min() ymax = pca_Y.max() mX, mY = sp.mgrid[xmin:xmax:100j, ymin:ymax:100j] Z = sp.rot90(sp.fliplr(sp.reshape(kernel(sp.c_[mX.ravel(), mY.ravel()].T).T, mX.T.shape))) axis_X = np.linspace(xmin,xmax,num=100) axis_Y = np.linspace(ymin,ymax,num=100) ''' # density filtering: # tramite "kernel.evaluate" trovo lo score (altezza) di ogni coordinata e decido se mantenerla o no filtered_pca_X = [] filtered_pca_Y = [] filtered_PcaCoordTr = [] filtered_PcaCoord = [] for i in range(len(pca_X)): kern_value = kernel.evaluate([pca_X[i],pca_Y[i]]) if kern_value > my_filter: filtered_pca_X.append(pca_X[i]) filtered_pca_Y.append(pca_Y[i]) filtered_PcaCoordTr.append(filtered_pca_X) filtered_PcaCoordTr.append(filtered_pca_Y) filtered_PcaCoord = np.transpose(filtered_PcaCoordTr) # creo i due array "plot_FiltOrigCoord" e "plot_FiltPaths" contenenti solo i dati filtrati con alta densita for index in range(len(self.plot_pcaCoord)): if self.plot_pcaCoord[index] in filtered_PcaCoord: self.plot_FiltOrigCoord.append(self.plot_myCoord[index]) self.plot_FiltPaths.append(self.plot_paths[index]) ''' # START PCA#2: USELESS!!! # creo l array con alcune colonne e pure moltiplicate temp_coord = [] for row in self.plot_FiltOrigCoord: res=[] for cols in config[2].split(","): if cols.find("*")!=-1: col = int(cols.split("*")[0]) molt = int(cols.split("*")[1]) elif cols.find("x")!=-1: col = int(cols.split("x")[0]) molt = int(cols.split("x")[1]) else: col = int(cols) molt = 1 res.append(row[col]*molt) temp_coord.append(res) self.plot_FiltOrigCoord = temp_coord # ricalcolo la PCA: array convert, calculate PCA, transpose self.plot_FiltOrigCoord = np.array(self.plot_FiltOrigCoord,dtype='float') #print self.plot_FiltOrigCoord.shape self.plot_NewPcaCoord = pca(self.plot_FiltOrigCoord, output_dim=2) #other way -> y = mdp.nodes.PCANode(output_dim=2)(array) self.plot_NewPcaCoordTr = np.transpose(self.plot_NewPcaCoord) pca_X2=np.array(self.plot_NewPcaCoordTr[0],dtype='float') pca_Y2=np.array(self.plot_NewPcaCoordTr[1],dtype='float') # Start section of testing with good plots # 4 TESTING! Xsyn_2=[] Ysyn_2=[] Xbad_2=[] Ybad_2=[] for index in range(len(self.plot_FiltPaths)): if self.plot_FiltPaths[index][:-1] in goodnames: Xsyn_2.append(pca_X2[index]) Ysyn_2.append(pca_Y2[index]) else: Xbad_2.append(pca_X2[index]) Ybad_2.append(pca_Y2[index]) # print second plot clustplot2=lhc.PlotObject() #clustplot2.add_set(pca_X2,pca_Y2) clustplot2.add_set(Xbad_2,Ybad_2) # 4 TESTING! clustplot2.add_set(Xsyn_2,Ysyn_2) # 4 TESTING! clustplot2.normalize_vectors() clustplot2.styles=['scatter', 'scatter','scatter'] clustplot2.colors=[None,'red','green'] clustplot2.destination=1 self._send_plot([clustplot2]) self.clustplot2=clustplot2 ''' # PRINT density plot clustplot2=lhc.PlotObject() clustplot2.add_set(filtered_pca_X,filtered_pca_Y) clustplot2.normalize_vectors() clustplot2.styles=['scatter', 'scatter','scatter'] clustplot2.colors=[None,'red','green'] clustplot2.destination=1 self._send_plot([clustplot2]) self.clustplot2=clustplot2 # printing results config_pca1 = config[0].replace("*", "x").rstrip("\n") config_pca2 = config[2].replace("*", "x").rstrip("\n") print "" print "- START: "+file_name print "Curve totali: ", nPlotTot #print "Curve totali good: ", nPlotGood # 4 TESTING! print "- FILTRO 1: 0-500 e NaN" print "Curve totali rimaste: ", len(self.plot_origCoord) #print 'Curve good rimaste: ', len(Xsyn_1) # 4 TESTING! print "- FILTRO 2: PCA:"+config_pca1+" e DENSITA:"+str(my_filter) print "Curve totali rimaste: ", len(self.plot_FiltOrigCoord) #print 'Curve good rimaste: ', len(Xsyn_2) # 4 TESTING! print "Piu alta: ", tallest #print "- FILTRO 3: 2'PCA:"+config_pca2 print "" # -- exporting coordinates and plot of PCA in debug mode! -- if config[3].find("true")!=-1: #1' PCA: save plot and build coordinate s file self.do_export(file_name.replace("coordinate_", "debug_pca1graph_").replace('.txt','_'+config_pca1) + " 0") f = open(file_name.replace("coordinate_", "debug_pca1coor_").replace('.txt','_'+config_pca1+'.txt'),'w') for i in range(len(pca_X)): f.write (str(i) + "\t" + str(pca_X[i]) + "\t" + str(pca_Y[i]) + "\n") f.close() #2' PCA: save plot and build coordinate s file #self.do_export(file_name.replace("coordinate_", "debug_pca2graph_").replace('.txt','_'+config_pca2) + " 1") #f = open(file_name.replace("coordinate_", "debug_pca2coor_").replace('.txt','_'+config_pca2+'.txt'),'w') #for i in range(len(pca_X2)): # f.write (str(i) + "\t" + str(pca_X2[i]) + "\t" + str(pca_Y2[i]) + "\n") #f.close() #DENSITY: save plot self.do_export(file_name.replace("coordinate_", "debug_densitygraph_").replace('.txt','_'+config_pca1+'_'+str(my_filter).replace(".",",")) + " 1") f = open(file_name.replace("coordinate_", "debug_densitycoor_").replace('.txt','_'+config_pca1+'_'+str(my_filter).replace(".",",")+'.txt'),'w') for i in range(len(filtered_pca_X)): f.write (str(i) + "\t" + str(filtered_pca_X[i]) + "\t" + str(filtered_pca_Y[i]) + "\n") f.close() #ALL GOOD COORDINATES (without NaN and 0<x<500) f = open(file_name.replace("coordinate_", "debug_allgoodcoor_"),'w') for i in range(len(self.plot_myCoord)): for cel in self.plot_myCoord[i]: f.write (" ; " + str(cel)) f.write ("\n") f.close() # pCLUSTER SAVING!!! import shutil pcl_name = file_name.replace("coordinate_", "goodplots_").replace('.txt','_'+config_pca1+'_'+str(my_filter).replace(".",",")) if os.path.exists(pcl_name+slash): shutil.rmtree(pcl_name) os.mkdir(pcl_name+slash) f = open(pcl_name+'.txt','w') for i in range(len(self.plot_FiltPaths)): myfile = str(self.plot_FiltPaths[i]).rstrip("\n") f.write (myfile+"\n") shutil.copy2(myfile, pcl_name) f.close()
def PCA(x): # http://mdp-toolkit.sourceforge.net/ return mdp.pca(x)