def __getitem__(self, index): if isinstance(index, slice): instances = self._instances[index] if self.sparse: instances = np.array(csr_matrix.todense(instances)) labels = self._labels[index] length = len(instances) # バッチ内の各ラベルからランダムに要素を取り出す。 sampled_instances = [self._instances[random.choice(range(self._partition[label], self._partition[label + 1]))] for label in labels] if self.sparse: sampled_instances = [np.array(csr_matrix.todense(sampled_instance)) for sampled_instance in sampled_instances] return [(instances[i], labels[i], sampled_instances[i]) for i in range(length)] else: instance = self._instances[index] if self.sparse: instance = np.array(csr_matrix.todense(instance)) label = self._labels[index] sampled_instance = self._instances[random.choice(range(self._partition[label], self._partition[label + 1]))] if self.sparse: sampled_instance = np.array(csr_matrix.todense(sampled_instance)) return instance, label, sampled_instance
def select_dataset(name): x_size, y_size, train_data, train_label, test_data, test_label = 0, 0, [], [] ,[] ,[] #초기화 if name == 'cifar': dataset = cifar.CIFAR() train_data, train_label, test_data, test_label = dataset.getdata() train_data = train_data.reshape(-1, 3072) test_data = test_data.reshape(-1, 3072) x_size = 3072 y_size = 10 elif name == 'svhn': dataset = svhn.SVHN() train_data, train_label = dataset.get_trainset() test_data, test_label = dataset.get_testset() train_data = train_data.reshape(-1, 3072) test_data = test_data.reshape(-1, 3072) x_size = 3072 y_size = 10 elif name == 'mnist': dataset = mnist.read_data_sets(flags.MNIST_DIR, one_hot=True) train_data, train_label, test_data, test_label = dataset.train.images, dataset.train.labels, \ dataset.test.images, dataset.test.labels x_size = 784 y_size = 10 elif name == 'news': trainset = fetch_20newsgroups(data_home=flags.NEWS_DIR, subset='train') testset = fetch_20newsgroups(data_home=flags.NEWS_DIR, subset='test') vectorizer = TfidfVectorizer(analyzer='word', max_features=3072) vectorizer.fit(trainset.data) train_data = vectorizer.transform(trainset.data) train_data = csr_matrix.todense(train_data) train_label = trainset.target train_label = NNutils.onehot(train_label, 20, list=True) # print(train_label.shape) test_data = vectorizer.transform(testset.data) test_data = csr_matrix.todense(test_data) test_label = testset.target test_label = NNutils.onehot(test_label, 20, list=True) x_size = 3072 y_size = 20 return Dataset(name, x_size, y_size, train_data, train_label, test_data, test_label)
def test_sparseMatConn(self): conn_mat = np.random.randint(2, size=(5, 3), dtype=bp.math.bool_) sparse_mat = csr_matrix(conn_mat) conn = bp.conn.SparseMatConn(sparse_mat)(pre_size=sparse_mat.shape[0], post_size=sparse_mat.shape[1]) print(conn.requires('pre2post')) print(conn.requires('conn_mat')) print(csr_matrix.todense(sparse_mat)) assert bp.math.array_equal( conn_mat, bp.math.asarray(csr_matrix.todense(sparse_mat), dtype=bp.math.bool_))
def batch_generator(self, A_X, y, batch_size): number_of_batches = ceil(y.shape[0] / batch_size) counter = 0 shuffle_index = np.arange(np.shape(y)[0]) np.random.shuffle(shuffle_index) A_ = A_X[0] X = A_X[1] A_ = A_[shuffle_index] X = X[shuffle_index] y = y[shuffle_index] while 1: index_batch = shuffle_index[batch_size * counter:min(batch_size * (counter + 1), y.shape[0])] if len(A_.shape) == 1: A_batch = np.array( list( map(lambda a: csr_matrix.todense(a), A_[index_batch].tolist()))) else: A_batch = A_[index_batch] X_batch = X[index_batch] y_batch = y[index_batch] counter += 1 yield ([A_batch, X_batch], y_batch) if (counter < number_of_batches): np.random.shuffle(shuffle_index) counter = 0
def update_iterative(self, inc_mat, inc_transpose, inv_len_mat, flux_mat): """One step update Parameters: inc_mat: sparse.matrix, oriented incidence matrix inc_transpose: sparse.matrix, oriented incidence matrix transposed inv_len_mat: sparse.matrix, diagonal matrix 1/l_e flux_mat: np.array, fluxes Returns: flux: np.array, updated fluxes """ if self.coupling == "l2": flux_norm = np.linalg.norm(flux_mat, axis=1)**2 if self.coupling == "l1": flux_norm = np.linalg.norm(flux_mat, axis=1, ord=1)**2 # computing scaling ad updating conductivities temp = (np.sum(self.length * flux_norm**((2 - self.pflux) / (3 - self.pflux))))**(1 / (2 - self.pflux)) self.tdens = (1 / temp) * flux_norm**(1 / (3 - self.pflux)) td_mat = diags(self.tdens, 0) # computing fluxes temp_pinv = np.linalg.pinv( csr_matrix.todense(inc_mat * td_mat * inv_len_mat * inc_transpose)) lagrange_mult = temp_pinv * self.forcing flux = td_mat * inv_len_mat * inc_transpose * lagrange_mult return flux
def frag_matrix_extract(hicfile, chrN1, chrN2, binsize, start1, start2, lastend1, lastend2, shiftsize, Step): end1 = start1 + Step + shiftsize end2 = start2 + Step + shiftsize #if end1 > lastend1: # end1 = lastend1 #if end2 > lastend2: # end2 = lastend2 result = straw.straw('NONE', hicfile, str(chrN1), str(chrN2), 'BP', binsize) row = [r // binsize for r in result[0]] col = [c // binsize for c in result[1]] value = result[2] N = max(chrs_length[chrN2] // binsize + Step // binsize, chrs_length[chrN1] // binsize + Step // binsize) + 1 #N = max(max(row)+1, max(col) + 1) #print(N) M = csr_matrix((value, (row, col)), shape=(N, N)) M = csr_matrix.todense(M) rowix = range(start1 // binsize, end1 // binsize + 1) colix = range(start2 // binsize, end2 // binsize + 1) #print(rowix,colix) M = M[np.ix_(rowix, colix)] N = M.shape[1] return (M, N)
def general_transform(inputs, sparse): instance, label = inputs if sparse: instance = np.array(csr_matrix.todense(instance)) return instance, label
def todense(self, samples): densed_samples = [] for sample in samples: densed_samples.append(csr_matrix.todense(sample)) densed_samples_np = np.asarray(densed_samples).reshape( len(samples), -1) return densed_samples_np
def categorize_dates(df, date_enc=None): onehot_cols = 'month,dow,year'.split(',') if not date_enc: date_enc = OneHotEncoder() date_enc.fit(df[onehot_cols]) X_date = date_enc.transform(df[onehot_cols]) X_date = csr_matrix.todense(X_date) return X_date, date_enc
def index_sparse_arr(self): self.sparse_doc = csr_matrix(self.test_arr) print(self.sparse_doc) self.dense_arr = csr_matrix.todense(self.sparse_doc) self.im = Image.fromarray(self.dense_arr * 225) self.im.show()
def todense(self, samples): densed_samples = [] for sample in samples: # print([csr_matrix.todense(sample).tolist()]) densed_samples.append(csr_matrix.todense(sample)) densed_samples_np = np.asarray(densed_samples).reshape( len(samples), -1) # print(densed_samples_np.shape) return densed_samples_np
def createTrainData_nparray(data, seqLength, predLength=1, stride=1): data = csr_matrix.todense(data) i = 0 dataX = [] dataY = [] while (i < (len(data) - seqLength - predLength)): dataX.append(data[i:i + seqLength]) dataY.append(data[i + seqLength:(i + seqLength + predLength)]) i += stride return np.array(dataX), np.array(dataY)
def lgc_solver(self, mu): d_neg_half = np.diag(np.power(self.degrees, -0.5)) d_neg_half = csr_matrix(d_neg_half) step = d_neg_half.dot(self.weight_matrix) S = (1 / (1 + mu)) * step.dot(d_neg_half) S = scipy.sparse.identity(self.node_number) - S S = csr_matrix.todense(S) S = np.linalg.inv(S) S = (mu / (1 + mu)) * S output_labels = np.linalg.multi_dot((S, self.Y)) return output_labels
def laplacian_graph(df_train,df_test): test=df_test.copy() test['rate']=0 la_train=pd.concat([df_train,test]) la_train=la_train.set_index(np.arange(len(la_train))) la_train = la_train.astype({'item': 'int'}) la_train = la_train.astype({'user': '******'}) a=list(set(la_train.user.tolist())) b=list(set(la_train.item.tolist())) B = nx.Graph() # Add nodes with the node attribute "bipartite" B.add_nodes_from(a, bipartite=1) B.add_nodes_from(b, bipartite=0) # Add edges only between nodes of opposite node sets B.add_weighted_edges_from(get_edgelist()) bottom_nodes, top_nodes = bipartite.sets(B) G1 = bipartite.weighted_projected_graph(B, top_nodes, ratio=False) #movie G2 = bipartite.weighted_projected_graph(B, bottom_nodes, ratio=False) #user la_movie=nx.normalized_laplacian_matrix(G1, nodelist=None, weight='weight') la_user=nx.normalized_laplacian_matrix(G2, nodelist=None, weight='weight') from scipy.sparse import csr_matrix la_movie=csr_matrix.todense(la_movie) la_user=csr_matrix.todense(la_user) m=nx.to_numpy_array(G1) u=nx.to_numpy_array(G2) Max1=np.amax(m) Max2=np.amax(u) norm_la_movie=np.true_divide(m, Max1) norm_la_user=np.true_divide(u, Max2) return la_movie,la_user,norm_la_movie,norm_la_user
def train_matrix_extract(chrN1, binsize, hicfile): result = straw.straw('NONE', hicfile, str(chrN1), str(chrN1), 'BP', binsize) row = [r // binsize for r in result[0]] col = [c // binsize for c in result[1]] value = result[2] N = max(max(row) + 1, max(col) + 1) #print(N) M = csr_matrix((value, (row, col)), shape=(N, N)) M = csr_matrix.todense(M) M = np.array(M) x, y = np.where(M != 0) M[y, x] = M[x, y] return (M)
def matrix_extract(chrN1, chrN2, binsize, hicfile): result = straw.straw('NONE', hicfile, str(chrN1),str(chrN2),'BP',binsize) row = [r//binsize for r in result[0]] col = [c//binsize for c in result[1]] value = result[2] Nrow = max(row) + 1 Ncol = max(col) + 1 N = max(Nrow, Ncol) M = csr_matrix((value, (row,col)), shape=(N,N)) M = csr_matrix.todense(M) return(M)
def __getitem__(self, index): if isinstance(index, slice): raise NotImplementedError batches = [dataset[index] for dataset in self._datasets] instances = [tuple([self.transform(instance) for instance in batches[2]])] clusters = [tuple([cluster for cluster in batches[0]])] classes = [tuple([_class for _class in batches[1]])] return [instances, clusters, classes] else: batches = [dataset[index] for dataset in self._datasets] instance, cluster, _class = tuple(batches) if self.transform is not None: instance = self.transform(instance) if self.sparse: instance = np.array(csr_matrix.todense(instance)) return instance, cluster, _class
def check_cluster(model, train, num_classes, num_cluster, batchsize=128, device=-1, sparse=False): with chainer.using_config('train', False): i, N = 0, len(train) cc = None ss = None while i <= N: train_batch = train[i:i + batchsize] if sparse: train_batch = np.array(csr_matrix.todense(train_batch)) # concat_examplesは(instances, labels)を返す。 xx = F.softmax( model( chainer.dataset.convert.concat_examples( train_batch, device=device)[0])).data if device >= 0: xx = cuda.to_cpu(xx) if cc is None: cc = np.argmax(xx, axis=1) else: cc = np.append(cc, np.argmax(xx, axis=1)) if ss is None: ss = np.sum(xx, axis=0) else: ss = ss + np.sum(xx, axis=0) i += batchsize ss /= N partition = train._partition cluster = [ tuple( np.sum(cc[partition[k]:partition[k + 1]] == c) for c in range(num_cluster)) for k in range(num_classes) ] return cluster, ss
def parse_dataset(dataset, hasher, kth, batch_size): a = hasher.transform( tokens(d[0]) for d in dataset.train[kth * batch_size:(kth + 1) * batch_size]) sample_size = a.shape[0] labels = list() class_dict = dict() last_class_index = 0 for i in range(sample_size): i = kth * batch_size + i if dataset.train[i][1] in class_dict: labels.append(class_dict[dataset.train[i][1]]) else: class_dict[dataset.train[i][1]] = last_class_index labels.append(last_class_index) last_class_index += 1 return csr_matrix.todense(a).T, convert_to_one_hot(labels).T
def matrix_extract(chrN1, binsize, hicfile): result = straw.straw('NONE', hicfile, str(chrN1), str(chrN1), 'BP', binsize) row = [r // binsize for r in result[0]] col = [c // binsize for c in result[1]] value = result[2] N = max(max(row) + 1, max(col) + 1) #print(N) M = csr_matrix((value, (row, col)), shape=(N, N)) M = csr_matrix.todense(M) M = np.array(M) x, y = np.where(M != 0) M[y, x] = M[x, y] #rowix = range(start1//binsize, end1//binsize+1) #colix = range(start2//binsize, end2//binsize+1) #print(rowix,colix) #M = M[np.ix_(rowix, colix)] #N = M.shape[1] return (M)
def topic_proportions(self, bow_matrix, embeds): #We assume up to batch_size documents to compute their proportions with tf.Session(graph=self.graph) as session: saver = tf.train.Saver() saver.restore(session, self.net_file) n_batches = int(np.floor(self.ntrain / self.batch_size)) order = np.arange(bow_matrix.shape[0]) topic_prop = np.zeros( [bow_matrix.shape[0], self.network_params['out_dim']]) loglik = 0.0 for i in range(n_batches): idx_batch = self.next_batch(order, i) bow_batch = np.zeros( [self.batch_size, self.network_params['input_dim']]) embed_batch = np.zeros( [self.batch_size, self.network_params['embedding_dim']]) #We complete with zeros (last batch only) bow_batch[:idx_batch.shape[0], :] = csr_matrix.todense( bow_matrix[idx_batch, :]) embed_batch[:idx_batch.shape[0], :] = embeds[idx_batch, :] topic_prop[idx_batch, :], ll = self.vae_graph.topic_prop( bow_batch, embed_batch, session) loglik += ll / self.ntrain * self.batch_size session.close() return topic_prop, loglik
def rarefy(self): """ For each BIOM file, a rarefaction filter is applied. A mininum read depth can be specified; samples with reads lower than this read depth are removed, and then samples are rarefied to equal depth. :return: """ all_bioms = {'otu': self.otu, 'genus': self.genus, 'family': self.family, 'order': self.order, 'class': self.class_, 'phylum': self.phylum} batchcopy = deepcopy(all_bioms) for level in all_bioms: for name in all_bioms[level]: try: if self.inputs['rar'] == 'True': lowest_count = int(min(all_bioms[level][name].sum(axis='sample'))) else: lowest_count = int(self.inputs['rar']) data = all_bioms[level][name].matrix_data data = csr_matrix.todense(data) keep_samples = list() mincount = np.sum(data, axis=0) for y in range(mincount.shape[1]): if mincount.item(y) >= lowest_count: keep_samples.append(all_bioms[level][name]._sample_ids[y]) keep = all_bioms[level][name].filter(keep_samples, axis="sample", inplace=False) batchcopy[level][name] = keep.subsample(n=lowest_count, axis='sample') except Exception: logger.error("Unable to rarefy file", exc_info=True) for name in list(all_bioms[level]): all_bioms[level][name] = batchcopy[level][name] self.otu = all_bioms['otu'] self.genus = all_bioms['genus'] self.family = all_bioms['family'] self.order = all_bioms['order'] self.class_ = all_bioms['class'] self.phylum = all_bioms['phylum']
def generate_figures(self): """Generates figures for diagnostics canvas. Also sets the split file params. """ file = self.file_list.GetSelection() if file != -1: file = self.file_list.GetString(file) biomfile = biom.load_table(file) if biomfile.metadata(axis='sample'): varlist = list(biomfile.metadata_to_dataframe(axis='sample').columns) varlist.sort() self.split_list.Set(varlist) else: if self.meta: if file in self.meta: varlist = self.meta[file] varlist.sort() self.split_list.Set(varlist) if self.split: split = self.split_list.FindString(self.split) self.split_list.SetSelection(split) data = biomfile.matrix_data data = csr_matrix.todense(data) fracs = np.count_nonzero(data, axis=1) nsamples = data.shape[1] fracs = fracs / nsamples self.prevfig.clear() self.prevfig.hist(fracs, bins=20) self.prevfig.set_xlabel('Prevalence') self.prevfig.set_title('Taxon prevalence') self.prevfig.set_ylabel('Number of taxa') sample_sums = np.transpose(np.count_nonzero(data, axis=0)) self.rarfig.clear() self.rarfig.hist(sample_sums, bins=40) self.rarfig.set_xlabel('Count number') self.rarfig.set_title('Sample counts') self.rarfig.set_ylabel('Number of samples') self.canvas1.draw() self.canvas2.draw()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Dec 10 14:59:37 2020 Copyright 2020 by Hadrien Montanelli. """ # %% Imports. # Standard library imports: import numpy as np from scipy.sparse import csr_matrix # Chebpy imports: from chebpy.nla import sptoeplitz # %% Test 1. col = np.array([1, 1, 2, 4, 5, 0, 0]) row = np.array([1, 3, 4, 0]) T = sptoeplitz(col, row) print(csr_matrix.todense(T)) # %% Test 2. n = 10 col = np.zeros(n) row = np.zeros(2*n + 1) row[int(n/2)] = 1 T = sptoeplitz(col, row) print(csr_matrix.todense(T))
def __init__(self, init_type, **kwargs): #init_type: {Directly, Random, From File, Random Tree, Random Chimera} if (init_type == "Directly"): if 'Const' in kwargs.keys(): Const = kwargs['Const'] else: Const = 0.0 if 'Pot' in kwargs.keys(): Pot = kwargs['Pot'] else: Pot = np.zeros((1, len(kwargs['Inter']))) (self.Inter, self.Pot, self.Const) = (kwargs['Inter'], Pot, Const) elif (init_type == "Random"): if 'n' in kwargs.keys(): n = kwargs['n'] else: print("Should specify number of vertices n=") if 'p' in kwargs.keys(): p = kwargs['p'] else: print("Should specify the probability of an edge p=") if 'seed' in kwargs.keys(): seed = kwargs['seed'] else: seed = None G = nx.gnp_random_graph(n, p, seed) A = csr_matrix.todense(nx.adjacency_matrix(G)) (self.Inter, self.Pot, self.Const) = (Laplacian(A)/4, np.zeros((n)), 0) elif (init_type == "From File"): if 'filename' in kwargs.keys(): filename = kwargs['filename'] else: print("You should specify the filename!") import os name, extension = os.path.splitext(filename) if (extension == '.json'): (self.Inter, self.Pot, self.Const) = BQPJSON(filename) self.Inter = 1*self.Inter self.Pot = 1*self.Pot self.Const = 1*self.Const #elif (file_extension == '.mat'): #retrieve a dense graph from .mat file #elif (file_extension == '.sparse'): #retrieve a sparse graph from .sparse file else: print("Wrong File Extension") elif (init_type == "Random Chimera"): import dwave_networkx as dnx G = dnx.chimera_graph(kwargs['M'], kwargs['N'], kwargs['L']) A = csr_matrix.todense(nx.adjacency_matrix(G)) (self.Inter, self.Pot, self.Const) = (Laplacian(A)/4, np.zeros((n)), 0) elif (init_type == "Random Tree"): if 'seed' in kwargs.keys(): seed = kwargs['seed'] else: seed = None if 'n' in kwargs.keys(): n = kwargs['n'] else: n = random.randint(10, 100) G = nx.random_tree(n, seed) A = csr_matrix.todense(nx.adjacency_matrix(G)) (self.Inter, self.Pot, self.Const) = (Laplacian(A)/4, np.zeros((n)), 0)
def Laplacian(Adjacency): G = nx.from_numpy_matrix(Adjacency) L = csr_matrix.todense(nx.laplacian_matrix(G)) return L
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Dec 10 14:59:37 2020 Copyright 2020 by Hadrien Montanelli. """ # %% Imports. # Standard library imports: import numpy as np from scipy.sparse import csr_matrix # Chebpy imports: from chebpy.nla import sphankel # %% Test 1. col = np.array([1, 2, 3, 4]) H = sphankel(col) print(csr_matrix.todense(H))
def cluster_biom(self): """ First normalizes bioms so clustering is not affected, performs transformation and then applies clustering. Note that the returned biom files are not normalized, this is just for the clustering process. Many network inference tools require absolute counts. Silhouette score is used to determine the optimal number of clusters. Clustering adds metadata info to the samples. Splitting according to cluster ID is done by wrapping the split_biom function. :return: """ inputs = self.inputs if inputs['nclust'] is not None: nums = list(range(2, (int(inputs['nclust']) + 1))) else: nums = list(range(2,5)) new_dict = {} if type(self.otu) is not dict: logger.warning('Cluster_biom requires a dictionary of biom files to be supplied. \n', exc_info=True) raise ValueError("Cluster_biom requires a dictionary of biom files to be supplied.") normbatch = self.normalize_transform(mode='clr') # CLR transform places data in Euclidean space for x in list(self.otu): try: # define topscore and bestcluster for no cluster norm_table = normbatch.otu[x] topscore = 0 bestcluster = [1] * len(norm_table.ids()) data = csr_matrix.todense(norm_table.matrix_data) data = np.matrix.transpose(data) data = PCA(n_components=2).fit_transform(data) randomclust = np.random.randint(2, size=len(data)) sh_score = [silhouette_score(data, randomclust)] # K-means clustering, tests 2-4 clusters if inputs['cluster'] == 'K-means': for i in nums: clusters = KMeans(i).fit_predict(data) silhouette_avg = silhouette_score(data, clusters) sh_score.append(silhouette_avg) topscore = int(np.argmax(sh_score) + 1) bestcluster = KMeans(topscore).fit_predict(data) # DBSCAN clustering, automatically finds optimal cluster size if inputs['cluster'] == 'DBSCAN': bestcluster = DBSCAN().fit_predict(data) topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0) # Gaussian Mixture Model (gmm) probability distribution if inputs['cluster'] == 'Gaussian': for i in nums: fit = GaussianMixture(i).fit(data) clusters = fit.predict(data) silhouette_avg = silhouette_score(data, clusters) sh_score.append(silhouette_avg) topscore = int(np.argmax(sh_score) + 1) bestfit = GaussianMixture(topscore).fit(data) bestcluster = bestfit.predict(data) # Spectral Clustering if inputs['cluster'] == 'Spectral': for i in nums: clusters = SpectralClustering(i).fit_predict(data) silhouette_avg = silhouette_score(data, clusters) sh_score.append(silhouette_avg) topscore = int(np.argmax(sh_score) + 1) bestcluster = SpectralClustering(topscore).fit_predict(data) # Affinity Propagation clustering if inputs['cluster'] == 'Affinity': bestcluster = AffinityPropagation().fit_predict(data) topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0) if max(sh_score) < 0.25: raise ValueError("Silhouette score too low: please try a different algorithm. " "Your data may not be suitable for clustering.") new_dict[x] = deepcopy(self.otu[x]) for i in range(topscore): mask, = np.where(bestcluster == i) for j in mask: new_dict[x]._sample_metadata[j]['cluster'] = inputs['cluster'] + '_' + str(i) self.otu = new_dict if inputs['split'] is not None: if inputs['split'] == 'TRUE': inputs['split'] = 'cluster' self.split_biom() except Exception: logger.error("Error occurred when clustering samples", exc_info=True)
def prev_filter(self, mode='prev'): """ Some operations may require transformed data. This function performs normalization and a clr transform on all OTU tables in a Batch object. It returns a deep copy of the original Batch object, so the original file is not modified. :param mode: prev or min, specifies whether taxa should be filtered based on prevalence or minimum abundance. The values are stored in the batch.inputs dictionary. :return: """ for level in self.levels: for name in self.levels[level]: data = self.levels[level][name].matrix_data data = csr_matrix.todense(data) keep_otus = list() binotu = None try: if mode == 'prev': # calculates prevalence fracs = np.count_nonzero(data, axis=1) nsamples = data.shape[1] fracs = fracs / nsamples for y in range(0, len(fracs)): if fracs[y] >= (float(self.inputs['prev'])/100): keep_otus.append(self.levels[level][name]._observation_ids[y]) else: binotu = self.levels[level][name]._observation_ids[y] if binotu is not None and 'Bin' not in keep_otus: keep_otus.append(binotu) except Exception: logger.error("Could not set prevalence filter", exc_info=True) try: if mode == 'min': mincount = np.sum(data, axis=1) for y in range(0, len(mincount)): if mincount[y] >= (int(self.inputs['min'])): keep_otus.append(self.levels[level][name]._observation_ids[y]) else: binotu = self.levels[level][name]._observation_ids[y] if binotu is not None: keep_otus.append(binotu) except Exception: logger.error("Could not set a minimum count filter", exc_info=True) keep = self.levels[level][name].filter(keep_otus, axis="observation", inplace=False) try: if binotu is not None: bin = self.levels[level][name].filter(keep_otus[:-1], axis="observation", inplace=False, invert=True) binsums = np.sum(bin.matrix_data, axis=0) # sums all binned OTUs # need to recreate keep._data as lil matrix, is more efficient orig = keep._data.tolil(copy=True) if 'Bin' not in keep_otus: bin_id = keep._obs_index[binotu] orig[bin_id] = binsums keep._observation_ids[bin_id] = "Bin" keep._obs_index["Bin"] = keep._obs_index.pop(binotu) if 'Bin' in keep_otus: # necessary to prevent duplicate Bin ID old_bin_id = keep._obs_index["Bin"] old_bin_sums = keep._data[old_bin_id] new_bin_sums = binsums + old_bin_sums orig[old_bin_id] = new_bin_sums # update keep._data with orig keep._data = orig.tocsr() except Exception: logger.error("Could not preserve binned taxa", exc_info=True) self.levels[level][name] = keep
ax.set_yticklabels(yticklabels, minor=False) # set title and x/y labels plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) # Remove last blank column # plt.xlim( (0, tfidf_matrix.shape[1]) ) for i in range(tfidf_matrix.shape[0]): for j in range(tfidf_matrix.shape[1]): c = round(tfidf_matrix[i, j], 2) ax.text(j, i, str(c)) plt.show() if __name__ == "__main__": corpus = [ 'this is the one document.', 'this is the second document.', 'and this is the third one, which is very similar to first one.', 'is this the first document relates to politics?', ] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(corpus) print(vectorizer.get_feature_names()) print(csr_matrix.todense(X)) heatmap(np.array(csr_matrix.todense(X)), "", "", "", vectorizer.get_feature_names(), corpus)