def build_rank(self, tweets): """Build tf-idf ranking score for terms in the corpus. Note: The code in this method could have been extracted to other smaller methods, improving legibility. This extraction has not been done so that its runtime complexity can be computed easily (the runtime complexity can be improved). Args: tweets (list of Indexable): List of indexed tweets that will be considered during tf-idf score computation. """ self.__build_vocabulary(tweets) n_terms = len(self.vocabulary) n_docs = len(tweets) ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float)) logging.info("[Ranker] Vocabulary assembled with terms count %s, docs count %s" \ % ("{:,}".format(n_terms), "{:,}".format(n_docs))) # compute tf logging.info("[Ranker] Starting tf computation ...") for index, indexable in enumerate(tweets): for word in indexable.words_generator(self.stop_words): word_index_in_vocabulary = self.vocabulary[word] doc_word_count = indexable.count_for_word(word) ft_matrix[index, word_index_in_vocabulary] = doc_word_count # return a copy of this matrix in compressed sparse column format. self.ft_matrix = ft_matrix.tocsc() # compute idf with smoothing logging.info("[Ranker] Starting tf-idf computation ...") df = np.diff(self.ft_matrix.indptr) + self.smoothing n_docs_smooth = n_docs + self.smoothing # create diagonal matrix to be multiplied with ft idf = np.log(float(n_docs_smooth) / df) + 1.0 self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms) # compute tf-idf self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix self.tf_idf_matrix = self.tf_idf_matrix.tocsr() # compute td-idf normalization norm = self.tf_idf_matrix.tocsr(copy=True) norm.data **= 2 norm = norm.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] sptools.csr_scale_rows(self.tf_idf_matrix.shape[0], self.tf_idf_matrix.shape[1], self.tf_idf_matrix.indptr, self.tf_idf_matrix.indices, self.tf_idf_matrix.data, norm)
def build_rank(self, tweets): """Build tf-idf ranking score for terms in the corpus. Note: The code in this method could have been extracted to other smaller methods, improving legibility. This extraction has not been done so that its runtime complexity can be computed easily (the runtime complexity can be improved). Args: tweets (list of Indexable): List of indexed tweets that will be considered during tf-idf score computation. """ self.__build_vocabulary(tweets) n_terms = len(self.vocabulary) n_docs = len(tweets) ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float)) logging.info("[Ranker] Vocabulary assembled with terms count %s, docs count %s" \ % ("{:,}".format(n_terms), "{:,}".format(n_docs))) # compute tf logging.info("[Ranker] Starting tf computation ...") for index, indexable in enumerate(tweets): for word in indexable.words_generator(self.stop_words): word_index_in_vocabulary = self.vocabulary[word] doc_word_count = indexable.count_for_word(word) ft_matrix[index, word_index_in_vocabulary] = doc_word_count # return a copy of this matrix in compressed sparse column format. self.ft_matrix = ft_matrix.tocsc() # compute idf with smoothing logging.info("[Ranker] Starting tf-idf computation ...") df = np.diff(self.ft_matrix.indptr) + self.smoothing n_docs_smooth = n_docs + self.smoothing # create diagonal matrix to be multiplied with ft idf = np.log(float(n_docs_smooth) / df) + 1.0 self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms) # compute tf-idf self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix self.tf_idf_matrix = self.tf_idf_matrix.tocsr() # compute td-idf normalization norm = self.tf_idf_matrix.tocsr(copy=True) norm.data **= 2 norm = norm.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] sptools.csr_scale_rows(self.tf_idf_matrix.shape[0], self.tf_idf_matrix.shape[1], self.tf_idf_matrix.indptr, self.tf_idf_matrix.indices, self.tf_idf_matrix.data, norm)
def distr_chips_row(matrix, chips, n_jobs=-1, norm=True, dist_zero_rows=True, mode="integers"): ''' Trial roulette method for eliciting Dirichlet priors from expressed hypothesis matrix. This function works row-based. Thus, each row will receive the given number of chips!!! :param matrix: csr_matrix A_k expressing theory H_k :param chips: number of (single row) chips C to distribute :param n_jobs: number of jobs, default -1 :param norm: set False if matrix does not need to be normalized (row-based) :param dist_zero_rows: if set to False, the method does not distribute chips to rows with only zeros (use with caution) :param mode: sets the mode of the distribution; "integers" means that the distributed pseudo clicks are integers; "reals" means that the pseudo clicks (hyperparameters) can also be positive reals :return: Dirichlet hyperparameters in the shape of a matrix ''' if mode not in ['integers', 'reals']: raise Exception, "Mode needs to be 'integers' or 'reals'!" chips = float(chips) if float(chips).is_integer() == False and mode == "integers": raise Exception, "If mode is 'integers' then only use integer chip counts!" if norm == True: norma = matrix.sum(axis=1) n_nzeros = np.where(norma > 0) n_zeros, _ = np.where(norma == 0) norma[n_nzeros] = 1.0 / norma[n_nzeros] norma = norma.T[0] csr_scale_rows(matrix.shape[0], matrix.shape[1], matrix.indptr, matrix.indices, matrix.data, norma) if mode == "integers": r = Parallel(n_jobs=n_jobs)(delayed(distr_chips)( matrix[i, :], chips, dist_zero_matrix=dist_zero_rows, norm=False) for i in xrange(matrix.shape[0])) return scipy.sparse.vstack(r) if mode == "reals": matrix = matrix * chips if dist_zero_rows == True: # if some rows have 100% sparsity, we equally distribute the chips n, m = matrix.shape if norm == False: norma = matrix.sum(axis=1) n_zeros, _ = np.where(norma == 0) if len(n_zeros) > 0: #with numpy 1.10 dev, the next line is not needed if int(np.version.short_version.split(".")[1]) < 10: n_zeros = np.array(n_zeros)[0] matrix[n_zeros, :] = chips / m return matrix
def norm (hypothesis): hypothesis = hypothesis.copy() norma = hypothesis.sum(axis=1) n_nzeros = np.where(norma > 0) n_zeros,_ = np.where(norma == 0) norma[n_nzeros] = 1.0 / norma[n_nzeros] norma = norma.T[0] csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1], hypothesis.indptr, hypothesis.indices, hypothesis.data, norma) return hypothesis
def normalize(pairs): """ Normalize rows of a csr matrix (sum to 1) """ factor = pairs.sum(axis=1) nnzeros = np.where(factor > 0) factor[nnzeros] = 1 / factor[nnzeros] factor = np.array(factor)[0] if not pairs.format == "csr": raise ValueError("csr only") sparsetools.csr_scale_rows(pairs.shape[0], pairs.shape[1], pairs.indptr, pairs.indices, pairs.data, factor) return pairs
def normalize(pairs): """ Normalize rows of a csr matrix (sum to 1) """ factor = pairs.sum(axis=1) nnzeros = np.where(factor > 0) factor[nnzeros] = 1 / factor[nnzeros] factor = np.array(factor)[0] if not pairs.format == "csr": raise ValueError("csr only") sparsetools.csr_scale_rows(pairs.shape[0], pairs.shape[1], pairs.indptr, pairs.indices, pairs.data, factor) return pairs
def distr_chips_row(matrix, chips, n_jobs=-1, norm=True, dist_zero_rows=True, mode="integers"): ''' Trial roulette method for eliciting Dirichlet priors from expressed hypothesis matrix. This function works row-based. Thus, each row will receive the given number of chips!!! :param matrix: csr_matrix A_k expressing theory H_k :param chips: number of (single row) chips C to distribute :param n_jobs: number of jobs, default -1 :param norm: set False if matrix does not need to be normalized (row-based) :param dist_zero_rows: if set to False, the method does not distribute chips to rows with only zeros (use with caution) :param mode: sets the mode of the distribution; "integers" means that the distributed pseudo clicks are integers; "reals" means that the pseudo clicks (hyperparameters) can also be positive reals :return: Dirichlet hyperparameters in the shape of a matrix ''' if mode not in ['integers', 'reals']: raise Exception, "Mode needs to be 'integers' or 'reals'!" chips = float(chips) if float(chips).is_integer() == False and mode == "integers": raise Exception, "If mode is 'integers' then only use integer chip counts!" if norm == True: norma = matrix.sum(axis=1) n_nzeros = np.where(norma > 0) n_zeros,_ = np.where(norma == 0) norma[n_nzeros] = 1.0 / norma[n_nzeros] norma = norma.T[0] csr_scale_rows(matrix.shape[0], matrix.shape[1], matrix.indptr, matrix.indices, matrix.data, norma) if mode == "integers": r = Parallel(n_jobs=n_jobs)(delayed(distr_chips)(matrix[i,:],chips,dist_zero_matrix=dist_zero_rows,norm=False) for i in xrange(matrix.shape[0])) return scipy.sparse.vstack(r) if mode == "reals": matrix = matrix * chips if dist_zero_rows == True: # if some rows have 100% sparsity, we equally distribute the chips n,m = matrix.shape if norm == False: norma = matrix.sum(axis=1) n_zeros,_ = np.where(norma == 0) if len(n_zeros) > 0: #with numpy 1.10 dev, the next line is not needed if int(np.version.short_version.split(".")[1]) < 10: n_zeros = np.array(n_zeros)[0] matrix[n_zeros,:] = chips / m return matrix
def l2_norm(sparse_csc_matrix): # convert csc_matrix to csr_matrix which is done in linear time norm = sparse_csc_matrix.tocsr(copy=True) # compute the inverse of l2 norm of non-zero elements norm.data **= 2 norm = norm.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] # modify sparse_csc_matrix in place csr_scale_rows(sparse_csc_matrix.shape[0], sparse_csc_matrix.shape[1], sparse_csc_matrix.indptr, sparse_csc_matrix.indices, sparse_csc_matrix.data, norm)
def test_scale_rows_and_cols(self): D = matrix([[1,0,0,2,3], [0,4,0,5,0], [0,0,6,7,0]]) #TODO expose through function S = csr_matrix(D) v = array([1,2,3]) csr_scale_rows(3,5,S.indptr,S.indices,S.data,v) assert_equal(S.todense(), diag(v)*D ) S = csr_matrix(D) v = array([1,2,3,4,5]) csr_scale_columns(3,5,S.indptr,S.indices,S.data,v) assert_equal(S.todense(), D*diag(v) ) # blocks E = kron(D,[[1,2],[3,4]]) S = bsr_matrix(E,blocksize=(2,2)) v = array([1,2,3,4,5,6]) bsr_scale_rows(3,5,2,2,S.indptr,S.indices,S.data,v) assert_equal(S.todense(), diag(v)*E ) S = bsr_matrix(E,blocksize=(2,2)) v = array([1,2,3,4,5,6,7,8,9,10]) bsr_scale_columns(3,5,2,2,S.indptr,S.indices,S.data,v) assert_equal(S.todense(), E*diag(v) ) E = kron(D,[[1,2,3],[4,5,6]]) S = bsr_matrix(E,blocksize=(2,3)) v = array([1,2,3,4,5,6]) bsr_scale_rows(3,5,2,3,S.indptr,S.indices,S.data,v) assert_equal(S.todense(), diag(v)*E ) S = bsr_matrix(E,blocksize=(2,3)) v = array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]) bsr_scale_columns(3,5,2,3,S.indptr,S.indices,S.data,v) assert_equal(S.todense(), E*diag(v) )
def build_rank(self, objects): self.__build_vocabulary(objects) n_terms = len(self.vocabulary) n_docs = len(objects) ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float)) for index, indexable in enumerate(objects): for word in indexable.words_generator(self.stop_words): word_index_in_vocabulary = self.vocabulary[word] doc_word_count = indexable.count_for_word(word) ft_matrix[index, word_index_in_vocabulary] = doc_word_count self.ft_matrix = ft_matrix.tocsc() logger.info( 'Results will be displayed from higher to lower ranking...') df = np.diff(self.ft_matrix.indptr) + self.smoothing n_docs_smooth = n_docs + self.smoothing idf = np.log(float(n_docs_smooth) / df) + 1.0 self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms) self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix self.tf_idf_matrix = self.tf_idf_matrix.tocsr() norm = self.tf_idf_matrix.tocsr(copy=True) norm.data **= 2 norm = norm.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] sptools.csr_scale_rows(self.tf_idf_matrix.shape[0], self.tf_idf_matrix.shape[1], self.tf_idf_matrix.indptr, self.tf_idf_matrix.indices, self.tf_idf_matrix.data, norm)
def csr_scale_rows(A, x): sparsetools.csr_scale_rows(A.shape[0], A.shape[1], A.indptr, A.indices, A.data, x)
def normalized_entropy(): transition_matrix = pickle.load( open( SSD_HOME+"pickle/transition_matrix", "rb" ) ) print "loaded transitions" graph = pickle.load( open( SSD_HOME+"pickle/graph", "rb" ) ) print "loaded graph" values = pickle.load( open( SSD_HOME+"pickle/values", "rb" ) ) vocab = pickle.load( open( SSD_HOME+"pickle/vocab", "rb" ) ) print "loaded vocab" state_count = len(vocab) states = vocab.keys() shape = (state_count, state_count) # structural hypothesises hyp_structural = csr_matrix((values, (graph[0], graph[1])), shape=shape, dtype=np.float) transitions = csr_matrix((transition_matrix[2], (transition_matrix[0], transition_matrix[1])), shape=shape) del transition_matrix #delete all zero rows from all see http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero print transitions.shape nonzero_row_indice, _ = transitions.nonzero() unique_nonzero_indice = np.unique(nonzero_row_indice) transitions = transitions[unique_nonzero_indice] print transitions.shape hyp_data = csr_matrix(transitions, copy=True) print hyp_data.shape hyp_structural = hyp_structural[unique_nonzero_indice] #norm the data norm_h = hyp_data.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr, hyp_data.indices, hyp_data.data, norm_h) #calculate the entropy for a row #entropy = np.apply_along_axis( entropy_step, axis=1, arr=hyp_data ) entropy =[] c=0 for i in range(0,hyp_data.shape[0]): c+=1 if c % 100000 == 0: print c x = hyp_data.getrow(i) entropy.append(entropy_step(x)) print "entropy" #number of link for a row, needed for normalization fo the norm_h = hyp_structural.sum(axis=1) #print norm_h normalized_entropy = [] for i, x in enumerate(entropy): if i % 100000 == 0: print i if math.log(norm_h[i][0])==0: normalized_entropy.append(0) else: e = x/math.log(norm_h[i][0]) normalized_entropy.append(e) print "normed entropy" write_pickle('output/normalized_entropy.obj',normalized_entropy)
def gini_random_rows(row_normed): print "gini" print row_normed transition_matrix = pickle.load( open( SSD_HOME+"pickle/transition_matrix", "rb" ) ) print "loaded transitions" graph = pickle.load( open( SSD_HOME+"pickle/graph", "rb" ) ) print "loaded graph" values = pickle.load( open( SSD_HOME+"pickle/values", "rb" ) ) vocab = pickle.load( open( SSD_HOME+"pickle/vocab", "rb" ) ) print "loaded vocab" state_count = len(vocab) states = vocab.keys() shape = (state_count, state_count) # structural hypothesises hyp_structural = csr_matrix((values, (graph[0], graph[1])), shape=shape, dtype=np.float) transitions = csr_matrix((transition_matrix[2], (transition_matrix[0], transition_matrix[1])), shape=shape) del transition_matrix #delete all zero rows from all see http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero print transitions.shape nonzero_row_indice, _ = transitions.nonzero() unique_nonzero_indice = np.unique(nonzero_row_indice) transitions = transitions[unique_nonzero_indice] print transitions.shape hyp_data = csr_matrix(transitions, copy=True) print hyp_data.shape hyp_structural = hyp_structural[unique_nonzero_indice] if row_normed: #norm the data norm_h = hyp_data.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr, hyp_data.indices, hyp_data.data, norm_h) #calculate the gini for a row gini =[] c=0 #for i in range(0,hyp_data.shape[0]): import random for i in random.sample(range(0,hyp_data.shape[0]), 10000): c+=1 if c % 1000 == 0: print c counts = hyp_data.getrow(i).toarray()[0] links = hyp_structural.getrow(i).toarray()[0] indices_of_links = links > 0 gini_data = counts[indices_of_links] gini.append(gini_step(gini_data)) print "gini" if row_normed: write_pickle('output/gini_random_rows_row_normed.obj',gini) else: write_pickle('output/gini_random_rows.obj',gini)
# f = open('X_indptr.p', 'wb') # cPickle.dump(X.indptr, f, protocol=-1) #X = normalize(X) # compute the inverse of l2 norm of non-zero elements X.data **= 2 norm = X.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] X.data = np.sqrt(X.data) # modify sparse_csc_matrix in place sparsetools.csr_scale_rows(X.shape[0], X.shape[1], X.indptr, X.indices, X.data, norm) print X.shape print X[0,:].sum() sparse_save(X,"../data/tfidf_norm.h5") # f = open('X_norm_data.p', 'wb') # cPickle.dump(X.data, f, protocol=-1) # f = open('X__norm_indices.p', 'wb') # cPickle.dump(X.indices, f, protocol=-1) # f = open('X_norm_indptr.p', 'wb') # cPickle.dump(X.indptr, f, protocol=-1)
def csr_scale_rows(A, x): sparsetools.csr_scale_rows(A.shape[0], A.shape[1], A.indptr, A.indices, A.data, x)
def evidence(self, hypothesis, structur, k=1, prior=1., norm=True): """ Determines Bayesian evidence given fitted model and hypothesis Args: hypothesis: Hypothesis csr matrix, indices need to map those of transition matrix k: Concentration parameter k prior: proto Dirichlet prior norm: Flag for normalizing hypothesis matrix Returns evidence """ # care with copy here hypothesis = csr_matrix(hypothesis, copy=True) structur = csr_matrix(structur, copy=True) pseudo_counts = k * self.state_count if hypothesis.size != 0: # in case of memory issues set copy to False but then care about changed hypothesis matrix if norm == True: #print "in norm" norm_h = hypothesis.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] #print "in place mod" # modify sparse_csc_matrix in place csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1], hypothesis.indptr, hypothesis.indices, hypothesis.data, norm_h) # distribute pseudo counts to matrix, row-based approach hypothesis = hypothesis * pseudo_counts #print "after pseude counts" # also consider those rows which only include zeros norma = hypothesis.sum(axis=1) n_zeros, _ = np.where(norma == 0) hypothesis[n_zeros, :] = pseudo_counts / self.state_count else: #print "in norm" norm_h = hypothesis.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] #print "in place mod" # modify sparse_csc_matrix in place csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1], hypothesis.indptr, hypothesis.indices, hypothesis.data, norm_h) # distribute pseudo counts to matrix, row-based approach #TODO check if this line should be placed after the zero_rows_norm() call???? hypothesis = hypothesis * pseudo_counts #self.zero_rows_norm(hypothesis, structur,k) self.zero_rows_norm_eff1(hypothesis, structur, k) else: # if hypothesis matrix is empty, we can simply increase the proto prior parameter prior += k # transition matrix with additional Dirichlet prior # not memory efficient transitions_prior = self.transitions.copy() transitions_prior = transitions_prior + hypothesis #print "after copy" # elegantly calculate evidence evidence = 0 evidence += gammaln(hypothesis.sum(axis=1) + self.state_count * prior).sum() evidence -= gammaln( self.transitions.sum(axis=1) + hypothesis.sum(axis=1) + self.state_count * prior).sum() evidence += gammaln(transitions_prior.data + prior).sum() evidence -= gammaln(hypothesis.data + prior).sum() + (len( transitions_prior.data) - len(hypothesis.data)) * gammaln(prior) return evidence
def normalized_entropy(): transition_matrix = pickle.load( open(SSD_HOME + "pickle/transition_matrix", "rb")) print "loaded transitions" graph = pickle.load(open(SSD_HOME + "pickle/graph", "rb")) print "loaded graph" values = pickle.load(open(SSD_HOME + "pickle/values", "rb")) vocab = pickle.load(open(SSD_HOME + "pickle/vocab", "rb")) print "loaded vocab" state_count = len(vocab) states = vocab.keys() shape = (state_count, state_count) # structural hypothesises hyp_structural = csr_matrix((values, (graph[0], graph[1])), shape=shape, dtype=np.float) transitions = csr_matrix( (transition_matrix[2], (transition_matrix[0], transition_matrix[1])), shape=shape) del transition_matrix #delete all zero rows from all see http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero print transitions.shape nonzero_row_indice, _ = transitions.nonzero() unique_nonzero_indice = np.unique(nonzero_row_indice) transitions = transitions[unique_nonzero_indice] print transitions.shape hyp_data = csr_matrix(transitions, copy=True) print hyp_data.shape hyp_structural = hyp_structural[unique_nonzero_indice] #norm the data norm_h = hyp_data.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr, hyp_data.indices, hyp_data.data, norm_h) #calculate the entropy for a row #entropy = np.apply_along_axis( entropy_step, axis=1, arr=hyp_data ) entropy = [] c = 0 for i in range(0, hyp_data.shape[0]): c += 1 if c % 100000 == 0: print c x = hyp_data.getrow(i) entropy.append(entropy_step(x)) print "entropy" #number of link for a row, needed for normalization fo the norm_h = hyp_structural.sum(axis=1) #print norm_h normalized_entropy = [] for i, x in enumerate(entropy): if i % 100000 == 0: print i if math.log(norm_h[i][0]) == 0: normalized_entropy.append(0) else: e = x / math.log(norm_h[i][0]) normalized_entropy.append(e) print "normed entropy" write_pickle('output/normalized_entropy.obj', normalized_entropy)
def gini_random_rows(row_normed): print "gini" print row_normed transition_matrix = pickle.load( open(SSD_HOME + "pickle/transition_matrix", "rb")) print "loaded transitions" graph = pickle.load(open(SSD_HOME + "pickle/graph", "rb")) print "loaded graph" values = pickle.load(open(SSD_HOME + "pickle/values", "rb")) vocab = pickle.load(open(SSD_HOME + "pickle/vocab", "rb")) print "loaded vocab" state_count = len(vocab) states = vocab.keys() shape = (state_count, state_count) # structural hypothesises hyp_structural = csr_matrix((values, (graph[0], graph[1])), shape=shape, dtype=np.float) transitions = csr_matrix( (transition_matrix[2], (transition_matrix[0], transition_matrix[1])), shape=shape) del transition_matrix #delete all zero rows from all see http://stackoverflow.com/questions/31188141/scipy-sparse-matrix-remove-the-rows-whose-all-elements-are-zero print transitions.shape nonzero_row_indice, _ = transitions.nonzero() unique_nonzero_indice = np.unique(nonzero_row_indice) transitions = transitions[unique_nonzero_indice] print transitions.shape hyp_data = csr_matrix(transitions, copy=True) print hyp_data.shape hyp_structural = hyp_structural[unique_nonzero_indice] if row_normed: #norm the data norm_h = hyp_data.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] csr_scale_rows(hyp_data.shape[0], hyp_data.shape[1], hyp_data.indptr, hyp_data.indices, hyp_data.data, norm_h) #calculate the gini for a row gini = [] c = 0 #for i in range(0,hyp_data.shape[0]): import random for i in random.sample(range(0, hyp_data.shape[0]), 10000): c += 1 if c % 1000 == 0: print c counts = hyp_data.getrow(i).toarray()[0] links = hyp_structural.getrow(i).toarray()[0] indices_of_links = links > 0 gini_data = counts[indices_of_links] gini.append(gini_step(gini_data)) print "gini" if row_normed: write_pickle('output/gini_random_rows_row_normed.obj', gini) else: write_pickle('output/gini_random_rows.obj', gini)
def evidence(self, hypothesis, structur, k=1, prior=1., norm=True): """ Determines Bayesian evidence given fitted model and hypothesis Args: hypothesis: Hypothesis csr matrix, indices need to map those of transition matrix k: Concentration parameter k prior: proto Dirichlet prior norm: Flag for normalizing hypothesis matrix Returns evidence """ # care with copy here hypothesis = csr_matrix(hypothesis, copy=True) structur = csr_matrix(structur, copy=True) pseudo_counts = k * self.state_count if hypothesis.size != 0: # in case of memory issues set copy to False but then care about changed hypothesis matrix if norm == True: #print "in norm" norm_h = hypothesis.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] #print "in place mod" # modify sparse_csc_matrix in place csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1], hypothesis.indptr, hypothesis.indices, hypothesis.data, norm_h) # distribute pseudo counts to matrix, row-based approach hypothesis = hypothesis * pseudo_counts #print "after pseude counts" # also consider those rows which only include zeros norma = hypothesis.sum(axis=1) n_zeros,_ = np.where(norma == 0) hypothesis[n_zeros,:] = pseudo_counts / self.state_count else: #print "in norm" norm_h = hypothesis.sum(axis=1) n_nzeros = np.where(norm_h > 0) norm_h[n_nzeros] = 1.0 / norm_h[n_nzeros] norm_h = np.array(norm_h).T[0] #print "in place mod" # modify sparse_csc_matrix in place csr_scale_rows(hypothesis.shape[0], hypothesis.shape[1], hypothesis.indptr, hypothesis.indices, hypothesis.data, norm_h) # distribute pseudo counts to matrix, row-based approach #TODO check if this line should be placed after the zero_rows_norm() call???? hypothesis = hypothesis * pseudo_counts #self.zero_rows_norm(hypothesis, structur,k) self.zero_rows_norm_eff1(hypothesis, structur, k) else: # if hypothesis matrix is empty, we can simply increase the proto prior parameter prior += k # transition matrix with additional Dirichlet prior # not memory efficient transitions_prior = self.transitions.copy() transitions_prior = transitions_prior + hypothesis #print "after copy" # elegantly calculate evidence evidence = 0 evidence += gammaln(hypothesis.sum(axis=1)+self.state_count*prior).sum() evidence -= gammaln(self.transitions.sum(axis=1)+hypothesis.sum(axis=1)+self.state_count*prior).sum() evidence += gammaln(transitions_prior.data+prior).sum() evidence -= gammaln(hypothesis.data+prior).sum() + (len(transitions_prior.data)-len(hypothesis.data)) * gammaln(prior) return evidence
def build_rank(self, objects,isBinaryWord=False): """Build tf-idf ranking score for terms in the corpus. Note: The code in this method could have been extracted to other smaller methods, improving legibility. This extraction has not been done so that its runtime complexity can be computed easily (the runtime complexity can be improved). Args: objects (list of Indexable): List of indexed objects that will be considered during tf-idf score computation. """ self.__build_vocabulary(objects,isBinaryWord) n_terms = len(self.vocabulary) n_docs = len(objects) ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float)) logger.info('Vocabulary assembled with terms count %s', n_terms) # compute idf logger.info('Starting tf computation...') for index, indexable in enumerate(objects): for word in indexable.words_generator(self.stop_words): word_index_in_vocabulary = self.vocabulary[word] doc_word_count = indexable.count_for_word(word) ft_matrix[index, word_index_in_vocabulary] = doc_word_count # Add synword's idf # for word in self.vocabulary_withoutsynword.keys(): # for synword in sy.synonymwords(word)[0:4]: # word_index_in_vocabulary = self.vocabulary[word] # synword_index_in_vocabulary = self.vocabulary[synword] # if synword not in self.vocabulary_withoutsynword.keys(): # #print "origin word: ", word," synword: ",synword # ft_matrix[:,synword_index_in_vocabulary] = ft_matrix[:,word_index_in_vocabulary] # elif synword != word: # newmatrix1 = 0.6*ft_matrix[:,word_index_in_vocabulary]+0.4*ft_matrix[:,synword_index_in_vocabulary] # newmatrix2 = 0.4*ft_matrix[:,word_index_in_vocabulary]+0.6*ft_matrix[:,synword_index_in_vocabulary] # ft_matrix[:,word_index_in_vocabulary] = newmatrix1 # ft_matrix[:,synword_index_in_vocabulary] = newmatrix2 self.ft_matrix = ft_matrix.tocsc() logger.info('Starting tf-idf computation...') # compute idf with smoothing df = np.diff(self.ft_matrix.indptr) + self.smoothing n_docs_smooth = n_docs + self.smoothing # create diagonal matrix to be multiplied with ft idf = np.log(float(n_docs_smooth) / df) + 1.0 self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms) # compute tf-idf self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix self.tf_idf_matrix = self.tf_idf_matrix.tocsr() # compute td-idf normalization norm = self.tf_idf_matrix.tocsr(copy=True) norm.data **= 2 norm = norm.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] sptools.csr_scale_rows(self.tf_idf_matrix.shape[0], self.tf_idf_matrix.shape[1], self.tf_idf_matrix.indptr, self.tf_idf_matrix.indices, self.tf_idf_matrix.data, norm)
# f = open('X_indices.p', 'wb') # cPickle.dump(X.indices, f, protocol=-1) # f = open('X_indptr.p', 'wb') # cPickle.dump(X.indptr, f, protocol=-1) #X = normalize(X) # compute the inverse of l2 norm of non-zero elements X.data **= 2 norm = X.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] X.data = np.sqrt(X.data) # modify sparse_csc_matrix in place sparsetools.csr_scale_rows(X.shape[0], X.shape[1], X.indptr, X.indices, X.data, norm) print X.shape print X[0, :].sum() sparse_save(X, "../data/tfidf_norm.h5") # f = open('X_norm_data.p', 'wb') # cPickle.dump(X.data, f, protocol=-1) # f = open('X__norm_indices.p', 'wb') # cPickle.dump(X.indices, f, protocol=-1) # f = open('X_norm_indptr.p', 'wb') # cPickle.dump(X.indptr, f, protocol=-1) #matrix = X.dot(X.T)
def build_rank(self, objects, isBinaryWord=False): """Build tf-idf ranking score for terms in the corpus. Note: The code in this method could have been extracted to other smaller methods, improving legibility. This extraction has not been done so that its runtime complexity can be computed easily (the runtime complexity can be improved). Args: objects (list of Indexable): List of indexed objects that will be considered during tf-idf score computation. """ self.__build_vocabulary(objects, isBinaryWord) n_terms = len(self.vocabulary) n_docs = len(objects) ft_matrix = sp.lil_matrix((n_docs, n_terms), dtype=np.dtype(float)) logger.info('Vocabulary assembled with terms count %s', n_terms) # compute idf logger.info('Starting tf computation...') for index, indexable in enumerate(objects): for word in indexable.words_generator(self.stop_words): word_index_in_vocabulary = self.vocabulary[word] doc_word_count = indexable.count_for_word(word) ft_matrix[index, word_index_in_vocabulary] = doc_word_count # Add synword's idf # for word in self.vocabulary_withoutsynword.keys(): # for synword in sy.synonymwords(word)[0:4]: # word_index_in_vocabulary = self.vocabulary[word] # synword_index_in_vocabulary = self.vocabulary[synword] # if synword not in self.vocabulary_withoutsynword.keys(): # #print "origin word: ", word," synword: ",synword # ft_matrix[:,synword_index_in_vocabulary] = ft_matrix[:,word_index_in_vocabulary] # elif synword != word: # newmatrix1 = 0.6*ft_matrix[:,word_index_in_vocabulary]+0.4*ft_matrix[:,synword_index_in_vocabulary] # newmatrix2 = 0.4*ft_matrix[:,word_index_in_vocabulary]+0.6*ft_matrix[:,synword_index_in_vocabulary] # ft_matrix[:,word_index_in_vocabulary] = newmatrix1 # ft_matrix[:,synword_index_in_vocabulary] = newmatrix2 self.ft_matrix = ft_matrix.tocsc() logger.info('Starting tf-idf computation...') # compute idf with smoothing df = np.diff(self.ft_matrix.indptr) + self.smoothing n_docs_smooth = n_docs + self.smoothing # create diagonal matrix to be multiplied with ft idf = np.log(float(n_docs_smooth) / df) + 1.0 self.ifd_diag_matrix = sp.spdiags(idf, diags=0, m=n_terms, n=n_terms) # compute tf-idf self.tf_idf_matrix = self.ft_matrix * self.ifd_diag_matrix self.tf_idf_matrix = self.tf_idf_matrix.tocsr() # compute td-idf normalization norm = self.tf_idf_matrix.tocsr(copy=True) norm.data **= 2 norm = norm.sum(axis=1) n_nzeros = np.where(norm > 0) norm[n_nzeros] = 1.0 / np.sqrt(norm[n_nzeros]) norm = np.array(norm).T[0] sptools.csr_scale_rows(self.tf_idf_matrix.shape[0], self.tf_idf_matrix.shape[1], self.tf_idf_matrix.indptr, self.tf_idf_matrix.indices, self.tf_idf_matrix.data, norm)