def appendMatrix(matrix01, matrix02): #Column-wise appends matrix02 columns to matrix01, where user indices are unaligned. Returns new matrix01. hold_matrix = csr_matrix((1, csr_matrix.get_shape(matrix02)[1])) #Empty matrix with 1 row for user in riskFactor_userIDs: if user in regex_userIDs: hold_matrix = vstack([hold_matrix, matrix02[numpy.where(regex_userIDs==user)[0][0],:]]) continue hold_matrix = vstack([hold_matrix, numpy.zeros(csr_matrix.get_shape(matrix02)[1])]) hold_matrix = csr_matrix(hold_matrix)[1:,:] #convert to csr matrix and remove first row matrix01 = csr_matrix(hstack([matrix01, hold_matrix])) return matrix01
def _compute_heat_diffusion(self, lap): eps = 1e-9 n_simplices = csr_matrix.get_shape(lap)[0] norm = np.vectorize(lambda x: 0 if np.abs(x) < eps else x) n_filters = len(self.taus_) if self.proc_ == 'exact': eigenvals, U = self._get_eigens(lap) heat = list() for i in range(n_filters): temp = U.dot(np.diagflat( np.exp(- self.taus_[i] * eigenvals).flatten())).dot(U.T).\ dot(self.initial_condition) heat.append((norm(temp))) else: heat = [sp.sparse.csc_matrix((n_simplices, n_simplices)) for i in range(n_filters)] monome = {0: sp.sparse.eye(n_simplices), 1: lap - sp.sparse.eye(n_simplices)} for k in range(2, self.order_ + 1): monome[k] = 2 * (lap - sp.sparse.eye(n_simplices)).dot( monome[k - 1]) - monome[k - 2] for i in range(n_filters): coeffs = self._compute_cheb_coeff_basis( self.taus_[i], self.order_) temp = sp.sum([coeffs[k] * monome[k] for k in range(0, self.order_ + 1)]) heat[i] = norm(temp.A) # cleans up the small coefficients return heat
import os, codecs import numpy from scipy.sparse import hstack, csr_matrix from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import roc_auc_score # Loading pickle file data into numpy array called data f = open("smoking_1_analytic_data_mapreduce.pkl", "rb") data = numpy.load(f) data = numpy.array(data) f.close() # Loading smoking posts_matrix data posts_matrix = data[0] # 11616 rows x 605107 columns rows = csr_matrix.get_shape(posts_matrix)[0] users_vector = data[3] labels_vector = data[4] keywords_vector = data[2] # Empty matrix for load columns loader_matrix = numpy.empty([rows, 1]) # List of RegExs with open("collocation_smoker_regexs.txt", "r") as f: queries = [l.strip() for l in f] keywords_vector.extend(queries) # Extend keywords list so all columns names accessible # Appends each regex column to loader_matrix for query in queries:
import codecs import os import string import numpy from scipy.sparse import csr_matrix from nltk import sent_tokenize, word_tokenize #Loading pickle file data into numpy array called data f = open('smoking_1_analytic_data_mapreduce.pkl', 'rb') data = numpy.load(f) data = numpy.array(data) f.close() #Loading smoking posts_matrix data posts_matrix = data[0] #11616 rows x 605107 columns rows = csr_matrix.get_shape(posts_matrix)[0] users_vector = data[3] labels_vector = data[4] keywords_vector = data[2] #Empty matrix for load columns loader_matrix = numpy.empty([1, 7]) def word_count(tokens): count = len([token for token in tokens if token not in string.punctuation]) return count def sent_per_status(file_text): content = file_text.readlines()