def load_train_matrix(*,train_dv=None,train_coll_cls,stack_per_sample=3000): train_bows=None train_labels=[] matrix_cache=[] for count,train_bow_obj in enumerate(train_coll_cls.objects): if count %1000==0: print("Train load curr at: {}".format(count)) curr_bow_matrix=train_dv.transform(train_bow_obj.attr_map)[0] matrix_cache.append(curr_bow_matrix) train_labels.append(train_bow_obj.short_genre) if len(matrix_cache)>stack_per_sample: train_bows=sp.vstack(matrix_cache) matrix_cache=[train_bows] print("stacked, train bow size:{},labels size: {}".format(train_bows.shape[0],len(train_labels))) if len(matrix_cache)>1: print("stacking") train_bows=sp.vstack(matrix_cache) matrix_cache=[] print("Final training size: {}".format(train_bows.shape[0])) return train_bows,np.asarray(train_labels)
def update(self, rclient, new): texts = [] urls = self.load_urls("global") url_length = len(urls) sources = {} vectorizer = self.load_vectorizer() dst = os.path.join(self.pickles_directory, self.prefix) tmp = os.path.join(self.pickles_directory, self.tmp_prefix) with TmpDirectory(dst, tmp): logger.info("extrating data") for i, (text, url, source) in enumerate( self.extract_data(rclient, new)): texts.append(text) urls.append(url) sources.setdefault(source, array('I')).append(i) logger.info("update global vectorizer") tfidf = vectorizer.transform(texts) tfidf = sp.vstack((self.load_tfidf("global"), tfidf), format='csr') self.save_tfidf(tfidf, "global") self.save_urls(urls, "global") for source, indices in sources.iteritems(): logger.info("update %s vectorizer", source) source_texts = (texts[i] for i in indices) source_urls = self.load_urls(source) + \ [urls[i + url_length] for i in indices] tfidf = vectorizer.transform(source_texts) tfidf = sp.vstack((self.load_tfidf(source), tfidf), format='csr') self.save_tfidf(tfidf, source) self.save_urls(source_urls, source)
def crawl(addr_in): print "Processing {}".format(addr_in) path_in = os.path.join(addr_in, "output_bin.npy") with open(path_in, "r") as file_in: X = smio.load_sparse_csr(file_in) path_out_pos = os.path.join(addr_in, "output_bin_pos.npy") path_out_neg = os.path.join(addr_in, "output_bin_neg.npy") list_pos = [] list_neg = [] for line in X: res = line[np.size(X, 1)-1] if res == 1: list_pos.append(csr_matrix(line)) else: list_neg.append(csr_matrix(line)) X_pos = vstack(list_pos) X_neg = vstack(list_neg) file_pos = open(path_out_pos, "w") smio.save_sparse_csr(file_pos, X_pos) file_pos.close() file_neg = open(path_out_neg, "w") smio.save_sparse_csr(file_neg, X_neg) file_neg.close()
def evalDeriv(self, prob, v=None, adjoint=False): if prob._formulation == 'HJ': if adjoint: if self.modelType == "Head": srcDeriv = - prob.hDeriv.T * prob.Grad.T * self.MfLiI.T * (prob.Div.T * v) elif self.modelType == "CurrentSource": srcDeriv = prob.qDeriv.T * v elif self.modelType == "CurrentDensity": jsDeriv = sp.vstack((prob.jsxDeriv, prob.jsyDeriv, prob.jszDeriv)) srcDeriv = - jsDeriv.T * prob.mesh.aveF2CCV * (prob.Div.T*v) else: raise NotImplementedError() else: if self.modelType == "Head": srcDeriv = -prob.Div*self.MfLiI*prob.Grad*(prob.hDeriv*v) elif self.modelType == "CurrentSource": srcDeriv = prob.qDeriv * v elif self.modelType == "CurrentDensity": jsDeriv = sp.vstack((prob.jsxDeriv, prob.jsyDeriv, prob.jszDeriv)) srcDeriv = -prob.Div*prob.mesh.aveF2CCV.T*(jsDeriv*v) else: raise NotImplementedError() elif prob._formulation == 'EB': raise NotImplementedError() return srcDeriv
def fit_simplified(self, x_train, y_train): c_training_examples = [] c_training_scores = [] h_training_examples = [] h_training_scores = [] start_time = time.clock() print "Number of examples in training set: " + str(len(x_train)) for i in xrange(len(x_train)): flipbit = FlipBit(x_train[i], self.number_of_labels, self.scoring_function, true_output=y_train[i]) outputs = flipbit.greedy_search(self.depth_of_search) h_training_examples.extend(flipbit.get_training_examples()) h_training_scores.extend(flipbit.get_training_scores()) for j in xrange(len(outputs)): example = construct_sparse_attributes(x_train[i], outputs[j]) score = calculate_loss(self.scoring_function, outputs[j], y_train[i], self.number_of_labels) c_training_examples.append(example) c_training_scores.append(score) generating_end_time = time.clock() self.h_regressor.fit(vstack(h_training_examples, format='csr'), h_training_scores) print "Number of H regression learning examples: " + str(len(h_training_examples)) self.c_regressor.fit(vstack(c_training_examples, format='csr'), c_training_scores) print "Number of C regression learning examples: " + str(len(c_training_examples)) fit_time = time.clock() construction_time = (generating_end_time - start_time) learning_time = (fit_time - generating_end_time) print("Construction time: {0:.4f}, Learning HC time: {1:.4f}".format(construction_time, learning_time))
def concatNpysIntoOneBigMatrix(npy_list): print "Start reading eval images..." result = None tmpArr = None count = 0 for npy in npy_list: if( (count+1) % 1000 == 0): sys.stderr.write(str(count+1) + " eval images loaded\n") # convert to sparse format # csr means row major f = csr_matrix(np.load(npy), dtype='float64') # concat into one big sparse matrix if result is None: result = f else: if(tmpArr == None): tmpArr = f # merge into main array every 1000 npy loaded (hope it could make the IO less) else: tmpArr = vstack([tmpArr, f]) if( count % 1000 == 0 ): result = vstack([result, tmpArr]) tmpArr = None count += 1 result = vstack([result, tmpArr]) print result.shape return result
def maketrain(): for idx, im in enumerate(trainimages): if (idx % 100) == 0: print idx, im for j in jab[int(im[21:27])]: cap_train.append((j, idx)) with open(path+'/coco_align.train.pkl', 'wb') as f: cPickle.dump(cap_train, f) sp = [] for idx, im in enumerate(trainimages): data = loadmat('../coco_cnn4/'+im) sp.append(csr_matrix(numpy.asarray(data['o24']))) if (idx % 10000) == 9999: print idx with open(path+'/train.pkl'+str(idx+1), 'wb') as f: cPickle.dump(vstack(sp), f, protocol=cPickle.HIGHEST_PROTOCOL) sp = [] with open(path+'/train.pkl'+str(idx+1), 'wb') as f: cPickle.dump(vstack(sp), f, protocol=cPickle.HIGHEST_PROTOCOL) #COCO_train2014_000000286899.jpg return 0
def blockLiftingAnalysis(self): """ Build, lift and analyze block matrix form of the collected LPs. """ # Extract lists of matrices for each LP element from the collected LPs. # The coo_matrix calls are needed because vectors are delivered in dense format # by the (block) grounder and the stacking below needs a sparse representation. am = [ x["a"] for x in self.ground] bm = [ sp.coo_matrix(x["b"]) for x in self.ground] cm = [ sp.coo_matrix(x["c"]) for x in self.ground] gm = [ x["g"] for x in self.ground] hm = [ sp.coo_matrix(x["h"]) for x in self.ground] # stack it block_a = sp.block_diag(am) block_b = sp.vstack(bm) block_c = sp.vstack(cm) block_g = sp.block_diag(gm) block_h = sp.vstack(hm) # lift it ground = mdict(block_a,block_b,block_c,block_g,block_h) lifted = lift(ground, self.sparse, self.orbits) # say it print >> self.report, "BLOCK LP LIFTING" reportToFile(self.report,ground,lifted, self.dumpBlockMatrix)
def __init__(self, Y, a, b, c, d, H, q=None): ''' Q_{i,j} = a*(y_i*y_j)^3+b*(y_i*y_j)^2+c*(y_i+y_j) + d - h_i*h_j q = q ''' super(AMF_deg3_BQP, self).__init__() n, l = Y.shape self.a = a self.b = b self.c = c self.d = d self.Y1 = Y self.H = H # mostly updated if q is None: self.q = np.zeros(n) else: self.q = q # consider 2nd and 3nd power of Y Y2_tmp = [] Y3_tmp = [] for i in xrange(n): y = Y.getrow(i) Y2_tmp.append(kron(y, y)) Y3_tmp.append(kron(kron(y, y), y)) self.Y2 = vstack(Y2_tmp).tocsr() self.Y3 = vstack(Y3_tmp).tocsr()
def getInterpolationMatCartMesh(self, Mrect, locType="CC", locTypeTo=None): """ Takes a cartesian mesh and returns a projection to translate onto the cartesian grid. """ assert self.isSymmetric, ( "Currently we have not taken into account " "other projections for more complicated " "CylMeshes" ) if locTypeTo is None: locTypeTo = locType if locType == "F": # do this three times for each component X = self.getInterpolationMatCartMesh(Mrect, locType="Fx", locTypeTo=locTypeTo + "x") Y = self.getInterpolationMatCartMesh(Mrect, locType="Fy", locTypeTo=locTypeTo + "y") Z = self.getInterpolationMatCartMesh(Mrect, locType="Fz", locTypeTo=locTypeTo + "z") return sp.vstack((X, Y, Z)) if locType == "E": X = self.getInterpolationMatCartMesh(Mrect, locType="Ex", locTypeTo=locTypeTo + "x") Y = self.getInterpolationMatCartMesh(Mrect, locType="Ey", locTypeTo=locTypeTo + "y") Z = Utils.spzeros(getattr(Mrect, "n" + locTypeTo + "z"), self.nE) return sp.vstack((X, Y, Z)) grid = getattr(Mrect, "grid" + locTypeTo) # This is unit circle stuff, 0 to 2*pi, starting at x-axis, rotating # counter clockwise in an x-y slice theta = -np.arctan2(grid[:, 0] - self.cartesianOrigin[0], grid[:, 1] - self.cartesianOrigin[1]) + np.pi / 2 theta[theta < 0] += np.pi * 2.0 r = ((grid[:, 0] - self.cartesianOrigin[0]) ** 2 + (grid[:, 1] - self.cartesianOrigin[1]) ** 2) ** 0.5 if locType in ["CC", "N", "Fz", "Ez"]: G, proj = np.c_[r, theta, grid[:, 2]], np.ones(r.size) else: dotMe = { "Fx": Mrect.normals[: Mrect.nFx, :], "Fy": Mrect.normals[Mrect.nFx : (Mrect.nFx + Mrect.nFy), :], "Fz": Mrect.normals[-Mrect.nFz :, :], "Ex": Mrect.tangents[: Mrect.nEx, :], "Ey": Mrect.tangents[Mrect.nEx : (Mrect.nEx + Mrect.nEy), :], "Ez": Mrect.tangents[-Mrect.nEz :, :], }[locTypeTo] if "F" in locType: normals = np.c_[np.cos(theta), np.sin(theta), np.zeros(theta.size)] proj = (normals * dotMe).sum(axis=1) if "E" in locType: tangents = np.c_[-np.sin(theta), np.cos(theta), np.zeros(theta.size)] proj = (tangents * dotMe).sum(axis=1) G = np.c_[r, theta, grid[:, 2]] interpType = locType if interpType == "Fy": interpType = "Fx" elif interpType == "Ex": interpType = "Ey" Pc2r = self.getInterpolationMat(G, interpType) Proj = Utils.sdiag(proj) return Proj * Pc2r
def build_file_data(self, class_names, features_file): for c in class_names: t0 = time.time() train_p, train_n, test_p, test_n = self.data_obj.split_by_class(c) t1 = time.time() print("Split by class time: ", t1 - t0,"s") numntrain = train_n.shape[0] numptrain = train_p.shape[0] X = sparse.vstack([train_p, train_n]) y = [1]*numptrain + [0]*numntrain if self.split_type == "CROSSVALIDATION": ftr_file_name = 'features/data_%s_%s' % (c, features_file) else: ftr_file_name = 'features/train_%s_%s' % (c, features_file) print('Writing %s...' % ftr_file_name) self.write_data(ftr_file_name, X, y) numntest = test_n.shape[0] numptest = test_p.shape[0] X = sparse.vstack([test_p, test_n]) y = [1]*numptest + [0]*numntest if self.split_type == "CROSSVALIDATION": ftr_file_name = 'features/data_%s_%s' % (c, features_file) else: ftr_file_name = 'features/test_%s_%s' % (c, features_file) print('Writing %s...' % ftr_file_name) self.write_data(ftr_file_name, X, y)
def eval_jac_g(x, flag, user_data=None): """Calculates the Jacobi matrix. If the flag is true, returns a tuple (row, col) to indicate the sparse Jacobi matrix's structure. If the flag is false, returns the values of the Jacobi matrix with length nnzj. """ Js = user_data['Js'] if flag: return (Js.row, Js.col) else: om = user_data['om'] Ybus = user_data['Ybus'] Yf = user_data['Yf'] Yt = user_data['Yt'] ppopt = user_data['ppopt'] il = user_data['il'] A = user_data['A'] _, _, dhn, dgn = opf_consfcn(x, om, Ybus, Yf, Yt, ppopt, il) if A is not None and issparse(A): J = vstack([dgn.T, dhn.T, A], 'coo') else: J = vstack([dgn.T, dhn.T], 'coo') ## FIXME: Extend PyIPOPT to handle changes in sparsity structure nnzj = Js.nnz Jd = zeros(nnzj) Jc = J.tocsc() for i in range(nnzj): Jd[i] = Jc[Js.row[i], Js.col[i]] return Jd
def generate_offspring(self,vectorpopulation,parameterpopulation,parameter_options,fitness,elite='0.1',tournament_size=2,crossover_prob=0.9,n_crossovers=1,mutation_rate=0.3,win_condition='highest'): fitness_numbered = [[i,x] for i,x in enumerate(fitness)] fitness_sorted = sorted(fitness_numbered,key = lambda k : k[1],reverse=True) if win_condition == 'highest' else sorted(fitness_numbered,key = lambda k : k[1]) new_population = [vectorpopulation[x[0],:] for x in fitness_sorted[:int(elite*vectorpopulation.shape[0])]] new_parameterpopulation = [parameterpopulation[x[0]] for x in fitness_sorted[:int(elite*vectorpopulation.shape[0])]] fitness_candidates = fitness_sorted[int(elite*vectorpopulation.shape[0]):] while len(new_population) < vectorpopulation.shape[0]: # select selections = self.tournament_selection(fitness_candidates,tournament_size,win_condition) parents = vectorpopulation[selections,:] parameterparents = parameterpopulation[selections,:] # generate and mutate if random.random() < crossover_prob: offspring = [] paramoffspring = [] for generation in range(2): child = self.offspring_crossover(parents,n_crossovers) child_mutated = self.mutate(child,mutation_rate) while child_mutated.count_nonzero() == 0: child_mutated = self.mutate(child,mutation_rate) offspring.append(child_mutated) paramoffspring.append(self.random_parameterpopulation(parameter_options, 1)[0]) else: offspring = parents paramoffspring = parameterparents # accept new_population.extend(offspring) new_parameterpopulation.extend(paramoffspring) return sparse.vstack(new_population), sparse.vstack(new_parameterpopulation)
def cluster(train_data, test_data, tag_matrix, k): km = MiniBatchKMeans(k) training_labels = km.fit_predict(train_data) testing_labels = km.predict(test_data) training_matrices = [] testing_matrices = [] tag_matrices = [] for i in xrange(k): train_rows = [train_data.getrow(j) for j in xrange(train_data.shape[0]) if training_labels[j] == i] test_rows = [test_data.getrow(j) for j in xrange(test_data.shape[0]) if testing_labels[j] == i] if len(train_rows) == 0: training_matrices.append(sparse.csr_matrix((1,1))) testing_matrices.append(sparse.csr_matrix((1,1))) tag_matrices.append(tag_matrix.getrow(0)-tag_matrix.getrow(0)) continue training_matrices.append(sparse.vstack(train_rows)) if len(test_rows) == 0: testing_matrices.append(sparse.csr_matrix((1,1))) tag_matrices.append(tag_matrix.getrow(0)-tag_matrix.getrow(0)) continue testing_matrices.append(sparse.vstack(test_rows)) ktags = sum([tag_matrix.getrow(j) for j in xrange(tag_matrix.shape[0]) if training_labels[j] == i]) for j in xrange(len(ktags.data)): ktags.data[j] /= ktags.data[j] tag_matrices.append(ktags) tag_matrix = sparse.vstack(tag_matrices) save_matrix('tag_matrix.txt', tag_matrix) for i in xrange(k): save_matrix('training_matrix_%d.txt' % i, training_matrices[i].tocoo()) save_matrix('testing_matrix_%d.txt' % i, testing_matrices[i].tocoo()) predictions = [] for i in xrange(len(testing_labels)): predictions.append(tag_matrices[testing_labels[i]]) return sparse.vstack(predictions)
def multiclass_to_ranking(X, y): n_classes = y.shape[1] n_samples = X.shape[0] # create extended X matrix X_features = X.copy() for i in range(n_classes - 1): X_features = sp.vstack([X_features, X]) X_labels = None for i in range(n_classes): X_tmp = sp.csc_matrix((n_samples, n_classes)) X_tmp[:, i] = 1 if X_labels is not None: X_labels = sp.vstack([X_labels, X_tmp]) else: X_labels = X_tmp X_ext = sp.hstack([X_labels, X_features]) # create all combinations compars = [] for i_row, row in enumerate(y.tocsr()): # over all true labels for i in row.indices: for c in range(n_classes): if c not in row.indices: offset = i_row * n_classes compars.append([offset + i, offset + c]) compars = np.vstack(compars) compars = compars.astype(np.float64) return X_ext, compars
def _get_aug_mat(self, k, j): """ Generate the matrix [[A, E], [0, A]] where A is the overall dynamics generator E is the control dynamics generator for a given timeslot and control returns this augmented matrix """ dyn = self.parent dg = dyn._get_phased_dyn_gen(k) if dyn.oper_dtype == Qobj: A = dg.data*dyn.tau[k] E = dyn._get_phased_ctrl_dyn_gen(k, j).data*dyn.tau[k] Z = sp.csr_matrix(dg.data.shape) aug = Qobj(sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])])) elif dyn.oper_dtype == np.ndarray: A = dg*dyn.tau[k] E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k] Z = np.zeros(dg.shape) aug = np.vstack([np.hstack([A, E]), np.hstack([Z, A])]) else: A = dg*dyn.tau[k] E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k] Z = dg*0.0 aug = sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])]) return aug
def block2full(ht,sparse=False): """Convert a heterostructure with block diagonal Hamiltonian into the full form""" if not ht.block_diagonal: return ht # stop ho = ht.copy() ho.block_diagonal = False # set in false from now on nb = len(ht.central_intra) # number of blocks lc = [csc_matrix(ht.central_intra[i][i].shape) for i in range(nb)] rc = [csc_matrix(ht.central_intra[i][i].shape) for i in range(nb)] lc[0] = csc_matrix(ht.left_coupling) rc[nb-1] = csc_matrix(ht.right_coupling) # convert the central to sparse form central = [[None for i in range(nb)] for j in range(nb)] for i in range(nb): for j in range(nb): if ht.central_intra[i][j] is None: continue else: central[i][j] = csc_matrix(ht.central_intra[i][j]) from scipy.sparse import vstack if sparse: ho.left_coupling = vstack(lc) ho.right_coupling = vstack(rc) ho.central_intra = bmat(ht.central_intra) # as sparse matrix else: ho.left_coupling = vstack(lc).todense() ho.right_coupling = vstack(rc).todense() ho.central_intra = bmat(central).todense() # as dense matrix return ho
def _split_data(self, factor, factor_null_val): """ Splits self.data into two sparse matrices. Arguments: Returns: A tuple of two sparse matrices containing rows for which factor's 'factor' values: (1) equal 'factor_null_val' (2) do not equal 'factor_null_val' """ fac_len, rows_len = self.fac_len, self.data.shape[0] fac_ind = self.col_names.index(factor) non_null_set = [] null_set = [] m_csr = self.data.tocsr() for row_ind in range(rows_len): arow = np.ravel(m_csr.getrow(row_ind).todense()) if arow[fac_ind] == factor_null_val: null_set.append(m_csr.getrow(row_ind)) else: non_null_set.append(m_csr.getrow(row_ind)) return (sparse.vstack(null_set).tolil(), sparse.vstack(non_null_set).tolil())
def integrateObservation(self, obs): """This method stores the observation inside the agent""" start_time = time.time() self.obs = obs if (len(obs) != 8): self.isEpisodeOver = True else: self.mayMarioJump, self.isMarioOnGround, self.marioFloats, self.enemiesFloats, self.levelScene, dummy,action,self.obsArray = obs self.obsArray = csr_matrix(self.obsArray) self.should_take_action = action if(self.count > 5): if(self.initialTraining): self.actions = numpy.vstack((self.actions,numpy.array([action]))) self.states = vstack((self.states,self.prev_obs.T)) self.human_input += 1 elif self.isLearning: if self.count > 6 and action != self.actionTaken: self.mistakes += 1 if((self.actionTaken != action)): self.actions = numpy.vstack((self.actions,numpy.array([action]))) self.states = vstack((self.states,self.prev_obs.T)) self.human_input += 1 self.human_input += 1 self.prev_obs = self.obsArray self.count += 1
def contiguous_train_test_split(X, y, train_size): N = len(y) N_train = np.int(N*train_size) start_train = np.random.randint(N) end_train = start_train + N_train if end_train <= N: print("start: " + str(start_train)) print("end: " + str(end_train)) print("total: " + str(N)) X_train = X[start_train:end_train, :] y_train = y[start_train:end_train] X1 = X[:start_train, :] X2 = X[end_train:, :] X_test = sparse.vstack((X1, X2)) y_test = np.r_[y[:start_train], y[end_train:]] else: end_train = end_train - N print("start: " + str(start_train)) print("end: " + str(end_train)) print("total: " + str(N)) X1 = X[:end_train, :] X2 = X[start_train:, :] X_train = sparse.vstack((X1, X2)) y_train = np.r_[y[:end_train], y[start_train:]] X_test = X[end_train:start_train, :] y_test = y[end_train:start_train] return X_train, X_test, y_train, y_test
def Jfull(self, m=None, f=None): if f is None: f = self.fields(m) nn = len(f)-1 Asubs, Adiags, Bs = list(range(nn)), list(range(nn)), list(range(nn)) for ii in range(nn): dt = self.timeSteps[ii] bc = self.getBoundaryConditions(ii, f[ii]) Asubs[ii], Adiags[ii], Bs[ii] = self.diagsJacobian( m, f[ii], f[ii+1], dt, bc ) Ad = sp.block_diag(Adiags) zRight = Utils.spzeros( (len(Asubs)-1)*Asubs[0].shape[0], Adiags[0].shape[1] ) zTop = Utils.spzeros( Adiags[0].shape[0], len(Adiags)*Adiags[0].shape[1] ) As = sp.vstack((zTop, sp.hstack((sp.block_diag(Asubs[1:]), zRight)))) A = As + Ad B = np.array(sp.vstack(Bs).todense()) Ainv = self.Solver(A, **self.solverOpts) AinvB = Ainv * B z = np.zeros((self.mesh.nC, B.shape[1])) du_dm = np.vstack((z, AinvB)) J = self.survey.deriv(f, du_dm_v=du_dm) # not multiplied by v return J
def splitDataByClass(data,label,percentage = 0.3): category = np.unique(label) labeled = None unlabeled = None y_labeled = None y_unlabeled = None first = False for c in category: split = np.nonzero(label == c)[0] sz = int(percentage*len(split)) choice = np.random.choice(split,sz,replace=False) remaining = np.setdiff1d(split,choice) # print(choice.shape,remaining.shape,split.shape) if first: #labeled = np.concatenate((labeled,data[choice,:]),axis = 0) labeled = vstack((labeled,data[choice,:])) unlabeled =vstack((unlabeled,data[remaining,:])) y_labeled = np.concatenate((y_labeled,label[choice])) y_unlabeled = np.concatenate((y_unlabeled,label[remaining])) else: labeled = data[choice,:] unlabeled = data[remaining,:] y_labeled = label[choice] y_unlabeled = label[remaining] first=True return ((labeled,y_labeled),(unlabeled,y_unlabeled))
def init_params(self, x_labeled, x_unlabeled, y): self.L = y.shape[0] self.U = x_unlabeled.shape[0] if self.iprint: print('training SVM ...') self.clf.fit(x_labeled, y) if self.iprint: print('training SVM complete') self.support_vector = self.clf.support_vector self.bias = self.clf.bias self.alpha = self.clf.alpha_times_y self.C = np.zeros(self.L + 2*self.U + 1) self.C[1:self.L+1] = self.clf.C self.C[self.L+1:] = self.C_unlabel if self.sparse: x = sp.vstack((x_labeled, x_unlabeled)) x = sp.vstack((x, x_unlabeled)) else: x = np.vstack((x_labeled, x_unlabeled)) x = np.vstack((x, x_unlabeled)) y_all = np.append(1, y) y_all = np.append(y_all, np.ones(self.U)) y_all = np.append(y_all, -np.ones(self.U)) return x, y_all
def combine_matrix(): X000 = [sio.loadmat(filein_name[:-4] + '0X000.mat')['X000'], sio.loadmat(filein_name[:-4] + '1X000.mat')['X000'], sio.loadmat(filein_name[:-4] + '2X000.mat')['X000']] X001 = [sio.loadmat(filein_name[:-4] + '0X001.mat')['X001'], sio.loadmat(filein_name[:-4] + '1X001.mat')['X001'], sio.loadmat(filein_name[:-4] + '2X001.mat')['X001']] X010 = [sio.loadmat(filein_name[:-4] + '0X010.mat')['X010'], sio.loadmat(filein_name[:-4] + '1X010.mat')['X010'], sio.loadmat(filein_name[:-4] + '2X010.mat')['X010']] X100 = [sio.loadmat(filein_name[:-4] + '0X100.mat')['X100'], sio.loadmat(filein_name[:-4] + '1X100.mat')['X100'], sio.loadmat(filein_name[:-4] + '2X100.mat')['X100']] X_000 = sp.vstack([X000[0],X000[1],X000[2]]) X_001 = sp.vstack([X001[0],X001[1],X001[2]]) X_010 = sp.vstack([X010[0],X010[1],X010[2]]) X_100 = sp.vstack([X100[0],X100[1],X100[2]]) print(X_000.shape) X_model_100 = sp.hstack([X_000,X_100]) sio.savemat(filein_name[:-4] + 'X100-model.mat', {'X100':X_model_100}) X_model_010 = sp.hstack([X_000,X_010]) sio.savemat(filein_name[:-4] + 'X010-model.mat', {'X010':X_model_010}) X_model_001 = sp.hstack([X_000,X_001]) sio.savemat(filein_name[:-4] + 'X001-model.mat', {'X001':X_model_001})
def initPatients(self, patientSet="train"): visitIDs = file(self.settings.find('./patients').attrib['src']) self.visitShelf = shelve.open(self.settings.find('./patients').attrib['shelf']) self.wordShelf = shelve.open(self.settings.find('./vocab').attrib['shelf']) start = int(filter(lambda s: s.attrib['name'] == "train", self.settings.findall('./patientSets/set'))[0].attrib['start']) end = int(filter(lambda s: s.attrib['name'] == "train", self.settings.findall('./patientSets/set'))[0].attrib['end']) visit_ids = [z.strip() for z in visitIDs.readlines()[start:end]] self.visitIDs = visit_ids print "reading in patients", len(visit_ids) print 'from shelve' sparse_X = [] s = time.time() for i,v in enumerate(self.visitIDs): if i%1000 == 0: print i, time.time() - s if i > end: break pat = self.visitShelf[v] pat['anchors'] = set() self.patients[v] = pat sparse_X.append(pat['sparse_X']) #print self.patients.keys() self.sparse_X = sparse.vstack(sparse_X, 'lil') self.train_patient_ids = visit_ids self.patientList = [self.patients[v] for v in self.visitIDs] self.patientIndex = dict(zip([pat['index'] for pat in self.patientList], xrange(len(self.patientList)))) visitIDs.seek(0) start = int(filter(lambda s: s.attrib['name'] == "validate", self.settings.findall('./patientSets/set'))[0].attrib['start']) end = int(filter(lambda s: s.attrib['name'] == "validate", self.settings.findall('./patientSets/set'))[0].attrib['end']) visit_ids = [z.strip() for z in visitIDs.readlines()[start:end]] self.validate_patient_set = set(visit_ids) self.validate_patient_ids = visit_ids self.validate_patient_list = [] print "reading in validate patients", len(visit_ids) print 'from shelve' sparse_X_validate = [] s = time.time() for i,v in enumerate(visit_ids): if i%1000 == 0: print i, time.time() - s if i > end: break pat = self.visitShelf[v] pat['anchors'] = set() self.patients[v] = pat self.validate_patient_list.append(pat) sparse_X_validate.append(pat['sparse_X']) self.sparse_X_validate = sparse.vstack(sparse_X_validate, 'lil')
def get_clean_data(rsa_file_path, rsa_format, ano_file_path, ano_format, meta_data, only_first_month = False): cmd_codes = meta_data['cmd'] stay_type_codes = meta_data['stay_type'] stay_complexity_codes = meta_data['stay_complexity'] ano_data = list() exit_month_data = list() chunk = 1000 sex_data_first_col = 0 age_in_year_data_first_col = sex_data_first_col + 2 age_in_day_data_first_col = age_in_year_data_first_col + formats.age_in_year_cols_count stay_length_data_first_col = age_in_day_data_first_col + formats.age_in_day_cols_count cmd_codes_first_col = stay_length_data_first_col + formats.stay_length_cols stay_type_codes_first_col = cmd_codes_first_col + len(cmd_codes) stay_complexity_codes_first_col = stay_type_codes_first_col + len(stay_type_codes) cols_count = stay_complexity_codes_first_col + len(stay_complexity_codes) np_data = np.zeros((chunk, cols_count), dtype=np.int) rsa_data = sparse.csr_matrix((0, cols_count)) index = 0 global_index = 0 lines_count = 0 with open(rsa_file_path) as rsa_file: with open(ano_file_path) as ano_file: while True: if index == chunk: rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data)]) np_data.fill(0) index = 0 rsa_line = rsa_file.readline() ano_line = ano_file.readline() if ano_tools.is_ano_ok(ano_line, ano_format) and rsa_tools.is_rsa_ok(rsa_line, rsa_format): rsa = rsa_tools.get_rsa(rsa_line, rsa_format) exit_month = rsa['exit_month'] if only_first_month and exit_month != 1: continue exit_month_data.append(exit_month) ano = ano_tools.get_ano(ano_line, ano_format, global_index) ano_data.append(ano) np_data[index, sex_data_first_col + rsa['sex']] = 1 np_data[index, age_in_year_data_first_col + rsa['age_in_year_cat']] = 1 np_data[index, age_in_day_data_first_col + rsa['age_in_day_cat']] = 1 np_data[index, stay_length_data_first_col + rsa['stay_length_cat']] = 1 if rsa['cmd'] != '': np_data[index, cmd_codes_first_col + cmd_codes.index(rsa['cmd'])] = 1 if rsa['stay_type'] != '': np_data[index, stay_type_codes_first_col + stay_type_codes.index(rsa['stay_type'])] = 1 if rsa['stay_complexity'] != '': np_data[index, stay_complexity_codes_first_col + stay_complexity_codes.index(rsa['stay_complexity'])] = 1 index += 1 global_index += 1 if lines_count % 10000 == 0: print '\rPorcessed %s \t added %s' % (lines_count, global_index), lines_count += 1 if not rsa_line and not ano_line: break if index % chunk != 0: rsa_data = vstack([rsa_data, sparse.csr_matrix(np_data[0:index, :])]) return {'anos': ano_data, 'rsas': rsa_data, 'exit_month_data': exit_month_data}
def extract_features(utterances): logger.info('Extracting features.') # This might be inefficient, because the space object is passed to the pool. X = pool.map(space_compose, ((u, composer) for u in utterances), chunksize=CHUNK_SIZE) logger.debug('Stacking %d rows.', len(X)) X = vstack(X, format='csr') if concatinate_prev_utterace: logger.debug('Getting previous utterances.') # It is basically the same X, just shifted one row up. prev_X = vstack([csr_matrix((1, X.shape[1])), csr_matrix(X)[:-1]], format='csr') # Reset prev. utterance vectors to 0 for the first utterance in a conversation. prev_conversation_no = None for row, u in enumerate(utterances): conversation_no = u.conversation_no if conversation_no != prev_conversation_no: prev_X.data[prev_X.indptr[row]:prev_X.indptr[row + 1]] = 0 prev_conversation_no = conversation_no prev_X.eliminate_zeros() assert (X[0] == prev_X[1]).todense().all() logger.debug('Hstacking utterances with their previous utterances.') X = hstack([X, prev_X], format='csr') return X
def compose_all(self, phrases): """ Composes all `phrases` and returns all unigrams and `phrases` as a matrix. Does NOT store the composed vectors. Unigram vectors must be brought in by extending classes. :param phrases: iterable of `str` or `DocumentFeature` :return: a tuple of : 1) `csr_matrix` containing all vectors, unigram and composed 2) the columns (features) of the unigram space that was used for composition 3) a row index- dict {Feature: Row}. Maps from a feature to the row in 1) where the vector for that feature is. Note: This is the opposite of what IO functions in discoutils expect """ composable_phrases = [foo for foo in phrases if foo in self] logging.info('Composing... %s able to compose %d/%d phrases using %d unigrams', self.name, len(composable_phrases), len(phrases), len(self.unigram_source.name2row)) if not composable_phrases: raise ValueError('%s cannot compose any of the provided phrases' % self.name) new_matrix = sp.vstack(self.get_vector(foo) for foo in composable_phrases) old_len = len(self.unigram_source.name2row) all_rows = deepcopy(self.unigram_source.name2row) # can't mutate the unigram datastructure for i, phrase in enumerate(composable_phrases): key = phrase if isinstance(phrase, str) else str(phrase) # phrase shouln't be in the unigram source. assert key not in all_rows all_rows[key] = i + old_len # this will not append to all_rows if phrase is contained in unigram_source all_vectors = sp.vstack([self.unigram_source.matrix, new_matrix], format='csr') assert all_vectors.shape == (len(all_rows), len(self.unigram_source.columns)), 'Shape mismatch' return all_vectors, self.unigram_source.columns, all_rows
def kron_mat(lin_op): """Returns the coefficient matrix for KRON linear op. Parameters ---------- lin_op : LinOp The conv linear op. Returns ------- list of SciPy CSC matrix The matrix representing the Kronecker product. """ constant = const_mat(lin_op.data) lh_rows, lh_cols = constant.shape rh_rows, rh_cols = lin_op.args[0].size # Stack sections for each column of the output. col_blocks = [] for j in range(lh_cols): # Vertically stack A_{ij}Identity. blocks = [] for i in range(lh_rows): blocks.append(constant[i, j]*sp.eye(rh_rows)) column = sp.vstack(blocks) # Make block diagonal matrix by repeating column. col_blocks.append( sp.block_diag(rh_cols*[column]) ) coeff = sp.vstack(col_blocks).tocsc() return [coeff]
def makePropertyTensor(M, tensor): if tensor is None: # default is ones tensor = np.ones(M.nC) if isScalar(tensor): tensor = tensor * np.ones(M.nC) propType = TensorType(M, tensor) if propType == 1: # Isotropic! Sigma = sp.kron(sp.identity(M.dim), sdiag(mkvc(tensor))) elif propType == 2: # Diagonal tensor Sigma = sdiag(mkvc(tensor)) elif M.dim == 2 and tensor.size == M.nC*3: # Fully anisotropic, 2D tensor = tensor.reshape((M.nC,3), order='F') row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 2]))) row2 = sp.hstack((sdiag(tensor[:, 2]), sdiag(tensor[:, 1]))) Sigma = sp.vstack((row1, row2)) elif M.dim == 3 and tensor.size == M.nC*6: # Fully anisotropic, 3D tensor = tensor.reshape((M.nC,6), order='F') row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 3]), sdiag(tensor[:, 4]))) row2 = sp.hstack((sdiag(tensor[:, 3]), sdiag(tensor[:, 1]), sdiag(tensor[:, 5]))) row3 = sp.hstack((sdiag(tensor[:, 4]), sdiag(tensor[:, 5]), sdiag(tensor[:, 2]))) Sigma = sp.vstack((row1, row2, row3)) else: raise Exception('Unexpected shape of tensor') return Sigma
r1 = sp.hstack((train_adj['adj_0_1'].transpose(), train_adj['adj_1_1'], train_adj['adj_1_2']), format="csr") r2 = sp.hstack((train_adj['adj_0_2'].transpose(), train_adj['adj_1_2'].transpose(), train_adj['adj_2_2']), format="csr") super_mask = [[1, 1, 1], [0, 1, 1], [0, 0, 1]] else: all_sub_adj, node_types, features, one_hot_labels = load_aminer() train_adj, train_mask, val_mask, test_mask = load_train_val_test2(all_sub_adj) n2 = train_adj['adj_0_2'].shape[1] n1 = train_adj['adj_0_1'].shape[1] empty_mat = sp.csr_matrix(np.zeros(shape=(n1, n2))) r0 = sp.hstack((train_adj['adj_0_0'], train_adj['adj_0_1'], train_adj['adj_0_2']), format="csr") r1 = sp.hstack((train_adj['adj_0_1'].transpose(), train_adj['adj_1_1'], empty_mat), format="csr") r2 = sp.hstack((train_adj['adj_0_2'].transpose(), empty_mat.transpose(), train_adj['adj_2_2']), format="csr") super_mask = [[1, 1, 1], [0, 1, 0], [0, 0, 1]] train_adj = sp.vstack((r0, r1, r2)) n_nodes = train_adj.shape[0] n_features = features.shape[1] n_types = node_types.shape[1] n_labels = one_hot_labels.shape[1] if FLAGS.model == 'gcn': support = [preprocess_adj(train_adj)] n_supports = 1 elif FLAGS.model == 'gcn_cheby': support = chebyshev_polynomials(train_adj, FLAGS.max_degree) n_supports = 1 + FLAGS.max_degree else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) print('Supports Created!')
n_hidden_2 = 50 print "Building positive and negative report matrices..." pos_reports = io.mmread('model_0_posreports.mtx') pos_reports = pos_reports.tocsr() neg_reports = io.mmread('model_0_negreports.mtx') neg_reports = neg_reports.tocsr() for reportblock in range(1, 50): print "Procesing", reportblock thispos = io.mmread('model_' + str(reportblock) + '_posreports.mtx') thispos = thispos.tocsr() pos_reports = vstack((pos_reports, thispos)) thisneg = io.mmread('model_' + str(reportblock) + '_negreports.mtx') thisneg = thisneg.tocsr() neg_reports = vstack((neg_reports, thisneg)) print "Done." neg_ind = np.arange(neg_reports.shape[0]) pos_ind = np.arange(pos_reports.shape[0]) subset_neg_ind = np.random.choice(neg_ind, pos_reports.shape[0], replace=False) neg_reports_subset = neg_reports[subset_neg_ind, :]
def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None, download_if_missing=True, return_X_y=False): """Load the 20 newsgroups dataset and vectorize it into token counts \ (classification). Download it if necessary. This is a convenience function; the transformation is done using the default settings for :class:`sklearn.feature_extraction.text.CountVectorizer`. For more advanced usage (stopword filtering, n-gram extraction, etc.), combine fetch_20newsgroups with a custom :class:`sklearn.feature_extraction.text.CountVectorizer`, :class:`sklearn.feature_extraction.text.HashingVectorizer`, :class:`sklearn.feature_extraction.text.TfidfTransformer` or :class:`sklearn.feature_extraction.text.TfidfVectorizer`. ================= ========== Classes 20 Samples total 18846 Dimensionality 130107 Features real ================= ========== Read more in the :ref:`User Guide <20newsgroups_dataset>`. Parameters ---------- subset : 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. remove : tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. data_home : optional, default: None Specify an download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. return_X_y : boolean, default=False. If True, returns ``(data.data, data.target)`` instead of a Bunch object. .. versionadded:: 0.20 Returns ------- bunch : Bunch object with the following attribute: - bunch.data: sparse matrix, shape [n_samples, n_features] - bunch.target: array, shape [n_samples] - bunch.target_names: a list of categories of the returned data, length [n_classes]. - bunch.DESCR: a description of the dataset. (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 """ data_home = get_data_home(data_home=data_home) filebase = '20newsgroup_vectorized' if remove: filebase += 'remove-' + ('-'.join(remove)) target_file = _pkl_filepath(data_home, filebase + ".pkl") # we shuffle but use a fixed seed for the memoization data_train = fetch_20newsgroups(data_home=data_home, subset='train', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing) data_test = fetch_20newsgroups(data_home=data_home, subset='test', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing) if os.path.exists(target_file): X_train, X_test = _joblib.load(target_file) else: vectorizer = CountVectorizer(dtype=np.int16) X_train = vectorizer.fit_transform(data_train.data).tocsr() X_test = vectorizer.transform(data_test.data).tocsr() _joblib.dump((X_train, X_test), target_file, compress=9) # the data is stored as int16 for compactness # but normalize needs floats X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) normalize(X_train, copy=False) normalize(X_test, copy=False) target_names = data_train.target_names if subset == "train": data = X_train target = data_train.target elif subset == "test": data = X_test target = data_test.target elif subset == "all": data = sp.vstack((X_train, X_test)).tocsr() target = np.concatenate((data_train.target, data_test.target)) else: raise ValueError("%r is not a valid subset: should be one of " "['train', 'test', 'all']" % subset) module_path = dirname(__file__) with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file: fdescr = rst_file.read() if return_X_y: return data, target return Bunch(data=data, target=target, target_names=target_names, DESCR=fdescr)
def solve_loop(qp_matrices, solver='osqp'): """ Solve portfolio optimization loop for all gammas """ # Shorter name for qp_matrices qp = qp_matrices # Get dimensions n = len(qp.lx) k = len(qp.l) - 1 print('n = %d and solver %s' % (n, solver)) # Get number of problems to solve n_prob = qp.q.shape[1] # Initialize time vector time = np.zeros(n_prob) # Initialize number of iterations vector niter = np.zeros(n_prob) if solver == 'osqp': # Construct qp matrices Aosqp = spa.vstack( (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc() losqp = np.append(qp.l, qp.lx) uosqp = np.append(qp.u, qp.ux) # Setup OSQP m = osqp.OSQP() m.setup(qp.P, qp.q[:, 0], Aosqp, losqp, uosqp, auto_rho=True, polish=False, verbose=False) for i in range(n_prob): q = qp.q[:, i] # Update linear cost m.update(q=q) # Solve results = m.solve() x = results.x y = results.y status = results.info.status_val niter[i] = results.info.iter time[i] = results.info.run_time # Check if status correct if status != m.constant('OSQP_SOLVED'): import ipdb ipdb.set_trace() raise ValueError('OSQP did not solve the problem!') # DEBUG # solve with gurobi # prob = mpbpy.QuadprogProblem(qp.P, q, Aosqp, losqp, uosqp) # res = prob.solve(solver=mpbpy.GUROBI, verbose=False) # print('Norm difference OSQP-GUROBI %.3e' % # np.linalg.norm(x - res.x)) # import ipdb; ipdb.set_trace() elif solver == 'osqp_coldstart': # Construct qp matrices Aosqp = spa.vstack( (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc() losqp = np.append(qp.l, qp.lx) uosqp = np.append(qp.u, qp.ux) # Setup OSQP m = osqp.OSQP() m.setup(qp.P, qp.q[:, 0], Aosqp, losqp, uosqp, warm_start=False, auto_rho=True, polish=False, verbose=False) for i in range(n_prob): q = qp.q[:, i] # Update linear cost m.update(q=q) # Solve results = m.solve() x = results.x y = results.y status = results.info.status_val niter[i] = results.info.iter time[i] = results.info.run_time # Check if status correct if status != m.constant('OSQP_SOLVED'): import ipdb ipdb.set_trace() raise ValueError('OSQP did not solve the problem!') # DEBUG # solve with gurobi # prob = mpbpy.QuadprogProblem(qp.P, q, Aosqp, losqp, uosqp) # res = prob.solve(solver=mpbpy.GUROBI, verbose=False) # print('Norm difference OSQP-GUROBI %.3e' % # np.linalg.norm(x - res.x)) # import ipdb; ipdb.set_trace() # DEBUG print iterations per value of gamma # gamma_vals = np.logspace(-2, 2, 101)[::-1] # # import matplotlib.pylab as plt # plt.figure() # ax = plt.gca() # plt.plot(gamma_vals, niter) # ax.set_xlabel(r'$\gamma$') # ax.set_ylabel(r'iter') # plt.show(block=False) # import ipdb; ipdb.set_trace() elif solver == 'osqp_no_caching': # Construct qp matrices Aosqp = spa.vstack( (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc() losqp = np.append(qp.l, qp.lx) uosqp = np.append(qp.u, qp.ux) for i in range(n_prob): # Setup OSQP m = osqp.OSQP() m.setup(qp.P, qp.q[:, i], Aosqp, losqp, uosqp, warm_start=False, auto_rho=True, polish=False, verbose=False) # Solve results = m.solve() x = results.x y = results.y status = results.info.status_val niter[i] = results.info.iter time[i] = results.info.run_time # Check if status correct if status != m.constant('OSQP_SOLVED'): import ipdb ipdb.set_trace() raise ValueError('OSQP did not solve the problem!') # DEBUG # solve with gurobi # prob = mpbpy.QuadprogProblem(qp.P, q, Aosqp, losqp, uosqp) # res = prob.solve(solver=mpbpy.GUROBI, verbose=False) # print('Norm difference OSQP-GUROBI %.3e' % # np.linalg.norm(x - res.x)) # import ipdb; ipdb.set_trace() # DEBUG print iterations per value of gamma # gamma_vals = np.logspace(-2, 2, 101)[::-1] # # import matplotlib.pylab as plt # plt.figure() # ax = plt.gca() # plt.plot(gamma_vals, niter) # ax.set_xlabel(r'$\gamma$') # ax.set_ylabel(r'iter') # plt.show(block=False) # import ipdb; ipdb.set_trace() elif solver == 'qpoases': n_dim = qp.P.shape[0] # Number of variables m_dim = qp.A.shape[0] # Number of constraints without bounds # Initialize qpoases and set options qpoases_m = qpoases.PyQProblem(n_dim, m_dim) options = qpoases.PyOptions() options.printLevel = qpoases.PyPrintLevel.NONE qpoases_m.setOptions(options) # Construct bounds for qpoases lx = np.append(qp.lx, -np.inf * np.ones(k)) ux = np.append(qp.ux, np.inf * np.ones(k)) # Setup matrix P and A P = np.ascontiguousarray(qp.P.todense()) A = np.ascontiguousarray(qp.A.todense()) for i in range(n_prob): # Get linera cost as contiguous array q = np.ascontiguousarray(qp.q[:, i]) # Reset cpu time qpoases_cpu_time = np.array([20.]) # Reset number of of working set recalculations nWSR = np.array([1000]) if i == 0: # First iteration res_qpoases = qpoases_m.init(P, q, A, np.ascontiguousarray(lx), np.ascontiguousarray(ux), np.ascontiguousarray(qp.l), np.ascontiguousarray(qp.u), nWSR, qpoases_cpu_time) else: # Solve new hot started problem res_qpoases = qpoases_m.hotstart(q, np.ascontiguousarray(lx), np.ascontiguousarray(ux), np.ascontiguousarray(qp.l), np.ascontiguousarray(qp.u), nWSR, qpoases_cpu_time) # # DEBUG Solve with gurobi # qpoases solution # sol_qpoases = np.zeros(n + k) # qpoases_m.getPrimalSolution(sol_qpoases) # import mathprogbasepy as mpbpy # Agrb = spa.vstack((qp.A, # spa.hstack((spa.eye(n), spa.csc_matrix((n, k))) # ))).tocsc() # lgrb = np.append(qp.l, qp.lx) # ugrb = np.append(qp.u, qp.ux) # prob = mpbpy.QuadprogProblem(spa.csc_matrix(qp.P), q, # Agrb, lgrb, ugrb) # res = prob.solve(solver=mpbpy.GUROBI, verbose=True) # print("Norm difference x qpoases - GUROBI = %.4f" % # np.linalg.norm(sol_qpoases - res.x)) # print("Norm difference objval qpoases - GUROBI = %.4f" % # abs(qpoases_m.getObjVal() - res.obj_val)) # import ipdb; ipdb.set_trace() if res_qpoases != 0: raise ValueError('qpoases did not solve the problem!') # Save time time[i] = qpoases_cpu_time[0] # Save number of iterations niter[i] = nWSR[0] elif solver == 'gurobi': # Construct qp matrices Agurobi = spa.vstack( (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc() lgurobi = np.append(qp.l, qp.lx) ugurobi = np.append(qp.u, qp.ux) for i in range(n_prob): # Get linera cost as contiguous array q = qp.q[:, i] # Solve with gurobi prob = mpbpy.QuadprogProblem(qp.P, q, Agurobi, lgurobi, ugurobi) res = prob.solve(solver=mpbpy.GUROBI, verbose=False) # Save time time[i] = res.cputime # Save number of iterations niter[i] = res.total_iter elif solver == 'mosek': # Construct qp matrices Amosek = spa.vstack( (qp.A, spa.hstack((spa.eye(n), spa.csc_matrix((n, k)))))).tocsc() lmosek = np.append(qp.l, qp.lx) umosek = np.append(qp.u, qp.ux) for i in range(n_prob): # Get linera cost as contiguous array q = qp.q[:, i] # Solve with mosek prob = mpbpy.QuadprogProblem(qp.P, q, Amosek, lmosek, umosek) res = prob.solve(solver=mpbpy.MOSEK, verbose=False) # Save time time[i] = res.cputime # Save number of iterations niter[i] = res.total_iter elif solver == 'ecos': for i in range(n_prob): # Construct the problem # minimize x' D x + y' I y - (1/gamma) * mu' x # subject to 1' x = 1 # F' x = y # 0 <= x <= 1 n_var = qp.F.shape[0] m_var = qp.F.shape[1] x = cvxpy.Variable(n_var) y = cvxpy.Variable(m_var) objective = cvxpy.Minimize( cvxpy.quad_form(x, qp.D) + cvxpy.quad_form(y, spa.eye(m_var)) + -1 / qp.gammas[i] * qp.mu * x) constraints = [ np.ones(n_var) * x == 1, qp.F.T * x == y, 0 <= x, x <= 1 ] problem = cvxpy.Problem(objective, constraints) problem.solve(solver=cvxpy.ECOS, verbose=False) # Obtain time and number of iterations time[i] = problem.solver_stats.setup_time + \ problem.solver_stats.solve_time niter[i] = problem.solver_stats.num_iters # # DEBUG: Solve with MOSEK # Amosek = spa.vstack((qp.A, # spa.hstack((spa.eye(n), spa.csc_matrix((n, k))) # ))).tocsc() # lmosek = np.append(qp.l, qp.lx) # umosek = np.append(qp.u, qp.ux) # prob = mpbpy.QuadprogProblem(qp.P, qp.q[:, i], # Amosek, lmosek, umosek) # res = prob.solve(solver=mpbpy.MOSEK, verbose=False) # x_mosek = res.x[:n_var] # import ipdb; ipdb.set_trace() else: raise ValueError('Solver not understood') # Return statistics return utils.Statistics(time), utils.Statistics(niter)
def _ht_2d( true_corr, # list of correlations for each group cells, # list of Nx2 sparse matrices approx_sf, design_matrix, Nc_list, num_boot, treatment_idx, q, _estimator_1d, _estimator_cov, resampling, **kwargs): good_idxs = np.zeros(design_matrix.shape[0], dtype=bool) # the bootstrap arrays boot_corr = np.zeros((design_matrix.shape[0], num_boot+1))*np.nan # Get strata-specific pooled information if resampling == 'permutation': uniq_strata, strata_indicator = np.unique(np.delete(design_matrix, treatment_idx, axis=1), axis=0, return_inverse=True) resampling_info = {} for k in range(uniq_strata.shape[0]): strata_idx = np.where(strata_indicator==0)[0] data_list = [cells[i] for i in strata_idx] sf_list = [approx_sf[i] for i in strata_idx] resampling_info[k] = bootstrap._unique_expr(sparse.vstack(data_list, format='csc'), np.concatenate(sf_list)) for group_idx in range(design_matrix.shape[0]): # Skip if any of the 2d moments are NaNs if np.isnan(true_corr[group_idx]) or (np.abs(true_corr[group_idx]) == 1): continue # Fill in the true value boot_corr[group_idx, 0] = true_corr[group_idx] # Generate the bootstrap values cov, var_1, var_2 = bootstrap._bootstrap_2d( data=cells[group_idx], size_factor=approx_sf[group_idx], num_boot=int(num_boot), q=q[group_idx], _estimator_1d=_estimator_1d, _estimator_cov=_estimator_cov, precomputed=(None if resampling == 'bootstrap' else resampling_info[strata_indicator[group_idx]])) corr = estimator._corr_from_cov(cov, var_1, var_2, boot=True) # This replicate is good boot_corr[group_idx, 1:] = corr#[:num_boot] vals = _fill_corr(boot_corr[group_idx, :]) # Skip if all NaNs if np.all(np.isnan(vals)): continue good_idxs[group_idx] = True boot_corr[group_idx, :] = vals # Skip this gene if good_idxs.sum() == 0: return np.nan, np.nan, np.nan vals = _regress_2d( design_matrix=design_matrix[good_idxs, :], boot_corr=boot_corr[good_idxs, :], Nc_list=Nc_list[good_idxs], treatment_idx=treatment_idx, resampling=resampling, **kwargs) return vals
def train_gen(batch_size=1000, mwr=0.3, distil_temp=1.0, ret_hashes=False): if distil_temp != 1.0: with open( processed_dir + "distil/y_good_distill.mwr-{}.temp-{}.pickle".format( mwr, distil_temp), "rb") as f: y_good_distill = pickle.load(f) with open( processed_dir + "distil/y_mal_distill.mwr-{}.temp-{}.pickle".format( mwr, distil_temp), "rb") as f: y_mal_distill = pickle.load(f) while True: perm_good = np.random.permutation(train_good) perm_mal = np.random.permutation(train_mal) y = np.zeros((batch_size, ), dtype=np.int8) mal_batch = int(batch_size * mwr) i = 0 j = 0 while i < train_mal: if train_mal - i < mal_batch: mal_batch = train_mal - i good_batch = int((mal_batch / mwr) * (1 - mwr)) full_batch = mal_batch + good_batch if full_batch != batch_size: y = np.zeros((full_batch, ), dtype=np.int8) good_idx = roll(perm_good, j, (j + good_batch) % train_good) mal_idx = perm_mal[i:i + mal_batch] x_m = vstack([x_manifest[good_idx], x_manifest_mal[mal_idx]]) x_c = vstack([x_code[good_idx], x_code_mal[mal_idx]]) if distil_temp != 1.0: y[:good_batch] = y_good_distill[good_idx] y[good_batch:] = y_mal_distill[mal_idx] else: y[:good_batch] = 0 y[good_batch:] = 1 if ret_hashes: hashes = [] for idx in good_idx: hashes.append(good_hashes[idx]) for idx in mal_idx: hashes.append(mal_hashes[idx]) yield ([x_m, x_c], y, hashes) else: yield ([x_m, x_c], y) i = i + mal_batch j = (j + good_batch) % train_good
def _cross_prod(self): s = self.ent_table r = self.att_table k = self.kfkds ns = k[0].shape[0] ds = s.shape[1] nr = [t.shape[0] for t in self.att_table] dr = [t.shape[1] for t in self.att_table] if not self.trans: if s.size > 0: res = self._t_cross(s) else: res = np.zeros((ns, ns), dtype=float, order='C') if all(map(sp.issparse, r)): cross_r = [self._t_cross(t).toarray() for t in r] else: cross_r = [self._t_cross(t) for t in r] comp.expand_add(ns, len(k), k, cross_r, nr, res) return res else: if all(map(sp.issparse, self.att_table)): other = np.ones((1, ns)) v = [ np.zeros((1, t.shape[0]), dtype=float) for t in self.att_table ] comp.group(ns, len(k), 1, k, nr, other, v) size = self.att_table[0].size data = np.empty(size) # part 2 and 3 are p.T and p comp.multiply_sparse(size, self.att_table[0].row, self.att_table[0].data, np.sqrt(v[0]), data) diag_part = self._cross( sp.coo_matrix((data, (self.att_table[0].row, self.att_table[0].col)))) if ds > 0: m = np.zeros((nr[0], ds)) comp.group_left(ns, ds, s, k[0], m) p = self._cross(self.att_table[0], m) s_part = self._cross(self.ent_table) res = sp.vstack((np.hstack( (s_part, p.T)), sp.hstack((p, diag_part)))) else: res = diag_part # multi-table join for i in range(1, len(k)): ps = [] if ds > 0: m = np.zeros((nr[i], ds)) comp.group_left(ns, ds, s, k[i], m) ps += [self._cross(self.att_table[i], m)] # cp (KRi) size = self.att_table[i].size data = np.empty(size) comp.multiply_sparse(size, self.att_table[i].row, self.att_table[i].data, np.sqrt(v[i]), data) diag_part = self._cross( sp.coo_matrix((data, (self.att_table[i].row, self.att_table[i].col)))) for j in range(i): ps += [r[i].tocsr()[k[i]].T.dot(r[j].tocsr()[k[j]])] res = sp.vstack((sp.hstack( (res, sp.vstack([p.T for p in ps]))), sp.hstack(ps + [diag_part]))) else: nt = self.ent_table.shape[1] + sum( [att.shape[1] for att in self.att_table]) other = np.ones((1, ns)) v = [ np.zeros((1, t.shape[0]), dtype=float) for t in self.att_table ] res = np.empty((nt, nt)) data = np.empty(self.att_table[0].shape, order='C') comp.group(ns, len(k), 1, k, nr, other, v) comp.multiply(self.att_table[0].shape[0], self.att_table[0].shape[1], self.att_table[0], v[0], data) res[ds:ds + dr[0], ds:ds + dr[0]] = self._cross(data) if ds > 0: m = np.zeros((nr[0], ds)) comp.group_left(ns, ds, s, k[0], m) res[ds:ds + dr[0], :ds] = self._cross(self.att_table[0], m) res[:ds, ds:ds + dr[0]] = res[ds:ds + dr[0], :ds].T res[:ds, :ds] = self._cross(self.ent_table) # multi-table join for i in range(1, len(self.kfkds)): if ds > 0: m = np.zeros((nr[i], ds)) comp.group_left(ns, ds, s, k[i], m) ni1 = ds + sum( [t.shape[1] for t in self.att_table[:i]]) ni2 = ni1 + self.att_table[i].shape[1] res[ni1:ni2, :ds] = self._cross(self.att_table[i], m) res[:ds, ni1:ni2] = res[ni1:ni2, :ds].T # cp(KRi) data = np.empty(self.att_table[i].shape, order='C') comp.multiply(self.att_table[i].shape[0], self.att_table[i].shape[1], self.att_table[i], v[i], data) res[ni1:ni2, ni1:ni2] = self._cross(data) for j in range(i): dj1 = ds + sum( [t.shape[1] for t in self.att_table[:j]]) dj2 = dj1 + self.att_table[j].shape[1] if (ns * 1.0 / nr[j]) > (1 + nr[j] * 1.0 / dr[j]): m = np.zeros((nr[i], nr[j]), order='C') comp.group_k_by_k(nr[i], nr[j], ns, k[i], k[j], m) res[ni1:ni2, dj1:dj2] = r[i].T.dot(m.T.dot(r[j])) res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T else: res[ni1:ni2, dj1:dj2] = r[i][k[i]].T.dot(r[j][k[j]]) res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T return res
def simulate_dataset_with_ambient_rna( n_cells: int = 150, n_empty: int = 300, clusters: int = 3, n_genes: int = 10000, d_cell: int = 5000, d_empty: int = 100, cells_in_clusters: Union[List[int], None] = None, ambient_different: bool = False, chi_input: Union[np.ndarray, None] = None) \ -> Tuple[sp.csr.csr_matrix, np.ndarray, np.ndarray, np.ndarray]: """Simulate a dataset with ambient background RNA counts. Empty drops have ambient RNA only, while barcodes with cells have cell RNA plus some amount of ambient background RNA (in proportion to the sizes of cell and droplet). Args: n_cells: Number of cells. n_empty: Number of empty droplets with only ambient RNA. clusters: Number of distinct cell types to simulate. n_genes: Number of genes. d_cell: Cell size scale factor. d_empty: Empty droplet size scale factor. cells_in_clusters: Number of cells of each cell type. If specified, the number of ints in this list must be equal to clusters. ambient_different: If False, the gene expression profile of ambient RNA is drawn from the sum of cellular gene expression. If True, the ambient RNA expression is completely different from cellular gene expression. chi_input: Gene expression arrays in a matrix, with rows as clusters and columns as genes. Expression should add to one for each row. Setting chi=None will generate new chi randomly according to a Dirichlet distribution. Returns: csr_barcode_gene_synthetic: The simulated barcode by gene matrix of UMI counts, as a scipy.sparse.csr.csr_matrix. z: The simulated cell type identities. A numpy array of integers, one for each barcode. The number 0 is used to denote barcodes without a cell present. chi: The simulated gene expression, one corresponding to each z. Access the vector of gene expression for a given z using chi[z, :]. d: The simulated size scale factors, one for each barcode. """ assert d_cell > 0, "Location parameter, d_cell, of LogNormal " \ "distribution must be greater than zero." assert d_empty > 0, "Location parameter, d_cell, of LogNormal " \ "distribution must be greater than zero." assert clusters > 0, "clusters must be a positive integer." assert n_cells > 0, "n_cells must be a positive integer." assert n_empty > 0, "n_empty must be a positive integer." assert n_genes > 0, "n_genes must be a positive integer." if chi_input is not None: assert chi_input.shape[0] == clusters, "Chi was specified, but the " \ "number of rows must match " \ "the number of clusters." assert chi_input.shape[1] == n_genes, "Chi was specified, but the " \ "number of columns must match " \ "the number of genes." # Figure out how many cells are in each cell cluster. if cells_in_clusters is None: # No user input: make equal numbers of each cell type cells_in_clusters = (np.ones(clusters, dtype=int) * int(n_cells / clusters)) else: assert len(cells_in_clusters) == clusters, "len(cells_in_clusters) " \ "must equal clusters." assert sum(cells_in_clusters) == n_cells, "sum(cells_in_clusters) " \ "must equal n_cells." # Initialize arrays and lists. chi = np.zeros((clusters + 1, n_genes)) csr_list = [] z = [] d = [] if chi_input is not None: # Go with the chi that was input. chi[1:, :] = chi_input else: # Get chi for cell expression. for i in range(1, clusters + 1): chi[i, :] = generate_chi(alpha=0.01, n_genes=n_genes) # Get chi for ambient expression. This becomes chi[0, :]. if ambient_different: # Ambient expression is unrelated to cells, and is itself random. chi[0, :] = generate_chi(alpha=0.001, n_genes=n_genes) # Sparse else: # Ambient gene expression comes from the sum of cell expression. for i in range(1, clusters + 1): chi[0, :] += cells_in_clusters[i - 1] * chi[i, :] # Weighted sum chi[0, :] = chi[0, :] / np.sum(chi[0, :]) # Normalize # Sample gene expression for ambient. csr, d_n = sample_expression_from(chi[0, :], n=n_empty, d_mu=np.log(d_empty).item()) # Add data to lists. csr_list.append(csr) z = z + [0 for _ in range(csr.shape[0])] d = d + [i for i in d_n] # Sample gene expression for cells. for i in range(1, clusters + 1): # Get chi for cells once ambient expression is added. chi_tilde = chi[i, :] * d_cell + chi[0, :] * d_empty chi_tilde = chi_tilde / np.sum(chi_tilde) # Normalize csr, d_n = sample_expression_from(chi_tilde, n=cells_in_clusters[i - 1], d_mu=np.log(d_cell).item()) # Add data to lists. csr_list.append(csr) z = z + [i for _ in range(csr.shape[0])] d = d + [j for j in d_n] # Package the results. csr_barcode_gene_synthetic = sp.vstack(csr_list) z = np.array(z) d = np.array(d) # Permute the barcode order and return results. order = np.random.permutation(z.size) csr_barcode_gene_synthetic = csr_barcode_gene_synthetic[order, ...] z = z[order] d = d[order] return csr_barcode_gene_synthetic, z, chi, d
def simulate_dataset_without_ambient_rna( n_cells: int = 100, clusters: int = 1, n_genes: int = 10000, cells_in_clusters: Union[List[int], None] = None, d_cell: int = 5000 ) -> Tuple[sp.csr.csr_matrix, np.ndarray, np.ndarray, np.ndarray]: """Simulate a dataset with ambient background RNA counts. Empty drops have ambient RNA only, while barcodes with cells have cell RNA plus some amount of ambient background RNA (in proportion to the sizes of cell and droplet). Args: n_cells: Number of cells. clusters: Number of distinct cell types to simulate. n_genes: Number of genes. d_cell: Cell size scale factor. cells_in_clusters: Number of cells of each cell type. If specified, the number of ints in this list must be equal to clusters. Returns: csr_barcode_gene_synthetic: The simulated barcode by gene matrix of UMI counts, as a scipy.sparse.csr.csr_matrix. z: The simulated cell type identities. A numpy array of integers, one for each barcode. The number 0 is used to denote barcodes without a cell present. chi: The simulated gene expression, one corresponding to each z. Access the vector of gene expression for a given z using chi[z, :]. d: The simulated size scale factors, one for each barcode. """ assert d_cell > 0, "Location parameter, d_cell, of LogNormal " \ "distribution must be greater than zero." assert clusters > 0, "clusters must be a positive integer." assert n_cells > 0, "n_cells must be a positive integer." assert n_genes > 0, "n_genes must be a positive integer." # Figure out how many cells are in each cell cluster. if cells_in_clusters is None: # No user input: make equal numbers of each cell type cells_in_clusters = np.ones(clusters) * int(n_cells / clusters) else: assert len(cells_in_clusters) == clusters, "len(cells_in_clusters) " \ "must equal clusters." assert sum(cells_in_clusters) == n_cells, "sum(cells_in_clusters) " \ "must equal n_cells." # Initialize arrays and lists. chi = np.zeros((clusters + 1, n_genes)) csr_list = [] z = [] d = [] # Get chi for cell expression. for i in range(clusters): chi[i, :] = generate_chi(alpha=1.0, n_genes=n_genes) csr, d_n = sample_expression_from(chi[i, :], n=int(cells_in_clusters[i]), d_mu=np.log(d_cell).item()) csr_list.append(csr) z = z + [i for _ in range(csr.shape[0])] d = d + [j for j in d_n] # Package the results. csr_barcode_gene_synthetic = sp.vstack(csr_list) z = np.array(z) d = np.array(d) # Permute the barcode order and return results. order = np.random.permutation(z.size) csr_barcode_gene_synthetic = csr_barcode_gene_synthetic[order, ...] z = z[order] d = d[order] return csr_barcode_gene_synthetic, z, chi, d
def _one_fit(self): if self.verbose: print("\nCreating synthetic doublets...") self._createDoublets() # Normalize combined augmented set if self.verbose: print("Normalizing...") if self.normalizer is not None: aug_counts = self.normalizer( sp_sparse.vstack((self._raw_counts, self._raw_synthetics))) else: # Follows doubletdetection.plot.normalize_counts, but uses memoized normed raw_counts synth_lib_size = np.sum(self._raw_synthetics, axis=1).A1 aug_lib_size = np.concatenate([self._lib_size, synth_lib_size]) normed_synths = self._raw_synthetics.copy() inplace_csr_row_normalize_l1(normed_synths) aug_counts = sp_sparse.vstack( (self._normed_raw_counts, normed_synths)) aug_counts = np.log(aug_counts.A * np.median(aug_lib_size) + 0.1) self._norm_counts = aug_counts[:self._num_cells] self._synthetics = aug_counts[self._num_cells:] aug_counts = anndata.AnnData(aug_counts) aug_counts.obs["n_counts"] = aug_lib_size if self.standard_scaling is True: sc.pp.scale(aug_counts, max_value=15) if self.verbose: print("Running PCA...") sc.tl.pca(aug_counts, n_comps=self.n_components, random_state=self.random_state) if self.verbose: print("Clustering augmented data set...\n") sc.pp.neighbors(aug_counts, random_state=self.random_state, method="umap", n_neighbors=10) if self.use_phenograph: fullcommunities, _, _ = phenograph.cluster( aug_counts.obsm["X_pca"], **self.phenograph_parameters) else: sc.tl.louvain(aug_counts, random_state=self.random_state, resolution=4, directed=False) fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int) min_ID = min(fullcommunities) self.communities_ = fullcommunities[:self._num_cells] self.synth_communities_ = fullcommunities[self._num_cells:] community_sizes = [ np.count_nonzero(fullcommunities == i) for i in np.unique(fullcommunities) ] if self.verbose: print("Found clusters [{0}, ... {2}], with sizes: {1}\n".format( min(fullcommunities), community_sizes, max(fullcommunities))) # Count number of fake doublets in each community and assign score # Number of synth/orig cells in each cluster. synth_cells_per_comm = collections.Counter(self.synth_communities_) orig_cells_per_comm = collections.Counter(self.communities_) community_IDs = orig_cells_per_comm.keys() community_scores = { i: float(synth_cells_per_comm[i]) / (synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } scores = np.array([community_scores[i] for i in self.communities_]) community_log_p_values = { i: hypergeom.logsf( synth_cells_per_comm[i], aug_counts.shape[0], self._synthetics.shape[0], synth_cells_per_comm[i] + orig_cells_per_comm[i], ) for i in community_IDs } log_p_values = np.array( [community_log_p_values[i] for i in self.communities_]) if min_ID < 0: scores[self.communities_ == -1] = np.nan log_p_values[self.communities_ == -1] = np.nan return scores, log_p_values
with open(os.path.join(".", "data", vectorizer_name), "rb") as handle: Vectorizer = pickle.load(handle) feature_list = [] #Hold feature arrays (sparse numpy arrays) meta_list = [] #Hold meta-data (Python list) #For each line, get text features and meta-data for line in Texts: text_id = line[0] speaker_id = line[1] line = line[2] features = Vectorizer.transform(line) try: meta_data = speakers[speaker_id][class_name] feature_list.append(features) meta_list.append(meta_data) except: print("Missing meta-data for " + str(speaker_id)) #Now merge into dataframe features = vstack(feature_list) meta = np.array(meta_list) filename = "Senate." + vectorizer_name + ".Features" save_npz(os.path.join(in_dir, filename), features, compressed = True) filename = "Senate." + vectorizer_name + ".Classes" np.save(os.path.join(in_dir, filename), meta, allow_pickle = True)
def _ht_1d( true_mean, # list of means true_res_var, # list of residual variances cells, # list of sparse vectors/matrices approx_sf, # list of dense arrays design_matrix, Nc_list, num_boot, treatment_idx, mv_fit, # list of tuples q, # list of numbers _estimator_1d, resampling, **kwargs): good_idxs = np.zeros(design_matrix.shape[0], dtype=bool) # the resampled arrays boot_mean = np.zeros((design_matrix.shape[0], num_boot+1))*np.nan boot_var = np.zeros((design_matrix.shape[0], num_boot+1))*np.nan # Get strata-specific pooled information if resampling == 'permutation': uniq_strata, strata_indicator = np.unique(np.delete(design_matrix, treatment_idx, axis=1), axis=0, return_inverse=True) resampling_info = {} for k in range(uniq_strata.shape[0]): strata_idx = np.where(strata_indicator==0)[0] data_list = [cells[i] for i in strata_idx] sf_list = [approx_sf[i] for i in strata_idx] resampling_info[k] = bootstrap._unique_expr(sparse.vstack(data_list, format='csc'), np.concatenate(sf_list)) for group_idx in range(len(true_mean)): # Skip if any of the 1d moments are NaNs if np.isnan(true_mean[group_idx]) or \ np.isnan(true_res_var[group_idx]) or \ true_mean[group_idx] == 0 or \ true_res_var[group_idx] < 0: continue # Fill in the true value boot_mean[group_idx, 0], boot_var[group_idx, 0] = np.log(true_mean[group_idx]), np.log(true_res_var[group_idx]) # Generate the bootstrap values mean, var = bootstrap._bootstrap_1d( data=cells[group_idx], size_factor=approx_sf[group_idx], num_boot=num_boot, q=q[group_idx], _estimator_1d=_estimator_1d, precomputed= (None if resampling == 'bootstrap' else resampling_info[strata_indicator[group_idx]])) # Compute the residual variance res_var = estimator._residual_variance(mean, var, mv_fit[group_idx]) # Minimize invalid values filled_mean = _fill(mean)#_push_nan(mean)#[:num_boot] filled_var = _fill(res_var)#_push_nan(res_var)#[:num_boot] # Make sure its a valid replicate if filled_mean is None or filled_var is None: continue boot_mean[group_idx, 1:] = np.log(filled_mean) boot_var[group_idx, 1:] = np.log(filled_var) # This replicate is good good_idxs[group_idx] = True # Skip this gene if good_idxs.sum() == 0: return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan vals = _regress_1d( design_matrix=design_matrix[good_idxs, :], boot_mean=boot_mean[good_idxs, :], boot_var=boot_var[good_idxs, :], Nc_list=Nc_list[good_idxs], treatment_idx=treatment_idx, resampling=resampling, **kwargs) return vals
def run(fold): # load the training data with folds df = pd.read_csv("../inputs/cat-in-the-dat-train-folds.csv") # extracting the categorical features features = [ x for x in df.columns if x not in ("id", "target", "kfold")] # Handling NaN values by replacing with NONE for col in features: df.loc[:, col] = df[col].astype(str).fillna("NONE") # training dataset df_train = df[df["kfold"] != fold].reset_index(drop=True) # validation dataset df_valid = df[df["kfold"] == fold].reset_index(drop=True) # full_data = pd.concat( # [df_train[features], df_valid[features]], # axis=0 # ) full_data = df[features] # initalize the OneHotEncoder() from sklearn ohe = preprocessing.OneHotEncoder() # fit to the data ohe.fit(full_data[features]) # transform training dataset x_train = ohe.transform(df_train[features]) # transform validation dataset x_valid = ohe.transform(df_valid[features]) # initialize Truncated SVD # we are reducing the data to 120 components svd = decomposition.TruncatedSVD(n_components=120) # fit svd on full sparse training data full_sparse = sparse.vstack((x_train, x_valid)) svd.fit(full_sparse) # transform the training sparse data x_train = svd.transform(x_train) # transform the validation sparse data x_valid = svd.transform(x_valid) # initialize the RandomForestClassifier model = ensemble.RandomForestClassifier(n_jobs=-1) # fit the data to the model model.fit(x_train, df_train.target.values) # predict only the ones for the given x_valid dataset, # need to select ones so [:,1] -- all rows of 2nd column, 1st column for zeros yhat_ones = model.predict_proba(x_valid)[:,1] # evaluate the auc score auc = metrics.roc_auc_score(df_valid.target.values, yhat_ones) print(f"Fold: {fold}, AUC Score: {auc}")
def estimate(self, ppci): # state vector built from delta, |V| and zero injections # Find pq bus with zero p,q and shunt admittance zero_injection_bus = np.argwhere( ppci["bus"][:, bus_cols + ZERO_INJ_FLAG] == True).ravel() ppci["bus"][ zero_injection_bus, [bus_cols + P, bus_cols + P_STD, bus_cols + Q, bus_cols + Q_STD]] = np.NaN # Withn pq buses with zero injection identify those who have also no p or q measurement p_zero_injections = zero_injection_bus q_zero_injections = zero_injection_bus new_states = np.zeros(len(p_zero_injections) + len(q_zero_injections)) slack_buses, non_slack_buses, n_active, r_inv, v_m, delta_masked, delta, z = self.wls_preprocessing( ppci) E = np.concatenate((delta_masked.compressed(), v_m, new_states)) # matrix calculation object sem = WLSAlgebraZeroInjectionConstraints(ppci, slack_buses, non_slack_buses) current_error = 100. cur_it = 0 G_m, r, H, h_x = None, None, None, None while current_error > self.tolerance and cur_it < self.max_iterations: self.logger.debug("Starting iteration {:d}".format(1 + cur_it)) try: # create h(x) for the current iteration h_x, c_x = sem.create_hx_cx(v_m, delta, p_zero_injections, q_zero_injections) # residual r r = csr_matrix(z - h_x).T c_rxh = csr_matrix(c_x).T # jacobian matrix H H_temp, C_temp = sem.create_jacobian(v_m, delta, p_zero_injections, q_zero_injections) H = csr_matrix(H_temp) C = csr_matrix(C_temp) # gain matrix G_m # G_m = H^t * R^-1 * H G_m = H.T * (r_inv * H) # building a new gain matrix for new constraints. A_1 = vstack([G_m, C]) c_ax = hstack([C, np.zeros((C.shape[0], C.shape[0]))]) c_xT = c_ax.T M_tx = csr_matrix(hstack( (A_1, c_xT))) # again adding to the new gain matrix rhs = H.T * (r_inv * r) # original right hand side C_rhs = vstack( (rhs, -c_rxh )) # creating the righ hand side with new constraints # state vector difference d_E d_E = spsolve(M_tx, C_rhs) E += d_E # update V/delta delta[non_slack_buses] = E[:len(non_slack_buses)] v_m = np.squeeze(E[len(non_slack_buses):len(non_slack_buses) + n_active]) # prepare next iteration cur_it += 1 current_error = np.max( np.abs(d_E[:len(non_slack_buses) + n_active])) self.logger.debug( "Current error: {:.7f}".format(current_error)) except np.linalg.linalg.LinAlgError: self.logger.error( "A problem appeared while using the linear algebra methods." "Check and change the measurement set.") return False # check if the estimation is successfull self.check_result(current_error, cur_it) return delta, v_m
# large as the order of the PDE being solved (2 in this # case). Larger values may improve accuracy # generate nodes nodes, smpid = menodes(N, vert, smp) edge_idx, = (smpid >= 0).nonzero() interior_idx, = (smpid == -1).nonzero() # create "left hand side" matrix A_int = weight_matrix(nodes[interior_idx], nodes, diffs=[[2, 0], [0, 2]], n=n, basis=basis, order=order) A_edg = weight_matrix(nodes[edge_idx], nodes, diffs=[0, 0]) A = vstack((A_int, A_edg)) # create "right hand side" vector d_int = -1 * np.ones_like(interior_idx) d_edg = np.zeros_like(edge_idx) d = np.hstack((d_int, d_edg)) # find the solution at the nodes u_soln = spsolve(A, d) # interpolate the solution on a grid xg, yg = np.meshgrid(np.linspace(-0.05, 2.05, 400), np.linspace(-0.05, 2.05, 400)) points = np.array([xg.flatten(), yg.flatten()]).T u_itp = LinearNDInterpolator(nodes, u_soln)(points) # mask points outside of the domain u_itp[~contains(points, vert, smp)] = np.nan ug = u_itp.reshape((400, 400)) # fold back into a grid # make a contour plot of the solution
print("Categories:", np.unique(train_labels)) print("Number of unique words:", len(np.unique(np.hstack(train_data)))) # 将word_index反转,实现将整数索引到单词的映射 ''' # Simple Vectoring data print('Vectoring data') X_train = pre.vectorize_sequences(train_data) X_test = pre.vectorize_sequences(test_data) ''' # TF-IDF Vectoring data print('\nVectoring train data') X_train, train_labels = tfidf.tf_idf_2doc(train_data, train_labels, feat=10000) print('\nVectoring test data') X_test, test_labels = tfidf.tf_idf_2doc(test_data, test_labels, feat=10000) data = sp.vstack((X_train, X_test)) # Vectoring label print('\nVectoring labels') y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') target = np.append(y_train, y_test) ''' X_val = X_test[: 10000] partial_x_train = X_test[10000:] y_val = y_test[: 10000] partial_y_train = y_test[10000:] ''' train_x = data[10000:] train_y = target[10000:]
IDs_arestas_0_locais = np.subtract(IDs_arestas_0, ni + nf) IDs_faces_0_locais = np.subtract(IDs_faces_0, ni) IDs_internos_0_locais = IDs_internos_0 IDs_arestas_1_locais = np.setdiff1d(range(na), IDs_arestas_0_locais) ids_arestas_slin_m0 = np.nonzero(As['Aev'].sum(axis=1))[0] Aev = As['Aev'] Ivv = As['Ivv'] Aif = As['Aif'] Afe = As['Afe'] invAee = lu_inv4(As['Aee'].tocsc(), ids_arestas_slin_m0) M2 = -invAee * Aev PAD = vstack([M2, Ivv]) invAff = invbAff M3 = -invAff * (Afe * M2) PAD = vstack([M3, PAD]) invAii = invbAii PAD = vstack([-invAii * (Aif * M3), PAD]) print("get_OP_AMS", time.time() - ta1) del M3 ids_1 = mb.tag_get_data(L1_ID_tag, vertices, flat=True) fine_to_primal1_classic_tag = mb.tag_get_handle('FINE_TO_PRIMAL1_CLASSIC') ids_class = mb.tag_get_data(fine_to_primal1_classic_tag, vertices, flat=True) t0 = time.time()
G_zz, parallels_x.dot(out['xz']) + parallels_y.dot(out['yz']) + parallels_z.dot(out['zz']), idx['boundary:roller']) # stack the components together. take care to delete matrices when # we do not need them anymore del (out, normals_x, normals_y, normals_z, parallels_1, parallels_2, parallels_x, parallels_y, parallels_z) G_x = sp.hstack((G_xx, G_xy, G_xz)) del G_xx, G_xy, G_xz G_y = sp.hstack((G_yx, G_yy, G_yz)) del G_yx, G_yy, G_yz G_z = sp.hstack((G_zx, G_zy, G_zz)) del G_zx, G_zy, G_zz G = sp.vstack((G_x, G_y, G_z)) del G_x, G_y, G_z G = G.tocsc() G.eliminate_zeros() # create the right-hand-side vector d_x = np.zeros((N, )) d_y = np.zeros((N, )) d_z = np.zeros((N, )) d_x[idx['interior']] = 0.0 d_x[idx['ghosts:free']] = 0.0 d_x[idx['ghosts:roller']] = 0.0 d_x[idx['boundary:free']] = 0.0 d_x[idx['boundary:roller']] = 0.0
def _non_rigid_icp_iter(self, source, target, closest_points_on_target, M_kron_G, alpha, gamma): """ Non-rigid icp for each iteration. Parameters: source (menpo.shape.mesh.base.TriMesh): original source mesh to be transformed target (menpo.shape.mesh.base.TriMesh): target mesh as the base closest_points_on_target (menpo3d.vtkutils.VTKClosestPointLocator): octree for finding nearest neighbor M_kron_G (scipy.sparse.coo.coo_matrix): matrix M kron matrix G alpha (float): stiffness weight gamma (float): data weight Returns: current_instance (menpo.shape.mesh.base.TriMesh): transformed source mesh training_info (dict): containing 3 lists of loss/regularized_err/err while training """ # init transformation n_dims = source.n_dims h_dims = n_dims + 1 n = source.points.shape[0] v_i = source.points # we need to prepare some indices for efficient construction of the D sparse matrix. row = np.hstack((np.repeat(np.arange(n)[:, None], n_dims, axis=1).ravel(), np.arange(n))) x = np.arange(n * h_dims).reshape((n, h_dims)) col = np.hstack((x[:, :n_dims].ravel(), x[:, n_dims])) ones = np.ones(n) alpha_M_kron_G = alpha * M_kron_G # start iteration training_info = {'loss': [], 'regularized_loss': []} iter_ = 0 while iter_ < self.max_iter: iter_ += 1 NonRigidIcp._iter_counter += 1 # find nearest neighbour and the normals U, tri_indices = closest_points_on_target(v_i) data = np.hstack((v_i.ravel(), ones)) D = sp.coo_matrix((data, (row, col))) to_stack_A = [alpha_M_kron_G, D] to_stack_B = [np.zeros((alpha_M_kron_G.shape[0], n_dims)), U] A = sp.vstack(to_stack_A).tocsr() B = sp.vstack(to_stack_B).tocsr() X = math_helper.Solver.linear_solver(A, B, self.solver) # deform template v_i = np.array(D.dot(X)) loss = np.linalg.norm(A @ X - B, ord='fro') regularized_loss = loss / len(source.points) training_info['loss'].append(loss) training_info['regularized_loss'].append(regularized_loss) NonRigidIcp._average_regularized_loss = (NonRigidIcp._iter_counter - 1) * \ NonRigidIcp._average_regularized_loss / NonRigidIcp._iter_counter if self.verbose: info = ' - {} loss: {:.3f} regularized_loss: {:.3f} '.format(iter_, loss, regularized_loss) print(info) else: progress_bar = "[" if NonRigidIcp._num_of_meshes is not None: progress = int(10.0 * NonRigidIcp._mesh_counter / NonRigidIcp._num_of_meshes) for _ in range(progress-1): progress_bar += "=" progress_bar += ">" for _ in range(10 - progress - 1): progress_bar += "." progress_bar += "] " + str(NonRigidIcp._mesh_counter) + "/" + str(NonRigidIcp._num_of_meshes) else: progress_bar += str(NonRigidIcp._num_of_meshes) + "]" if self._expected_remaining_time is not None: progress_bar += " | remaining time: " + self._expected_remaining_time print(("loss @ this iter: {:.3f} | " "loss/iter: {:.3f} | " + progress_bar) .format(regularized_loss, NonRigidIcp._average_regularized_loss ), end="\r", flush=True) if regularized_loss < self.eps: break current_instance = source.copy() current_instance.points = v_i.copy() return current_instance, training_info
def perform_EM(self, X_l, y_l, X_u): nb_clf = MultinomialNB(alpha=0.01) nb_clf.fit(X_l, y_l) # calculate log likelihood class_log_prior = (nb_clf.class_log_prior_).tolist() word_given_class = nb_clf.feature_log_prob_ class_size = len(nb_clf.class_count_) un_sum_outer = 0 for doc in X_u: sum_inner = 0 for index in range(class_size): sum_inner += (class_log_prior[index] * np.sum(word_given_class[index, :])) un_sum_outer += sum_inner lb_sum = 0 for index, doc in enumerate(X_l): sum_inner = 0 given_label = y_l[index] sum_inner = (class_log_prior[given_label] * np.sum(word_given_class[given_label, :])) lb_sum += sum_inner log_likelihood = (-1 * (lb_sum + un_sum_outer)) prev_log = float("-inf") current_log = log_likelihood count = 0 # remove this line while (abs(current_log - prev_log) > 1e-6): # Estimation step Y_u = nb_clf.predict(X_u) # Maximize step X_new = vstack([X_l, X_u]) Y_new = np.concatenate((y_l, Y_u), axis=0) nb_clf.fit(X_new, Y_new) # calculate log likelihood class_log_prior = (nb_clf.class_log_prior_).tolist() word_given_class = nb_clf.feature_log_prob_ class_size = len(nb_clf.class_count_) count += 1 un_sum_outer = 0 for doc in X_u: sum_inner = 0 for index in range(class_size): sum_inner += (class_log_prior[index] * np.sum(word_given_class[index, :])) un_sum_outer += sum_inner lb_sum = 0 for index, doc in enumerate(X_l): sum_inner = 0 given_label = y_l[index] sum_inner = (class_log_prior[given_label] * np.sum(word_given_class[given_label, :])) lb_sum += sum_inner log_likelihood = (-1 * (lb_sum + un_sum_outer)) prev_log = current_log current_log = log_likelihood print("log_likelihood ", log_likelihood) return nb_clf
def load_gcn_data(dataset_str): npz_file = 'data/{}_{}.npz'.format(dataset_str, FLAGS.normalization) if os.path.exists(npz_file): start_time = time() print('Found preprocessed dataset {}, loading...'.format(npz_file)) data = np.load(npz_file) num_data = data['num_data'] labels = data['labels'] train_data = data['train_data'] val_data = data['val_data'] test_data = data['test_data'] train_adj = sp.csr_matrix((data['train_adj_data'], data['train_adj_indices'], data['train_adj_indptr']), shape=data['train_adj_shape']) full_adj = sp.csr_matrix((data['full_adj_data'], data['full_adj_indices'], data['full_adj_indptr']), shape=data['full_adj_shape']) feats = sp.csr_matrix((data['feats_data'], data['feats_indices'], data['feats_indptr']), shape=data['feats_shape']) train_feats = sp.csr_matrix((data['train_feats_data'], data['train_feats_indices'], data['train_feats_indptr']), shape=data['train_feats_shape']) test_feats = sp.csr_matrix((data['test_feats_data'], data['test_feats_indices'], data['test_feats_indptr']), shape=data['test_feats_shape']) print('Finished in {} seconds.'.format(time() - start_time)) else: """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) if dataset_str != 'nell': test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() # idx_train = range(len(y)) idx_train = range(18217) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] else: test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) features = allx.tocsr() adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = ally idx_test = test_idx_reorder idx_train = range(len(y)) idx_val = range(len(y), len(y)+969) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] # num_data, (v, coords), feats, labels, train_d, val_d, test_d num_data = features.shape[0] def _normalize_adj(adj): rowsum = np.array(adj.sum(1)).flatten() d_inv = 1.0 / (rowsum+1e-20) d_mat_inv = sp.diags(d_inv, 0) adj = d_mat_inv.dot(adj).tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data.astype(np.float32), coords def gcn_normalize_adj(adj): adj = adj + sp.eye(adj.shape[0]) rowsum = np.array(adj.sum(1)) + 1e-20 d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt, 0) adj = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt) adj = adj.tocoo() coords = np.array((adj.row, adj.col)).astype(np.int32) return adj.data.astype(np.float32), coords # Normalize features rowsum = np.array(features.sum(1)) + 1e-9 r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv, 0) features = r_mat_inv.dot(features) if FLAGS.normalization == 'gcn': full_v, full_coords = gcn_normalize_adj(adj) else: full_v, full_coords = _normalize_adj(adj) full_v = full_v.astype(np.float32) full_coords = full_coords.astype(np.int32) train_v, train_coords = full_v, full_coords labels = (y_train + y_val + y_test).astype(np.float32) train_data = np.nonzero(train_mask)[0].astype(np.int32) val_data = np.nonzero(val_mask)[0].astype(np.int32) test_data = np.nonzero(test_mask)[0].astype(np.int32) feats = (features.data, features.indices, features.indptr, features.shape) def _get_adj(data, coords): adj = sp.csr_matrix((data, (coords[0,:], coords[1,:])), shape=(num_data, num_data)) return adj train_adj = _get_adj(train_v, train_coords) full_adj = _get_adj(full_v, full_coords) feats = sp.csr_matrix((feats[0], feats[1], feats[2]), shape=feats[-1], dtype=np.float32) train_feats = train_adj.dot(feats) test_feats = full_adj.dot(feats) with open(npz_file, 'wb') as fwrite: np.savez(fwrite, num_data=num_data, train_adj_data=train_adj.data, train_adj_indices=train_adj.indices, train_adj_indptr=train_adj.indptr, train_adj_shape=train_adj.shape, full_adj_data=full_adj.data, full_adj_indices=full_adj.indices, full_adj_indptr=full_adj.indptr, full_adj_shape=full_adj.shape, feats_data=feats.data, feats_indices=feats.indices, feats_indptr=feats.indptr, feats_shape=feats.shape, train_feats_data=train_feats.data, train_feats_indices=train_feats.indices, train_feats_indptr=train_feats.indptr, train_feats_shape=train_feats.shape, test_feats_data=test_feats.data, test_feats_indices=test_feats.indices, test_feats_indptr=test_feats.indptr, test_feats_shape=test_feats.shape, labels=labels, train_data=train_data, val_data=val_data, test_data=test_data) return num_data, train_adj, full_adj, feats, train_feats, test_feats, labels, train_data, val_data, test_data
def main(args): start_time = time.time() print("Running XGBoost Classifier") print("Reading blacklist words file") load_blacklist_words("../data/blacklist.txt") print("Reading raw gender-comment data") with open("../data/male-comments.json", "r") as f: male_comment = json.load(f) with open("../data/female-comments.json", "r") as f: female_comment = json.load(f) # Lower case all comments male_comment = [[x[0], x[1].lower()] for x in male_comment] female_comment = [[x[0], x[1].lower()] for x in female_comment] # Filter blacklisted words in comments male_comment = [[x[0], x[1]] for x in male_comment if all(c not in BLACKLIST_WORDS for c in x[1].split(" "))] female_comment = [[x[0], x[1]] for x in female_comment if all( c not in BLACKLIST_WORDS for c in x[1].split(" "))] random.shuffle(male_comment) random.shuffle(female_comment) print("Loaded {} male and {} female comments".format( len(male_comment), len(female_comment))) female_ratio = 1.0 - args.male_female_ratio if args.limit != -1: print( "Limiting male and female comments to {} male and {} female ({} total)" .format(int(args.limit * args.male_female_ratio), int(args.limit * female_ratio), args.limit)) try: del male_comment[int(args.limit * args.male_female_ratio):] del female_comment[int(args.limit * female_ratio):] except: print("Not enough male/female comments data") sys.exit(1) gender_comment = [] for idx, data in enumerate(male_comment): data[1] = data[1].lower() gender_comment.append(data) for idx, data in enumerate(female_comment): data[1] = data[1].lower() gender_comment.append(data) random.shuffle(gender_comment) list_of_words = set() for data in gender_comment: list_of_words.update(data[1].split(" ")) list_of_words = list(list_of_words) word_count = len(list_of_words) if args.cache: cache.cache_list_of_words(list_of_words) print("Total of {} words found\n".format(word_count)) data = coo_matrix((1, 1)) label = [] total = len(gender_comment) start_progress("Processing {} raw gender-comment data".format(total)) for i, j in enumerate(gender_comment): if j[0] == "female": # Label for female = 0, and male = 1 label.append(0) else: label.append(1) wc = {} for word in j[1].split(): if word in wc: wc[word] += 1 else: wc[word] = 1 d = [] for idx in range(word_count): count = 0 if list_of_words[idx] in wc: count = wc[list_of_words[idx]] d.append(count) if i == 0: data = coo_matrix(d) else: data = vstack((data, coo_matrix(d))) progress((i + 1) / total * 100) if i == total: break end_progress() if args.cache: cache.cache_data_and_label(data, label, word_count) run_tests(data, label, total, args.split, args.gamma, args.learning_rate, args.n_estimators) print("Elapsed time: {0:.2f}s".format(time.time() - start_time))
def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) support_index = self.svm_estimator_.support_[y[ self.svm_estimator_.support_] == class_sample] support_vector = _safe_indexing(X, support_index) self.nn_m_.fit(X) noise_bool = self._in_danger_noise(self.nn_m_, support_vector, class_sample, y, kind="noise") support_vector = _safe_indexing( support_vector, np.flatnonzero(np.logical_not(noise_bool))) danger_bool = self._in_danger_noise(self.nn_m_, support_vector, class_sample, y, kind="danger") safety_bool = np.logical_not(danger_bool) self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) n_generated_samples = int(fractions * (n_samples + 1)) if np.count_nonzero(danger_bool) > 0: nns = self.nn_k_.kneighbors( _safe_indexing(support_vector, np.flatnonzero(danger_bool)), return_distance=False, )[:, 1:] X_new_1, y_new_1 = self._make_samples( _safe_indexing(support_vector, np.flatnonzero(danger_bool)), y.dtype, class_sample, X_class, nns, n_generated_samples, step_size=1.0, ) if np.count_nonzero(safety_bool) > 0: nns = self.nn_k_.kneighbors( _safe_indexing(support_vector, np.flatnonzero(safety_bool)), return_distance=False, )[:, 1:] X_new_2, y_new_2 = self._make_samples( _safe_indexing(support_vector, np.flatnonzero(safety_bool)), y.dtype, class_sample, X_class, nns, n_samples - n_generated_samples, step_size=-self.out_step, ) if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): if sparse.issparse(X_resampled): X_resampled = sparse.vstack( [X_resampled, X_new_1, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), axis=0) elif np.count_nonzero(danger_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) elif np.count_nonzero(safety_bool) == 0: if sparse.issparse(X_resampled): X_resampled = sparse.vstack([X_resampled, X_new_1]) else: X_resampled = np.vstack((X_resampled, X_new_1)) y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) return X_resampled, y_resampled
data_all, _ = split_dataframe_partial_user_holdout( df_all, "userId", "movieId", test_user_ratio=0.2, val_user_ratio=0.2, heldout_ratio_test=0.5, heldout_ratio_val=0.5, ) data_train = data_all["train"] data_val = data_all["val"] data_test = data_all["test"] X_train_all: sps.csr_matrix = sps.vstack( [data_train.X_train, data_val.X_train, data_test.X_train], format="csr") X_train_val_all: sps.csr_matrix = sps.vstack( [data_train.X_all, data_val.X_all, data_test.X_train], format="csr") valid_evaluator = Evaluator( ground_truth=data_val.X_test, offset=data_train.n_users, cutoff=BASE_CUTOFF, ) test_evaluator = Evaluator( ground_truth=data_test.X_test, offset=data_train.n_users + data_val.n_users, cutoff=BASE_CUTOFF, ) test_results = []
def osqp_solve_qp(P, q, G=None, h=None, A=None, b=None, initvals=None, verbose=False, eps_abs=1e-5, eps_rel=1e-5, polish=True): """ Solve a Quadratic Program defined as: .. math:: \\begin{split}\\begin{array}{ll} \\mbox{minimize} & \\frac{1}{2} x^T P x + q^T x \\\\ \\mbox{subject to} & G x \\leq h \\\\ & A x = h \\end{array}\\end{split} using `OSQP <https://github.com/oxfordcontrol/osqp>`_. Parameters ---------- P : scipy.sparse.csc_matrix Symmetric quadratic-cost matrix. q : numpy.array Quadratic cost vector. G : scipy.sparse.csc_matrix Linear inequality constraint matrix. h : numpy.array Linear inequality constraint vector. A : scipy.sparse.csc_matrix, optional Linear equality constraint matrix. b : numpy.array, optional Linear equality constraint vector. initvals : numpy.array, optional Warm-start guess vector. verbose : bool, optional Set to `True` to print out extra information. eps_abs : scalar, optional Absolute convergence tolerance of the solver. Lower values yield more precise solutions at the cost of computation time. eps_rel : scalar, optional Relative convergence tolerance of the solver. Lower values yield more precise solutions at the cost of computation time. polish : bool, optional Perform `polishing <https://osqp.org/docs/solver/#polishing>`_, an additional step where the solver tries to improve the accuracy of the solution. Default is ``True``. Returns ------- x : array, shape=(n,) Solution to the QP, if found, otherwise ``None``. Note ---- OSQP requires `P` to be symmetric, and won't check for errors otherwise. Check out for this point if you e.g. `get nan values <https://github.com/oxfordcontrol/osqp/issues/10>`_ in your solutions. Note ---- As of OSQP v0.6.1, the default values for both absolute and relative tolerances are set to ``1e-3``, which results in low solver times but imprecise solutions compared to the other QP solvers. We lower them to ``1e-5`` so that OSQP behaves closer to the norm in terms of numerical accuracy. """ if type(P) is ndarray: warn(conversion_warning("P")) P = csc_matrix(P) solver = OSQP() kwargs = { 'eps_abs': eps_abs, 'eps_rel': eps_rel, 'polish': polish, 'verbose': verbose } if A is None and G is None: solver.setup(P=P, q=q, **kwargs) elif A is not None: if type(A) is ndarray: warn(conversion_warning("A")) A = csc_matrix(A) if G is None: solver.setup(P=P, q=q, A=A, l=b, u=b, **kwargs) else: # G is not None l = -inf * ones(len(h)) qp_A = vstack([G, A]).tocsc() qp_l = hstack([l, b]) qp_u = hstack([h, b]) solver.setup(P=P, q=q, A=qp_A, l=qp_l, u=qp_u, **kwargs) else: # A is None if type(G) is ndarray: warn(conversion_warning("G")) G = csc_matrix(G) l = -inf * ones(len(h)) solver.setup(P=P, q=q, A=G, l=l, u=h, **kwargs) if initvals is not None: solver.warm_start(x=initvals) res = solver.solve() if hasattr(solver, 'constant'): success_status = solver.constant('OSQP_SOLVED') else: # more recent versions of OSQP success_status = osqp.constant('OSQP_SOLVED') if res.info.status_val != success_status: print("OSQP exited with status '%s'" % res.info.status) return res.x
def _fit_resample(self, X, y): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = _safe_indexing(X, target_class_indices) self.nn_m_.fit(X) danger_index = self._in_danger_noise(self.nn_m_, X_class, class_sample, y, kind="danger") if not any(danger_index): continue self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(_safe_indexing(X_class, danger_index), return_distance=False)[:, 1:] # divergence between borderline-1 and borderline-2 if self.kind == "borderline-1": # Create synthetic samples for borderline points. X_new, y_new = self._make_samples( _safe_indexing(X_class, danger_index), y.dtype, class_sample, X_class, nns, n_samples, ) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) elif self.kind == "borderline-2": random_state = check_random_state(self.random_state) fractions = random_state.beta(10, 10) # only minority X_new_1, y_new_1 = self._make_samples( _safe_indexing(X_class, danger_index), y.dtype, class_sample, X_class, nns, int(fractions * (n_samples + 1)), step_size=1.0, ) # we use a one-vs-rest policy to handle the multiclass in which # new samples will be created considering not only the majority # class but all over classes. X_new_2, y_new_2 = self._make_samples( _safe_indexing(X_class, danger_index), y.dtype, class_sample, _safe_indexing(X, np.flatnonzero(y != class_sample)), nns, int((1 - fractions) * n_samples), step_size=0.5, ) if sparse.issparse(X_resampled): X_resampled = sparse.vstack( [X_resampled, X_new_1, X_new_2]) else: X_resampled = np.vstack((X_resampled, X_new_1, X_new_2)) y_resampled = np.hstack((y_resampled, y_new_1, y_new_2)) return X_resampled, y_resampled
n_ds = float(sum(ds_rep == ds)) if n_ds == 0: # 0 log 0 = 0 continue H += (n_ds / n_cluster) * np.log(n_ds / n_cluster) H *= -1 H /= np.log(len(datasets)) Hs.append(H) return np.mean(Hs) if __name__ == '__main__': datasets, genes_list, n_cells = load_names(data_names, norm=False) datasets, genes = merge_datasets(datasets, genes_list) X = vstack(datasets) gt_idx = [i for i, s in enumerate(np.sum(X != 0, axis=1)) if s >= 500] X = X[gt_idx] if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)): log('Dimension reduction with {}...'.format(METHOD)) X_dimred = reduce_dimensionality(normalize(X), method=METHOD, dimred=DIMRED) log('Dimensionality = {}'.format(X_dimred.shape[1])) np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred) else: X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format( METHOD, NAMESPACE))
log1p=True) datasets.append(train_X) genes_list.append(tms_genes_list) data_names_all.append('TMS') ## embedd the cell ontology unseen_l, l2i, i2l, onto_net, Y_emb, cls2cls = ParseCLOnto(train_Y_str) train_Y = MapLabel2CL(train_Y_str, l2i) ## use Scanorama to correct batch effects datasets, genes = merge_datasets(datasets, genes_list) datasets_dimred, genes = process_data(datasets, genes, dimred=100) expr_datasets = my_assemble(datasets_dimred, ds_names=data_names_all, expr_datasets=datasets, sigma=150)[1] expr_corrected = sparse.vstack(expr_datasets) expr_corrected = np.log2(expr_corrected.toarray() + 1) ## annotate 26-datasets, train on TMS ntrain, ngene = np.shape(train_X) nsample = np.shape(expr_corrected)[0] train_X_corrected = expr_corrected[nsample - ntrain:, :] test_X_corrected = expr_corrected[:nsample - ntrain, :] OnClass_obj = OnClassPred() OnClass_obj.train(train_X_corrected, train_Y, Y_emb, log_transform=False) test_Y_pred = OnClass_obj.predict(test_X_corrected, log_transform=False) ## save the prediction matrix, nsample (number of samples in 26-datasets) by nlabels np.save(output_dir + '26_datasets_predicted_score_matrix.npy', test_Y_pred)
def _cross_prod_w(self, w): # Calculate X * A * X.T. A is a diagnalized matrix, and w is the array of diagnal of A. w = w.astype(float) s = self.ent_table r = self.att_table k = self.kfkds ns = k[0].shape[0] ds = s.shape[1] nr = [t.shape[0] for t in r] dr = [t.shape[1] for t in r] if not self.trans: if s.size > 0: res = self._t_cross_w(s, w[0:ds]) else: res = np.zeros((ns, ns), dtype=float, order='C') count = ds cross_r = [] for t in r: if all(map(sp.issparse, r)): cross_r.append( self._t_cross_w(t, w[count:count + t.shape[1]]).toarray()) else: cross_r.append( self._t_cross_w(t, w[count:count + t.shape[1]])) count += t.shape[1] comp.expand_add(ns, len(k), k, cross_r, nr, res) else: if all(map(sp.issparse, r)): # change the 'other' as weight to group other = w.reshape((1, -1)).astype(float) s2 = w.reshape(-1, 1) * np.array(s) v = [np.zeros((1, t.shape[0]), dtype=float) for t in r] comp.group(ns, len(k), 1, k, nr, other, v) size = r[0].size data = np.empty(size) # part 2 and 3 are p.T and p comp.multiply_sparse(size, r[0].row, r[0].data, np.sqrt(v[0]), data) diag_part = self._cross( sp.coo_matrix((data, (r[0].row, r[0].col)))) if ds > 0: m = np.zeros((nr[0], ds)) comp.group_left(ns, ds, s2, k[0], m) p = self._cross(r[0], m) s_part = self._cross(s, s2) res = sp.vstack((np.hstack( (s_part, p.T)), sp.hstack((p, diag_part)))) else: res = diag_part # multi-table join for i in range(1, len(k)): ps = [] if ds > 0: m = np.zeros((nr[i], ds)) comp.group_left(ns, ds, s2, k[i], m) ps += [self._cross(r[i], m)] # cp (KRi) size = r[i].size data = np.empty(size) comp.multiply_sparse(size, r[i].row, r[i].data, np.sqrt(v[i]), data) diag_part = self._cross( sp.coo_matrix((data, (r[i].row, r[i].col)))) for j in range(i): ps += [ r[i].tocsr()[k[i]].T.dot( r[j].tocsr()[k[j]].multiply(w.reshape(-1, 1))) ] res = sp.vstack((sp.hstack( (res, sp.vstack([p.T for p in ps]))), sp.hstack(ps + [diag_part]))) else: nt = s.shape[1] + sum([att.shape[1] for att in r]) other = w.reshape((1, -1)).astype(float) s2 = w.reshape(-1, 1) * np.array(s) v = [np.zeros((1, t.shape[0]), dtype=float) for t in r] res = np.empty((nt, nt)) data = np.empty(r[0].shape, order='C') comp.group(ns, len(k), 1, k, nr, other, v) comp.multiply(r[0].shape[0], r[0].shape[1], r[0], v[0], data) res[ds:ds + dr[0], ds:ds + dr[0]] = self._cross(data) if ds > 0: m = np.zeros((nr[0], ds)) comp.group_left(ns, ds, s2, k[0], m) res[ds:ds + dr[0], :ds] = self._cross(r[0], m) res[:ds, ds:ds + dr[0]] = res[ds:ds + dr[0], :ds].T res[:ds, :ds] = self._cross(s, s2) # multi-table join for i in range(1, len(k)): if ds > 0: m = np.zeros((nr[i], ds)) comp.group_left(ns, ds, s2, k[i], m) ni1 = ds + sum([t.shape[1] for t in r[:i]]) ni2 = ni1 + r[i].shape[1] res[ni1:ni2, :ds] = self._cross(r[i], m) res[:ds, ni1:ni2] = res[ni1:ni2, :ds].T # cp(KRi) data = np.empty(r[i].shape, order='C') comp.multiply(r[i].shape[0], r[i].shape[1], r[i], v[i], data) res[ni1:ni2, ni1:ni2] = self._cross(data) for j in range(i): dj1 = ds + sum([t.shape[1] for t in r[:j]]) dj2 = dj1 + r[j].shape[1] if (ns * 1.0 / nr[j]) > (1 + nr[j] * 1.0 / dr[j]): m = np.zeros((nr[i], nr[j]), order='C') # Update in comp.cpp. When count the number in each group, add w instead of 1. comp.group_k_by_k_w(nr[i], nr[j], ns, w, k[i], k[j], m) res[ni1:ni2, dj1:dj2] = r[i].T.dot(m.T.dot(r[j])) res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T else: res[ni1:ni2, dj1:dj2] = (w.reshape(-1, 1) * np.array(r[i][k[i]])).T.dot( r[j][k[j]]) res[dj1:dj2, ni1:ni2] = res[ni1:ni2, dj1:dj2].T return res
def qrfullsps(K): """ Full QR-factorization for a sparse potentially singular mxn matrix, with m>=n. Parameters ---------- K : ndarray matrix, that is to be inverted Returns ------- Kinv : ndarray pseudoinverse of K R : ndarray left nullspace of K """ R = copy(K) Q = csr_matrix(eye(K.shape[0])) for j in range(K.shape[1]): G = None for k in range(K.shape[0] - 1, j, -1): vec_norm = np.linalg.norm(np.array([R[j, j], R[k, j]])) if not np.isclose(vec_norm, 0): c_j = R[j, j] / vec_norm s_j = R[k, j] / vec_norm G_help = csr_matrix(eye(K.shape[0])) G_help[j, j] = c_j G_help[k, j] = -s_j G_help[j, k] = s_j G_help[k, k] = c_j R = G_help @ R if G is None: G = copy(G_help) else: G = G_help @ G if G is not None: Q = Q @ G.T Q = -Q R = -R r_row_sum = np.sum(np.abs(R), axis=1) tol = 1.0e-12 rank = K.shape[0] - len(np.where(r_row_sum < tol)[0]) Q1 = Q[:, :rank] Q2 = Q[:, rank:] R1 = R[:rank, :rank] R2 = R[:rank, :rank] # Inverting R1 by back substitution if R1.shape[0] == R1.shape[1]: R_inv = csr_matrix(R1.shape) backward_substitution_failed = False for row in np.arange(R1.shape[0] - 1, -1, -1): if row is not R1.shape[0] - 1: for col in np.arange(row + 1, R1.shape[1]): R_inv[row, :] -= R1[row, col] * R_inv[col, :] R_inv[row, row] = 1 if np.isclose(R1[row, row], 0): backward_substitution_failed = True break else: R_inv[row, :] = R_inv[row, :] / R1[row, row] else: backward_substitution_failed = True if backward_substitution_failed: R_inv = csr_matrix(np.linalg.pinv(R.todense())) K_inv = R_inv @ Q1.T if K_inv.shape[0] < K.shape[1]: K_inv = vstack( (K_inv, lil_matrix((K.shape[1] - K_inv.shape[0], K.shape[1])))) if K_inv.shape[1] < K.shape[0]: K_inv = hstack( (K_inv, lil_matrix((K_inv.shape[0], K.shape[0] - K_inv.shape[1])))) return K_inv, Q2