def predict_proba( self, X ): #Return probabilities predicted by the model if self.modelType == "SV1": return self.model01.predict_proba ( X ) if self.modelType == "SV2": mprob1 = self.model01.predict_proba( X ) mprob2 = self.model02.predict_proba( X ) return np.array( mprob1[:,1] ), np.array( mprob2[:,1] ) if self.modelType == "LR2": if self.nComp == 0 or min( len( self.featSel1) , len( self.featSel2 ) ) <= self.nComp: mprob1 = self.model01.predict_proba( X ) mprob2 = self.model02.predict_proba( X ) else: newX1 , newX2 = X[ :, self.featSel1 ], X[ :, self.featSel2 ] X1_1, X1_2 = self.concepts1.transform( newX1 ), self.concepts2.transform( newX2 ) X_1 , X_2 = hstack( ( X, csr_matrix( X1_1 ) ) ) , hstack( ( X, csr_matrix( X1_2 ) ) ) mprob1 = self.model01.predict_proba( X_1 ) mprob2 = self.model02.predict_proba( X_2 ) return np.array( mprob1[:,1] ), np.array( mprob2[:,1] ) if self.modelType == "LR1": if self.nComp == 0 or len( self.featSel) <= self.nComp: return self.model01.predict_proba ( X ) else: newX = X[ :, self.featSel ] X1 = self.concepts.transform( newX ) combined_X = hstack( ( X, csr_matrix( X1 ) ) ) return self.model01.predict_proba( combined_X )
def makePropertyTensor(M, tensor): if tensor is None: # default is ones tensor = np.ones(M.nC) if isScalar(tensor): tensor = tensor * np.ones(M.nC) propType = TensorType(M, tensor) if propType == 1: # Isotropic! Sigma = sp.kron(sp.identity(M.dim), sdiag(mkvc(tensor))) elif propType == 2: # Diagonal tensor Sigma = sdiag(mkvc(tensor)) elif M.dim == 2 and tensor.size == M.nC*3: # Fully anisotropic, 2D tensor = tensor.reshape((M.nC,3), order='F') row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 2]))) row2 = sp.hstack((sdiag(tensor[:, 2]), sdiag(tensor[:, 1]))) Sigma = sp.vstack((row1, row2)) elif M.dim == 3 and tensor.size == M.nC*6: # Fully anisotropic, 3D tensor = tensor.reshape((M.nC,6), order='F') row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 3]), sdiag(tensor[:, 4]))) row2 = sp.hstack((sdiag(tensor[:, 3]), sdiag(tensor[:, 1]), sdiag(tensor[:, 5]))) row3 = sp.hstack((sdiag(tensor[:, 4]), sdiag(tensor[:, 5]), sdiag(tensor[:, 2]))) Sigma = sp.vstack((row1, row2, row3)) else: raise Exception('Unexpected shape of tensor') return Sigma
def train_model(self): ''' calls the computation of each feature in self.feature_list builds self.X_train, self.X_test matrices and fits classifier on the trainings data :return: None ''' self.train_raw = self.train self.train = self._filter(self.train) self.test_raw = self.test self.test = self._filter(self.test) self.train_unified = self._unify_data(self.train) self.test_unified = self._unify_data(self.test) for feature in self.feature_list: model, train, test = self._get_model(feature) self.feature_models[feature] = model if type(self.X_train) == int: self.X_train = train else: self.X_train = sp.hstack((self.X_train, train), format="csr") if type(self.X_test) == int: self.X_test = test else: self.X_test = sp.hstack((self.X_test, test), format="csr") self.classifier.fit(self.X_train, self.y_train)
def fitLR2( self, X, y): #Fit Logistic Regression when the possible scores may vary from 0 to 2 C1, C2, self.nComp = self.param1, self.param2, self.param3 y1 = ( y > 0 ) * 1 y2 = ( y > 1 ) * 1 #Train two Logistic Regression models and repeat process described in fitLR1 with the both of them self.model01 = LogisticRegression( penalty= 'l1' , C = C1, random_state=2512 ) self.model01.fit( X , y1 ) self.model02 = LogisticRegression( penalty= 'l1' , C = C1, random_state=2512 ) self.model02.fit( X , y2 ) self.featSel1 = [ i for i in range(len( self.model01.coef_[0])) if self.model01.coef_[0][i] > 0 ] self.featSel2 = [ i for i in range(len( self.model02.coef_[0])) if self.model02.coef_[0][i] > 0 ] if self.nComp == 0 or min( len( self.featSel1) , len( self.featSel2 ) ) <= self.nComp: return newX1, newX2 = X[:,self.featSel1] , X[:,self.featSel2] self.concepts1 = TruncatedSVD( n_components = self.nComp, random_state = 2512 ) ## test with RBM self.concepts2 = TruncatedSVD( n_components = self.nComp, random_state = 2512 ) ## test with RBM self.concepts1.fit( newX1 ) self.concepts2.fit( newX2 ) X1_1, X1_2 = self.concepts1.transform( newX1 ), self.concepts2.transform( newX2 ) X_1 , X_2 = hstack( ( X, csr_matrix( X1_1 ) ) ) , hstack( ( X, csr_matrix( X1_2 ) ) ) self.model01 = LogisticRegression( penalty= 'l1' , C = C2 , random_state=2512 ) self.model02 = LogisticRegression( penalty= 'l1' , C = C2 , random_state=2512 ) self.model01.fit( X_1, y1 ) self.model02.fit( X_2, y2 ) return
def inv2X2BlockDiagonal(a11, a12, a21, a22, returnMatrix=True): """ B = inv2X2BlockDiagonal(a11, a12, a21, a22) Inverts a stack of 2x2 matrices by using the inversion formula inv(A) = (1/det(A)) * cof(A)^T Input: A - a11, a12, a21, a22 Output: B - inverse """ a11 = mkvc(a11) a12 = mkvc(a12) a21 = mkvc(a21) a22 = mkvc(a22) # compute inverse of the determinant. detAinv = 1./(a11*a22 - a21*a12) b11 = +detAinv*a22 b12 = -detAinv*a12 b21 = -detAinv*a21 b22 = +detAinv*a11 if not returnMatrix: return b11, b12, b21, b22 return sp.vstack((sp.hstack((sdiag(b11), sdiag(b12))), sp.hstack((sdiag(b21), sdiag(b22)))))
def clf_event_running_wordVec(path, event, name_clf, X, X_vec, Y, clf, K, command, call): if command == "StratifiedKFold": cv = StratifiedKFold(Y, K) else: print "Need a correct command" quit() X_vec_norm = preprocessing.normalize(X_vec, norm="l2") for traincv, testcv in cv: X_train, X_test = X[traincv], X[testcv] X_vec_train, X_vec_test = X_vec_norm[traincv], X_vec_norm[testcv] y_train, y_test = Y[traincv], Y[testcv] MIN_DF = 2 vec = CountVectorizer(lowercase=True, min_df=2) vec = vec.fit(X_train) X_train_trans, X_test_trans = vec.transform(X_train), vec.transform(X_test) X_train_trans_all, X_test_trans_all = hstack([X_train_trans, X_vec_train]), hstack([X_test_trans, X_vec_test]) # print X_vec_train.shape, X_vec_test.shape # print X_train_trans.shape, X_test_trans.shape # print X_train_trans_all.shape, X_test_trans_all.shape clf.fit(X_train_trans_all, y_train) # training model y_test_pred = clf.predict(X_test_trans_all) matrix = confusion_matrix(y_test_pred, y_test) for value in matrix: line = "" for each in value: line = line + str(each) + "\t" print line.strip() print "----------------"
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection): nsample=sparseArrayRowNorm[0].shape[0] scaler = StandardScaler(with_mean=False) #scaler = MinMaxScaler() testsize=int(nsample/nfold) cvIdx=[1]*(nsample-testsize)+[2]*testsize random.seed(100) aucRes=[] for nn in range(nrep): #print nn random.shuffle(cvIdx) Y_train=y_all[np.where(np.array(cvIdx)==1)[0]] Y_test=y_all[np.where(np.array(cvIdx)==2)[0]] X_train_all=[] X_test_all=[] for ii in xrange(len(sparseArrayRowNorm)): varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1])) X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:] X_train =varSelector.fit_transform(X_train, Y_train) X_train_all=X_train_all+[X_train] X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:] X_test= varSelector.transform(X_test) X_test_all=X_test_all+[X_test] X_train=hstack(X_train_all,format='csr') X_test=hstack(X_test_all,format='csr') del X_train_all del X_test_all aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure)) print np.array(aucRes).mean() return np.array(aucRes).mean()
def _df_one_hot_encode(self, dtype=np.float): if self.categoricals(): self.to_indexes(drop_origianls=True) start('one_hot_encoding data frame with ' + `self.shape[1]` + \ ' columns. \n\tNOTE: this resturns a sparse array and empties' + \ ' the initial array.') debug('separating categoricals from others') indexes = self.indexes() if not indexes: return self others = filter(lambda c: not c in indexes, self.columns) categorical_df = self[indexes] others_df = sparse.coo_matrix(self[others].values) # Destroy original as it now just takes up memory self.drop(self.columns, 1, inplace=True) gc.collect() ohe_sparse = None for i, c in enumerate(indexes): debug('one hot encoding column: ' + `c`) col_ohe = OneHotEncoder(categorical_features=[0], dtype=dtype).\ fit_transform(categorical_df[[c]]) if ohe_sparse == None: ohe_sparse = col_ohe else: ohe_sparse = sparse.hstack((ohe_sparse, col_ohe)) categorical_df.drop(c, axis=1, inplace=True) gc.collect() matrix = ohe_sparse if not others else sparse.hstack((ohe_sparse, others_df)) stop('done one_hot_encoding') return matrix.tocsr()
def tvdiplmax(y): """Calculate the value of lambda so that if lambda >= lambdamax, the TVD functional solved by TVDIP is minimized by the trivial constant solution x = mean(y). This can then be used to determine a useful range of values of lambda, for example. Args: y: Original signal to denoise, size N x 1. Returns: lambdamax: Value of lambda at which x = mean(y) is the output of the TVDIP function. """ N = y.size M = N - 1 # Construct sparse operator matrices I1 = sparse.eye(M) O1 = sparse.dia_matrix((M, 1)) D = sparse.hstack([I1, O1]) - sparse.hstack([O1, I1]) DDT = D.dot(D.conj().T) Dy = D.dot(y) lambdamax = np.absolute(linalg.spsolve(DDT, Dy)).max(0) return lambdamax
def load_lda_dataset(uid): fname = join(DATASETS_FOLDER, 'es_twlda25ds_%d.npz' % uid) z = np.load(open(fname,'rb')) X_train = z['arr_0'].item() X_valid = z['arr_1'].item() X_test = z['arr_2'].item() # X_train = csc.csc_matrix(X_train.tolist()) # X_valid = csc.csc_matrix(X_train.tolist()) # X_test = csc.csc_matrix(X_test.tolist()) cols_train = X_train.shape[1] cols_valid = X_valid.shape[1] cols_test = X_test.shape[1] maxcols = max(cols_train, cols_valid, cols_test) if cols_train < maxcols: missing_cols = csc_matrix((X_train.shape[0], maxcols - cols_train), dtype=np.float64) X_train = sp.hstack((X_train, missing_cols)) if cols_valid < maxcols: missing_cols = csc_matrix((X_valid.shape[0], maxcols - cols_valid), dtype=np.float64) X_valid = sp.hstack((X_valid, missing_cols)) if cols_test < maxcols: missing_cols = csc_matrix((X_test.shape[0], maxcols - cols_test), dtype=np.float64) X_test = sp.hstack((X_test, missing_cols)) ys_fname = join(DATAFRAMES_FOLDER, "ysv_%d_small.pickle" % uid) y_train, y_valid, y_test = pickle.load(open(ys_fname, 'rb')) return X_train, X_valid, X_test, y_train, y_valid, y_test
def newtonDirection(Rb, Rc, Rxs, A, m, n, x, s, lu, errorCheck=0): rhs =np.hstack((-Rb,-Rc+Rxs/x)) D_2 = -np.minimum(1e+16, s/x) B = sparse.vstack ((sparse.hstack((sparse.coo_matrix((m,m)), A)), sparse.hstack((A.T, sparse.diags([D_2], [0]))))) # ldl' factorization # if L and D are not provided, we calc new factorization; otherwise, # reuse them useLu=True if useLu: if (lu is None) : lu = sparse.linalg.splu(B.tocsc()) # wikipedia says it uses Mehrotra cholesky but the matrix i'm getting is not definite positive # scikits.sparse.cholmod.cholesky fails without a warning sol=lu.solve(rhs) else: sol=sparse.linalg.cg(B,rhs,tol=1e-5)[0] #assert(np.max(np.abs(B*sol-rhs))<1e-5) dy = sol[:m] dx = sol[m:m+n]; ds = -(Rxs+s*dx)/x; if errorCheck == 1: print ('error = %6.2e'%(norm(A.T*dy + ds + Rc)+ norm(A*dx + Rb) + norm(s*dx + x*ds + Rxs)),) print ('\t + err_d = %6.2e'%(norm(A.T*dy + ds + Rc)),) print ('\t + err_p = %6.2e'%(norm(A*dx + Rb)),) print ('\t + err_gap = %6.2e\n'%(norm(s*dx + x*ds + Rxs)),) return dx, dy, ds, lu
def cernikov_filter(wts, fts=None): """ Remove any of the working transversals that are minimal versions of each other """ wt_i = 0 wts = sparse.csc_matrix(wts) if fts is not None: fts = sparse.csc_matrix(fts) while wt_i < wts.shape[1]: target_t = wts[:, wt_i] left_t = wts[:, :max(wt_i, 0)] right_t = wts[:, min(wt_i+1, wts.shape[1]):] assert(left_t.shape[1] + right_t.shape[1] + target_t.shape[1] == wts.shape[1]), "Left/Right split failed." left_right_t = sparse.hstack((left_t, right_t)) if fts is not None: check_ts = sparse.hstack((left_right_t, fts)) else: check_ts = left_right_t if is_minimal_present(target_t, check_ts): wts = left_right_t # The new wts to loop over # [logic] wt_i = wt_i # Don't increase else: # [logic] wts = wts # Keep target wt_i += 1 return wts
def __init__(self, X_l, L_l, X_u, random_generator, ** kw): """ Intializes the S3VM optimizer. """ self.__random_generator = random_generator # This is a nuisance, but we may need to pad extra dimensions to either X_l or X_u # in case the highest feature indices appear only in one of the two data matrices if X_l.shape[1] > X_u.shape[1]: X_u = sparse.hstack([X_u, sparse.coo_matrix(X_u.shape[0], X_l.shape[1] - X_u.shape[1])]) elif X_l.shape[1] < X_u.shape[1]: X_l = sparse.hstack([X_l, sparse.coo_matrix(X_l.shape[0], X_u.shape[1] - X_u.shape[1])]) # We vertically stack the data matrices into one big matrix X = sparse.vstack([X_l, X_u]) self.__size_l, self.__size_u, self.__size_n = X_l.shape[0], X_u.shape[0], X_l.shape[0]+ X_u.shape[0] x = arr.array('i') for l in L_l: x.append(int(l)) self.__YL = mat(x, dtype=np.float64) self.__YL = self.__YL.transpose() self.__setParameters( ** kw) self.__kw = kw self.X_l = X_l.tocsr() self.X_u = X_u.tocsr() self.X = X.tocsr() # compute mean of unlabeled patterns self.__mean_u = self.X_u.mean(axis=0) self.X_u_T = X_u.tocsc().T self.X_l_T = X_l.tocsc().T self.X_T = X.tocsc().T
def get_data(): tickets_file = csv.reader(open('2012-10-09.close.csv')) time_format = '%Y-%m-%d %H:%M:%S' tickets = [] times = [] reporters = [] subjects = [] for number, created, changetime, closetime, reporter, summary, status, \ owner, tkt_type, component, description in tickets_file: row = [] created = dt.datetime.strptime(created, time_format) closetime = dt.datetime.strptime(closetime, time_format) changetime = dt.datetime.strptime(changetime, time_format) time_to_fix = closetime - created row.append(float(number)) row.append(float(time.mktime(created.timetuple()))) tickets.append(row) times.append(total_seconds(time_to_fix)) reporters.append(reporter) subjects.append(summary) scaler = preprocessing.Scaler().fit(np.array(tickets)) tickets = sp.csr_matrix(scaler.transform(tickets)) tickets = sp.hstack((tickets, TfidfTransformer().fit_transform( CountVectorizer().fit_transform(reporters)))) tickets = sp.hstack((tickets, TfidfTransformer().fit_transform( CountVectorizer(ngram_range=(1,3)).fit_transform(subjects)))) scaler = preprocessing.Scaler(with_mean=False).fit(tickets) tickets = scaler.transform(tickets) return tickets, times
def _get_aug_mat(self, k, j): """ Generate the matrix [[A, E], [0, A]] where A is the overall dynamics generator E is the control dynamics generator for a given timeslot and control returns this augmented matrix """ dyn = self.parent dg = dyn._get_phased_dyn_gen(k) if dyn.oper_dtype == Qobj: A = dg.data*dyn.tau[k] E = dyn._get_phased_ctrl_dyn_gen(k, j).data*dyn.tau[k] Z = sp.csr_matrix(dg.data.shape) aug = Qobj(sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])])) elif dyn.oper_dtype == np.ndarray: A = dg*dyn.tau[k] E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k] Z = np.zeros(dg.shape) aug = np.vstack([np.hstack([A, E]), np.hstack([Z, A])]) else: A = dg*dyn.tau[k] E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k] Z = dg*0.0 aug = sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])]) return aug
def pywfmLocalModel(trainFeature, testFeature, trainLabel, testLabel, trainIndex, testIndex, fm, cvIndex): print 'run local: folds: ' + str(cvIndex) trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex) encoder = OneHotEncoder(n_values=[value1, value2]) trainIndex_encode = encoder.fit_transform(trainIndex) testIndex_encode = encoder.transform(testIndex) trainFeature = hstack((trainIndex_encode, trainFeature)) testFeature = hstack((testIndex_encode, testFeature)) ''' for i in range(len(trainLabel)): if i == 0: trainLabel[i] = -1 for i in range(len(testLabel)): if i == 0: testLabel[i] = -1 ''' model = fm.run(trainIndex_encode, trainLabel, testIndex_encode, testLabel) predict = model.predictions predict = np.array(predict, np.float) predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict)) return predict
def LSS_KKT(R, D): R, D = array(R), array(D) assert R.ndim == 3 assert R.shape[1] == R.shape[2] N, m = R.shape[:2] bigR = sparse.bsr_matrix((R, r_[:N], r_[:N+1]), \ shape=(N*m, (N+1)*m)) I = array([eye(m)] * N) bigI = sparse.bsr_matrix((I, r_[1:N+1], r_[:N+1]), \ shape=(N*m, (N+1)*m)) bigL = bigI - bigR assert D.shape == (N+1, m, m) bigD = sparse.bsr_matrix((D, r_[:N+1], r_[:N+2]), \ shape=((N+1)*m, (N+1)*m)) O = zeros([N, m, m]) bigO = sparse.bsr_matrix((O, r_[:N], r_[:N+1]), \ shape=(N*m, N*m)) return sparse.vstack([sparse.hstack([bigD, bigL.T]), sparse.hstack([bigL, bigO])])
def _df_append_right(self, df_or_s): start('appending to the right. note, this is a destructuve operation') if (type(df_or_s) is sparse.coo.coo_matrix): self_sparse = None for c in self.columns: debug('\tappending column: ' + c) c_coo = sparse.coo_matrix(self[[c]]) self.drop([c], 1, inplace=True) gc.collect() if self_sparse == None: self_sparse = c_coo else: self_sparse = sparse.hstack((self_sparse, c_coo)) self_sparse = sparse.hstack((self_sparse, df_or_s)) stop('done appending to the right') return self_sparse elif _is_sparse(df_or_s) and not _is_sparse(self): debug('converting data frame to a sparse frame') self = self.to_sparse(fill_value=0) if type(df_or_s) is pd.Series: self[df_or_s.name] = df_or_s.values else: if type(df_or_s) is pd.DataFrame: columns = df_or_s.columns right = df_or_s.values else: columns = [`i` + '_2' for i in range(df_or_s.shape[1])] right = df_or_s
def pywfmPredictModel(trainFeature, testFeature, trainLabel, trainIndex, testIndex, fm): print 'run online!' trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex) encoder = OneHotEncoder(n_values=[value1, value2]) trainIndex_encode = encoder.fit_transform(trainIndex) testIndex_encode = encoder.transform(testIndex) trainFeature = hstack((trainIndex_encode, trainFeature)) testFeature = hstack((testIndex_encode, testFeature)) #print trainFeature ''' for i in range(len(trainLabel)): if i == 0: trainLabel[i] = -1 for i in range(len(testLabel)): if i == 0: testLabel[i] = -1 ''' testLabel = np.zeros((testFeature.shape[0])) model = fm.run(trainFeature, trainLabel, testFeature, testLabel) predict = model.predictions predict = np.array(predict, np.float) print np.max(predict), np.min(predict) #predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict)) return predict
def extract_features(data, train_size, with_stemmer, tfidf): text_vector, post_time_vector, posting_user_vector = preprocess_posts(data, with_stemmer=with_stemmer) label_vector = extract_labels(data) text_vector_train, text_vector_test, posting_user_vector_train, posting_user_vector_test, post_time_vector_train, post_time_vector_test, label_vector_train, label_vector_test = \ train_test_split(text_vector, posting_user_vector, post_time_vector, label_vector, train_size=train_size) print "Extracting features..." vectorizer = _vectorizer(tfidf) le = preprocessing.LabelEncoder() le.fit(posting_user_vector) from scipy.sparse import csr_matrix, hstack def prepare_matrix(vector): return np.transpose(csr_matrix(vector)) features_vector_train = hstack((prepare_matrix(le.transform(posting_user_vector_train)), prepare_matrix(post_time_vector_train), vectorizer.fit_transform(text_vector_train))) print vectorizer.get_feature_names() features_vector_test = hstack((prepare_matrix(le.transform(posting_user_vector_test)), prepare_matrix(post_time_vector_test), vectorizer.transform(text_vector_test))) scaler = preprocessing.StandardScaler(with_mean=False).fit(features_vector_train) features_vector_train = scaler.transform(features_vector_train) features_vector_test = scaler.transform(features_vector_test) return features_vector_train, features_vector_test, label_vector_train, label_vector_test
def plain_impeuler(Mc,Ac,BTc,Bc,fvbc,fpbc,vp_init,PrP,TsP): Nts, t0, tE, dt, Nv, Np = init_time_stepping(PrP,TsP) v, p = expand_vp_dolfunc(PrP, vp=vp_init, vc=None, pc=None) tcur = t0 TsP.UpFiles.u_file << v, tcur TsP.UpFiles.p_file << p, tcur IterAv = sps.hstack([Mc+dt*Ac,-dt*BTc]) IterAp = sps.hstack([-dt*Bc,sps.csr_matrix((Np,Np))]) IterA = sps.vstack([IterAv,IterAp]).todense()[:-1,:-1] vp_old = vp_init for etap in range(1,11): for i in range(Nts/10): tcur = tcur + dt Iterrhs = np.vstack([Mc*vp_old[:Nv,],np.zeros((Np-1,1))]) \ + dt*np.vstack([fvbc,fpbc[:-1,]]) vp_new = np.linalg.solve(IterA,Iterrhs) vp_old = vp_new print '%d of %d time steps completed ' % (etap*Nts/10,Nts) v, p = expand_vp_dolfunc(PrP, vp=vp_new, vc=None, pc=None) TsP.UpFiles.u_file << v, tcur TsP.UpFiles.p_file << p, tcur return
def checkZeroEig(self,X,vecList): # eigenvalues and eigenvectors LDAeigens, LDAeigenvecs = np.linalg.eig(X) LDAeigens, LDAeigenvecs = abs(LDAeigens), abs(LDAeigenvecs) # remove zero eigenvalues, corresponding eigenvectors, vecList-cols and X-cols marked = [] for i in range(self.m): if LDAeigens[i] == 0: marked.append(i) LDAeigens = np.delete(LDAeigens,marked,0) LDAeigenvecs = np.delete(LDAeigenvecs,marked,0) if len(marked)>0: print ' empty eigenvalues:' print marked stackvec = sparse.csc_matrix((vecList.shape[0],1)) stackX = sparse.csc_matrix((vecList.shape[0],1)) for col in range(vecList.shape[1]): if col in marked: pass else: stackvec = sparse.hstack(stackvec,vecList.getcol(col)) stackX = sparse.hstack(stackX,X.getcol(col)) vecList,X = stackvec,stackX return vecList,X
def run(input_train, input_test, output_name): """ Takes a file path as input, a file path as output, and produces a sorted csv of item IDs for Kaggle submission ------- input_train : 'full path of the training file' input_test : 'full path of the testing file' output_name : 'full path of the output file' """ data = pd.read_table(input_train) test = pd.read_table(input_test) testItemIds = test.itemid response = data.is_blocked dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory)) pretestdummies = pd.get_dummies(test.subcategory) testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1)) words = np.array(data.description,str) testwords = np.array(test.description,str) del data, test vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2)) corpus = np.concatenate((words, testwords)) vect.fit(corpus) counts = vect.transform(words) features = sparse.hstack((dummies,counts)) clf = LinearSVC() clf.fit(features, response) testcounts = vect.transform(testwords) testFeatures = sparse.hstack((testdummies,testcounts)) predicted_scores = clf.predict_proba(testFeatures).T[1] f = open(output_name,'w') f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close()
def predict_proba(self, X): """Predict probabilities of label assignments for X Parameters ---------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix Returns ------- :mod:`scipy.sparse` matrix of `float in [0.0, 1.0]`, shape=(n_samples, n_labels) matrix with label assignment probabilities """ X_extended = self._ensure_input_format( X, sparse_format='csc', enforce_sparse=True) results = [] for label in self._order(): prediction = self.classifiers_[label].predict( self._ensure_input_format(X_extended)) prediction = self._ensure_output_format( prediction, sparse_format='csc', enforce_sparse=True) prediction_proba = self.classifiers_[label].predict_proba( self._ensure_input_format(X_extended)) prediction_proba = self._ensure_output_format( prediction_proba, sparse_format='csc', enforce_sparse=True)[:, 1] X_extended = hstack([X_extended, prediction]).tocsc() results.append(prediction_proba) return hstack(results)
def go(self,K=100, Y=6, DI=500, minFreq=5): print self._sourceDomain + " -> " + self._targetDomain domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq) numDomainIndep = len(domainIndependentFeatures) numDomainDep = len(domainDependentFeatures) #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep) #print "creating cooccurrenceMatrix..." a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures) #print "creating SquareAffinityMatrix..." a = self._createSquareAffinityMatrix(a) #print "creating DiagonalMatrix..." b = self._createDiagonalMatrix(a) #print "multiplying..." c = b.dot(a) del a c = c.dot(b) del b #print "calculating eigenvalues and eigenvectors" eigenValues,eigenVectors = eigsh(c, k=K, which="LA") del c #print "building document vectors..." documentVectorsTraining,classificationsTraining = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain) documentVectorsTesting,classificationsTesting = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain) #print "training and testing..." U = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]] U = np.concatenate(U,axis=1)[:numDomainDep] U = sparse.csr_matrix(U) clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining] trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))] clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting] testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))] self._trainClassifier(trainingVectors, classificationsTraining) print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i" % (self._testClassifier(testVectors,classificationsTesting)*100,K,DI,Y,minFreq)
def solve_system(self, rhs, factor, u0, t): """ Simple linear solver for (I-dtA)u = rhs Args: rhs: right-hand side for the nonlinear system factor: abbrev. for the node-to-node stepsize (or any other factor required) u0: initial guess for the iterative solver (not used here so far) t: current time (e.g. for time-dependent BCs) Returns: solution as mesh """ M1 = sp.hstack((sp.eye(self.nvars[1]), -factor * self.A)) M2 = sp.hstack((-factor * self.A, sp.eye(self.nvars[1]))) M = sp.vstack((M1, M2)) b = np.concatenate((rhs.values[0, :], rhs.values[1, :])) sol = LA.spsolve(M, b) me = mesh(self.nvars) me.values[0, :], me.values[1, :] = np.split(sol, 2) return me
def dSdm(self): if getattr(self, '_dSdm', None) is None: if self.model is None: raise Exception('Requires a chi') nC = int(len(self.model)/3) m_xyz = self.chiMap * matutils.spherical2cartesian(self.model.reshape((nC, 3), order='F')) nC = int(m_xyz.shape[0]/3.) m_atp = matutils.cartesian2spherical(m_xyz.reshape((nC, 3), order='F')) a = m_atp[:nC] t = m_atp[nC:2*nC] p = m_atp[2*nC:] Sx = sp.hstack([sp.diags(np.cos(t)*np.cos(p), 0), sp.diags(-a*np.sin(t)*np.cos(p), 0), sp.diags(-a*np.cos(t)*np.sin(p), 0)]) Sy = sp.hstack([sp.diags(np.cos(t)*np.sin(p), 0), sp.diags(-a*np.sin(t)*np.sin(p), 0), sp.diags(a*np.cos(t)*np.cos(p), 0)]) Sz = sp.hstack([sp.diags(np.sin(t), 0), sp.diags(a*np.cos(t), 0), sp.csr_matrix((nC, nC))]) self._dSdm = sp.vstack([Sx, Sy, Sz]) return self._dSdm
def combine_matrix(): X000 = [sio.loadmat(filein_name[:-4] + '0X000.mat')['X000'], sio.loadmat(filein_name[:-4] + '1X000.mat')['X000'], sio.loadmat(filein_name[:-4] + '2X000.mat')['X000']] X001 = [sio.loadmat(filein_name[:-4] + '0X001.mat')['X001'], sio.loadmat(filein_name[:-4] + '1X001.mat')['X001'], sio.loadmat(filein_name[:-4] + '2X001.mat')['X001']] X010 = [sio.loadmat(filein_name[:-4] + '0X010.mat')['X010'], sio.loadmat(filein_name[:-4] + '1X010.mat')['X010'], sio.loadmat(filein_name[:-4] + '2X010.mat')['X010']] X100 = [sio.loadmat(filein_name[:-4] + '0X100.mat')['X100'], sio.loadmat(filein_name[:-4] + '1X100.mat')['X100'], sio.loadmat(filein_name[:-4] + '2X100.mat')['X100']] X_000 = sp.vstack([X000[0],X000[1],X000[2]]) X_001 = sp.vstack([X001[0],X001[1],X001[2]]) X_010 = sp.vstack([X010[0],X010[1],X010[2]]) X_100 = sp.vstack([X100[0],X100[1],X100[2]]) print(X_000.shape) X_model_100 = sp.hstack([X_000,X_100]) sio.savemat(filein_name[:-4] + 'X100-model.mat', {'X100':X_model_100}) X_model_010 = sp.hstack([X_000,X_010]) sio.savemat(filein_name[:-4] + 'X010-model.mat', {'X010':X_model_010}) X_model_001 = sp.hstack([X_000,X_001]) sio.savemat(filein_name[:-4] + 'X001-model.mat', {'X001':X_model_001})
def sentiment_kaggle_dataset(): train_x_1, test_x_1 = senti_lexicon_vectorizor(data=SST_KAGGLE, tfidf=True) train_x_2, test_x_2 = senti_wordnet_vectorizer(data=SST_KAGGLE, tfidf=True) train_x = sparse.hstack((train_x_1, train_x_2)) test_x = sparse.hstack((test_x_1, test_x_2)) _, train_y, _ = read_sst_kaggle_pickle() return train_x, train_y, test_x
def test_sadpnt_smw_krypy(self): """check the sadpnt solver with krypy""" umat, vmat, k, = self.U, self.V, self.k # self.Jt = self.J.T # check the formula AuvInvZ = lau.solve_sadpnt_smw(amat=self.A, jmat=self.J, rhsv=self.Z, jmatT=self.Jt, umat=self.U, vmat=self.V, krylov=True, krpslvprms=self.krpslvprms) sysm1 = sps.hstack([self.A, self.Jt], format='csr') sysm2 = sps.hstack([self.J, sps.csr_matrix((k, k))], format='csr') mata = sps.vstack([sysm1, sysm2], format='csr') umate = np.vstack([umat, np.zeros((k, umat.shape[1]))]) vmate = np.hstack([vmat, np.zeros((vmat.shape[0], k))]) ze = np.vstack([self.Z, np.zeros((k, self.Z.shape[1]))]) AAinvZ = mata * AuvInvZ - np.dot(umate, np.dot(vmate, AuvInvZ)) # likely to fail because of ill conditioned rand mats print np.linalg.norm(AAinvZ - ze) self.assertTrue(np.allclose(AAinvZ, ze), msg='likely to fail because of ill cond')
# same concept as above, but at the character level char_vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='char', stop_words='english', ngram_range=(1, 6), max_features=100000) # fit the vectorizer to all text (so that all ngrams are observed) # generate testing and training features using the fitted vectorizer char_vectorizer.fit(comb_text) train_char_features = char_vectorizer.transform(train_text) test_char_features = char_vectorizer.transform(test_text) # generate training and testing features using word and char features train_features = hstack([train_char_features, train_word_features]) test_features = hstack([test_char_features, test_word_features]) # empty scores list and predictions dataframe scores = [] pred = pd.DataFrame.from_dict({'id': test['id']}) # loop through each class, train the ridge model, and make predictions for class_name in classes: train_target = train[class_name] classifier = Ridge(alpha=20, fit_intercept=True, solver='auto', max_iter=100, random_state=0, tol=0.0025)
def run(self): """ Solves a state estimation problem. """ case = self.case baseMVA = case.base_mva buses = self.case.connected_buses branches = case.online_branches generators = case.online_generators meas = self.measurements # Update indices. self.case.index_buses() self.case.index_branches() # Index buses. # ref = [b._i for b in buses if b.type == REFERENCE] pv = [b._i for b in buses if b.type == PV] pq = [b._i for b in buses if b.type == PQ] # Build admittance matrices. Ybus, Yf, Yt = case.Y # Prepare initial guess. V0 = self.getV0(self.v_mag_guess, buses, generators) # Start the clock. t0 = time() # Initialise SE. converged = False i = 0 V = V0 Va = angle(V0) Vm = abs(V0) nb = Ybus.shape[0] f = [b.from_bus._i for b in branches] t = [b.to_bus._i for b in branches] nonref = pv + pq # Form measurement vector. z = array([m.value for m in meas]) # Form measurement index vectors. idx_zPf = [m.b_or_l._i for m in meas if m.type == PF] idx_zPt = [m.b_or_l._i for m in meas if m.type == PT] idx_zQf = [m.b_or_l._i for m in meas if m.type == QF] idx_zQt = [m.b_or_l._i for m in meas if m.type == QT] idx_zPg = [m.b_or_l._i for m in meas if m.type == PG] idx_zQg = [m.b_or_l._i for m in meas if m.type == QG] idx_zVm = [m.b_or_l._i for m in meas if m.type == VM] idx_zVa = [m.b_or_l._i for m in meas if m.type == VA] def col(seq): return [[k] for k in seq] # Create inverse of covariance matrix with all measurements. # full_scale = 30 # sigma = [ # 0.02 * abs(Sf) + 0.0052 * full_scale * ones(nbr,1), # 0.02 * abs(St) + 0.0052 * full_scale * ones(nbr,1), # 0.02 * abs(Sbus) + 0.0052 * full_scale * ones(nb,1), # 0.2 * pi/180 * 3*ones(nb,1), # 0.02 * abs(Sf) + 0.0052 * full_scale * ones(nbr,1), # 0.02 * abs(St) + 0.0052 * full_scale * ones(nbr,1), # 0.02 * abs(Sbus) + 0.0052 * full_scale * ones(nb,1), # 0.02 * abs(V0) + 0.0052 * 1.1 * ones(nb,1), # ] ./ 3 # Get R inverse matrix. sigma_vector = r_[self.sigma[0] * ones(len(idx_zPf)), self.sigma[1] * ones(len(idx_zPt)), self.sigma[2] * ones(len(idx_zQf)), self.sigma[3] * ones(len(idx_zQt)), self.sigma[4] * ones(len(idx_zPg)), self.sigma[5] * ones(len(idx_zQg)), self.sigma[6] * ones(len(idx_zVm)), self.sigma[7] * ones(len(idx_zVa))] sigma_squared = sigma_vector**2 rsig = range(len(sigma_squared)) Rinv = csr_matrix((1.0 / sigma_squared, (rsig, rsig))) # Do Newton iterations. while (not converged) and (i < self.max_iter): i += 1 # Compute estimated measurement. Sfe = V[f] * conj(Yf * V) Ste = V[t] * conj(Yt * V) # Compute net injection at generator buses. gbus = [g.bus._i for g in generators] Sgbus = V[gbus] * conj(Ybus[gbus, :] * V) # inj S + local Sd Sd = array([complex(b.p_demand, b.q_demand) for b in buses]) Sgen = (Sgbus * baseMVA + Sd) / baseMVA z_est = r_[Sfe[idx_zPf].real, Ste[idx_zPt].real, Sfe[idx_zQf].imag, Ste[idx_zQt].imag, Sgen[idx_zPg].real, Sgen[idx_zQg].imag, abs(V[idx_zVm]), angle(V[idx_zVa])] # Get H matrix. dSbus_dVm, dSbus_dVa = case.dSbus_dV(Ybus, V) dSf_dVa, dSf_dVm, dSt_dVa, dSt_dVm, _, _ = case.dSbr_dV(Yf, Yt, V) # Get sub-matrix of H relating to line flow. dPF_dVa = dSf_dVa.real # from end dQF_dVa = dSf_dVa.imag dPF_dVm = dSf_dVm.real dQF_dVm = dSf_dVm.imag dPT_dVa = dSt_dVa.real # to end dQT_dVa = dSt_dVa.imag dPT_dVm = dSt_dVm.real dQT_dVm = dSt_dVm.imag # Get sub-matrix of H relating to generator output. dPG_dVa = dSbus_dVa[gbus, :].real dQG_dVa = dSbus_dVa[gbus, :].imag dPG_dVm = dSbus_dVm[gbus, :].real dQG_dVm = dSbus_dVm[gbus, :].imag # Get sub-matrix of H relating to voltage angle. dVa_dVa = csr_matrix((ones(nb), (range(nb), range(nb)))) dVa_dVm = csr_matrix((nb, nb)) # Get sub-matrix of H relating to voltage magnitude. dVm_dVa = csr_matrix((nb, nb)) dVm_dVm = csr_matrix((ones(nb), (range(nb), range(nb)))) h = [(col(idx_zPf), dPF_dVa, dPF_dVm), (col(idx_zQf), dQF_dVa, dQF_dVm), (col(idx_zPt), dPT_dVa, dPT_dVm), (col(idx_zQt), dQT_dVa, dQT_dVm), (col(idx_zPg), dPG_dVa, dPG_dVm), (col(idx_zQg), dQG_dVa, dQG_dVm), (col(idx_zVm), dVm_dVa, dVm_dVm), (col(idx_zVa), dVa_dVa, dVa_dVm)] H = vstack([ hstack([dVa[idx, nonref], dVm[idx, nonref]]) for idx, dVa, dVm in h if len(idx) > 0 ]) # Compute update step. J = H.T * Rinv * H F = H.T * Rinv * (z - z_est) # evalute F(x) dx = spsolve(J, F) # Check for convergence. normF = linalg.norm(F, Inf) if self.verbose: logger.info("Iteration [%d]: Norm of mismatch: %.3f" % (i, normF)) if normF < self.tolerance: converged = True # Update voltage. npvpq = len(nonref) Va[nonref] = Va[nonref] + dx[:npvpq] Vm[nonref] = Vm[nonref] + dx[npvpq:2 * npvpq] V = Vm * exp(1j * Va) Va = angle(V) Vm = abs(V) # Weighted sum squares of error. error_sqrsum = sum((z - z_est)**2 / sigma_squared) # Update case with solution. case.pf_solution(Ybus, Yf, Yt, V) # Stop the clock. elapsed = time() - t0 if self.verbose and converged: print "State estimation converged in: %.3fs (%d iterations)" % \ (elapsed, i) # self.output_solution(sys.stdout, z, z_est) solution = { "V": V, "converged": converged, "iterations": i, "z": z, "z_est": z_est, "error_sqrsum": error_sqrsum, "elapsed": elapsed } return solution
#%% scaler = StandardScaler() scaler.fit(new_feat_train['year_month'].values.reshape(-1, 1)) new_feat_train['year_month_scaled'] = scaler.transform( new_feat_train['year_month'].values.reshape(-1, 1)) new_feat_test['year_month_scaled'] = scaler.transform( new_feat_test['year_month'].values.reshape(-1, 1)) #%% #Вычисление ROC_AUC на отложенной выборке X_train_sparse_new = csr_matrix( hstack([ X_train_sparse, new_feat_train['year_month_scaled'].values.reshape(-1, 1) ])) get_auc_lr_valid(X_train_sparse_new, y_train) #%% # Месяц начала сессии new_feat_train['start_month'] = train_df['time1'].apply(lambda x: x.month) new_feat_test['start_month'] = test_df['time1'].apply(lambda x: x.month) # Час начала сессии new_feat_train['start_hour'] = train_df['time1'].apply(lambda x: x.hour) new_feat_test['start_hour'] = test_df['time1'].apply(lambda x: x.hour) # Утро - час меньше 11 или нет new_feat_train['morning'] = new_feat_train['start_hour'] <= 11 new_feat_test['morning'] = new_feat_test['start_hour'] <= 11
train_x = np.hstack((train_x, ct_trains)) test_x = np.hstack((test_x, ct_tests)) # 特征进行onehot处理 enc = OneHotEncoder() oc_encoder = OneHotEncoder() print("onehot start") f.write("onehot start") f.flush() for feature in tqdm(one_hot_feature): oc_encoder.fit(data[feature].values.reshape(-1, 1)) train_a=oc_encoder.transform(train[feature].values.reshape(-1, 1)) test_a = oc_encoder.transform(test[feature].values.reshape(-1, 1)) train_x = sparse.hstack((train_x, train_a)) test_x = sparse.hstack((test_x, test_a)) print('one-hot prepared !') f.write("one-hot prepared !") f.flush() # 处理count特征向量 ct_encoder = CountVectorizer(min_df=0.001, tokenizer = str.split) #传递函数 print("CV start") f.write("CV start") f.flush() for feature in tqdm(vector_feature): ct_encoder.fit(data[feature]) train_a = ct_encoder.transform(train[feature]) test_a = ct_encoder.transform(test[feature])
def undirected_grid_2d_bipartie_graph(m, n, d=1, r=1, centers_d=None, centers_s=None, num_of_centers_supply=None, num_of_centers_demand=None, l_norm=np.inf, plot=False, alpha=1.0): g = nx.empty_graph(0, None) rows = range(m) columns = range(n) # adding all the nodes g.add_nodes_from((i, j) for i in rows for j in columns) # # adding all the edges for k in range(d + 1): for l in range(d + 1): g.add_edges_from(((i, j), (i + l, j + k)) for i in rows for j in columns if i + l <= m - 1 and j + k <= n - 1 and norm(np.array([l, k]), l_norm) <= r) nodes = list(g.nodes()) if centers_d is None: if num_of_centers_demand is None: num_of_centers_demand = int(0.5 * ((m * n)**0.5)) centers_d = [((uniform(0.2 * m, 0.8 * m), uniform(0.2 * n, 0.8 * n)), uniform(0, 1)) for _ in range(num_of_centers_demand)] if centers_s is None: if num_of_centers_supply is None: num_of_centers_supply = int(0.5 * ((m * n)**0.5)) centers_s = [((uniform(0.2 * m, 0.8 * m), uniform(0.2 * n, 0.8 * n)), uniform(0, 1)) for _ in range(num_of_centers_supply)] lamda_d_pdf = gaussian_pdf_2d(m, n, centers_d, normalize=True) lamda_d = np.array([lamda_d_pdf[node] for node in nodes]) lamda_s_pdf = alpha * gaussian_pdf_2d( m, n, centers_s, normalize=True) + (1 - alpha) * lamda_d_pdf lamda_s = np.array([lamda_s_pdf[node] for node in nodes]) lamda = np.concatenate((lamda_d, lamda_s)) grid_adj_mat = nx.adjacency_matrix(g) layered_grid_adj_mat = sps.vstack( (sps.hstack((0 * sps.eye(m * n), grid_adj_mat)), sps.hstack((grid_adj_mat, 0 * sps.eye(m * n))))) nodes = dict( enumerate( list(zip(nodes, ['d'] * len(nodes))) + list(zip(nodes, ['s'] * len(nodes))))) workload_decomp = grid_workload_decomposition(lamda, layered_grid_adj_mat) max_workload = workload_decomp[0]['workload'] lamda_s_pdf = lamda_s_pdf * max_workload lamda_s = lamda_s * max_workload for s in workload_decomp: workload_decomp[s][ 'workload'] = workload_decomp[s]['workload'] / max_workload supply_decomp = np.zeros((m, n)) demand_decomp = np.zeros((m, n)) for st in workload_decomp: wl = workload_decomp[st]['workload'] for d in workload_decomp[st]['demnand_nodes']: demand_decomp[nodes[d][0]] = wl for s in workload_decomp[st]['supply_nodes']: supply_decomp[nodes[s][0]] = wl return dict( zip([ 'lamda_s', 'lamda_d', 'grid_adj_mat', 'nodes', 'lamda_d_pdf', 'lamda_s_pdf', 'demand_decomp', 'supply_decomp', 'fifo_ct', 'max_ent_workload' ], [ lamda_s, lamda_d, grid_adj_mat, nodes, lamda_d_pdf, lamda_s_pdf, demand_decomp, supply_decomp ]))
left_index=True, right_index=True).merge(gatest[['testrow']], how='left', left_index=True, right_index=True).reset_index()) d = devicelabels.dropna(subset=['trainrow']) Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(gatrain.shape[0], nlabels)) d = devicelabels.dropna(subset=['testrow']) Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), shape=(gatest.shape[0], nlabels)) print('Labels data:train shape{},test shape{}'.format(Xtr_label.shape, Xte_label.shape)) #concatenate all features Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr') Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr') scipy.io.mmwrite('Xtrain.mtx', Xtrain) Xtrain = scipy.io.mmread("Xtrain.mtx") scipy.io.mmwrite('Xtest.mtx', Xtest) Xtest = scipy.io.mmread("Xtest.mtx") #print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape)) # #cross validation targetencoder = LabelEncoder().fit(gatrain.group) y = targetencoder.transform(gatrain.group) #对gatrain的所有行的group编码 nclasses = len(targetencoder.classes_)
del data_postive del data gc.collect() data_negative_x = data_negative[['creativeSize']] ##负样本稀疏处理 mprint('onehot_trans begin') for feature in one_hot_feature: #for feature in one_hot_feature: enc = OneHotEncoder() tmp_enc = enc.fit_transform(data_negative[feature].values.reshape(-1, 1)) # enc.fit(data_negative[feature].values.reshape(-1, 1)) mprint(enc.n_values_, 'feature:%s enc.n_values_' % (feature)) # tmp_enc=enc.transform(data_negative[feature].values.reshape(-1, 1)) data_negative_x = sparse.hstack((data_negative_x, tmp_enc)) del tmp_enc data_negative = data_negative.drop(feature, axis=1) gc.collect() mprint(mem_usage(data_negative), 'mem_usage(data_negative) after onehot_trans %s' % (feature)) mprint('feature:%s one-hot finished!' % (feature)) mprint('onehot_trans prepared !') mprint('countvec_trans begin') for feature in vector_feature:
def get_Le_Ln(self, S = 1, return_dxi_deta = False, return_sparse = False): """ Calculate the matrix that produces the derivative in the eastward and northward directions of a scalar field defined on self set return_dxi_deta to True to return the matrices that differentiate in cubed sphere coordinates instead of geo Parameters: ----------- S: int, optional Stencil size. Default is 1, in which case derivatives will be calculated with a 3-point stencil. With S = 2, a 5-point stencil will be used. etc. return_dxi_deta: bool, optional Set to True if you want matrices that differentiate in the xi / eta directions instead of east / north return_sparse: bool, optional Set to True if you want scipy.sparse matrices instead of dense numpy arrays """ dxi = self.dxi det = self.deta N = self.NL M = self.NW D_xi = {'rows':[], 'cols':[], 'elements':[]} D_et = {'rows':[], 'cols':[], 'elements':[]} # index arrays (0 to N, M) i_arr = np.arange(N) j_arr = np.arange(M) # meshgrid versions: ii, jj = np.meshgrid(i_arr, j_arr, indexing = 'xy') # inner grid points: points = np.r_[-S:S+1:1] coefficients = diffutils.stencil(points, order = 1) i_dx, j_dx = ii [:, S:-S], jj [:, S:-S] i_dy, j_dy = ii.T[:, S:-S], jj.T[:, S:-S] for ll in range(len(points)): D_et['rows'] .append(self._index(i_dx, j_dx )) D_et['cols'] .append(self._index(i_dx + points[ll], j_dx)) D_et['elements'].append(np.full(i_dx.size, coefficients[ll] / det)) D_xi['rows'] .append(self._index(i_dy, j_dy )) D_xi['cols'] .append(self._index(i_dy, j_dy + points[ll])) D_xi['elements'].append(np.full(i_dy.size, coefficients[ll] / dxi)) # boundaries for kk in np.arange(0, S)[::-1]: # LEFT points = np.r_[-kk:S+1:1] coefficients = diffutils.stencil(points, order = 1) i_dx, j_dx = ii [:, kk], jj [:, kk] i_dy, j_dy = ii.T[:, kk], jj.T[:, kk] for ll in range(len(points)): D_et['rows'] .append(self._index(i_dx, j_dx )) D_et['cols'] .append(self._index(i_dx + points[ll], j_dx)) D_et['elements'].append(np.full(i_dx.size, coefficients[ll] / det)) D_xi['rows'] .append(self._index(i_dy, j_dy )) D_xi['cols'] .append(self._index(i_dy, j_dy + points[ll])) D_xi['elements'].append(np.full(i_dy.size, coefficients[ll] / dxi)) # RIGHT points = np.r_[-S:kk+1:1] coefficients = diffutils.stencil(points, order = 1) i_dx, j_dx = ii [:, -(kk + 1)], jj [:, -(kk + 1)] i_dy, j_dy = ii.T[:, -(kk + 1)], jj.T[:, -(kk + 1)] for ll in range(len(points)): D_et['rows'] .append(self._index(i_dx, j_dx )) D_et['cols'] .append(self._index(i_dx + points[ll], j_dx)) D_et['elements'].append(np.full(i_dx.size, coefficients[ll] / det)) D_xi['rows'] .append(self._index(i_dy, j_dy )) D_xi['cols'] .append(self._index(i_dy, j_dy + points[ll])) D_xi['elements'].append(np.full(i_dy.size, coefficients[ll] / dxi)) D_xi = {key:np.hstack(D_xi[key]) for key in D_xi.keys()} D_et = {key:np.hstack(D_et[key]) for key in D_et.keys()} D_xi = sparse.csc_matrix((D_xi['elements'], (D_xi['rows'], D_xi['cols'])), shape = (N * M, N * M)) D_et = sparse.csc_matrix((D_et['elements'], (D_et['rows'], D_et['cols'])), shape = (N * M, N * M)) if return_dxi_deta: if return_sparse: return D_xi, D_et else: return np.array(D_xi.todense()), np.array(D_et.todense()) # convert to gradient compnents X = self.X.flatten().reshape((1, -1)) Y = self.Y.flatten().reshape((1, -1)) D = self.D.flatten().reshape((1, -1)) C = self.C.flatten().reshape((1, -1)) d = self.delta.flatten().reshape((1, -1)) I = sparse.eye(self.size) # equation 21 of Ronchi et al. L_xi = (D_xi.multiply(D ) + D_et.multiply(X * Y / D)) / self.R L_et = (D_xi.multiply(X * Y / C) + D_et.multiply( C )) / self.R dd = np.sqrt(d - 1) # conversion from xi/eta to geocentric east/west is accomplished through the # matrix in equation 14 of Ronchi et al. # The elements of this matrix are: a00 = D * X / dd a01 = -D * Y / dd / np.sqrt(d) a10 = C * Y / dd a11 = C * X / dd / np.sqrt(d) # The a matrix converts from local theta/phi to xi/eta. The elements of # the inverse are: det = a00*a11 - a01*a10 b00 = a11 /det b01 = -a01 /det b10 = -a10 /det b11 = a00 /det # matrix that converts from xi/eta to local east/north Be_ = sparse.hstack((I.multiply(b00), I.multiply(b01))) Bn_ = sparse.hstack((I.multiply(b10), I.multiply(b11))) # Make rotation matrix from local east/north to geocentric east/south: R_l2g = self.projection.local2geo_enu_rotation(self.local_lon.flatten(), self.local_lat.flatten()) r10 = -R_l2g[:, 0, 0].reshape((1, -1)) r11 = -R_l2g[:, 0, 1].reshape((1, -1)) r00 = R_l2g[:, 1, 0].reshape((1, -1)) r01 = R_l2g[:, 1, 1].reshape((1, -1)) Re = sparse.hstack((I.multiply(r00), I.multiply(r01))) Rn = sparse.hstack((I.multiply(r10), I.multiply(r11))) # where I switched the order of the rows and multiplied first row by -1 # so that R acts on (south/east) instead of (east/north). # combine all three operations: Differentiation of xi/eta, conversion to local, conversion to global L = sparse.vstack((Re, Rn)).dot(sparse.vstack((Be_, Bn_))).dot(sparse.vstack((L_xi, L_et))) # and return the upper and lower parts of L: Le, Ln = L[:self.size], L[self.size:] if return_sparse: return Le, Ln else: return np.array(Le.todense()), np.array(Ln.todense())
# Xts holds one hot encodings for each individual feature in memory # speeding up feature selection Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)] print "Performing greedy feature selection..." score_hist = [] N = 10 good_features = set([]) # Greedy feature selection loop while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: scores = [] for f in range(len(Xts)): if f not in good_features: feats = list(good_features) + [f] Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() score = cv_loop(Xt, y, model, N) scores.append((score, f)) print "Feature: %i Mean AUC: %f" % (f, score) good_features.add(sorted(scores)[-1][1]) score_hist.append(sorted(scores)[-1]) print "Current features: %s" % sorted(list(good_features)) # Remove last added feature from good_features good_features.remove(score_hist[-1][1]) good_features = sorted(list(good_features)) print "Selected features %s" % good_features gf = open("feats" + submit, 'w') print >> gf, good_features gf.close() print len(good_features), " features"
def divergence(self, S = 1, return_sparse = False): """ Calculate the matrix that produces the divergence of a vector field The returned 2N x N matrix operates on a 1D array that represents a vector field. The array must be of length 2N, where N is the number of grid cells. The first N elements are the eastward components and the last N are the northward components. Note - this code is based on equations (12) and (23) of Ronchi. The 'matrification' is explained in my regional data analysis document; it is not super easy to understand it from the code alone. Parameters: ----------- S: int, optional Stencil size. Default is 1, in which case derivatives will be calculated with a 3-point stencil. With S = 2, a 5-point stencil will be used. etc. return_sparse: bool, optional Set to True if you want scipy.sparse matrices instead of dense numpy arrays """ # 1) construct matrix that operates on [[Vxi], [Veta]] to produce # the divergence of teh vector field V according to equation (23) # of Ronchi et al. # 2) construct matrix that converts from east/north to xi/eta # in local coords # 3) construct matrix that rotates from global to local coords # 4) combine all three matrices and return # matrices that calculate differentials L_xi, L_eta = self.get_Le_Ln(S = S, return_dxi_deta = True, return_sparse = True) # define some parameteres that are needed d = self.delta.flatten().reshape((-1, 1)) X = self.X.flatten().reshape( (-1, 1)) Y = self.Y.flatten().reshape( (-1, 1)) D = self.D.flatten().reshape( (-1, 1)) C = self.C.flatten().reshape( (-1, 1)) xi = self.xi.flatten().reshape( (-1, 1)) eta = self.eta.flatten().reshape( (-1, 1)) R = self.R I = sparse.eye(xi.size) q1 = d / (R * D * C**2) q2 = -np.tan(xi ) / (R * D * C**2 * np.cos(xi )**2) p1 = d / (R * C * D**2) p2 = -np.tan(eta) / (R * C * D**2 * np.cos(eta)**2) # matrix that caculates the divergence with xi/eta components: L = sparse.hstack((L_xi.multiply(q1) + I.multiply(q2), L_eta.multiply(p1) + I.multiply(p2))) dd = np.sqrt(d - 1) aa = -D * Y / dd / np.sqrt(d) bb = -D * X / dd cc = C * X / dd / np.sqrt(d) dd = -C * Y / dd # matrix that rotates from east/north to xi/eta: R = sparse.vstack((sparse.hstack((I.multiply(aa), I.multiply(bb))), sparse.hstack((I.multiply(cc), I.multiply(dd))))) # Combine this with the rotation matrix from geocentric east/north to local east/north: R_l2g = self.projection.local2geo_enu_rotation(self.local_lon.flatten(), self.local_lat.flatten()) R_g2l = np.swapaxes(R_l2g, 1, 2) # transpose to get rotation from geo 2 local r00 = R_g2l[:, 0, 0].reshape((1, -1)) r01 = R_g2l[:, 0, 1].reshape((1, -1)) r10 = R_g2l[:, 1, 0].reshape((1, -1)) r11 = R_g2l[:, 1, 1].reshape((1, -1)) RR = sparse.vstack((sparse.hstack((I.multiply(r00), I.multiply(r01))), sparse.hstack((I.multiply(r10), I.multiply(r11))))) # combine the matrices so we get divergence of east/north D = L.dot(R.dot(RR) ) return D if return_sparse else np.array(D.todense())
xx.append(np.std(X.todense(), axis=1)) xx.append(np.std(X1.todense(), axis=1)) xx.append(np.std(X2.todense(), axis=1)) xx.append(np.std(X3.todense(), axis=1)) xx.append(np.std(X4.todense(), axis=1)) #xx.append(np.sum(sparse.hstack([X,X1,X2,X3,X4],format='csr').todense(),axis=1)) #xx.append(np.max(X.todense(),axis=1)-np.min(X.todense(),axis=1)) #xx.append(np.max(X1.todense(),axis=1)-np.min(X1.todense(),axis=1)) #xx.append(np.max(X2.todense(),axis=1)-np.min(X2.todense(),axis=1)) #xx.append(np.max(X3.todense(),axis=1)-np.min(X3.todense(),axis=1)) #xx.append(np.max(X4.todense(),axis=1)-np.min(X4.todense(),axis=1)) xx = np.hstack(xx) X = sparse.hstack( [X, X1, X2, X3, X4, xx, pickle.load(open('../explore/X2.p'))], format='csr').todense() train = pd.read_csv('../explore/train1.csv') idname = 'id' label = 'fault_severity' idx = train[idname].as_matrix() y = np.array(train[label]) import pickle X = np.hstack([X, train.drop([label, idname], axis=1).as_matrix()]) #X=np.hstack([X,train[['location','volume']].as_matrix()]) print X.shape, y.shape yp, score = kfold_cv(X, y, 4) print X.shape, y.shape print yp.shape
model=load_model('best_model.hdf5') #model=load_model('MyBidirLSTM2Layer100Dim.h5') X=np.concatenate((leftX,rightX),axis=1) o=model.evaluate(X,Y)[1] print("Test Accuracy by my deep model") print(o) transformer = TfidfTransformer() loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb"))) features = transformer.fit_transform(loaded_vec.fit_transform(np.array(doc))) i=0 X_test=[] while i<=(2*len(Y)-2): arr=np.zeros((3)) l=vstack([csr_matrix(features[i,:]),csr_matrix(features[i+1,:])]) l1=hstack([csr_matrix(features[i,:]),csr_matrix(features[i+1,:])]) #print(l.shape) l=l.todense() l1=l1.todense() arr[0]=np.dot(l[0,:],l[1,:].T) arr[1]=sklearn.metrics.pairwise.euclidean_distances(l[0,:],l[1,:]) arr[2]=sklearn.metrics.pairwise.manhattan_distances(l[0,:],l[1,:]) #X.append(arr) X_test.append(l1) i=i+2 joblib_model = joblib.load('My_TFIDF_Modelnew1.pkl') X_test=np.array(X_test) #print(X_test.shape) # shape is (9824, 1, 2398) if l1 was added else (9824, 2398) if arr was added
# add the constraint that displacement at the pinned node is zero. out = elastic3d_displacement( nodes[groups['pinned']], nodes, lamb=lamb, mu=mu, n=1, basis=basis, order=0) G_xx = add_rows(G_xx, out['xx'], groups['pinned']) G_yy = add_rows(G_yy, out['yy'], groups['pinned']) G_zz = add_rows(G_zz, out['zz'], groups['pinned']) # stack it all together, removing unneeded matrices as soon as # possible G_x = sp.hstack((G_xx, G_xy, G_xz)) del G_xx, G_xy, G_xz G_y = sp.hstack((G_yx, G_yy, G_yz)) del G_yx, G_yy, G_yz G_z = sp.hstack((G_zx, G_zy, G_zz)) del G_zx, G_zy, G_zz G = sp.vstack((G_x, G_y, G_z)) del G_x, G_y, G_z G = G.tocsc() G.eliminate_zeros() # form the right-hand-side vector
def __constrStateTransitionMatrix(self): k = self.__class__.k A = None # state transtion block matrix # loop over all rows (i.e. individual objs) and find all connections with other objs for row,i,obj in zip(self.massMatrix,xrange(len(self.objs)),self.objs): fac = 1./(1. + obj.b1*k) fac /= obj.h**2 if isinstance(obj,Resonator2D) else obj.h C1_total = csc_matrix((obj.Nm,obj.Nm)); C2_total = csc_matrix((obj.Nm,obj.Nm)) C3_total = {}; C4_total = {}; A_row = None colInds = np.nonzero(row)[0] # for every connection between obj q and r other objects, construct inter-connection matrices for j in colInds: cpoint_q = self.connPointMatrix[i][j] e_q = spdistr2D(1.,cpoint_q[0],cpoint_q[1],obj.Nx - 1,obj.Ny - 1,flatten=True)\ if isinstance(obj,Resonator2D) else spdistr1D(1.,cpoint_q,obj.Nm,'lin') # return the row indices of the nonzero entrees in the current col we are looking in row_r = [ind for ind,item in enumerate([self.massMatrix[q][j] for q in xrange(len(self.massMatrix))]) if item > 0] # remove row index of current object and since list must now be of size 1, simply return row index row_r.remove(i); row_r = row_r[0] M = float(self.massMatrix[i][j])/self.massMatrix[row_r][j] # mass ratio: Mq/Mr cpoint_r = self.connPointMatrix[row_r][j] e_r = spdistr2D(1.,cpoint_r[0],cpoint_r[1],self.objs[row_r].Nx - 1,self.objs[row_r].Ny - 1,flatten=True) \ if isinstance(self.objs[row_r],Resonator2D) \ else spdistr1D(1.,cpoint_r,self.objs[row_r].B.shape[0],'lin') c1 = fac/(e_q.T.dot(e_q)[0,0] + M*e_r.T.dot(e_r)[0,0]) e_qCre_q = e_q*e_q.T; e_qCre_r = e_q*e_r.T C1_total = C1_total + c1*e_qCre_q*obj.C1 C2_total = C2_total + c1*e_qCre_q*obj.C2 if row_r in C3_total: # save to assert that when C3[row_r] is empty, C4[row_r] is empty also C3_total[row_r] = C3_total[row_r] - c1*e_qCre_r*self.objs[row_r].C1 C4_total[row_r] = C4_total[row_r] - c1*e_qCre_r*self.objs[row_r].C2 else: C3_total[row_r] = -c1*e_qCre_r*self.objs[row_r].C1 C4_total[row_r] = -c1*e_qCre_r*self.objs[row_r].C2 # construct row of A for u[n] for j in xrange(0,len(self.objs)): if i == j: # we're on the diagonal A_row = hstack((obj.B + C1_total,obj.C + C2_total),format="lil") if A_row == None else \ hstack((A_row,obj.B + C1_total,obj.C + C2_total),format="lil") elif j in C3_total: A_row = hstack((C3_total[j],C4_total[j]),format="lil") if A_row == None else \ hstack((A_row,C3_total[j],C4_total[j]),format="lil") else: Nm2 = self.objs[j].Nm*2 A_row = lil_matrix((obj.Nm,Nm2)) if A_row is None else \ hstack((A_row,lil_matrix((obj.Nm,Nm2)))) # construct row of A for u[n - 1] if i == 0: # first object, so identity matrix is first in row I = hstack((identity(obj.Nm,format="lil"),lil_matrix((obj.Nm,A_row.shape[1] - obj.Nm)))) elif i == len(self.objs) - 1: # last object, so identity matrix is penultimate to last col I = hstack((lil_matrix((obj.Nm,A_row.shape[1] - 2*self.objs[-1].Nm)),\ identity(obj.Nm,format="lil"),lil_matrix((obj.Nm,obj.Nm)))) else: # if any other object, calc pos of identity matrix based on grid size N of each obj I = hstack((lil_matrix((obj.Nm,2*np.sum(self.Nt[:i]))),identity(obj.Nm),\ lil_matrix((obj.Nm,obj.Nm + 2*np.sum(self.Nt[-(len(self.Nt) - 1 - i):]))))) # append row to block state transition matrix A A = vstack((A_row,I)) if A is None else vstack((A,A_row,I)) return A.tocsc()
#test_word_features = word_vectorizer.transform(test_text) #char_vectorizer = CountVectorizer( # sublinear_tf=True, # strip_accents='unicode', # analyzer='char', # stop_words='english', # ngram_range=(2, 6), # max_features=50000) #char_vectorizer.fit(all_text) #train_char_features = char_vectorizer.transform(train_text) #test_char_features = char_vectorizer.transform(test_text) #train_features = hstack([train_char_features, train_word_features]) #train_features=hstack([train_char_features]) train_features = hstack([train_word_features]) #test_features = hstack([test_char_features, test_word_features]) scores = [] #submission = pd.DataFrame.from_dict({'id': test['id']}) for class_name in class_names: train_target = train[class_name] classifier = LogisticRegression(C=0.1, solver='liblinear') cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=5, scoring='roc_auc')) scores.append(cv_score)
def model(munged_train_filepath, munged_test_filepath, save_predictions_path): train_df = pd.read_json(munged_train_filepath) test_df = pd.read_json(munged_test_filepath) # Need to ask John what this does train_df['features'] = train_df["features"].apply( lambda x: " ".join(["_".join(i.split(" ")) for i in x])) test_df['features'] = test_df["features"].apply( lambda x: " ".join(["_".join(i.split(" ")) for i in x])) tfidf = CountVectorizer(stop_words='english', max_features=200) tr_sparse = tfidf.fit_transform(train_df["features"]) te_sparse = tfidf.transform(test_df["features"]) # which columns are currently numeric? numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numeric_idx = [ column for column in train_df.columns if train_df[column].dtype in numeric ] non_numeric_idx = [ column for column in train_df.columns if column not in numeric_idx ] train_df[numeric_idx].head() # seaparate train and test into X and y train_X = sparse.hstack([train_df[numeric_idx], tr_sparse]).tocsr() test_X = sparse.hstack([test_df[numeric_idx], te_sparse]).tocsr() target_num_map = {'high': 0, 'medium': 1, 'low': 2} train_y = np.array( train_df['interest_level'].apply(lambda x: target_num_map[x])) # function to create and run model def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000): param = {} param['objective'] = 'multi:softprob' param['eta'] = 0.1 param['max_depth'] = 6 param['silent'] = 1 param['num_class'] = 3 param['eval_metric'] = "mlogloss" param['min_child_weight'] = 1 param['subsample'] = 0.7 param['colsample_bytree'] = 0.7 param['nthread'] = 4 param['seed'] = seed_val num_rounds = num_rounds plst = list(param.items()) xgtrain = xgb.DMatrix(train_X, label=train_y) if test_y is not None: xgtest = xgb.DMatrix(test_X, label=test_y) watchlist = [(xgtrain, 'train'), (xgtest, 'test')] model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50) else: xgtest = xgb.DMatrix(test_X) model = xgb.train(plst, xgtrain, num_rounds) pred_test_y = model.predict(xgtest) return pred_test_y, model # Run model and export to specified filepath preds, model = runXGB(train_X, train_y, test_X, num_rounds=300) out_df = pd.DataFrame(preds) out_df.columns = ["high", "medium", "low"] out_df["listing_id"] = test_df.listing_id.values out_df.to_csv(save_predictions_path, index=False)
def fit_transform(self, raw_documents, y=None): x = [vect.fit_transform(raw_documents, y) for vect in self.vectorizers] x = sparse.hstack(x) return x
def append_ones(X): if sp.issparse(X): return sp.hstack((np.ones((X.shape[0], 1)), X)).tocsr() else: return np.hstack((np.ones((X.shape[0], 1)), X))
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) gc.collect() cpuStats() ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_orig.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_orig)).tocsr() ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0002: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) from wordbatch.models import nn_relu_h1, nn_relu_h2 modelnn = nn_relu_h1.NN_ReLU_H1(alpha=0.05, L2=0.00001, D_nn=60, D=sparse_merge.shape[1], \ iters=1, inv_link="identity", threads=threads) baseline = 1. print('[{}] Epoch time '.format(time.time() - start_time)) for i in range(3): modelnn.fit(train_X, train_y, verbose=1) predsnn = modelnn.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsnn)) print("FM_FTRL dev RMSLE:", score_) print('[{}] Epoch time '.format(time.time() - start_time)) if score_ < baseline - 0.0002: baseline = score_ else: break pd.Series((np.expm1(predsnn) - np.expm1(predsfm))).hist() print( "FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), 0.1 * (np.expm1(predsnn)) + 0.9 * (np.expm1(predsfm)))) tpoint2 = time.time() print("Time for Training: {}".format(hms_string(tpoint2 - tpoint1))) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
def transform(self, raw_documents): x = [vect.transform(raw_documents) for vect in self.vectorizers] x = sparse.hstack(x) return x
def _fit_resample(self, X, y): self.n_features_ = X.shape[1] self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority)) if sparse.issparse(X): if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder(sparse=True, handle_unknown="ignore", dtype=dtype_ohe) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform(X_categorical.toarray( ) if sparse.issparse(X_categorical) else X_categorical) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original # categorical encoding which will be later used for inversing the OHE if math.isclose(self.median_std_, 0): self._X_categorical_minority_encoded = _safe_indexing( X_ohe.toarray(), np.flatnonzero(y == class_minority)) X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2) X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr") X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( ( X_resampled[:, :self.continuous_features_.size], X_res_cat_dec, ), format="csr", ) else: X_resampled = np.hstack(( X_resampled[:, :self.continuous_features_.size].toarray(), X_res_cat_dec, )) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_))) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled
ridge = SklearnWrapper(clf=Ridge, seed=SEED, params=ridge_params) ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:ntrain], y, ready_df[ntrain:]) rms = sqrt(mean_squared_error(y, ridge_oof_train)) print('Ridge OOF RMSE: {}'.format(rms)) print("Modeling Stage") ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test]) df['ridge_preds'] = ridge_preds # Combine Dense Features with Sparse Text Bag of Words Features X = hstack( [csr_matrix(df.loc[traindex, :].values), ready_df[0:traindex.shape[0]]]) # Sparse Matrix testing = hstack( [csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:]]) tfvocab = df.columns.tolist() + tfvocab for shape in [X, testing]: print("{} Rows and {} Cols".format(*shape.shape)) print("Feature Names Length: ", len(tfvocab)) del df gc.collect() print("\nModeling Stage") X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=23)
def append_biases(X): return sparse.hstack((X, np.ones(X.shape[0])[:, np.newaxis])).tocsr()
]) start_vect = time.time() vectorizer.fit(df.loc[traindex, :].to_dict('records')) ready_df = vectorizer.transform(df.to_dict('records')) tfvocab = vectorizer.get_feature_names() tfvocab[:50] print('[{}] Vectorisation completed'.format(time.time() - start_time)) # Drop Text Cols df.drop(textfeats + ['text', 'all_titles'], axis=1, inplace=True) gc.collect() print('[{}] Modeling Stage'.format(time.time() - start_time)) # Combine Dense Features with Sparse Text Bag of Words Features X_train = hstack([csr_matrix(df.loc[traindex,:][trnidx].values),\ ready_df[0:traindex.shape[0]][trnidx],\ dnimgsvd[0:traindex.shape[0]][trnidx]]) X_valid = hstack([csr_matrix(df.loc[traindex,:][validx].values),\ ready_df[0:traindex.shape[0]][validx],\ dnimgsvd[0:traindex.shape[0]][validx]]) y_train = y[trnidx] y_valid = y[validx] testing = hstack([csr_matrix(df.loc[testdex,:].values),\ ready_df[traindex.shape[0]:],\ dnimgsvd[traindex.shape[0]:]]) tfvocab = df.columns.tolist() + tfvocab + [ 'imgsvdcomp%s' % (i) for i in range(dnimgsvd.shape[1]) ] for shape in [X_train, X_valid, testing]: print("{} Rows and {} Cols".format(*shape.shape)) print("Feature Names Length: ", len(tfvocab))
def map( self, lens, X=None, clusterer=None, cover=None, nerve=None, precomputed=False, remove_duplicate_nodes=False, ): """Apply Mapper algorithm on this projection and build a simplicial complex. Returns a dictionary with nodes and links. Parameters ---------- lens: Numpy Array Lower dimensional representation of data. In general will be output of `fit_transform`. X: Numpy Array Original data or data to run clustering on. If `None`, then use `lens` as default. X can be a SciPy sparse matrix. clusterer: Default: DBSCAN Scikit-learn API compatible clustering algorithm. Must provide `fit` and `predict`. cover: kmapper.Cover Cover scheme for lens. Instance of kmapper.cover providing methods `fit` and `transform`. nerve: kmapper.Nerve Nerve builder implementing `__call__(nodes)` API precomputed : Boolean Tell Mapper whether the data that you are clustering on is a precomputed distance matrix. If set to `True`, the assumption is that you are also telling your `clusterer` that `metric='precomputed'` (which is an argument for DBSCAN among others), which will then cause the clusterer to expect a square distance matrix for each hypercube. `precomputed=True` will give a square matrix to the clusterer to fit on for each hypercube. remove_duplicate_nodes: Boolean Removes duplicate nodes before edges are determined. A node is considered to be duplicate if it has exactly the same set of points as another node. nr_cubes: Int .. deprecated:: 1.1.6 define Cover explicitly in future versions The number of intervals/hypercubes to create. Default = 10. overlap_perc: Float .. deprecated:: 1.1.6 define Cover explicitly in future versions The percentage of overlap "between" the intervals/hypercubes. Default = 0.1. Returns ======= simplicial_complex : dict A dictionary with "nodes", "links" and "meta" information. Examples ======== >>> # Default mapping. >>> graph = mapper.map(X_projected, X_inverse) >>> # Apply clustering on the projection instead of on inverse X >>> graph = mapper.map(X_projected) >>> # Use 20 cubes/intervals per projection dimension, with a 50% overlap >>> graph = mapper.map(X_projected, X_inverse, >>> cover=kmapper.Cover(n_cubes=20, perc_overlap=0.5)) >>> # Use multiple different cubes/intervals per projection dimension, >>> # And vary the overlap >>> graph = mapper.map(X_projected, X_inverse, >>> cover=km.Cover(n_cubes=[10,20,5], >>> perc_overlap=[0.1,0.2,0.5])) >>> # Use KMeans with 2 clusters >>> graph = mapper.map(X_projected, X_inverse, >>> clusterer=sklearn.cluster.KMeans(2)) >>> # Use DBSCAN with "cosine"-distance >>> graph = mapper.map(X_projected, X_inverse, >>> clusterer=sklearn.cluster.DBSCAN(metric="cosine")) >>> # Use HDBSCAN as the clusterer >>> graph = mapper.map(X_projected, X_inverse, >>> clusterer=hdbscan.HDBSCAN()) >>> # Parametrize the nerve of the covering >>> graph = mapper.map(X_projected, X_inverse, >>> nerve=km.GraphNerve(min_intersection=3)) """ start = datetime.now() clusterer = clusterer or cluster.DBSCAN(eps=0.5, min_samples=3) self.cover = cover or Cover(n_cubes=10, perc_overlap=0.1) nerve = nerve or GraphNerve() nodes = defaultdict(list) meta = defaultdict(list) graph = {} # If inverse image is not provided, we use the projection as the inverse image (suffer projection loss) if X is None: X = lens if self.verbose > 0: print("Mapping on data shaped %s using lens shaped %s\n" % (str(X.shape), str(lens.shape))) # Prefix'ing the data with an ID column ids = np.array([x for x in range(lens.shape[0])]) lens = np.c_[ids, lens] if issparse(X): X = hstack([ids[np.newaxis].T, X], format='csr') else: X = np.c_[ids, X] # Cover scheme defines a list of elements bins = self.cover.fit(lens) # Algo's like K-Means, have a set number of clusters. We need this number # to adjust for the minimal number of samples inside an interval before # we consider clustering or skipping it. cluster_params = clusterer.get_params() min_cluster_samples = cluster_params.get( "n_clusters", cluster_params.get("min_cluster_size", cluster_params.get("min_samples", 1)), ) if self.verbose > 1: print("Minimal points in hypercube before clustering: %d" % (min_cluster_samples)) # Subdivide the projected data X in intervals/hypercubes with overlap if self.verbose > 0: bins = list(bins) # extract list from generator total_bins = len(bins) print("Creating %s hypercubes." % total_bins) for i, hypercube in enumerate(self.cover.transform(lens)): # If at least min_cluster_samples samples inside the hypercube if hypercube.shape[0] >= min_cluster_samples: # Cluster the data point(s) in the cube, skipping the id-column # Note that we apply clustering on the inverse image (original data samples) that fall inside the cube. ids = [int(nn) for nn in hypercube[:, 0]] X_cube = X[ids] fit_data = X_cube[:, 1:] if precomputed: fit_data = fit_data[:, ids] cluster_predictions = clusterer.fit_predict(fit_data) if self.verbose > 1: print(" > Found %s clusters in hypercube %s." % (np.unique(cluster_predictions[ cluster_predictions > -1]).shape[0], i)) for pred in np.unique(cluster_predictions): # if not predicted as noise if pred != -1 and not np.isnan(pred): cluster_id = "cube{}_cluster{}".format(i, int(pred)) nodes[cluster_id] = hypercube[:, 0][ cluster_predictions == pred].astype(int).tolist() elif self.verbose > 1: print("Cube_%s is empty.\n" % (i)) if remove_duplicate_nodes: nodes = self._remove_duplicate_nodes(nodes) links, simplices = nerve.compute(nodes) graph["nodes"] = nodes graph["links"] = links graph["simplices"] = simplices graph["meta_data"] = { "projection": self.projection if self.projection else "custom", "n_cubes": self.cover.n_cubes, "perc_overlap": self.cover.perc_overlap, "clusterer": str(clusterer), "scaler": str(self.scaler), } graph["meta_nodes"] = meta if self.verbose > 0: self._summary(graph, str(datetime.now() - start)) return graph
def join(self, other, axis=1, how='outer', level=None): """ Join two tables along their indices Parameters ---------- other: sparsity.SparseTable another SparseFrame axis: int along which axis to join how: str one of 'inner', 'outer', 'left', 'right' level: int if Multiindex join using this level Returns ------- joined: sparsity.SparseFrame """ if isinstance(self._index, pd.MultiIndex)\ or isinstance(other._index, pd.MultiIndex): raise NotImplementedError() if not isinstance(other, SparseFrame): other = SparseFrame(other) if axis not in set([0, 1]): raise ValueError("axis mut be either 0 or 1") if axis == 0: if np.all(other._columns.values == self._columns.values): # take short path if join axes are identical data = sparse.vstack([self.data, other.data]) index = np.hstack([self.index, other.index]) res = SparseFrame(data, index=index, columns=self._columns) else: raise NotImplementedError( "Joining along axis 0 fails when column names differ." "This is probably caused by adding all-zeros row.") data, new_index = _matrix_join(self._data.T.tocsr(), other._data.T.tocsr(), self._columns, other._columns, how=how) res = SparseFrame(data.T.tocsr(), index=np.concatenate( [self.index, other.index]), columns=new_index) elif axis == 1: if np.all(self.index.values == other.index.values): # take short path if join axes are identical data = sparse.hstack([self.data, other.data]) columns = np.hstack([self._columns, other._columns]) res = SparseFrame(data, index=self.index, columns=columns) else: data, new_index = _matrix_join(self._data, other._data, self.index, other.index, how=how) res = SparseFrame(data, index=new_index, columns=np.concatenate( [self._columns, other._columns])) return res
def calibration_single_ended_solver(ds, st_label, ast_label, st_var=None, ast_var=None, calc_cov=True, solver='sparse', verbose=False): """ Parameters ---------- ds : DataStore st_label : str ast_label : str st_var : float, array-like, optional If `None` use ols calibration. If `float` the variance of the noise from the Stokes detector is described with a single value. Or when the variance is a function of the intensity (Poisson distributed) define an array with shape (nx, nt), where nx are the number of calibration locations. ast_var : float, array-like, optional If `None` use ols calibration. If `float` the variance of the noise from the Stokes detector is described with a single value. Or when the variance is a function of the intensity (Poisson distributed) define an array with shape (nx, nt), where nx are the number of calibration locations. calc_cov : bool whether to calculate the covariance matrix. Required for calculation of confidence boundaries. But uses a lot of memory. solver : {'sparse', 'stats', 'external', 'external_split'} Always use sparse to save memory. The statsmodel can be used to validate sparse solver. `external` returns the matrices that would enter the matrix solver (Eq.37). `external_split` returns a dictionary with matrix X split in the coefficients per parameter. The use case for the latter is when certain parameters are fixed/combined. verbose : bool Returns ------- """ ix_sec = ds.ufunc_per_section(x_indices=True, calc_per='all') ds_sec = ds.isel(x=ix_sec) x_sec = ds_sec['x'].values nx = x_sec.size nt = ds.time.size p0_est = np.asarray([485., 0.1] + nt * [1.4]) # X \gamma # Eq.34 cal_ref = ds.ufunc_per_section(label=st_label, ref_temp_broadcasted=True, calc_per='all') data_gamma = 1 / (cal_ref.ravel() + 273.15) # gamma coord_gamma_row = np.arange(nt * nx, dtype=int) coord_gamma_col = np.zeros(nt * nx, dtype=int) X_gamma = sp.coo_matrix((data_gamma, (coord_gamma_row, coord_gamma_col)), shape=(nt * nx, 1), copy=False) # X \Delta\alpha # Eq.34 data_dalpha = np.repeat(-x_sec, nt) # dalpha coord_dalpha_row = np.arange(nt * nx, dtype=int) coord_dalpha_col = np.zeros(nt * nx, dtype=int) X_dalpha = sp.coo_matrix( (data_dalpha, (coord_dalpha_row, coord_dalpha_col)), shape=(nt * nx, 1), copy=False) # X C # Eq.34 data_c = -np.ones(nt * nx, dtype=int) coord_c_row = np.arange(nt * nx, dtype=int) coord_c_col = np.tile(np.arange(nt, dtype=int), nx) X_c = sp.coo_matrix((data_c, (coord_c_row, coord_c_col)), shape=(nt * nx, nt), copy=False) # Stack all X's X = sp.hstack((X_gamma, X_dalpha, X_c)) # y y = np.log(ds_sec[st_label] / ds_sec[ast_label]).values.ravel() # w if st_var is not None: w = 1 / (ds_sec[st_label]**-2 * st_var + ds_sec[ast_label]**-2 * ast_var).values.ravel() else: w = 1. # unweighted if solver == 'sparse': if calc_cov: p_sol, p_var, p_cov = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=calc_cov, verbose=verbose) else: p_sol, p_var = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=calc_cov, verbose=verbose) elif solver == 'stats': if calc_cov: p_sol, p_var, p_cov = wls_stats(X, y, w=w, calc_cov=calc_cov, verbose=verbose) else: p_sol, p_var = wls_stats(X, y, w=w, calc_cov=calc_cov, verbose=verbose) elif solver == 'external': return X, y, w, p0_est elif solver == 'external_split': return dict(y=y, w=w, X_gamma=X_gamma, X_dalpha=X_dalpha, X_c=X_c, p0_est=p0_est) else: raise ValueError("Choose a valid solver") if calc_cov: return p_sol, p_var, p_cov else: return p_sol, p_var
X_train_bow = vectorizer.fit_transform(clean_train_reviews) X_test_bow = vectorizer.transform(clean_test_reviews) model_final = createModel_word2Vec(clean_train_reviews) print('Loading word2vec model..\n') model = Word2Vec.load(model_final) print("Creating the w2v vectors...\n") X_train_w2v = scale(getAvgFeatureVecs(clean_train_reviews, model, 5000)) X_test_w2v = scale(getAvgFeatureVecs(clean_test_reviews, model, 5000)) print("Combing the bag of words and the w2v vectors...\n") X_train_bwv = hstack([X_train_bow, X_train_w2v]) X_test_bwv = hstack([X_test_bow, X_test_w2v]) print("Checking the dimension of training vectors") print('W2V', X_train_w2v.shape) print('BoW-W2V', X_train_bwv.shape) y_train = Review_train['Rating'] clf = LogisticRegression(class_weight="auto") print("Predicting with Bag-of-words model and Word2Vec model...\n") clf.fit(X_train_bwv, y_train)
def construct_submatrices(nt, nx, st_label, ds, transient_asym_att_x, x_sec): """Wrapped in a function to reduce memory usage. Constructing: Z_gamma (nt * nx, 1). Data: positive 1/temp Z_D (nt * nx, nt). Data: ones E (nt * nx, nx). Data: ones Zero_gamma (nt * nx, 1) zero_d (nt * nx, nt) Z_TA_fw (nt * nx, nta * 2 * nt) minus ones Z_TA_bw (nt * nx, nta * 2 * nt) minus ones Z_TA_E (nt * nx, nta * 2 * nt) I_fw = 1/Tref*gamma - D_fw - E - TA_fw I_bw = 1/Tref*gamma - D_bw + E - TA_bw (I_bw - I_fw) / 2 = D_fw/2 - D_bw/2 + E + TA_fw/2 - TA_bw/2 Eq42 """ # Z \gamma # Eq.47 cal_ref = np.array( ds.ufunc_per_section(label=st_label, ref_temp_broadcasted=True, calc_per='all')) data_gamma = 1 / (cal_ref.ravel() + 273.15) # gamma coord_gamma_row = np.arange(nt * nx, dtype=int) coord_gamma_col = np.zeros(nt * nx, dtype=int) Z_gamma = sp.coo_matrix( (data_gamma, (coord_gamma_row, coord_gamma_col)), shape=(nt * nx, 1), copy=False) # Z D # Eq.47 data_c = np.ones(nt * nx, dtype=float) coord_c_row = np.arange(nt * nx, dtype=int) coord_c_col = np.tile(np.arange(nt, dtype=int), nx) Z_D = sp.coo_matrix((data_c, (coord_c_row, coord_c_col)), shape=(nt * nx, nt), copy=False) Z_D_att = sp.eye(nt, format='coo') # E # Eq.47 data_c = np.ones(nt * nx, dtype=float) coord_c_row = np.arange(nt * nx, dtype=int) coord_c_col = np.repeat(np.arange(nx, dtype=int), nt) E = sp.coo_matrix((data_c, (coord_c_row, coord_c_col)), shape=(nt * nx, nx), copy=False) # Zero # Eq.45 Zero_gamma = sp.coo_matrix(([], ([], [])), shape=(nt * nx, 1)) Zero_d = sp.coo_matrix(([], ([], [])), shape=(nt * nx, nt)) Zero_E = sp.coo_matrix(([], ([], [])), shape=(nt * nx, nx)) Zero_gamma_att = sp.coo_matrix(([], ([], [])), shape=(nt, 1)) Zero_E_att = sp.coo_matrix(([], ([], [])), shape=(nt, nx)) if transient_asym_att_x: # unpublished BdT TA_fw_list = list() TA_bw_list = list() for transient_asym_att_xi in transient_asym_att_x: """For forward direction. """ # first index on the right hand side a the difficult splice # Deal with connector outside of fiber if transient_asym_att_xi >= x_sec[-1]: ix_sec_ta_ix0 = nx elif transient_asym_att_xi <= x_sec[0]: ix_sec_ta_ix0 = 0 else: ix_sec_ta_ix0 = np.flatnonzero( x_sec >= transient_asym_att_xi)[0] # Data is -1 for both forward and backward # I_fw = 1/Tref*gamma - D_fw - E - TA_fw. Eq40 data_ta_fw = -np.ones(nt * (nx - ix_sec_ta_ix0), dtype=float) # skip ix_sec_ta_ix0 locations, because they are upstream of # the connector. coord_ta_fw_row = np.arange(nt * ix_sec_ta_ix0, nt * nx, dtype=int) # nt parameters coord_ta_fw_col = np.tile(np.arange(nt, dtype=int), nx - ix_sec_ta_ix0) TA_fw_list.append( sp.coo_matrix( # TA_fw (data_ta_fw, (coord_ta_fw_row, coord_ta_fw_col)), shape=(nt * nx, 2 * nt), copy=False)) # I_bw = 1/Tref*gamma - D_bw + E - TA_bw. Eq41 data_ta_bw = -np.ones(nt * ix_sec_ta_ix0, dtype=float) coord_ta_bw_row = np.arange(nt * ix_sec_ta_ix0, dtype=int) coord_ta_bw_col = np.tile(np.arange(nt, 2 * nt, dtype=int), ix_sec_ta_ix0) TA_bw_list.append( sp.coo_matrix( # TA_bw (data_ta_bw, (coord_ta_bw_row, coord_ta_bw_col)), shape=(nt * nx, 2 * nt), copy=False)) Z_TA_fw = sp.hstack(TA_fw_list) Z_TA_bw = sp.hstack(TA_bw_list) else: Z_TA_fw = sp.coo_matrix(([], ([], [])), shape=(nt * nx, 0)) Z_TA_bw = sp.coo_matrix(([], ([], [])), shape=(nt * nx, 0)) Z_TA_att = sp.coo_matrix(([], ([], [])), shape=(nt, 0)) # (I_bw - I_fw) / 2 = D_fw/2 - D_bw/2 + E + TA_fw/2 - TA_bw/2 Eq42 Z_TA_E = (Z_TA_bw - Z_TA_fw) / 2 return E, Z_D, Z_gamma, Zero_d, Zero_gamma, Z_TA_fw, Z_TA_bw, Z_TA_E,\ Zero_E, Z_TA_att, Z_D_att, Zero_gamma_att, Zero_E_att
#test test_new = train_test.iloc[ntrain:, :] test_new_cat = me.transform(test_new) train_test = pd.concat((train_new_cat, test_new_cat), axis=0).reset_index(drop=True) train_test.drop(categoricals, axis=1, inplace=True) train_test['features_count'] = train_test['features'].apply(lambda x: len(x)) train_test['features2'] = train_test['features'] train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x)) c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1)) c_vect_sparse = c_vect.fit_transform(train_test['features2']) c_vect_sparse_cols = c_vect.get_feature_names() train_test.drop(['features', 'features2'], axis=1, inplace=True) train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr() train_test_new = pd.DataFrame(train_test_sparse.toarray()) X_train = train_test_new.iloc[:ntrain, :] X_test = train_test_new.iloc[ntrain:, :] train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True) train_new.to_csv(out + 'RentListingInquries_FE_train.csv', index=False) X_test.to_csv(out + 'RentListingInquries_FE_test.csv', index=False) X_train_sparse = train_test_sparse[:ntrain, :] X_test_sparse = train_test_sparse[ntrain:, :] train_sparse = sparse.hstack([X_train_sparse, sparse.csr_matrix(y_train).T]).tocsr() mmwrite(out + 'RentListingInquries_FE_train.txt', train_sparse) mmwrite(out + 'RentListingInquries_FE_test.txt', X_test_sparse)