Example #1
0
  def predict_proba( self, X ):    
    #Return probabilities predicted by the model
    if self.modelType == "SV1":
      return self.model01.predict_proba ( X )

    if self.modelType == "SV2":
      mprob1 = self.model01.predict_proba( X )
      mprob2 = self.model02.predict_proba( X )
      return np.array( mprob1[:,1] ), np.array( mprob2[:,1] )

    if self.modelType == "LR2":
      if self.nComp == 0 or min( len( self.featSel1) , len( self.featSel2 ) ) <= self.nComp:
        mprob1 = self.model01.predict_proba( X )
        mprob2 = self.model02.predict_proba( X )      
      else:
        newX1 , newX2 = X[ :, self.featSel1 ], X[ :, self.featSel2 ]  
        X1_1, X1_2 = self.concepts1.transform( newX1 ), self.concepts2.transform( newX2 )
        X_1 , X_2  = hstack( ( X, csr_matrix( X1_1 ) ) ) , hstack( ( X, csr_matrix( X1_2 ) ) )
        mprob1 = self.model01.predict_proba( X_1 )
        mprob2 = self.model02.predict_proba( X_2 )
      return np.array( mprob1[:,1] ), np.array( mprob2[:,1] )

    if self.modelType == "LR1":
      if self.nComp == 0 or len( self.featSel) <= self.nComp:
        return self.model01.predict_proba ( X )
      else:
        newX = X[ :, self.featSel ]
        X1  = self.concepts.transform( newX )
        combined_X = hstack( ( X, csr_matrix( X1 ) ) ) 
        return self.model01.predict_proba( combined_X )
Example #2
0
def makePropertyTensor(M, tensor):
    if tensor is None:  # default is ones
        tensor = np.ones(M.nC)

    if isScalar(tensor):
        tensor = tensor * np.ones(M.nC)

    propType = TensorType(M, tensor)
    if propType == 1: # Isotropic!
        Sigma = sp.kron(sp.identity(M.dim), sdiag(mkvc(tensor)))
    elif propType == 2: # Diagonal tensor
        Sigma = sdiag(mkvc(tensor))
    elif M.dim == 2 and tensor.size == M.nC*3:  # Fully anisotropic, 2D
        tensor = tensor.reshape((M.nC,3), order='F')
        row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 2])))
        row2 = sp.hstack((sdiag(tensor[:, 2]), sdiag(tensor[:, 1])))
        Sigma = sp.vstack((row1, row2))
    elif M.dim == 3 and tensor.size == M.nC*6:  # Fully anisotropic, 3D
        tensor = tensor.reshape((M.nC,6), order='F')
        row1 = sp.hstack((sdiag(tensor[:, 0]), sdiag(tensor[:, 3]), sdiag(tensor[:, 4])))
        row2 = sp.hstack((sdiag(tensor[:, 3]), sdiag(tensor[:, 1]), sdiag(tensor[:, 5])))
        row3 = sp.hstack((sdiag(tensor[:, 4]), sdiag(tensor[:, 5]), sdiag(tensor[:, 2])))
        Sigma = sp.vstack((row1, row2, row3))
    else:
        raise Exception('Unexpected shape of tensor')

    return Sigma
Example #3
0
    def train_model(self):
        '''
        calls the computation of each feature in self.feature_list
        builds self.X_train, self.X_test matrices and fits classifier on the trainings data
        :return: None
        '''

        self.train_raw = self.train
        self.train = self._filter(self.train)
        self.test_raw = self.test
        self.test = self._filter(self.test)

        self.train_unified = self._unify_data(self.train)
        self.test_unified = self._unify_data(self.test)

        for feature in self.feature_list:
            model, train, test = self._get_model(feature)
            self.feature_models[feature] = model

            if type(self.X_train) == int:
                self.X_train = train
            else:
                self.X_train = sp.hstack((self.X_train, train), format="csr")

            if type(self.X_test) == int:
                self.X_test = test
            else:
                self.X_test = sp.hstack((self.X_test, test), format="csr")

        self.classifier.fit(self.X_train, self.y_train)
Example #4
0
  def fitLR2( self, X, y):  
    #Fit Logistic Regression when the possible scores may vary from 0 to 2
    C1, C2, self.nComp = self.param1, self.param2, self.param3
    y1 = ( y > 0 ) * 1
    y2 = ( y > 1 ) * 1
    #Train two Logistic Regression models and repeat process described in fitLR1 with the both of them
    self.model01 = LogisticRegression( penalty= 'l1' , C = C1, random_state=2512 )
    self.model01.fit( X , y1 )
    self.model02 = LogisticRegression( penalty= 'l1' , C = C1, random_state=2512 )
    self.model02.fit( X , y2 )
    self.featSel1 = [ i for i in range(len( self.model01.coef_[0])) if self.model01.coef_[0][i] > 0 ]
    self.featSel2 = [ i for i in range(len( self.model02.coef_[0])) if self.model02.coef_[0][i] > 0 ]        

    if self.nComp == 0 or min( len( self.featSel1) , len( self.featSel2 ) ) <= self.nComp:
      return

    newX1, newX2 = X[:,self.featSel1]  , X[:,self.featSel2]
    self.concepts1 = TruncatedSVD( n_components = self.nComp, random_state = 2512 ) ## test with RBM
    self.concepts2 = TruncatedSVD( n_components = self.nComp, random_state = 2512 ) ## test with RBM
    self.concepts1.fit( newX1 )
    self.concepts2.fit( newX2 )
    X1_1, X1_2 = self.concepts1.transform( newX1 ), self.concepts2.transform( newX2 )
    X_1 , X_2  = hstack( ( X, csr_matrix( X1_1 ) ) ) , hstack( ( X, csr_matrix( X1_2 ) ) )
    self.model01 = LogisticRegression( penalty= 'l1' , C = C2 , random_state=2512 )
    self.model02 = LogisticRegression( penalty= 'l1' , C = C2 , random_state=2512 )
    self.model01.fit( X_1, y1 )
    self.model02.fit( X_2, y2 )     
    return
Example #5
0
def inv2X2BlockDiagonal(a11, a12, a21, a22, returnMatrix=True):
    """ B = inv2X2BlockDiagonal(a11, a12, a21, a22)

    Inverts a stack of 2x2 matrices by using the inversion formula

    inv(A) = (1/det(A)) * cof(A)^T

    Input:
    A   - a11, a12, a21, a22

    Output:
    B   - inverse
    """

    a11 = mkvc(a11)
    a12 = mkvc(a12)
    a21 = mkvc(a21)
    a22 = mkvc(a22)

    # compute inverse of the determinant.
    detAinv = 1./(a11*a22 - a21*a12)

    b11 = +detAinv*a22
    b12 = -detAinv*a12
    b21 = -detAinv*a21
    b22 = +detAinv*a11

    if not returnMatrix:
        return b11, b12, b21, b22

    return sp.vstack((sp.hstack((sdiag(b11), sdiag(b12))),
                      sp.hstack((sdiag(b21), sdiag(b22)))))
def clf_event_running_wordVec(path, event, name_clf, X, X_vec, Y, clf, K, command, call):
    if command == "StratifiedKFold":
        cv = StratifiedKFold(Y, K)
    else:
        print "Need a correct command"
        quit()

    X_vec_norm = preprocessing.normalize(X_vec, norm="l2")
    for traincv, testcv in cv:
        X_train, X_test = X[traincv], X[testcv]
        X_vec_train, X_vec_test = X_vec_norm[traincv], X_vec_norm[testcv]
        y_train, y_test = Y[traincv], Y[testcv]

        MIN_DF = 2
        vec = CountVectorizer(lowercase=True, min_df=2)
        vec = vec.fit(X_train)

        X_train_trans, X_test_trans = vec.transform(X_train), vec.transform(X_test)
        X_train_trans_all, X_test_trans_all = hstack([X_train_trans, X_vec_train]), hstack([X_test_trans, X_vec_test])

        # print X_vec_train.shape, X_vec_test.shape
        # print X_train_trans.shape, X_test_trans.shape
        # print X_train_trans_all.shape, X_test_trans_all.shape

        clf.fit(X_train_trans_all, y_train)  # training model
        y_test_pred = clf.predict(X_test_trans_all)

        matrix = confusion_matrix(y_test_pred, y_test)
        for value in matrix:
            line = ""
            for each in value:
                line = line + str(each) + "\t"
            print line.strip()
        print "----------------"
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection):
    nsample=sparseArrayRowNorm[0].shape[0]
    scaler = StandardScaler(with_mean=False)
    #scaler = MinMaxScaler()
    testsize=int(nsample/nfold)
    cvIdx=[1]*(nsample-testsize)+[2]*testsize
    random.seed(100)
    aucRes=[]
    for nn in range(nrep):
        #print nn
        random.shuffle(cvIdx)
        Y_train=y_all[np.where(np.array(cvIdx)==1)[0]]
        Y_test=y_all[np.where(np.array(cvIdx)==2)[0]]
        X_train_all=[]
        X_test_all=[]
        for ii in xrange(len(sparseArrayRowNorm)):
            varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1]))
            X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:]
            X_train =varSelector.fit_transform(X_train, Y_train)
            X_train_all=X_train_all+[X_train]
            X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:]
            X_test= varSelector.transform(X_test)
            X_test_all=X_test_all+[X_test]
        X_train=hstack(X_train_all,format='csr')
        X_test=hstack(X_test_all,format='csr')
        del X_train_all
        del X_test_all
        aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure))
    print np.array(aucRes).mean()
    return np.array(aucRes).mean()
def _df_one_hot_encode(self, dtype=np.float):  
  if self.categoricals(): self.to_indexes(drop_origianls=True)    

  start('one_hot_encoding data frame with ' + `self.shape[1]` + \
    ' columns. \n\tNOTE: this resturns a sparse array and empties' + \
    ' the initial array.')  

  debug('separating categoricals from others')
  indexes = self.indexes()
  if not indexes: return self
  others = filter(lambda c: not c in indexes, self.columns)

  categorical_df = self[indexes]    
  others_df = sparse.coo_matrix(self[others].values)

  # Destroy original as it now just takes up memory
  self.drop(self.columns, 1, inplace=True) 
  gc.collect()

  ohe_sparse = None
  for i, c in enumerate(indexes):
    debug('one hot encoding column: ' + `c`)
    col_ohe = OneHotEncoder(categorical_features=[0], dtype=dtype).\
      fit_transform(categorical_df[[c]])
    if ohe_sparse == None: ohe_sparse = col_ohe
    else: ohe_sparse = sparse.hstack((ohe_sparse, col_ohe))
    categorical_df.drop(c, axis=1, inplace=True)
    gc.collect()
  
  matrix = ohe_sparse if not others else sparse.hstack((ohe_sparse, others_df))
  stop('done one_hot_encoding')
  return matrix.tocsr()
Example #9
0
def tvdiplmax(y):
    """Calculate the value of lambda so that if lambda >= lambdamax, the TVD
    functional solved by TVDIP is minimized by the trivial constant solution
    x = mean(y). This can then be used to determine a useful range of values
    of lambda, for example.
    Args:
        y: Original signal to denoise, size N x 1.
    Returns:
        lambdamax: Value of lambda at which x = mean(y) is the output of the
            TVDIP function.
    """

    N = y.size
    M = N - 1

    # Construct sparse operator matrices
    I1 = sparse.eye(M)
    O1 = sparse.dia_matrix((M, 1))
    D = sparse.hstack([I1, O1]) - sparse.hstack([O1, I1])

    DDT = D.dot(D.conj().T)
    Dy = D.dot(y)

    lambdamax = np.absolute(linalg.spsolve(DDT, Dy)).max(0)

    return lambdamax
def load_lda_dataset(uid):
    fname = join(DATASETS_FOLDER, 'es_twlda25ds_%d.npz' % uid)

    z = np.load(open(fname,'rb'))
    X_train = z['arr_0'].item()
    X_valid = z['arr_1'].item()
    X_test = z['arr_2'].item()

    # X_train = csc.csc_matrix(X_train.tolist())
    # X_valid = csc.csc_matrix(X_train.tolist())
    # X_test = csc.csc_matrix(X_test.tolist())

    cols_train = X_train.shape[1]
    cols_valid = X_valid.shape[1]
    cols_test = X_test.shape[1]

    maxcols = max(cols_train, cols_valid, cols_test)

    if cols_train < maxcols:
        missing_cols = csc_matrix((X_train.shape[0], maxcols - cols_train), dtype=np.float64)
        X_train = sp.hstack((X_train, missing_cols))

    if cols_valid < maxcols:
        missing_cols = csc_matrix((X_valid.shape[0], maxcols - cols_valid), dtype=np.float64)
        X_valid = sp.hstack((X_valid, missing_cols))

    if cols_test < maxcols:
        missing_cols = csc_matrix((X_test.shape[0], maxcols - cols_test), dtype=np.float64)
        X_test = sp.hstack((X_test, missing_cols))

    ys_fname = join(DATAFRAMES_FOLDER, "ysv_%d_small.pickle" % uid)
    y_train, y_valid, y_test = pickle.load(open(ys_fname, 'rb'))

    return X_train, X_valid, X_test, y_train, y_valid, y_test
Example #11
0
def newtonDirection(Rb, Rc, Rxs, A, m, n, x, s, lu, errorCheck=0):
    
    rhs =np.hstack((-Rb,-Rc+Rxs/x))
    D_2 = -np.minimum(1e+16, s/x)
    B = sparse.vstack ((sparse.hstack((sparse.coo_matrix((m,m)), A)), sparse.hstack((A.T, sparse.diags([D_2], [0])))))
    
    # ldl' factorization
    # if L and D are not provided, we calc new factorization; otherwise,
    # reuse them
    useLu=True
    if useLu:
        if (lu is None)  :
            lu = sparse.linalg.splu(B.tocsc())
            # wikipedia says it uses Mehrotra cholesky but the matrix i'm getting is not definite positive
            # scikits.sparse.cholmod.cholesky fails without a warning 
    
        sol=lu.solve(rhs)
    else:
         sol=sparse.linalg.cg(B,rhs,tol=1e-5)[0]
         #assert(np.max(np.abs(B*sol-rhs))<1e-5)
      
        
        
    dy = sol[:m]
    dx = sol[m:m+n];
    ds = -(Rxs+s*dx)/x;
    
    if errorCheck == 1:
        print ('error = %6.2e'%(norm(A.T*dy + ds + Rc)+ norm(A*dx + Rb) + norm(s*dx + x*ds + Rxs)),)
        print ('\t + err_d = %6.2e'%(norm(A.T*dy + ds + Rc)),)
        print ('\t + err_p = %6.2e'%(norm(A*dx + Rb)),)
        print ('\t + err_gap = %6.2e\n'%(norm(s*dx + x*ds + Rxs)),)
      
    return dx, dy, ds, lu
def cernikov_filter(wts, fts=None):
    """ Remove any of the working transversals that are minimal versions of each other """
    wt_i = 0
    wts = sparse.csc_matrix(wts)
    if fts is not None:
        fts = sparse.csc_matrix(fts)

    while wt_i < wts.shape[1]:
        target_t = wts[:, wt_i]
        left_t = wts[:, :max(wt_i, 0)]
        right_t = wts[:, min(wt_i+1, wts.shape[1]):]

        assert(left_t.shape[1] + right_t.shape[1] + target_t.shape[1] == wts.shape[1]), "Left/Right split failed."

        left_right_t = sparse.hstack((left_t, right_t))
        if fts is not None:
            check_ts = sparse.hstack((left_right_t, fts))
        else:
            check_ts = left_right_t

        if is_minimal_present(target_t, check_ts):
            wts = left_right_t # The new wts to loop over
            # [logic] wt_i = wt_i # Don't increase
        else:
            # [logic] wts = wts # Keep target
            wt_i += 1
    return wts
Example #13
0
 def __init__(self, X_l, L_l, X_u, random_generator, ** kw):
     """
     Intializes the S3VM optimizer.
     """
     self.__random_generator = random_generator
     # This is a nuisance, but we may need to pad extra dimensions to either X_l or X_u
     # in case the highest feature indices appear only in one of the two data matrices
     if X_l.shape[1] > X_u.shape[1]:
         X_u = sparse.hstack([X_u, sparse.coo_matrix(X_u.shape[0], X_l.shape[1] - X_u.shape[1])])
     elif X_l.shape[1] < X_u.shape[1]:
         X_l = sparse.hstack([X_l, sparse.coo_matrix(X_l.shape[0], X_u.shape[1] - X_u.shape[1])])
     # We vertically stack the data matrices into one big matrix
     X = sparse.vstack([X_l, X_u])
     self.__size_l, self.__size_u, self.__size_n = X_l.shape[0], X_u.shape[0], X_l.shape[0]+ X_u.shape[0]
     x = arr.array('i')
     for l in L_l:
         x.append(int(l))
     self.__YL = mat(x, dtype=np.float64)
     self.__YL = self.__YL.transpose()
     self.__setParameters( ** kw)
     self.__kw = kw
     self.X_l = X_l.tocsr()
     self.X_u = X_u.tocsr()
     self.X = X.tocsr()
     # compute mean of unlabeled patterns
     self.__mean_u = self.X_u.mean(axis=0)
     self.X_u_T = X_u.tocsc().T
     self.X_l_T = X_l.tocsc().T
     self.X_T = X.tocsc().T
Example #14
0
def get_data():
    tickets_file = csv.reader(open('2012-10-09.close.csv'))

    time_format = '%Y-%m-%d %H:%M:%S'
    tickets = []
    times = []
    reporters = []
    subjects = []

    for number, created, changetime, closetime, reporter, summary, status, \
            owner, tkt_type, component, description in tickets_file:
        row = []
        created = dt.datetime.strptime(created, time_format)
        closetime = dt.datetime.strptime(closetime, time_format)
        changetime = dt.datetime.strptime(changetime, time_format)
        time_to_fix = closetime - created

        row.append(float(number))
        row.append(float(time.mktime(created.timetuple())))

        tickets.append(row)
        times.append(total_seconds(time_to_fix))
        reporters.append(reporter)
        subjects.append(summary)

    scaler = preprocessing.Scaler().fit(np.array(tickets))
    tickets = sp.csr_matrix(scaler.transform(tickets))
    tickets = sp.hstack((tickets, TfidfTransformer().fit_transform(
                CountVectorizer().fit_transform(reporters))))
    tickets = sp.hstack((tickets, TfidfTransformer().fit_transform(
                CountVectorizer(ngram_range=(1,3)).fit_transform(subjects))))

    scaler = preprocessing.Scaler(with_mean=False).fit(tickets)
    tickets = scaler.transform(tickets)
    return tickets, times
Example #15
0
    def _get_aug_mat(self, k, j):
        """
        Generate the matrix [[A, E], [0, A]] where
            A is the overall dynamics generator
            E is the control dynamics generator
        for a given timeslot and control
        returns this augmented matrix
        """
        dyn = self.parent
        dg = dyn._get_phased_dyn_gen(k)

        if dyn.oper_dtype == Qobj:
            A = dg.data*dyn.tau[k]
            E = dyn._get_phased_ctrl_dyn_gen(k, j).data*dyn.tau[k]
            Z = sp.csr_matrix(dg.data.shape)
            aug = Qobj(sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])]))
        elif dyn.oper_dtype == np.ndarray:
            A = dg*dyn.tau[k]
            E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k]
            Z = np.zeros(dg.shape)
            aug = np.vstack([np.hstack([A, E]), np.hstack([Z, A])])
        else:
            A = dg*dyn.tau[k]
            E = dyn._get_phased_ctrl_dyn_gen(k, j)*dyn.tau[k]
            Z = dg*0.0
            aug = sp.vstack([sp.hstack([A, E]), sp.hstack([Z, A])])
        return aug
Example #16
0
def pywfmLocalModel(trainFeature, testFeature, trainLabel, testLabel, trainIndex, testIndex, fm, cvIndex):


	print 'run local: folds: ' + str(cvIndex) 

	trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex)
	encoder = OneHotEncoder(n_values=[value1, value2])
	trainIndex_encode = encoder.fit_transform(trainIndex)
	testIndex_encode = encoder.transform(testIndex)

	trainFeature = hstack((trainIndex_encode, trainFeature))
	testFeature = hstack((testIndex_encode, testFeature))

	'''
	for i in range(len(trainLabel)):
		if i == 0:
			trainLabel[i] = -1
	for i in range(len(testLabel)):
		if i == 0:
			testLabel[i] = -1
	'''
	model = fm.run(trainIndex_encode, trainLabel, testIndex_encode, testLabel)

	predict = model.predictions

	predict = np.array(predict, np.float)

	predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict))


	return predict
Example #17
0
File: nilss.py Project: qiqi/nilss
def LSS_KKT(R, D):
    R, D = array(R), array(D)
    assert R.ndim == 3
    assert R.shape[1] == R.shape[2]
    N, m = R.shape[:2]

    bigR = sparse.bsr_matrix((R, r_[:N], r_[:N+1]), \
                          shape=(N*m, (N+1)*m))

    I = array([eye(m)] * N)
    bigI = sparse.bsr_matrix((I, r_[1:N+1], r_[:N+1]), \
                          shape=(N*m, (N+1)*m))

    bigL = bigI - bigR

    assert D.shape == (N+1, m, m)
    bigD = sparse.bsr_matrix((D, r_[:N+1], r_[:N+2]), \
                          shape=((N+1)*m, (N+1)*m))

    O = zeros([N, m, m])
    bigO = sparse.bsr_matrix((O, r_[:N], r_[:N+1]), \
                          shape=(N*m, N*m))

    return sparse.vstack([sparse.hstack([bigD, bigL.T]),
                          sparse.hstack([bigL, bigO])])
Example #18
0
def _df_append_right(self, df_or_s):  
  start('appending to the right.  note, this is a destructuve operation')
  if (type(df_or_s) is sparse.coo.coo_matrix):
    self_sparse = None
    for c in self.columns:
      debug('\tappending column: ' + c)
      c_coo = sparse.coo_matrix(self[[c]])
      self.drop([c], 1, inplace=True)
      gc.collect()
      if self_sparse == None: self_sparse = c_coo
      else: self_sparse = sparse.hstack((self_sparse, c_coo)) 
    self_sparse = sparse.hstack((self_sparse, df_or_s))
    stop('done appending to the right')
    return self_sparse
  elif _is_sparse(df_or_s) and not _is_sparse(self):
    debug('converting data frame to a sparse frame')
    self = self.to_sparse(fill_value=0)
  if type(df_or_s) is pd.Series: self[df_or_s.name] = df_or_s.values
  else: 
    if type(df_or_s) is pd.DataFrame:
      columns = df_or_s.columns
      right = df_or_s.values
    else:
      columns = [`i` + '_2' for i in range(df_or_s.shape[1])]
      right = df_or_s
Example #19
0
def pywfmPredictModel(trainFeature, testFeature, trainLabel, trainIndex, testIndex, fm):


	print 'run online!'

	trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex)
	encoder = OneHotEncoder(n_values=[value1, value2])
	trainIndex_encode = encoder.fit_transform(trainIndex)
	testIndex_encode = encoder.transform(testIndex)

	trainFeature = hstack((trainIndex_encode, trainFeature))
	testFeature = hstack((testIndex_encode, testFeature))

	#print trainFeature

	'''
	for i in range(len(trainLabel)):
		if i == 0:
			trainLabel[i] = -1
	for i in range(len(testLabel)):
		if i == 0:
			testLabel[i] = -1
	'''
	testLabel = np.zeros((testFeature.shape[0]))
	model = fm.run(trainFeature, trainLabel, testFeature, testLabel)

	predict = model.predictions

	predict = np.array(predict, np.float)
	print np.max(predict), np.min(predict)

	#predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict))


	return predict
def extract_features(data, train_size, with_stemmer, tfidf):
    text_vector, post_time_vector, posting_user_vector = preprocess_posts(data, with_stemmer=with_stemmer)
    label_vector = extract_labels(data)

    text_vector_train, text_vector_test, posting_user_vector_train, posting_user_vector_test, post_time_vector_train, post_time_vector_test, label_vector_train, label_vector_test = \
        train_test_split(text_vector, posting_user_vector, post_time_vector, label_vector, train_size=train_size)

    print "Extracting features..."
    vectorizer = _vectorizer(tfidf)
    le = preprocessing.LabelEncoder()
    le.fit(posting_user_vector)
    from scipy.sparse import csr_matrix, hstack

    def prepare_matrix(vector):
        return np.transpose(csr_matrix(vector))
    features_vector_train = hstack((prepare_matrix(le.transform(posting_user_vector_train)),
                             prepare_matrix(post_time_vector_train),
                             vectorizer.fit_transform(text_vector_train)))
    print vectorizer.get_feature_names()
    features_vector_test = hstack((prepare_matrix(le.transform(posting_user_vector_test)),
                            prepare_matrix(post_time_vector_test),
                            vectorizer.transform(text_vector_test)))

    scaler = preprocessing.StandardScaler(with_mean=False).fit(features_vector_train)
    features_vector_train = scaler.transform(features_vector_train)
    features_vector_test = scaler.transform(features_vector_test)
    return features_vector_train, features_vector_test, label_vector_train, label_vector_test
Example #21
0
def plain_impeuler(Mc,Ac,BTc,Bc,fvbc,fpbc,vp_init,PrP,TsP):

	Nts, t0, tE, dt, Nv, Np = init_time_stepping(PrP,TsP)

	v, p   = expand_vp_dolfunc(PrP, vp=vp_init, vc=None, pc=None)

	tcur = t0

	TsP.UpFiles.u_file << v, tcur
	TsP.UpFiles.p_file << p, tcur

	IterAv = sps.hstack([Mc+dt*Ac,-dt*BTc])
	IterAp = sps.hstack([-dt*Bc,sps.csr_matrix((Np,Np))])
	IterA  = sps.vstack([IterAv,IterAp]).todense()[:-1,:-1]

	vp_old = vp_init
	for etap in range(1,11):
		for i in range(Nts/10):
			tcur = tcur + dt

			Iterrhs = np.vstack([Mc*vp_old[:Nv,],np.zeros((Np-1,1))]) \
					+ dt*np.vstack([fvbc,fpbc[:-1,]])
			vp_new = np.linalg.solve(IterA,Iterrhs)
			vp_old = vp_new


		print '%d of %d time steps completed ' % (etap*Nts/10,Nts) 
		v, p = expand_vp_dolfunc(PrP, vp=vp_new, vc=None, pc=None)

		TsP.UpFiles.u_file << v, tcur
		TsP.UpFiles.p_file << p, tcur
		
	return
    def checkZeroEig(self,X,vecList):
        
        # eigenvalues and eigenvectors
        LDAeigens, LDAeigenvecs = np.linalg.eig(X)
        LDAeigens, LDAeigenvecs = abs(LDAeigens), abs(LDAeigenvecs)

        # remove zero eigenvalues, corresponding eigenvectors, vecList-cols and X-cols
        marked = []
        for i in range(self.m):
            if LDAeigens[i] == 0:
                marked.append(i)
        LDAeigens = np.delete(LDAeigens,marked,0)
        LDAeigenvecs = np.delete(LDAeigenvecs,marked,0)
        if len(marked)>0:
            print '    empty eigenvalues:'
            print marked
            stackvec = sparse.csc_matrix((vecList.shape[0],1))
            stackX = sparse.csc_matrix((vecList.shape[0],1))
            for col in range(vecList.shape[1]):
                if col in marked:
                    pass
                else:
                    stackvec = sparse.hstack(stackvec,vecList.getcol(col))
                    stackX = sparse.hstack(stackX,X.getcol(col))
            vecList,X = stackvec,stackX
        
        return  vecList,X
Example #23
0
def run(input_train, input_test, output_name):
    """
    Takes a file path as input, a file path as output, and produces a sorted csv of
    item IDs for Kaggle submission
    -------
    input_train : 'full path of the training file'
    input_test : 'full path of the testing file'
    output_name : 'full path of the output file'
    """

    data = pd.read_table(input_train)
    test = pd.read_table(input_test)
    testItemIds = test.itemid
    response = data.is_blocked
    dummies = sparse.csc_matrix(pd.get_dummies(data.subcategory))
    pretestdummies = pd.get_dummies(test.subcategory)
    testdummies = sparse.csc_matrix(pretestdummies.drop(['Растения', 'Товары для компьютера'],axis=1))
    words = np.array(data.description,str)
    testwords = np.array(test.description,str)
    del data, test
    vect = text.CountVectorizer(decode_error = u'ignore', strip_accents='unicode', ngram_range=(1,2))
    corpus = np.concatenate((words, testwords))
    vect.fit(corpus)
    counts = vect.transform(words)
    features = sparse.hstack((dummies,counts))
    clf = LinearSVC()
    clf.fit(features, response)
    testcounts = vect.transform(testwords)
    testFeatures = sparse.hstack((testdummies,testcounts))
    predicted_scores = clf.predict_proba(testFeatures).T[1]
    f = open(output_name,'w')
    f.write("id\n") 
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
Example #24
0
    def predict_proba(self, X):
        """Predict probabilities of label assignments for X

        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix

        Returns
        -------
        :mod:`scipy.sparse` matrix of `float in [0.0, 1.0]`, shape=(n_samples, n_labels)
            matrix with label assignment probabilities
        """
        X_extended = self._ensure_input_format(
            X, sparse_format='csc', enforce_sparse=True)

        results = []
        for label in self._order():
            prediction = self.classifiers_[label].predict(
                self._ensure_input_format(X_extended))

            prediction = self._ensure_output_format(
                prediction, sparse_format='csc', enforce_sparse=True)

            prediction_proba = self.classifiers_[label].predict_proba(
                self._ensure_input_format(X_extended))

            prediction_proba = self._ensure_output_format(
                prediction_proba, sparse_format='csc', enforce_sparse=True)[:, 1]

            X_extended = hstack([X_extended, prediction]).tocsc()
            results.append(prediction_proba)

        return hstack(results)
 def go(self,K=100, Y=6, DI=500, minFreq=5):
     print self._sourceDomain + " -> " + self._targetDomain
     domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq)
     numDomainIndep = len(domainIndependentFeatures)
     numDomainDep = len(domainDependentFeatures)
     #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep)
     #print "creating cooccurrenceMatrix..."
     a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures)
     #print "creating SquareAffinityMatrix..."
     a = self._createSquareAffinityMatrix(a)
     #print "creating DiagonalMatrix..."
     b = self._createDiagonalMatrix(a)
     #print "multiplying..." 
     c = b.dot(a)
     del a
     c = c.dot(b)
     del b
     #print "calculating eigenvalues and eigenvectors"
     eigenValues,eigenVectors = eigsh(c, k=K, which="LA")
     del c
     #print "building document vectors..."
     documentVectorsTraining,classificationsTraining = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain)
     documentVectorsTesting,classificationsTesting = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain)
     #print "training and testing..."
     U  = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]]
     U = np.concatenate(U,axis=1)[:numDomainDep]
     U = sparse.csr_matrix(U)
     clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining]
     trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))]
     clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting]
     testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))]
     self._trainClassifier(trainingVectors, classificationsTraining)
     print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i" % (self._testClassifier(testVectors,classificationsTesting)*100,K,DI,Y,minFreq)
Example #26
0
    def solve_system(self, rhs, factor, u0, t):
        """
        Simple linear solver for (I-dtA)u = rhs

        Args:
            rhs: right-hand side for the nonlinear system
            factor: abbrev. for the node-to-node stepsize (or any other factor required)
            u0: initial guess for the iterative solver (not used here so far)
            t: current time (e.g. for time-dependent BCs)

        Returns:
            solution as mesh
        """

        M1 = sp.hstack((sp.eye(self.nvars[1]), -factor * self.A))
        M2 = sp.hstack((-factor * self.A, sp.eye(self.nvars[1])))
        M = sp.vstack((M1, M2))

        b = np.concatenate((rhs.values[0, :], rhs.values[1, :]))

        sol = LA.spsolve(M, b)

        me = mesh(self.nvars)
        me.values[0, :], me.values[1, :] = np.split(sol, 2)

        return me
Example #27
0
    def dSdm(self):

        if getattr(self, '_dSdm', None) is None:

            if self.model is None:
                raise Exception('Requires a chi')

            nC = int(len(self.model)/3)

            m_xyz = self.chiMap * matutils.spherical2cartesian(self.model.reshape((nC, 3), order='F'))

            nC = int(m_xyz.shape[0]/3.)
            m_atp = matutils.cartesian2spherical(m_xyz.reshape((nC, 3), order='F'))

            a = m_atp[:nC]
            t = m_atp[nC:2*nC]
            p = m_atp[2*nC:]

            Sx = sp.hstack([sp.diags(np.cos(t)*np.cos(p), 0),
                            sp.diags(-a*np.sin(t)*np.cos(p), 0),
                            sp.diags(-a*np.cos(t)*np.sin(p), 0)])

            Sy = sp.hstack([sp.diags(np.cos(t)*np.sin(p), 0),
                            sp.diags(-a*np.sin(t)*np.sin(p), 0),
                            sp.diags(a*np.cos(t)*np.cos(p), 0)])

            Sz = sp.hstack([sp.diags(np.sin(t), 0),
                            sp.diags(a*np.cos(t), 0),
                            sp.csr_matrix((nC, nC))])

            self._dSdm = sp.vstack([Sx, Sy, Sz])

        return self._dSdm
Example #28
0
def combine_matrix():
    
    X000 = [sio.loadmat(filein_name[:-4] + '0X000.mat')['X000'],
            sio.loadmat(filein_name[:-4] + '1X000.mat')['X000'],
            sio.loadmat(filein_name[:-4] + '2X000.mat')['X000']]

    X001 = [sio.loadmat(filein_name[:-4] + '0X001.mat')['X001'],
            sio.loadmat(filein_name[:-4] + '1X001.mat')['X001'],
            sio.loadmat(filein_name[:-4] + '2X001.mat')['X001']]

    X010 = [sio.loadmat(filein_name[:-4] + '0X010.mat')['X010'],
            sio.loadmat(filein_name[:-4] + '1X010.mat')['X010'],
            sio.loadmat(filein_name[:-4] + '2X010.mat')['X010']]

    X100 = [sio.loadmat(filein_name[:-4] + '0X100.mat')['X100'],
            sio.loadmat(filein_name[:-4] + '1X100.mat')['X100'],
            sio.loadmat(filein_name[:-4] + '2X100.mat')['X100']]
    

    X_000 = sp.vstack([X000[0],X000[1],X000[2]])
    X_001 = sp.vstack([X001[0],X001[1],X001[2]])
    X_010 = sp.vstack([X010[0],X010[1],X010[2]])
    X_100 = sp.vstack([X100[0],X100[1],X100[2]])
    print(X_000.shape)

    
    X_model_100 = sp.hstack([X_000,X_100])
    sio.savemat(filein_name[:-4] + 'X100-model.mat', {'X100':X_model_100})

    X_model_010 = sp.hstack([X_000,X_010])
    sio.savemat(filein_name[:-4] + 'X010-model.mat', {'X010':X_model_010})

    X_model_001 = sp.hstack([X_000,X_001])
    sio.savemat(filein_name[:-4] + 'X001-model.mat', {'X001':X_model_001})
Example #29
0
def sentiment_kaggle_dataset():
    train_x_1, test_x_1 = senti_lexicon_vectorizor(data=SST_KAGGLE, tfidf=True)
    train_x_2, test_x_2 = senti_wordnet_vectorizer(data=SST_KAGGLE, tfidf=True)
    train_x = sparse.hstack((train_x_1, train_x_2))
    test_x = sparse.hstack((test_x_1, test_x_2))
    _, train_y, _ = read_sst_kaggle_pickle()
    return train_x, train_y, test_x
Example #30
0
    def test_sadpnt_smw_krypy(self):
        """check the sadpnt solver with krypy"""

        umat, vmat, k, = self.U, self.V, self.k

        # self.Jt = self.J.T
        # check the formula
        AuvInvZ = lau.solve_sadpnt_smw(amat=self.A, jmat=self.J, rhsv=self.Z,
                                       jmatT=self.Jt, umat=self.U, vmat=self.V,
                                       krylov=True, krpslvprms=self.krpslvprms)

        sysm1 = sps.hstack([self.A, self.Jt], format='csr')
        sysm2 = sps.hstack([self.J, sps.csr_matrix((k, k))], format='csr')
        mata = sps.vstack([sysm1, sysm2], format='csr')

        umate = np.vstack([umat, np.zeros((k, umat.shape[1]))])
        vmate = np.hstack([vmat, np.zeros((vmat.shape[0], k))])
        ze = np.vstack([self.Z, np.zeros((k, self.Z.shape[1]))])

        AAinvZ = mata * AuvInvZ - np.dot(umate, np.dot(vmate, AuvInvZ))

        # likely to fail because of ill conditioned rand mats
        print np.linalg.norm(AAinvZ - ze)
        self.assertTrue(np.allclose(AAinvZ, ze),
                        msg='likely to fail because of ill cond')
Example #31
0
# same concept as above, but at the character level
char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                  strip_accents='unicode',
                                  analyzer='char',
                                  stop_words='english',
                                  ngram_range=(1, 6),
                                  max_features=100000)

# fit the vectorizer to all text (so that all ngrams are observed)
# generate testing and training features using the fitted vectorizer
char_vectorizer.fit(comb_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

# generate training and testing features using word and char features
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

# empty scores list and predictions dataframe
scores = []
pred = pd.DataFrame.from_dict({'id': test['id']})

# loop through each class, train the ridge model, and make predictions
for class_name in classes:
    train_target = train[class_name]
    classifier = Ridge(alpha=20,
                       fit_intercept=True,
                       solver='auto',
                       max_iter=100,
                       random_state=0,
                       tol=0.0025)
Example #32
0
    def run(self):
        """ Solves a state estimation problem.
        """
        case = self.case
        baseMVA = case.base_mva
        buses = self.case.connected_buses
        branches = case.online_branches
        generators = case.online_generators
        meas = self.measurements
        # Update indices.
        self.case.index_buses()
        self.case.index_branches()

        # Index buses.
        #        ref = [b._i for b in buses if b.type == REFERENCE]
        pv = [b._i for b in buses if b.type == PV]
        pq = [b._i for b in buses if b.type == PQ]

        # Build admittance matrices.
        Ybus, Yf, Yt = case.Y

        # Prepare initial guess.
        V0 = self.getV0(self.v_mag_guess, buses, generators)

        # Start the clock.
        t0 = time()

        # Initialise SE.
        converged = False
        i = 0
        V = V0
        Va = angle(V0)
        Vm = abs(V0)

        nb = Ybus.shape[0]
        f = [b.from_bus._i for b in branches]
        t = [b.to_bus._i for b in branches]
        nonref = pv + pq

        # Form measurement vector.
        z = array([m.value for m in meas])

        # Form measurement index vectors.
        idx_zPf = [m.b_or_l._i for m in meas if m.type == PF]
        idx_zPt = [m.b_or_l._i for m in meas if m.type == PT]
        idx_zQf = [m.b_or_l._i for m in meas if m.type == QF]
        idx_zQt = [m.b_or_l._i for m in meas if m.type == QT]
        idx_zPg = [m.b_or_l._i for m in meas if m.type == PG]
        idx_zQg = [m.b_or_l._i for m in meas if m.type == QG]
        idx_zVm = [m.b_or_l._i for m in meas if m.type == VM]
        idx_zVa = [m.b_or_l._i for m in meas if m.type == VA]

        def col(seq):
            return [[k] for k in seq]

        # Create inverse of covariance matrix with all measurements.
#        full_scale = 30
#        sigma = [
#            0.02 * abs(Sf)      + 0.0052 * full_scale * ones(nbr,1),
#            0.02 * abs(St)      + 0.0052 * full_scale * ones(nbr,1),
#            0.02 * abs(Sbus)    + 0.0052 * full_scale * ones(nb,1),
#            0.2 * pi/180 * 3*ones(nb,1),
#            0.02 * abs(Sf)      + 0.0052 * full_scale * ones(nbr,1),
#            0.02 * abs(St)      + 0.0052 * full_scale * ones(nbr,1),
#            0.02 * abs(Sbus)    + 0.0052 * full_scale * ones(nb,1),
#            0.02 * abs(V0)      + 0.0052 * 1.1 * ones(nb,1),
#        ] ./ 3

# Get R inverse matrix.
        sigma_vector = r_[self.sigma[0] * ones(len(idx_zPf)),
                          self.sigma[1] * ones(len(idx_zPt)),
                          self.sigma[2] * ones(len(idx_zQf)),
                          self.sigma[3] * ones(len(idx_zQt)),
                          self.sigma[4] * ones(len(idx_zPg)),
                          self.sigma[5] * ones(len(idx_zQg)),
                          self.sigma[6] * ones(len(idx_zVm)),
                          self.sigma[7] * ones(len(idx_zVa))]
        sigma_squared = sigma_vector**2

        rsig = range(len(sigma_squared))
        Rinv = csr_matrix((1.0 / sigma_squared, (rsig, rsig)))

        # Do Newton iterations.
        while (not converged) and (i < self.max_iter):
            i += 1

            # Compute estimated measurement.
            Sfe = V[f] * conj(Yf * V)
            Ste = V[t] * conj(Yt * V)
            # Compute net injection at generator buses.
            gbus = [g.bus._i for g in generators]
            Sgbus = V[gbus] * conj(Ybus[gbus, :] * V)
            # inj S + local Sd
            Sd = array([complex(b.p_demand, b.q_demand) for b in buses])
            Sgen = (Sgbus * baseMVA + Sd) / baseMVA

            z_est = r_[Sfe[idx_zPf].real, Ste[idx_zPt].real, Sfe[idx_zQf].imag,
                       Ste[idx_zQt].imag, Sgen[idx_zPg].real,
                       Sgen[idx_zQg].imag,
                       abs(V[idx_zVm]),
                       angle(V[idx_zVa])]

            # Get H matrix.
            dSbus_dVm, dSbus_dVa = case.dSbus_dV(Ybus, V)
            dSf_dVa, dSf_dVm, dSt_dVa, dSt_dVm, _, _ = case.dSbr_dV(Yf, Yt, V)

            # Get sub-matrix of H relating to line flow.
            dPF_dVa = dSf_dVa.real  # from end
            dQF_dVa = dSf_dVa.imag
            dPF_dVm = dSf_dVm.real
            dQF_dVm = dSf_dVm.imag
            dPT_dVa = dSt_dVa.real  # to end
            dQT_dVa = dSt_dVa.imag
            dPT_dVm = dSt_dVm.real
            dQT_dVm = dSt_dVm.imag
            # Get sub-matrix of H relating to generator output.
            dPG_dVa = dSbus_dVa[gbus, :].real
            dQG_dVa = dSbus_dVa[gbus, :].imag
            dPG_dVm = dSbus_dVm[gbus, :].real
            dQG_dVm = dSbus_dVm[gbus, :].imag
            # Get sub-matrix of H relating to voltage angle.
            dVa_dVa = csr_matrix((ones(nb), (range(nb), range(nb))))
            dVa_dVm = csr_matrix((nb, nb))
            # Get sub-matrix of H relating to voltage magnitude.
            dVm_dVa = csr_matrix((nb, nb))
            dVm_dVm = csr_matrix((ones(nb), (range(nb), range(nb))))

            h = [(col(idx_zPf), dPF_dVa, dPF_dVm),
                 (col(idx_zQf), dQF_dVa, dQF_dVm),
                 (col(idx_zPt), dPT_dVa, dPT_dVm),
                 (col(idx_zQt), dQT_dVa, dQT_dVm),
                 (col(idx_zPg), dPG_dVa, dPG_dVm),
                 (col(idx_zQg), dQG_dVa, dQG_dVm),
                 (col(idx_zVm), dVm_dVa, dVm_dVm),
                 (col(idx_zVa), dVa_dVa, dVa_dVm)]

            H = vstack([
                hstack([dVa[idx, nonref], dVm[idx, nonref]])
                for idx, dVa, dVm in h if len(idx) > 0
            ])

            # Compute update step.
            J = H.T * Rinv * H
            F = H.T * Rinv * (z - z_est)  # evalute F(x)
            dx = spsolve(J, F)

            # Check for convergence.
            normF = linalg.norm(F, Inf)

            if self.verbose:
                logger.info("Iteration [%d]: Norm of mismatch: %.3f" %
                            (i, normF))
            if normF < self.tolerance:
                converged = True

            # Update voltage.
            npvpq = len(nonref)

            Va[nonref] = Va[nonref] + dx[:npvpq]
            Vm[nonref] = Vm[nonref] + dx[npvpq:2 * npvpq]

            V = Vm * exp(1j * Va)
            Va = angle(V)
            Vm = abs(V)

        # Weighted sum squares of error.
        error_sqrsum = sum((z - z_est)**2 / sigma_squared)

        # Update case with solution.
        case.pf_solution(Ybus, Yf, Yt, V)

        # Stop the clock.
        elapsed = time() - t0

        if self.verbose and converged:
            print "State estimation converged in: %.3fs (%d iterations)" % \
            (elapsed, i)
#            self.output_solution(sys.stdout, z, z_est)

        solution = {
            "V": V,
            "converged": converged,
            "iterations": i,
            "z": z,
            "z_est": z_est,
            "error_sqrsum": error_sqrsum,
            "elapsed": elapsed
        }

        return solution
Example #33
0
#%%

scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1, 1))

new_feat_train['year_month_scaled'] = scaler.transform(
    new_feat_train['year_month'].values.reshape(-1, 1))
new_feat_test['year_month_scaled'] = scaler.transform(
    new_feat_test['year_month'].values.reshape(-1, 1))

#%%
#Вычисление ROC_AUC на отложенной выборке

X_train_sparse_new = csr_matrix(
    hstack([
        X_train_sparse,
        new_feat_train['year_month_scaled'].values.reshape(-1, 1)
    ]))
get_auc_lr_valid(X_train_sparse_new, y_train)

#%%
# Месяц начала сессии
new_feat_train['start_month'] = train_df['time1'].apply(lambda x: x.month)
new_feat_test['start_month'] = test_df['time1'].apply(lambda x: x.month)

# Час начала сессии
new_feat_train['start_hour'] = train_df['time1'].apply(lambda x: x.hour)
new_feat_test['start_hour'] = test_df['time1'].apply(lambda x: x.hour)

# Утро - час меньше 11 или нет
new_feat_train['morning'] = new_feat_train['start_hour'] <= 11
new_feat_test['morning'] = new_feat_test['start_hour'] <= 11
Example #34
0
train_x = np.hstack((train_x, ct_trains))
test_x = np.hstack((test_x, ct_tests))


# 特征进行onehot处理
enc = OneHotEncoder()

oc_encoder = OneHotEncoder()
print("onehot start")
f.write("onehot start")
f.flush()
for feature in tqdm(one_hot_feature):
    oc_encoder.fit(data[feature].values.reshape(-1, 1))
    train_a=oc_encoder.transform(train[feature].values.reshape(-1, 1))
    test_a = oc_encoder.transform(test[feature].values.reshape(-1, 1))
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('one-hot prepared !')
f.write("one-hot prepared !")
f.flush()

# 处理count特征向量

ct_encoder = CountVectorizer(min_df=0.001, tokenizer = str.split)  #传递函数
print("CV start")
f.write("CV start")
f.flush()
for feature in tqdm(vector_feature):
    ct_encoder.fit(data[feature])
    train_a = ct_encoder.transform(train[feature])
    test_a = ct_encoder.transform(test[feature])
Example #35
0
def undirected_grid_2d_bipartie_graph(m,
                                      n,
                                      d=1,
                                      r=1,
                                      centers_d=None,
                                      centers_s=None,
                                      num_of_centers_supply=None,
                                      num_of_centers_demand=None,
                                      l_norm=np.inf,
                                      plot=False,
                                      alpha=1.0):

    g = nx.empty_graph(0, None)
    rows = range(m)
    columns = range(n)

    # adding all the nodes
    g.add_nodes_from((i, j) for i in rows for j in columns)
    #
    # adding all the edges

    for k in range(d + 1):
        for l in range(d + 1):
            g.add_edges_from(((i, j), (i + l, j + k)) for i in rows
                             for j in columns
                             if i + l <= m - 1 and j + k <= n -
                             1 and norm(np.array([l, k]), l_norm) <= r)

    nodes = list(g.nodes())

    if centers_d is None:
        if num_of_centers_demand is None:
            num_of_centers_demand = int(0.5 * ((m * n)**0.5))
        centers_d = [((uniform(0.2 * m,
                               0.8 * m), uniform(0.2 * n,
                                                 0.8 * n)), uniform(0, 1))
                     for _ in range(num_of_centers_demand)]
    if centers_s is None:
        if num_of_centers_supply is None:
            num_of_centers_supply = int(0.5 * ((m * n)**0.5))
        centers_s = [((uniform(0.2 * m,
                               0.8 * m), uniform(0.2 * n,
                                                 0.8 * n)), uniform(0, 1))
                     for _ in range(num_of_centers_supply)]

    lamda_d_pdf = gaussian_pdf_2d(m, n, centers_d, normalize=True)
    lamda_d = np.array([lamda_d_pdf[node] for node in nodes])

    lamda_s_pdf = alpha * gaussian_pdf_2d(
        m, n, centers_s, normalize=True) + (1 - alpha) * lamda_d_pdf
    lamda_s = np.array([lamda_s_pdf[node] for node in nodes])

    lamda = np.concatenate((lamda_d, lamda_s))
    grid_adj_mat = nx.adjacency_matrix(g)
    layered_grid_adj_mat = sps.vstack(
        (sps.hstack((0 * sps.eye(m * n), grid_adj_mat)),
         sps.hstack((grid_adj_mat, 0 * sps.eye(m * n)))))
    nodes = dict(
        enumerate(
            list(zip(nodes, ['d'] * len(nodes))) +
            list(zip(nodes, ['s'] * len(nodes)))))

    workload_decomp = grid_workload_decomposition(lamda, layered_grid_adj_mat)
    max_workload = workload_decomp[0]['workload']

    lamda_s_pdf = lamda_s_pdf * max_workload
    lamda_s = lamda_s * max_workload

    for s in workload_decomp:
        workload_decomp[s][
            'workload'] = workload_decomp[s]['workload'] / max_workload

    supply_decomp = np.zeros((m, n))
    demand_decomp = np.zeros((m, n))
    for st in workload_decomp:
        wl = workload_decomp[st]['workload']
        for d in workload_decomp[st]['demnand_nodes']:
            demand_decomp[nodes[d][0]] = wl
        for s in workload_decomp[st]['supply_nodes']:
            supply_decomp[nodes[s][0]] = wl

    return dict(
        zip([
            'lamda_s', 'lamda_d', 'grid_adj_mat', 'nodes', 'lamda_d_pdf',
            'lamda_s_pdf', 'demand_decomp', 'supply_decomp', 'fifo_ct',
            'max_ent_workload'
        ], [
            lamda_s, lamda_d, grid_adj_mat, nodes, lamda_d_pdf, lamda_s_pdf,
            demand_decomp, supply_decomp
        ]))
Example #36
0
             left_index=True,
             right_index=True).merge(gatest[['testrow']],
                                     how='left',
                                     left_index=True,
                                     right_index=True).reset_index())
d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)),
                       shape=(gatrain.shape[0], nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),
                       shape=(gatest.shape[0], nlabels))
print('Labels data:train shape{},test shape{}'.format(Xtr_label.shape,
                                                      Xte_label.shape))

#concatenate all features
Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr')
Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr')

scipy.io.mmwrite('Xtrain.mtx', Xtrain)
Xtrain = scipy.io.mmread("Xtrain.mtx")
scipy.io.mmwrite('Xtest.mtx', Xtest)
Xtest = scipy.io.mmread("Xtest.mtx")

#print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))
#
#cross validation
targetencoder = LabelEncoder().fit(gatrain.group)
y = targetencoder.transform(gatrain.group)  #对gatrain的所有行的group编码
nclasses = len(targetencoder.classes_)

Example #37
0
del data_postive
del data
gc.collect()
data_negative_x = data_negative[['creativeSize']]

##负样本稀疏处理
mprint('onehot_trans begin')
for feature in one_hot_feature:
    #for feature in one_hot_feature:

    enc = OneHotEncoder()
    tmp_enc = enc.fit_transform(data_negative[feature].values.reshape(-1, 1))
    #    enc.fit(data_negative[feature].values.reshape(-1, 1))
    mprint(enc.n_values_, 'feature:%s enc.n_values_' % (feature))
    #    tmp_enc=enc.transform(data_negative[feature].values.reshape(-1, 1))
    data_negative_x = sparse.hstack((data_negative_x, tmp_enc))

    del tmp_enc
    data_negative = data_negative.drop(feature, axis=1)

    gc.collect()
    mprint(mem_usage(data_negative),
           'mem_usage(data_negative) after onehot_trans %s' % (feature))

    mprint('feature:%s one-hot finished!' % (feature))

mprint('onehot_trans prepared !')

mprint('countvec_trans begin')

for feature in vector_feature:
Example #38
0
    def get_Le_Ln(self, S = 1, return_dxi_deta = False, return_sparse = False):
        """ 
        Calculate the matrix that produces the derivative in the 
        eastward and northward directions of a scalar field 
        defined on self

        set return_dxi_deta to True to return the matrices that 
        differentiate in cubed sphere coordinates instead of geo

        Parameters:
        -----------
        S: int, optional
            Stencil size. Default is 1, in which case derivatives will be calculated
            with a 3-point stencil. With S = 2, a 5-point stencil will be used. etc.
        return_dxi_deta: bool, optional
            Set to True if you want matrices that differentiate in the xi / eta 
            directions instead of east /  north
        return_sparse: bool, optional
            Set to True if you want scipy.sparse matrices instead of dense numpy arrays
        """


        dxi = self.dxi
        det = self.deta
        N = self.NL
        M = self.NW

        D_xi = {'rows':[], 'cols':[], 'elements':[]}
        D_et = {'rows':[], 'cols':[], 'elements':[]}

        # index arrays (0 to N, M)
        i_arr = np.arange(N)
        j_arr = np.arange(M)

        # meshgrid versions:
        ii, jj = np.meshgrid(i_arr, j_arr, indexing = 'xy')

        # inner grid points:
        points = np.r_[-S:S+1:1]
        coefficients = diffutils.stencil(points, order = 1)
        i_dx, j_dx = ii  [:, S:-S], jj  [:, S:-S]
        i_dy, j_dy = ii.T[:, S:-S], jj.T[:, S:-S]

        for ll in range(len(points)):
            D_et['rows']    .append(self._index(i_dx, j_dx             ))
            D_et['cols']    .append(self._index(i_dx + points[ll], j_dx))
            D_et['elements'].append(np.full(i_dx.size, coefficients[ll] / det))

            D_xi['rows']    .append(self._index(i_dy, j_dy             ))
            D_xi['cols']    .append(self._index(i_dy, j_dy + points[ll]))
            D_xi['elements'].append(np.full(i_dy.size, coefficients[ll] / dxi))

        # boundaries
        for kk in np.arange(0, S)[::-1]:

            # LEFT
            points = np.r_[-kk:S+1:1] 
            coefficients = diffutils.stencil(points, order = 1)
            i_dx, j_dx = ii  [:, kk], jj  [:, kk]
            i_dy, j_dy = ii.T[:, kk], jj.T[:, kk]

            for ll in range(len(points)):
                D_et['rows']    .append(self._index(i_dx, j_dx             ))
                D_et['cols']    .append(self._index(i_dx + points[ll], j_dx))
                D_et['elements'].append(np.full(i_dx.size, coefficients[ll] / det))

                D_xi['rows']    .append(self._index(i_dy, j_dy             ))
                D_xi['cols']    .append(self._index(i_dy, j_dy + points[ll]))
                D_xi['elements'].append(np.full(i_dy.size, coefficients[ll] / dxi))

            # RIGHT
            points = np.r_[-S:kk+1:1] 
            coefficients = diffutils.stencil(points, order = 1)
            i_dx, j_dx = ii  [:, -(kk + 1)], jj  [:, -(kk + 1)]
            i_dy, j_dy = ii.T[:, -(kk + 1)], jj.T[:, -(kk + 1)]

            for ll in range(len(points)):
                D_et['rows']    .append(self._index(i_dx, j_dx             ))
                D_et['cols']    .append(self._index(i_dx + points[ll], j_dx))
                D_et['elements'].append(np.full(i_dx.size, coefficients[ll] / det))

                D_xi['rows']    .append(self._index(i_dy, j_dy             ))
                D_xi['cols']    .append(self._index(i_dy, j_dy + points[ll]))
                D_xi['elements'].append(np.full(i_dy.size, coefficients[ll] / dxi))


        D_xi = {key:np.hstack(D_xi[key]) for key in D_xi.keys()}
        D_et = {key:np.hstack(D_et[key]) for key in D_et.keys()}

        D_xi = sparse.csc_matrix((D_xi['elements'], (D_xi['rows'], D_xi['cols'])), shape = (N * M, N * M))
        D_et = sparse.csc_matrix((D_et['elements'], (D_et['rows'], D_et['cols'])), shape = (N * M, N * M))

        if return_dxi_deta:
            if return_sparse:
                return D_xi, D_et
            else:
                return np.array(D_xi.todense()), np.array(D_et.todense())

        # convert to gradient compnents
        X = self.X.flatten().reshape((1, -1))
        Y = self.Y.flatten().reshape((1, -1))
        D = self.D.flatten().reshape((1, -1))
        C = self.C.flatten().reshape((1, -1))
        d = self.delta.flatten().reshape((1, -1))

        I = sparse.eye(self.size)

        # equation 21 of Ronchi et al.
        L_xi = (D_xi.multiply(D        ) + D_et.multiply(X * Y / D)) / self.R
        L_et = (D_xi.multiply(X * Y / C) + D_et.multiply(    C    )) / self.R
        dd = np.sqrt(d - 1)

        # conversion from xi/eta to geocentric east/west is accomplished through the
        # matrix in equation 14 of Ronchi et al. 
        # The elements of this matrix are:
        a00 =  D * X / dd 
        a01 = -D * Y / dd / np.sqrt(d) 
        a10 =  C * Y / dd 
        a11 =  C * X / dd / np.sqrt(d)        

        # The a matrix converts from local theta/phi to xi/eta. The elements of
        # the inverse are:
        det = a00*a11 - a01*a10
        b00 =  a11 /det 
        b01 = -a01 /det 
        b10 = -a10 /det 
        b11 =  a00 /det 

        # matrix that converts from xi/eta to local east/north
        Be_ = sparse.hstack((I.multiply(b00), I.multiply(b01)))
        Bn_ = sparse.hstack((I.multiply(b10), I.multiply(b11)))

        # Make rotation matrix from local east/north to geocentric east/south:
        R_l2g = self.projection.local2geo_enu_rotation(self.local_lon.flatten(), self.local_lat.flatten())
        r10 =  -R_l2g[:, 0, 0].reshape((1, -1))
        r11 =  -R_l2g[:, 0, 1].reshape((1, -1))
        r00 =   R_l2g[:, 1, 0].reshape((1, -1))
        r01 =   R_l2g[:, 1, 1].reshape((1, -1))
        Re = sparse.hstack((I.multiply(r00), I.multiply(r01)))
        Rn = sparse.hstack((I.multiply(r10), I.multiply(r11)))
        # where I switched the order of the rows and multiplied first row by -1
        # so that R acts on (south/east) instead of (east/north). 

        # combine all three operations: Differentiation of xi/eta, conversion to local, conversion to global
        L = sparse.vstack((Re, Rn)).dot(sparse.vstack((Be_, Bn_))).dot(sparse.vstack((L_xi, L_et)))

        # and return the upper and lower parts of L:
        Le, Ln = L[:self.size], L[self.size:]
        if return_sparse:
            return Le, Ln
        else:
            return np.array(Le.todense()), np.array(Ln.todense())
Example #39
0
# Xts holds one hot encodings for each individual feature in memory
# speeding up feature selection
Xts = [OneHotEncoder(X_train_all[:, [i]])[0] for i in range(num_features)]

print "Performing greedy feature selection..."
score_hist = []
N = 10
good_features = set([])
# Greedy feature selection loop
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
    scores = []
    for f in range(len(Xts)):
        if f not in good_features:
            feats = list(good_features) + [f]
            Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
            score = cv_loop(Xt, y, model, N)
            scores.append((score, f))
            print "Feature: %i Mean AUC: %f" % (f, score)
    good_features.add(sorted(scores)[-1][1])
    score_hist.append(sorted(scores)[-1])
    print "Current features: %s" % sorted(list(good_features))

# Remove last added feature from good_features
good_features.remove(score_hist[-1][1])
good_features = sorted(list(good_features))
print "Selected features %s" % good_features
gf = open("feats" + submit, 'w')
print >> gf, good_features
gf.close()
print len(good_features), " features"
Example #40
0
    def divergence(self, S = 1, return_sparse = False):
        """ 
        Calculate the matrix that produces the divergence of a vector field

        The returned 2N x N matrix operates on a 1D array that represents a 
        vector field. The array must be of length 2N, where N is the number 
        of grid cells. The first N elements are the eastward components and 
        the last N are the northward components. 

        Note - this code is based on equations (12) and (23) of Ronchi. The 
        'matrification' is explained in my regional data analysis document;
        it is not super easy to understand it from the code alone. 

        Parameters:
        -----------
        S: int, optional
            Stencil size. Default is 1, in which case derivatives will be calculated
            with a 3-point stencil. With S = 2, a 5-point stencil will be used. etc.
        return_sparse: bool, optional
            Set to True if you want scipy.sparse matrices instead of dense numpy arrays
        """


        # 1) construct matrix that operates on [[Vxi], [Veta]] to produce
        #    the divergence of teh vector field V according to equation (23)
        #    of Ronchi et al. 
        # 2) construct matrix that converts from east/north to xi/eta 
        #    in local coords
        # 3) construct matrix that rotates from global to local coords
        # 4) combine all three matrices and return


        # matrices that calculate differentials
        L_xi, L_eta = self.get_Le_Ln(S = S, return_dxi_deta = True, return_sparse = True)

        # define some parameteres that are needed 
        d   = self.delta.flatten().reshape((-1, 1))
        X   = self.X.flatten().reshape(    (-1, 1))
        Y   = self.Y.flatten().reshape(    (-1, 1))
        D   = self.D.flatten().reshape(    (-1, 1))
        C   = self.C.flatten().reshape(    (-1, 1))
        xi  = self.xi.flatten().reshape(   (-1, 1))
        eta = self.eta.flatten().reshape(  (-1, 1))
        R = self.R

        I = sparse.eye(xi.size)

        q1 = d / (R * D * C**2)
        q2 = -np.tan(xi ) / (R * D * C**2 * np.cos(xi )**2)
        p1 = d / (R * C * D**2)
        p2 = -np.tan(eta) / (R * C * D**2 * np.cos(eta)**2)

        # matrix that caculates the divergence with xi/eta components:
        L = sparse.hstack((L_xi.multiply(q1) + I.multiply(q2), L_eta.multiply(p1) + I.multiply(p2)))

        dd = np.sqrt(d - 1)
        aa = -D * Y / dd / np.sqrt(d)
        bb = -D * X / dd
        cc =  C * X / dd / np.sqrt(d)
        dd = -C * Y / dd

        # matrix that rotates from east/north to xi/eta:
        R = sparse.vstack((sparse.hstack((I.multiply(aa), I.multiply(bb))), 
                           sparse.hstack((I.multiply(cc), I.multiply(dd))))) 

        # Combine this with the rotation matrix from geocentric east/north to local east/north:
        R_l2g = self.projection.local2geo_enu_rotation(self.local_lon.flatten(), self.local_lat.flatten())
        R_g2l = np.swapaxes(R_l2g, 1, 2) # transpose to get rotation from geo 2 local

        r00 =  R_g2l[:, 0, 0].reshape((1, -1))
        r01 =  R_g2l[:, 0, 1].reshape((1, -1))
        r10 =  R_g2l[:, 1, 0].reshape((1, -1))
        r11 =  R_g2l[:, 1, 1].reshape((1, -1))

        RR = sparse.vstack((sparse.hstack((I.multiply(r00), I.multiply(r01))),
                            sparse.hstack((I.multiply(r10), I.multiply(r11)))))

        # combine the matrices so we get divergence of east/north
        D = L.dot(R.dot(RR) )
        return D if return_sparse else np.array(D.todense())
Example #41
0
xx.append(np.std(X.todense(), axis=1))
xx.append(np.std(X1.todense(), axis=1))
xx.append(np.std(X2.todense(), axis=1))
xx.append(np.std(X3.todense(), axis=1))
xx.append(np.std(X4.todense(), axis=1))
#xx.append(np.sum(sparse.hstack([X,X1,X2,X3,X4],format='csr').todense(),axis=1))
#xx.append(np.max(X.todense(),axis=1)-np.min(X.todense(),axis=1))
#xx.append(np.max(X1.todense(),axis=1)-np.min(X1.todense(),axis=1))
#xx.append(np.max(X2.todense(),axis=1)-np.min(X2.todense(),axis=1))
#xx.append(np.max(X3.todense(),axis=1)-np.min(X3.todense(),axis=1))
#xx.append(np.max(X4.todense(),axis=1)-np.min(X4.todense(),axis=1))

xx = np.hstack(xx)

X = sparse.hstack(
    [X, X1, X2, X3, X4, xx,
     pickle.load(open('../explore/X2.p'))],
    format='csr').todense()
train = pd.read_csv('../explore/train1.csv')
idname = 'id'
label = 'fault_severity'
idx = train[idname].as_matrix()
y = np.array(train[label])
import pickle
X = np.hstack([X, train.drop([label, idname], axis=1).as_matrix()])
#X=np.hstack([X,train[['location','volume']].as_matrix()])

print X.shape, y.shape
yp, score = kfold_cv(X, y, 4)
print X.shape, y.shape
print yp.shape
Example #42
0
model=load_model('best_model.hdf5')
#model=load_model('MyBidirLSTM2Layer100Dim.h5')
X=np.concatenate((leftX,rightX),axis=1)
o=model.evaluate(X,Y)[1]
print("Test Accuracy by my deep model")
print(o)

transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb")))
features = transformer.fit_transform(loaded_vec.fit_transform(np.array(doc)))
i=0
X_test=[]
while i<=(2*len(Y)-2): 
    arr=np.zeros((3))
    l=vstack([csr_matrix(features[i,:]),csr_matrix(features[i+1,:])])
    l1=hstack([csr_matrix(features[i,:]),csr_matrix(features[i+1,:])])
    #print(l.shape)
    l=l.todense()
    l1=l1.todense()
    
    arr[0]=np.dot(l[0,:],l[1,:].T)
    arr[1]=sklearn.metrics.pairwise.euclidean_distances(l[0,:],l[1,:])
    arr[2]=sklearn.metrics.pairwise.manhattan_distances(l[0,:],l[1,:])
    #X.append(arr)
    X_test.append(l1)
    i=i+2
joblib_model = joblib.load('My_TFIDF_Modelnew1.pkl')
X_test=np.array(X_test)
#print(X_test.shape)    # shape is (9824, 1, 2398) if l1 was added else (9824, 2398) if arr was added

Example #43
0
File: fd.m.py Project: cossatot/RBF
# add the constraint that displacement at the pinned node is zero.
out = elastic3d_displacement(
    nodes[groups['pinned']], 
    nodes, 
    lamb=lamb, 
    mu=mu, 
    n=1,
    basis=basis,
    order=0)
G_xx = add_rows(G_xx, out['xx'], groups['pinned'])
G_yy = add_rows(G_yy, out['yy'], groups['pinned'])
G_zz = add_rows(G_zz, out['zz'], groups['pinned'])

# stack it all together, removing unneeded matrices as soon as
# possible
G_x = sp.hstack((G_xx, G_xy, G_xz))
del G_xx, G_xy, G_xz

G_y = sp.hstack((G_yx, G_yy, G_yz))
del G_yx, G_yy, G_yz

G_z = sp.hstack((G_zx, G_zy, G_zz))
del G_zx, G_zy, G_zz

G = sp.vstack((G_x, G_y, G_z))
del G_x, G_y, G_z

G = G.tocsc()
G.eliminate_zeros()

# form the right-hand-side vector
    def __constrStateTransitionMatrix(self):
        k = self.__class__.k
        A = None    # state transtion block matrix
        # loop over all rows (i.e. individual objs) and find all connections with other objs
        for row,i,obj in zip(self.massMatrix,xrange(len(self.objs)),self.objs):
            fac = 1./(1. + obj.b1*k)
            fac /= obj.h**2 if isinstance(obj,Resonator2D) else obj.h
            C1_total = csc_matrix((obj.Nm,obj.Nm)); C2_total = csc_matrix((obj.Nm,obj.Nm))
            C3_total = {}; C4_total = {}; A_row = None
            colInds = np.nonzero(row)[0]
            # for every connection between obj q and r other objects, construct inter-connection matrices
            for j in colInds:
                cpoint_q = self.connPointMatrix[i][j]
                e_q = spdistr2D(1.,cpoint_q[0],cpoint_q[1],obj.Nx - 1,obj.Ny - 1,flatten=True)\
                if isinstance(obj,Resonator2D) else spdistr1D(1.,cpoint_q,obj.Nm,'lin')
                # return the row indices of the nonzero entrees in the current col we are looking in
                row_r = [ind for ind,item in enumerate([self.massMatrix[q][j] for q in xrange(len(self.massMatrix))]) if item > 0]
                # remove row index of current object and since list must now be of size 1, simply return row index
                row_r.remove(i); row_r = row_r[0]
                M = float(self.massMatrix[i][j])/self.massMatrix[row_r][j]    # mass ratio: Mq/Mr
                cpoint_r = self.connPointMatrix[row_r][j]
                e_r = spdistr2D(1.,cpoint_r[0],cpoint_r[1],self.objs[row_r].Nx - 1,self.objs[row_r].Ny - 1,flatten=True) \
                if isinstance(self.objs[row_r],Resonator2D) \
                else spdistr1D(1.,cpoint_r,self.objs[row_r].B.shape[0],'lin')
                c1 = fac/(e_q.T.dot(e_q)[0,0] + M*e_r.T.dot(e_r)[0,0])
                e_qCre_q = e_q*e_q.T; e_qCre_r = e_q*e_r.T
                C1_total = C1_total + c1*e_qCre_q*obj.C1
                C2_total = C2_total + c1*e_qCre_q*obj.C2
                if row_r in C3_total:   # save to assert that when C3[row_r] is empty, C4[row_r] is empty also
                    C3_total[row_r] = C3_total[row_r] - c1*e_qCre_r*self.objs[row_r].C1
                    C4_total[row_r] = C4_total[row_r] - c1*e_qCre_r*self.objs[row_r].C2
                else:
                    C3_total[row_r] = -c1*e_qCre_r*self.objs[row_r].C1
                    C4_total[row_r] = -c1*e_qCre_r*self.objs[row_r].C2

            # construct row of A for u[n]
            for j in xrange(0,len(self.objs)):
                if i == j:       # we're on the diagonal
                    A_row = hstack((obj.B + C1_total,obj.C + C2_total),format="lil") if A_row == None else \
                    hstack((A_row,obj.B + C1_total,obj.C + C2_total),format="lil")
                elif j in C3_total:
                    A_row = hstack((C3_total[j],C4_total[j]),format="lil") if A_row == None else \
                    hstack((A_row,C3_total[j],C4_total[j]),format="lil")
                else:
                    Nm2 = self.objs[j].Nm*2
                    A_row = lil_matrix((obj.Nm,Nm2)) if A_row is None else \
                    hstack((A_row,lil_matrix((obj.Nm,Nm2))))

            # construct row of A for u[n - 1]
            if i == 0:   # first object, so identity matrix is first in row
                I = hstack((identity(obj.Nm,format="lil"),lil_matrix((obj.Nm,A_row.shape[1] - obj.Nm))))
            elif i == len(self.objs) - 1:   # last object, so identity matrix is penultimate to last col
                I = hstack((lil_matrix((obj.Nm,A_row.shape[1] - 2*self.objs[-1].Nm)),\
                identity(obj.Nm,format="lil"),lil_matrix((obj.Nm,obj.Nm))))
            else:   # if any other object, calc pos of identity matrix based on grid size N of each obj
                I = hstack((lil_matrix((obj.Nm,2*np.sum(self.Nt[:i]))),identity(obj.Nm),\
                lil_matrix((obj.Nm,obj.Nm + 2*np.sum(self.Nt[-(len(self.Nt) - 1 - i):])))))
            # append row to block state transition matrix A
            A = vstack((A_row,I)) if A is None else vstack((A,A_row,I))

        return A.tocsc()
#test_word_features = word_vectorizer.transform(test_text)

#char_vectorizer = CountVectorizer(
#   sublinear_tf=True,
#    strip_accents='unicode',
#    analyzer='char',
#    stop_words='english',
#    ngram_range=(2, 6),
#    max_features=50000)
#char_vectorizer.fit(all_text)
#train_char_features = char_vectorizer.transform(train_text)
#test_char_features = char_vectorizer.transform(test_text)

#train_features = hstack([train_char_features, train_word_features])
#train_features=hstack([train_char_features])
train_features = hstack([train_word_features])
#test_features = hstack([test_char_features, test_word_features])

scores = []
#submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='liblinear')

    cv_score = np.mean(
        cross_val_score(classifier,
                        train_features,
                        train_target,
                        cv=5,
                        scoring='roc_auc'))
    scores.append(cv_score)
Example #46
0
def model(munged_train_filepath, munged_test_filepath, save_predictions_path):
    train_df = pd.read_json(munged_train_filepath)
    test_df = pd.read_json(munged_test_filepath)

    # Need to ask John what this does
    train_df['features'] = train_df["features"].apply(
        lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
    test_df['features'] = test_df["features"].apply(
        lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
    tfidf = CountVectorizer(stop_words='english', max_features=200)
    tr_sparse = tfidf.fit_transform(train_df["features"])
    te_sparse = tfidf.transform(test_df["features"])

    # which columns are currently numeric?
    numeric = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric_idx = [
        column for column in train_df.columns
        if train_df[column].dtype in numeric
    ]
    non_numeric_idx = [
        column for column in train_df.columns if column not in numeric_idx
    ]
    train_df[numeric_idx].head()

    # seaparate train and test into X and y
    train_X = sparse.hstack([train_df[numeric_idx], tr_sparse]).tocsr()
    test_X = sparse.hstack([test_df[numeric_idx], te_sparse]).tocsr()

    target_num_map = {'high': 0, 'medium': 1, 'low': 2}
    train_y = np.array(
        train_df['interest_level'].apply(lambda x: target_num_map[x]))

    # function to create and run model
    def runXGB(train_X,
               train_y,
               test_X,
               test_y=None,
               feature_names=None,
               seed_val=0,
               num_rounds=1000):
        param = {}
        param['objective'] = 'multi:softprob'
        param['eta'] = 0.1
        param['max_depth'] = 6
        param['silent'] = 1
        param['num_class'] = 3
        param['eval_metric'] = "mlogloss"
        param['min_child_weight'] = 1
        param['subsample'] = 0.7
        param['colsample_bytree'] = 0.7
        param['nthread'] = 4
        param['seed'] = seed_val
        num_rounds = num_rounds

        plst = list(param.items())
        xgtrain = xgb.DMatrix(train_X, label=train_y)

        if test_y is not None:
            xgtest = xgb.DMatrix(test_X, label=test_y)
            watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
            model = xgb.train(plst,
                              xgtrain,
                              num_rounds,
                              watchlist,
                              early_stopping_rounds=50)
        else:
            xgtest = xgb.DMatrix(test_X)
            model = xgb.train(plst, xgtrain, num_rounds)

        pred_test_y = model.predict(xgtest)
        return pred_test_y, model

    # Run model and export to specified filepath
    preds, model = runXGB(train_X, train_y, test_X, num_rounds=300)
    out_df = pd.DataFrame(preds)
    out_df.columns = ["high", "medium", "low"]
    out_df["listing_id"] = test_df.listing_id.values
    out_df.to_csv(save_predictions_path, index=False)
Example #47
0
    def fit_transform(self, raw_documents, y=None):
        x = [vect.fit_transform(raw_documents, y) for vect in self.vectorizers]

        x = sparse.hstack(x)

        return x
Example #48
0
def append_ones(X):
    if sp.issparse(X):
        return sp.hstack((np.ones((X.shape[0], 1)), X)).tocsr()
    else:
        return np.hstack((np.ones((X.shape[0], 1)), X))
Example #49
0
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')

    glove_file = '../feat/glove.6B.50d.txt'
    threads = 8
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    ix = (merge['brand_name'] == merge['brand_name']) & (
        ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(
            merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix]

    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))
    '''
    Crossed columns
    '''

    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns

    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
        ['brand_name', 'item_condition_id_str'],
        ['brand_name', 'subcat_1'],
        ['brand_name', 'subcat_2'],
        ['brand_name', 'general_cat'],
        #['brand_name',  'subcat_1',  'item_condition_id_str'],
        #['brand_name',  'subcat_2',  'item_condition_id_str'],
        #['brand_name',  'general_cat',  'item_condition_id_str'],
        ['brand_name', 'shipping_str'],
        ['shipping_str', 'item_condition_id_str'],
        ['shipping_str', 'subcat_2'],
        ['item_condition_id_str', 'subcat_2'])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(merge.select_dtypes(include=['object']).columns)

    D = 2**30
    for k, v in crossed_columns_d.items():
        print('Crossed column ', k)
        outls_ = []
        indicator = 0
        for col in v:
            outls_.append((np.array(merge[col].apply(hash))) % D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del (lb)

    gc.collect()
    cpuStats()
    '''
    Hash name
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**20,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del (wb)
    X_cat = X_cat[:,
                  np.array(np.clip(X_cat.getnnz(axis=0) -
                                   1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    '''
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_orig.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat,
                           x_col, X_orig)).tocsr()
    '''

    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape)
    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_cat, x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline - 0.0002:
            baseline = score_
        else:
            break

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
        # 0.44532
        # Full data 0.424681

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    from wordbatch.models import nn_relu_h1, nn_relu_h2

    modelnn = nn_relu_h1.NN_ReLU_H1(alpha=0.05, L2=0.00001, D_nn=60, D=sparse_merge.shape[1], \
                                  iters=1, inv_link="identity", threads=threads)

    baseline = 1.
    print('[{}] Epoch time '.format(time.time() - start_time))
    for i in range(3):
        modelnn.fit(train_X, train_y, verbose=1)
        predsnn = modelnn.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsnn))
        print("FM_FTRL dev RMSLE:", score_)
        print('[{}] Epoch time '.format(time.time() - start_time))
        if score_ < baseline - 0.0002:
            baseline = score_
        else:
            break

    pd.Series((np.expm1(predsnn) - np.expm1(predsfm))).hist()

    print(
        "FM_FTRL dev RMSLE:",
        rmsle(np.expm1(valid_y),
              0.1 * (np.expm1(predsnn)) + 0.9 * (np.expm1(predsfm))))

    tpoint2 = time.time()
    print("Time for Training: {}".format(hms_string(tpoint2 - tpoint1)))

    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
Example #50
0
    def transform(self, raw_documents):
        x = [vect.transform(raw_documents) for vect in self.vectorizers]

        x = sparse.hstack(x)

        return x
Example #51
0
    def _fit_resample(self, X, y):
        self.n_features_ = X.shape[1]
        self._validate_estimator()

        # compute the median of the standard deviation of the minority class
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        X_continuous = X[:, self.continuous_features_]
        X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
        X_minority = _safe_indexing(X_continuous,
                                    np.flatnonzero(y == class_minority))

        if sparse.issparse(X):
            if X.format == "csr":
                _, var = csr_mean_variance_axis0(X_minority)
            else:
                _, var = csc_mean_variance_axis0(X_minority)
        else:
            var = X_minority.var(axis=0)
        self.median_std_ = np.median(np.sqrt(var))

        X_categorical = X[:, self.categorical_features_]
        if X_continuous.dtype.name != "object":
            dtype_ohe = X_continuous.dtype
        else:
            dtype_ohe = np.float64
        self.ohe_ = OneHotEncoder(sparse=True,
                                  handle_unknown="ignore",
                                  dtype=dtype_ohe)

        # the input of the OneHotEncoder needs to be dense
        X_ohe = self.ohe_.fit_transform(X_categorical.toarray(
        ) if sparse.issparse(X_categorical) else X_categorical)

        # we can replace the 1 entries of the categorical features with the
        # median of the standard deviation. It will ensure that whenever
        # distance is computed between 2 samples, the difference will be equal
        # to the median of the standard deviation as in the original paper.

        # In the edge case where the median of the std is equal to 0, the 1s
        # entries will be also nullified. In this case, we store the original
        # categorical encoding which will be later used for inversing the OHE
        if math.isclose(self.median_std_, 0):
            self._X_categorical_minority_encoded = _safe_indexing(
                X_ohe.toarray(), np.flatnonzero(y == class_minority))

        X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) *
                      self.median_std_ / 2)
        X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr")

        X_resampled, y_resampled = super()._fit_resample(X_encoded, y)

        # reverse the encoding of the categorical features
        X_res_cat = X_resampled[:, self.continuous_features_.size:]
        X_res_cat.data = np.ones_like(X_res_cat.data)
        X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat)

        if sparse.issparse(X):
            X_resampled = sparse.hstack(
                (
                    X_resampled[:, :self.continuous_features_.size],
                    X_res_cat_dec,
                ),
                format="csr",
            )
        else:
            X_resampled = np.hstack((
                X_resampled[:, :self.continuous_features_.size].toarray(),
                X_res_cat_dec,
            ))

        indices_reordered = np.argsort(
            np.hstack((self.continuous_features_, self.categorical_features_)))
        if sparse.issparse(X_resampled):
            # the matrix is supposed to be in the CSR format after the stacking
            col_indices = X_resampled.indices.copy()
            for idx, col_idx in enumerate(indices_reordered):
                mask = X_resampled.indices == col_idx
                col_indices[mask] = idx
            X_resampled.indices = col_indices
        else:
            X_resampled = X_resampled[:, indices_reordered]

        return X_resampled, y_resampled
ridge = SklearnWrapper(clf=Ridge, seed=SEED, params=ridge_params)
ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:ntrain], y,
                                          ready_df[ntrain:])

rms = sqrt(mean_squared_error(y, ridge_oof_train))
print('Ridge OOF RMSE: {}'.format(rms))

print("Modeling Stage")

ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])

df['ridge_preds'] = ridge_preds

# Combine Dense Features with Sparse Text Bag of Words Features
X = hstack(
    [csr_matrix(df.loc[traindex, :].values),
     ready_df[0:traindex.shape[0]]])  # Sparse Matrix
testing = hstack(
    [csr_matrix(df.loc[testdex, :].values), ready_df[traindex.shape[0]:]])
tfvocab = df.columns.tolist() + tfvocab
for shape in [X, testing]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ", len(tfvocab))
del df
gc.collect()

print("\nModeling Stage")
X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      test_size=0.10,
                                                      random_state=23)
 def append_biases(X):
     return sparse.hstack((X, np.ones(X.shape[0])[:, np.newaxis])).tocsr()
Example #54
0
])

start_vect = time.time()
vectorizer.fit(df.loc[traindex, :].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
tfvocab[:50]
print('[{}] Vectorisation completed'.format(time.time() - start_time))
# Drop Text Cols
df.drop(textfeats + ['text', 'all_titles'], axis=1, inplace=True)
gc.collect()

print('[{}] Modeling Stage'.format(time.time() - start_time))
# Combine Dense Features with Sparse Text Bag of Words Features
X_train = hstack([csr_matrix(df.loc[traindex,:][trnidx].values),\
                        ready_df[0:traindex.shape[0]][trnidx],\
                        dnimgsvd[0:traindex.shape[0]][trnidx]])
X_valid = hstack([csr_matrix(df.loc[traindex,:][validx].values),\
                        ready_df[0:traindex.shape[0]][validx],\
                        dnimgsvd[0:traindex.shape[0]][validx]])
y_train = y[trnidx]
y_valid = y[validx]
testing = hstack([csr_matrix(df.loc[testdex,:].values),\
                  ready_df[traindex.shape[0]:],\
                  dnimgsvd[traindex.shape[0]:]])
tfvocab = df.columns.tolist() + tfvocab + [
    'imgsvdcomp%s' % (i) for i in range(dnimgsvd.shape[1])
]
for shape in [X_train, X_valid, testing]:
    print("{} Rows and {} Cols".format(*shape.shape))
print("Feature Names Length: ", len(tfvocab))
Example #55
0
    def map(
        self,
        lens,
        X=None,
        clusterer=None,
        cover=None,
        nerve=None,
        precomputed=False,
        remove_duplicate_nodes=False,
    ):
        """Apply Mapper algorithm on this projection and build a simplicial complex. Returns a dictionary with nodes and links.

        Parameters
        ----------
        lens: Numpy Array
            Lower dimensional representation of data. In general will be output of `fit_transform`.

        X: Numpy Array
            Original data or data to run clustering on. If `None`, then use `lens` as default. X can be a SciPy sparse matrix.

        clusterer: Default: DBSCAN
            Scikit-learn API compatible clustering algorithm. Must provide `fit` and `predict`.

        cover: kmapper.Cover
            Cover scheme for lens. Instance of kmapper.cover providing methods `fit` and `transform`.

        nerve: kmapper.Nerve
            Nerve builder implementing `__call__(nodes)` API

        precomputed : Boolean
            Tell Mapper whether the data that you are clustering on is a precomputed distance matrix. If set to
            `True`, the assumption is that you are also telling your `clusterer` that `metric='precomputed'` (which
            is an argument for DBSCAN among others), which 
            will then cause the clusterer to expect a square distance matrix for each hypercube. `precomputed=True` will give a square matrix
            to the clusterer to fit on for each hypercube.
            
        remove_duplicate_nodes: Boolean
            Removes duplicate nodes before edges are determined. A node is considered to be duplicate
            if it has exactly the same set of points as another node.

        nr_cubes: Int
            
            .. deprecated:: 1.1.6

                define Cover explicitly in future versions

            The number of intervals/hypercubes to create. Default = 10.
            
        overlap_perc: Float
            .. deprecated:: 1.1.6

                define Cover explicitly in future versions    

            The percentage of overlap "between" the intervals/hypercubes. Default = 0.1. 
            


        Returns
        =======
        simplicial_complex : dict
            A dictionary with "nodes", "links" and "meta" information.

        Examples
        ========

        >>> # Default mapping.
        >>> graph = mapper.map(X_projected, X_inverse)

        >>> # Apply clustering on the projection instead of on inverse X
        >>> graph = mapper.map(X_projected)

        >>> # Use 20 cubes/intervals per projection dimension, with a 50% overlap
        >>> graph = mapper.map(X_projected, X_inverse, 
        >>>                    cover=kmapper.Cover(n_cubes=20, perc_overlap=0.5))

        >>> # Use multiple different cubes/intervals per projection dimension, 
        >>> # And vary the overlap
        >>> graph = mapper.map(X_projected, X_inverse,
        >>>                    cover=km.Cover(n_cubes=[10,20,5],
        >>>                                         perc_overlap=[0.1,0.2,0.5]))

        >>> # Use KMeans with 2 clusters
        >>> graph = mapper.map(X_projected, X_inverse,
        >>>     clusterer=sklearn.cluster.KMeans(2))

        >>> # Use DBSCAN with "cosine"-distance
        >>> graph = mapper.map(X_projected, X_inverse,
        >>>     clusterer=sklearn.cluster.DBSCAN(metric="cosine"))

        >>> # Use HDBSCAN as the clusterer
        >>> graph = mapper.map(X_projected, X_inverse,
        >>>     clusterer=hdbscan.HDBSCAN())

        >>> # Parametrize the nerve of the covering
        >>> graph = mapper.map(X_projected, X_inverse,
        >>>     nerve=km.GraphNerve(min_intersection=3))


        """

        start = datetime.now()

        clusterer = clusterer or cluster.DBSCAN(eps=0.5, min_samples=3)
        self.cover = cover or Cover(n_cubes=10, perc_overlap=0.1)
        nerve = nerve or GraphNerve()

        nodes = defaultdict(list)
        meta = defaultdict(list)
        graph = {}

        # If inverse image is not provided, we use the projection as the inverse image (suffer projection loss)
        if X is None:
            X = lens

        if self.verbose > 0:
            print("Mapping on data shaped %s using lens shaped %s\n" %
                  (str(X.shape), str(lens.shape)))

        # Prefix'ing the data with an ID column
        ids = np.array([x for x in range(lens.shape[0])])
        lens = np.c_[ids, lens]
        if issparse(X):
            X = hstack([ids[np.newaxis].T, X], format='csr')
        else:
            X = np.c_[ids, X]

        # Cover scheme defines a list of elements
        bins = self.cover.fit(lens)

        # Algo's like K-Means, have a set number of clusters. We need this number
        # to adjust for the minimal number of samples inside an interval before
        # we consider clustering or skipping it.
        cluster_params = clusterer.get_params()

        min_cluster_samples = cluster_params.get(
            "n_clusters",
            cluster_params.get("min_cluster_size",
                               cluster_params.get("min_samples", 1)),
        )

        if self.verbose > 1:
            print("Minimal points in hypercube before clustering: %d" %
                  (min_cluster_samples))

        # Subdivide the projected data X in intervals/hypercubes with overlap
        if self.verbose > 0:
            bins = list(bins)  # extract list from generator
            total_bins = len(bins)
            print("Creating %s hypercubes." % total_bins)

        for i, hypercube in enumerate(self.cover.transform(lens)):

            # If at least min_cluster_samples samples inside the hypercube
            if hypercube.shape[0] >= min_cluster_samples:
                # Cluster the data point(s) in the cube, skipping the id-column
                # Note that we apply clustering on the inverse image (original data samples) that fall inside the cube.
                ids = [int(nn) for nn in hypercube[:, 0]]
                X_cube = X[ids]

                fit_data = X_cube[:, 1:]
                if precomputed:
                    fit_data = fit_data[:, ids]

                cluster_predictions = clusterer.fit_predict(fit_data)

                if self.verbose > 1:
                    print("   > Found %s clusters in hypercube %s." %
                          (np.unique(cluster_predictions[
                              cluster_predictions > -1]).shape[0], i))

                for pred in np.unique(cluster_predictions):
                    # if not predicted as noise
                    if pred != -1 and not np.isnan(pred):
                        cluster_id = "cube{}_cluster{}".format(i, int(pred))

                        nodes[cluster_id] = hypercube[:, 0][
                            cluster_predictions == pred].astype(int).tolist()
            elif self.verbose > 1:
                print("Cube_%s is empty.\n" % (i))

        if remove_duplicate_nodes:
            nodes = self._remove_duplicate_nodes(nodes)

        links, simplices = nerve.compute(nodes)

        graph["nodes"] = nodes
        graph["links"] = links
        graph["simplices"] = simplices
        graph["meta_data"] = {
            "projection": self.projection if self.projection else "custom",
            "n_cubes": self.cover.n_cubes,
            "perc_overlap": self.cover.perc_overlap,
            "clusterer": str(clusterer),
            "scaler": str(self.scaler),
        }
        graph["meta_nodes"] = meta

        if self.verbose > 0:
            self._summary(graph, str(datetime.now() - start))

        return graph
Example #56
0
    def join(self, other, axis=1, how='outer', level=None):
        """
        Join two tables along their indices

        Parameters
        ----------
        other: sparsity.SparseTable
            another SparseFrame
        axis: int
            along which axis to join
        how: str
            one of 'inner', 'outer', 'left', 'right'
        level: int
            if Multiindex join using this level

        Returns
        -------
            joined: sparsity.SparseFrame
        """
        if isinstance(self._index, pd.MultiIndex)\
            or isinstance(other._index, pd.MultiIndex):
            raise NotImplementedError()
        if not isinstance(other, SparseFrame):
            other = SparseFrame(other)
        if axis not in set([0, 1]):
            raise ValueError("axis mut be either 0 or 1")
        if axis == 0:
            if np.all(other._columns.values == self._columns.values):
                # take short path if join axes are identical
                data = sparse.vstack([self.data, other.data])
                index = np.hstack([self.index, other.index])
                res = SparseFrame(data, index=index, columns=self._columns)
            else:
                raise NotImplementedError(
                    "Joining along axis 0 fails when column names differ."
                    "This is probably caused by adding all-zeros row.")
                data, new_index = _matrix_join(self._data.T.tocsr(),
                                               other._data.T.tocsr(),
                                               self._columns,
                                               other._columns,
                                               how=how)
                res = SparseFrame(data.T.tocsr(),
                                  index=np.concatenate(
                                      [self.index, other.index]),
                                  columns=new_index)
        elif axis == 1:
            if np.all(self.index.values == other.index.values):
                # take short path if join axes are identical
                data = sparse.hstack([self.data, other.data])
                columns = np.hstack([self._columns, other._columns])
                res = SparseFrame(data, index=self.index, columns=columns)
            else:
                data, new_index = _matrix_join(self._data,
                                               other._data,
                                               self.index,
                                               other.index,
                                               how=how)
                res = SparseFrame(data,
                                  index=new_index,
                                  columns=np.concatenate(
                                      [self._columns, other._columns]))
        return res
def calibration_single_ended_solver(ds,
                                    st_label,
                                    ast_label,
                                    st_var=None,
                                    ast_var=None,
                                    calc_cov=True,
                                    solver='sparse',
                                    verbose=False):
    """
    Parameters
    ----------
    ds : DataStore
    st_label : str
    ast_label : str
    st_var : float, array-like, optional
        If `None` use ols calibration. If `float` the variance of the noise
        from the Stokes detector is described with a single value. Or when the
        variance is a function of the intensity (Poisson distributed) define an
        array with shape (nx, nt), where nx are the number of calibration
        locations.
    ast_var : float, array-like, optional
        If `None` use ols calibration. If `float` the variance of the noise
        from the Stokes detector is described with a single value. Or when the
        variance is a function of the intensity (Poisson distributed) define an
        array with shape (nx, nt), where nx are the number of calibration
        locations.
    calc_cov : bool
        whether to calculate the covariance matrix. Required for calculation
        of confidence boundaries. But uses a lot of memory.
    solver : {'sparse', 'stats', 'external', 'external_split'}
        Always use sparse to save memory. The statsmodel can be used to validate
        sparse solver. `external` returns the matrices that would enter the
        matrix solver (Eq.37). `external_split` returns a dictionary with
        matrix X split in the coefficients per parameter. The use case for
        the latter is when certain parameters are fixed/combined.

    verbose : bool

    Returns
    -------

    """
    ix_sec = ds.ufunc_per_section(x_indices=True, calc_per='all')
    ds_sec = ds.isel(x=ix_sec)

    x_sec = ds_sec['x'].values
    nx = x_sec.size

    nt = ds.time.size
    p0_est = np.asarray([485., 0.1] + nt * [1.4])

    # X \gamma  # Eq.34
    cal_ref = ds.ufunc_per_section(label=st_label,
                                   ref_temp_broadcasted=True,
                                   calc_per='all')

    data_gamma = 1 / (cal_ref.ravel() + 273.15)  # gamma
    coord_gamma_row = np.arange(nt * nx, dtype=int)
    coord_gamma_col = np.zeros(nt * nx, dtype=int)
    X_gamma = sp.coo_matrix((data_gamma, (coord_gamma_row, coord_gamma_col)),
                            shape=(nt * nx, 1),
                            copy=False)

    # X \Delta\alpha  # Eq.34
    data_dalpha = np.repeat(-x_sec, nt)  # dalpha
    coord_dalpha_row = np.arange(nt * nx, dtype=int)
    coord_dalpha_col = np.zeros(nt * nx, dtype=int)
    X_dalpha = sp.coo_matrix(
        (data_dalpha, (coord_dalpha_row, coord_dalpha_col)),
        shape=(nt * nx, 1),
        copy=False)

    # X C  # Eq.34
    data_c = -np.ones(nt * nx, dtype=int)
    coord_c_row = np.arange(nt * nx, dtype=int)
    coord_c_col = np.tile(np.arange(nt, dtype=int), nx)
    X_c = sp.coo_matrix((data_c, (coord_c_row, coord_c_col)),
                        shape=(nt * nx, nt),
                        copy=False)

    # Stack all X's
    X = sp.hstack((X_gamma, X_dalpha, X_c))

    # y
    y = np.log(ds_sec[st_label] / ds_sec[ast_label]).values.ravel()

    # w
    if st_var is not None:
        w = 1 / (ds_sec[st_label]**-2 * st_var +
                 ds_sec[ast_label]**-2 * ast_var).values.ravel()

    else:
        w = 1.  # unweighted

    if solver == 'sparse':
        if calc_cov:
            p_sol, p_var, p_cov = wls_sparse(X,
                                             y,
                                             w=w,
                                             x0=p0_est,
                                             calc_cov=calc_cov,
                                             verbose=verbose)
        else:
            p_sol, p_var = wls_sparse(X,
                                      y,
                                      w=w,
                                      x0=p0_est,
                                      calc_cov=calc_cov,
                                      verbose=verbose)

    elif solver == 'stats':
        if calc_cov:
            p_sol, p_var, p_cov = wls_stats(X,
                                            y,
                                            w=w,
                                            calc_cov=calc_cov,
                                            verbose=verbose)
        else:
            p_sol, p_var = wls_stats(X,
                                     y,
                                     w=w,
                                     calc_cov=calc_cov,
                                     verbose=verbose)

    elif solver == 'external':
        return X, y, w, p0_est

    elif solver == 'external_split':
        return dict(y=y,
                    w=w,
                    X_gamma=X_gamma,
                    X_dalpha=X_dalpha,
                    X_c=X_c,
                    p0_est=p0_est)

    else:
        raise ValueError("Choose a valid solver")

    if calc_cov:
        return p_sol, p_var, p_cov
    else:
        return p_sol, p_var
Example #58
0
    X_train_bow = vectorizer.fit_transform(clean_train_reviews)
    X_test_bow = vectorizer.transform(clean_test_reviews)

    model_final = createModel_word2Vec(clean_train_reviews)
    print('Loading word2vec model..\n')

    model = Word2Vec.load(model_final)

    print("Creating the w2v vectors...\n")

    X_train_w2v = scale(getAvgFeatureVecs(clean_train_reviews, model, 5000))
    X_test_w2v = scale(getAvgFeatureVecs(clean_test_reviews, model, 5000))

    print("Combing the bag of words and the w2v vectors...\n")

    X_train_bwv = hstack([X_train_bow, X_train_w2v])
    X_test_bwv = hstack([X_test_bow, X_test_w2v])

    print("Checking the dimension of training vectors")

    print('W2V', X_train_w2v.shape)

    print('BoW-W2V', X_train_bwv.shape)

    y_train = Review_train['Rating']

    clf = LogisticRegression(class_weight="auto")

    print("Predicting with Bag-of-words model and Word2Vec model...\n")

    clf.fit(X_train_bwv, y_train)
    def construct_submatrices(nt, nx, st_label, ds, transient_asym_att_x,
                              x_sec):
        """Wrapped in a function to reduce memory usage.
        Constructing:
        Z_gamma (nt * nx, 1). Data: positive 1/temp
        Z_D (nt * nx, nt). Data: ones
        E (nt * nx, nx). Data: ones
        Zero_gamma (nt * nx, 1)
        zero_d (nt * nx, nt)
        Z_TA_fw (nt * nx, nta * 2 * nt) minus ones
        Z_TA_bw (nt * nx, nta * 2 * nt) minus ones
        Z_TA_E (nt * nx, nta * 2 * nt)

        I_fw = 1/Tref*gamma - D_fw - E - TA_fw
        I_bw = 1/Tref*gamma - D_bw + E - TA_bw
        (I_bw - I_fw) / 2 = D_fw/2 - D_bw/2 + E + TA_fw/2 - TA_bw/2 Eq42
        """

        # Z \gamma  # Eq.47
        cal_ref = np.array(
            ds.ufunc_per_section(label=st_label,
                                 ref_temp_broadcasted=True,
                                 calc_per='all'))
        data_gamma = 1 / (cal_ref.ravel() + 273.15)  # gamma
        coord_gamma_row = np.arange(nt * nx, dtype=int)
        coord_gamma_col = np.zeros(nt * nx, dtype=int)
        Z_gamma = sp.coo_matrix(
            (data_gamma, (coord_gamma_row, coord_gamma_col)),
            shape=(nt * nx, 1),
            copy=False)
        # Z D  # Eq.47
        data_c = np.ones(nt * nx, dtype=float)
        coord_c_row = np.arange(nt * nx, dtype=int)
        coord_c_col = np.tile(np.arange(nt, dtype=int), nx)
        Z_D = sp.coo_matrix((data_c, (coord_c_row, coord_c_col)),
                            shape=(nt * nx, nt),
                            copy=False)
        Z_D_att = sp.eye(nt, format='coo')
        # E  # Eq.47
        data_c = np.ones(nt * nx, dtype=float)
        coord_c_row = np.arange(nt * nx, dtype=int)
        coord_c_col = np.repeat(np.arange(nx, dtype=int), nt)
        E = sp.coo_matrix((data_c, (coord_c_row, coord_c_col)),
                          shape=(nt * nx, nx),
                          copy=False)
        # Zero  # Eq.45
        Zero_gamma = sp.coo_matrix(([], ([], [])), shape=(nt * nx, 1))
        Zero_d = sp.coo_matrix(([], ([], [])), shape=(nt * nx, nt))
        Zero_E = sp.coo_matrix(([], ([], [])), shape=(nt * nx, nx))
        Zero_gamma_att = sp.coo_matrix(([], ([], [])), shape=(nt, 1))
        Zero_E_att = sp.coo_matrix(([], ([], [])), shape=(nt, nx))
        if transient_asym_att_x:
            # unpublished BdT

            TA_fw_list = list()
            TA_bw_list = list()

            for transient_asym_att_xi in transient_asym_att_x:
                """For forward direction. """
                # first index on the right hand side a the difficult splice
                # Deal with connector outside of fiber
                if transient_asym_att_xi >= x_sec[-1]:
                    ix_sec_ta_ix0 = nx
                elif transient_asym_att_xi <= x_sec[0]:
                    ix_sec_ta_ix0 = 0
                else:
                    ix_sec_ta_ix0 = np.flatnonzero(
                        x_sec >= transient_asym_att_xi)[0]

                # Data is -1 for both forward and backward
                # I_fw = 1/Tref*gamma - D_fw - E - TA_fw. Eq40
                data_ta_fw = -np.ones(nt * (nx - ix_sec_ta_ix0), dtype=float)
                # skip ix_sec_ta_ix0 locations, because they are upstream of
                # the connector.
                coord_ta_fw_row = np.arange(nt * ix_sec_ta_ix0,
                                            nt * nx,
                                            dtype=int)
                # nt parameters
                coord_ta_fw_col = np.tile(np.arange(nt, dtype=int),
                                          nx - ix_sec_ta_ix0)
                TA_fw_list.append(
                    sp.coo_matrix(  # TA_fw
                        (data_ta_fw, (coord_ta_fw_row, coord_ta_fw_col)),
                        shape=(nt * nx, 2 * nt),
                        copy=False))

                # I_bw = 1/Tref*gamma - D_bw + E - TA_bw. Eq41
                data_ta_bw = -np.ones(nt * ix_sec_ta_ix0, dtype=float)
                coord_ta_bw_row = np.arange(nt * ix_sec_ta_ix0, dtype=int)
                coord_ta_bw_col = np.tile(np.arange(nt, 2 * nt, dtype=int),
                                          ix_sec_ta_ix0)
                TA_bw_list.append(
                    sp.coo_matrix(  # TA_bw
                        (data_ta_bw, (coord_ta_bw_row, coord_ta_bw_col)),
                        shape=(nt * nx, 2 * nt),
                        copy=False))
            Z_TA_fw = sp.hstack(TA_fw_list)
            Z_TA_bw = sp.hstack(TA_bw_list)

        else:
            Z_TA_fw = sp.coo_matrix(([], ([], [])), shape=(nt * nx, 0))
            Z_TA_bw = sp.coo_matrix(([], ([], [])), shape=(nt * nx, 0))

            Z_TA_att = sp.coo_matrix(([], ([], [])), shape=(nt, 0))

        # (I_bw - I_fw) / 2 = D_fw/2 - D_bw/2 + E + TA_fw/2 - TA_bw/2 Eq42
        Z_TA_E = (Z_TA_bw - Z_TA_fw) / 2

        return E, Z_D, Z_gamma, Zero_d, Zero_gamma, Z_TA_fw, Z_TA_bw, Z_TA_E,\
            Zero_E, Z_TA_att, Z_D_att, Zero_gamma_att, Zero_E_att
Example #60
0
#test
test_new = train_test.iloc[ntrain:, :]
test_new_cat = me.transform(test_new)
train_test = pd.concat((train_new_cat, test_new_cat),
                       axis=0).reset_index(drop=True)
train_test.drop(categoricals, axis=1, inplace=True)

train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x))
c_vect = CountVectorizer(stop_words='english',
                         max_features=200,
                         ngram_range=(1, 1))
c_vect_sparse = c_vect.fit_transform(train_test['features2'])
c_vect_sparse_cols = c_vect.get_feature_names()
train_test.drop(['features', 'features2'], axis=1, inplace=True)
train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr()

train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]
train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True)
train_new.to_csv(out + 'RentListingInquries_FE_train.csv', index=False)
X_test.to_csv(out + 'RentListingInquries_FE_test.csv', index=False)

X_train_sparse = train_test_sparse[:ntrain, :]
X_test_sparse = train_test_sparse[ntrain:, :]
train_sparse = sparse.hstack([X_train_sparse,
                              sparse.csr_matrix(y_train).T]).tocsr()
mmwrite(out + 'RentListingInquries_FE_train.txt', train_sparse)
mmwrite(out + 'RentListingInquries_FE_test.txt', X_test_sparse)