def predict(self,X,Y,T,wv_X,wv_T,kind): """ train and predict :param X: train text :param Y: train label :param T: test text :param wv_X: train wv :param wv_T: test wv :param kind: age/gender/education :return: array like ,predict of "kind" """ print('predicting..向量化中...') vec = TfidfVectorizer(use_idf=True, sublinear_tf=False, max_features=60000, binary=True) vec.fit(X, Y) X = vec.transform(X) T = vec.transform(T) print('train size',X.shape,T.shape) res = self.stacking(X, Y, T, wv_X, wv_T, kind) return res
def predict(self,X,Y,T,wv_X,wv_T,kind): """ train and predict :param X: train text :param Y: train label :param T: test text :param wv_X: train wv :param wv_T: test wv :param kind: age/gender/education :return: array like ,predict of "kind" """ print 'predicting..向量化中...' vec = TfidfVectorizer(use_idf=True, sublinear_tf=False, max_features=60000, binary=True) vec.fit(X, Y) X = vec.transform(X) T = vec.transform(T) print 'train size',X.shape,T.shape res = self.stacking(X, Y, T, wv_X, wv_T, kind) return res
def validation(self, X, Y, wv_X, kind): """ 2-fold validation :param X: train text :param Y: train label :param wv_X: train wv_vec :param kind: age/gender/education :return: mean score of 2-fold validation """ print '向量化中...' X=np.array(X) fold_n=2 folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0)) score = np.zeros(fold_n) for j, (train_idx, test_idx) in enumerate(folds): print j+1,'-fold' X_train = X[train_idx] y_train = Y[train_idx] X_test = X[test_idx] y_test = Y[test_idx] wv_X_train =wv_X[train_idx] wv_X_test = wv_X[test_idx] vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True) vec.fit(X_train, y_train) X_train = vec.transform(X_train) X_test = vec.transform(X_test) print 'shape',X_train.shape ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind) cur = sum(y_test == ypre) * 1.0 / len(ypre) score[j] = cur print score print score.mean(),kind return score.mean()
def validation(self, X, Y, wv_X, kind): """ 2-fold validation :param X: train text :param Y: train label :param wv_X: train wv_vec :param kind: age/gender/education :return: mean score of 2-fold validation """ print('向量化中...') X = np.array(X) fold_n = 2 folder = StratifiedKFold(n_splits=fold_n, shuffle=False, random_state=0) score = np.zeros(fold_n) for j, (train_idx, test_idx) in enumerate(list(folder.split(X, Y))): print(j + 1, '-fold') X_train = X[train_idx] y_train = Y[train_idx] X_test = X[test_idx] y_test = Y[test_idx] # fold_n=2 # folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0)) # score = np.zeros(fold_n) # for j, (train_idx, test_idx) in enumerate(folds): # print(j+1,'-fold') # # X_train = X[train_idx] # y_train = Y[train_idx] # X_test = X[test_idx] # y_test = Y[test_idx] wv_X_train = wv_X[train_idx] wv_X_test = wv_X[test_idx] vec = TfidfVectorizer(use_idf=True, sublinear_tf=False, max_features=50000, binary=True) vec.fit(X_train, y_train) X_train = vec.transform(X_train) X_test = vec.transform(X_test) print('shape', X_train.shape) ypre = self.stacking(X_train, y_train, X_test, wv_X_train, wv_X_test, kind) cur = sum(y_test == ypre) * 1.0 / len(ypre) score[j] = cur print(score) print(score.mean(), kind) return score.mean()