Ejemplo n.º 1
0
    def validation(self, X, Y, wv_X, kind):
        """
        2-fold validation
        :param X: train text
        :param Y: train label
        :param wv_X: train wv_vec
        :param kind: age/gender/education
        :return: mean score of 2-fold validation
        """
        print('向量化中...')
        X = np.array(X)
        fold_n = 2
        folder = StratifiedKFold(n_splits=fold_n,
                                 shuffle=False,
                                 random_state=0)
        score = np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(list(folder.split(X, Y))):
            print(j + 1, '-fold')
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            # fold_n=2
            # folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0))
            # score = np.zeros(fold_n)
            # for j, (train_idx, test_idx) in enumerate(folds):
            #     print(j+1,'-fold')
            #
            #     X_train = X[train_idx]
            #     y_train = Y[train_idx]
            #     X_test = X[test_idx]
            #     y_test = Y[test_idx]

            wv_X_train = wv_X[train_idx]
            wv_X_test = wv_X[test_idx]

            vec = TfidfVectorizer(use_idf=True,
                                  sublinear_tf=False,
                                  max_features=50000,
                                  binary=True)
            vec.fit(X_train, y_train)
            X_train = vec.transform(X_train)
            X_test = vec.transform(X_test)

            print('shape', X_train.shape)

            ypre = self.stacking(X_train, y_train, X_test, wv_X_train,
                                 wv_X_test, kind)
            cur = sum(y_test == ypre) * 1.0 / len(ypre)
            score[j] = cur

        print(score)
        print(score.mean(), kind)
        return score.mean()
Ejemplo n.º 2
0
    def predict(self,X,Y,T,wv_X,wv_T,kind):
        """
        train and predict
        :param X: train text
        :param Y: train label
        :param T: test text
        :param wv_X: train wv
        :param wv_T: test wv
        :param kind: age/gender/education
        :return: array like ,predict of "kind"
        """
        print('predicting..向量化中...')
        vec = TfidfVectorizer(use_idf=True, sublinear_tf=False, max_features=60000, binary=True)

        vec.fit(X, Y)
        X = vec.transform(X)
        T = vec.transform(T)

        print('train size',X.shape,T.shape)
        res = self.stacking(X, Y, T, wv_X, wv_T, kind)
        return res
Ejemplo n.º 3
0
    def predict(self,X,Y,T,wv_X,wv_T,kind):
        """
        train and predict
        :param X: train text
        :param Y: train label
        :param T: test text
        :param wv_X: train wv
        :param wv_T: test wv
        :param kind: age/gender/education
        :return: array like ,predict of "kind"
        """
        print 'predicting..向量化中...'
        vec = TfidfVectorizer(use_idf=True, sublinear_tf=False, max_features=60000, binary=True)

        vec.fit(X, Y)
        X = vec.transform(X)
        T = vec.transform(T)

        print 'train size',X.shape,T.shape
        res = self.stacking(X, Y, T, wv_X, wv_T, kind)
        return res
Ejemplo n.º 4
0
    def validation(self, X, Y, wv_X, kind):
        """
        2-fold validation
        :param X: train text
        :param Y: train label
        :param wv_X: train wv_vec
        :param kind: age/gender/education
        :return: mean score of 2-fold validation
        """
        print '向量化中...'
        X=np.array(X)
        fold_n=2
        folds = list(StratifiedKFold(Y, n_folds=fold_n, shuffle=False,random_state=0))
        score = np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(folds):
            print j+1,'-fold'

            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]

            wv_X_train =wv_X[train_idx]
            wv_X_test = wv_X[test_idx]

            vec = TfidfVectorizer(use_idf=True,sublinear_tf=False, max_features=50000, binary=True)
            vec.fit(X_train, y_train)
            X_train = vec.transform(X_train)
            X_test = vec.transform(X_test)

            print 'shape',X_train.shape

            ypre = self.stacking(X_train,y_train,X_test,wv_X_train,wv_X_test,kind)
            cur = sum(y_test == ypre) * 1.0 / len(ypre)
            score[j] = cur

        print score
        print score.mean(),kind
        return score.mean()