Python term_weightingの例、termweight.term_weighting Pythonの例

コード例 #1

0

ファイルを表示

ファイル: domain_adaptation_baseline.py プロジェクト: lelegan/xrce_msda_da_regularization

def sfa_transfer(Xs,
                 Ys,
                 Xt,
                 Yt,
                 clf_class=LogisticRegression,
                 feat_type=2,
                 nclusters=100,
                 num_di_words=1000):

    Xsn = termweight.term_weighting(Xs, feat_type)
    Xtn = termweight.term_weighting(Xt, feat_type)
    Xs_sfa, Xt_sfa = spectral_feature_alignement(Xsn,
                                                 Xtn,
                                                 num_di_words,
                                                 nk=nclusters,
                                                 normalize=True)

    Xs_tfidf = termweight.term_weighting(Xs, feat_type)
    Xt_tfidf = termweight.term_weighting(Xt, feat_type)

    Xs_all = append_features(Xs_tfidf, Xs_sfa, gamma=0.5)
    Xt_all = append_features(Xt_tfidf, Xt_sfa, gamma=0.5)

    acc = no_transfer(Xs, Ys, Xt, Yt, clf_class=clf_class)
    acc_sfa = no_transfer(Xs_all, Ys, Xt_all, Yt, clf_class=clf_class)

    return acc, acc_sfa

コード例 #2

0

ファイルを表示

def mda_exp(Xs,
            Ys,
            Xt,
            Yt,
            clf_class=LogisticRegression,
            noise=0.9,
            feat_type=2,
            layer_func=lambda x: layer_function(x, 3),
            filter_W_option=0,
            topk=50,
            cross_valid=True,
            use_Xr=True,
            use_bias=True):
    #Stack Dataset Together
    ndocs_source = Xs.shape[0]
    ndocs_target = Xt.shape[0]

    X_all = sp.vstack([Xs, Xt])
    word_selected = get_most_frequent_features(X_all, 5000)

    if feat_type > 0:
        X_all = term_weighting(X_all, feat_type=feat_type)

    Xdw_most_frequent = X_all[:, word_selected]

    acc_bow = domain_adaptation_baseline.no_transfer(X_all[:ndocs_source, :],
                                                     Ys,
                                                     X_all[ndocs_source:, :],
                                                     Yt)
    #acc_bow=-1
    #print "BOW Baseline",acc_bow
    if use_Xr:
        hw, W = denoising_autoencoders.mDA(X_all.T,
                                           noise,
                                           1e-2,
                                           layer_func=layer_func,
                                           Xr=Xdw_most_frequent.T,
                                           filter_W_option=filter_W_option,
                                           topk=topk)
    else:
        if use_bias:
            hw, W = denoising_autoencoders.mDA(X_all.T,
                                               noise,
                                               1e-2,
                                               layer_func=layer_func,
                                               filter_W_option=filter_W_option,
                                               topk=topk)
        else:
            print("Without Bias ....")
            hw, W = denoising_autoencoders.mDA_without_bias(
                X_all.T, noise, 1e-2, layer_func=layer_func)

    accuracy = evaluate_mda_features(hw,
                                     Ys,
                                     Yt,
                                     ndocs_source,
                                     clf_class,
                                     cross_valid=cross_valid)

    return acc_bow, accuracy

コード例 #3

0

ファイルを表示

def make_matlab_dataset(type='small',
                        outname="amazon_small_10p4_features.mat",
                        feat_type=0,
                        max_words=10000):
    dvd_name = get_dataset_path('dvd', type)
    books_name = get_dataset_path('books', type)
    electronics_name = get_dataset_path('electronics', type)
    kitchen_name = get_dataset_path('kitchen', type)

    dataset_list = [dvd_name, books_name, electronics_name, kitchen_name]
    datasets, dico = dataset_utils.parse_processed_amazon_dataset(
        dataset_list, max_words=max_words)

    L_dvd, Y_dvd = datasets[dvd_name]
    L_books, Y_books = datasets[books_name]
    L_elec, Y_elec = datasets[electronics_name]
    L_kit, Y_kit = datasets[kitchen_name]

    X_dvd = count_list_to_sparse_matrix(L_dvd, dico)
    X_books = count_list_to_sparse_matrix(L_books, dico)
    X_elec = count_list_to_sparse_matrix(L_elec, dico)
    X_kit = count_list_to_sparse_matrix(L_kit, dico)

    if feat_type > 0:
        X_dvd = termweight.term_weighting(X_dvd, feat_type)
        X_books = termweight.term_weighting(X_books, feat_type)
        X_elec = termweight.term_weighting(X_elec, feat_type)
        X_kit = termweight.term_weighting(X_kit, feat_type)

    A = {
        "X_dvd": X_dvd,
        "X_boo": X_books,
        "X_ele": X_elec,
        "X_kit": X_kit,
        "Y_dvd": Y_dvd,
        "Y_boo": Y_books,
        "Y_ele": Y_elec,
        "Y_kit": Y_kit
    }

    scipy.io.savemat(outname, A)

コード例 #4

0

ファイルを表示

ファイル: domain_adaptation_baseline.py プロジェクト: lelegan/xrce_msda_da_regularization

def lsi_transfer(Xs,
                 Ys,
                 Xt,
                 Yt,
                 clf_class=LogisticRegression,
                 feat_type=2,
                 lsi_rank=100,
                 use_original_features=False,
                 use_singular_values=True,
                 l2normalization=False):

    ndocs_source = Xs.shape[0]
    ndocs_target = Xt.shape[0]

    if sp.issparse(Xs):
        X_all = sp.vstack([Xs, Xt])
    else:
        X_all = np.vstack([Xs, Xt])

    X = termweight.term_weighting(X_all, feat_type)
    if sp.issparse(X):
        U, S, V = LSI(X, lsi_rank)
        if use_singular_values:
            Un = np.dot(U, np.diag(np.sqrt(S)))
        else:
            Un = U

    else:
        #Randomized SVD Here
        svd = TruncatedSVD(n_components=lsi_rank)
        Un = svd.fit_transform(X)

    if l2normalization:
        Un = sklearn.preprocessing.normalize(Un)

    Us = Un[:ndocs_source, :]
    Ut = Un[ndocs_source:, :]

    if use_original_features is True:
        Xs_n = X_all[:ndocs_source, :]
        Xt_n = X_all[ndocs_source:, :]

        Us = append_features(Xs_n, Us, gamma=0.5)
        Ut = append_features(Xt_n, Ut, gamma=0.5)

    return no_transfer(Us, Ys, Ut, Yt, clf_class)

コード例 #5

0

ファイルを表示

def lsi_mda(Xs,
            Ys,
            Xt,
            Yt,
            clf_class=LogisticRegression,
            noise=0.9,
            feat_type=2,
            layer_func=lambda x: layer_function(x, 1),
            lsi_rank=100):
    #First MDA and then LSI
    #Stack Dataset Together
    ndocs_source = Xs.shape[0]
    ndocs_target = Xt.shape[0]

    X_all = sp.vstack([Xs, Xt])
    X = term_weighting(X_all, feat_type)

    word_selected = get_most_frequent_features(X_all, 5000)

    Xdw_most_frequent = X_all[:, word_selected]

    hx, _ = denoising_autoencoders.mDA(X.T,
                                       noise,
                                       layer_func=layer_func,
                                       Xr=Xdw_most_frequent.T,
                                       reg_lambda=1e-2)
    X_all_dafeatures = hx.T

    Xs_mda = X_all_dafeatures[:ndocs_source, :]
    Xt_mda = X_all_dafeatures[ndocs_source:, :]

    return domain_adaptation_baseline.lsi_transfer(Xs_mda,
                                                   Ys,
                                                   Xt_mda,
                                                   Yt,
                                                   clf_class,
                                                   feat_type=0,
                                                   lsi_rank=lsi_rank)

コード例 #6

0

ファイルを表示

ファイル: domain_adaptation_baseline.py プロジェクト: lelegan/xrce_msda_da_regularization

    def fit(self):
        #Cross Validate Classifier
        ndocs_source = self.Xs.shape[0]
        ndocs_target = self.Xt.shape[0]

        if sp.issparse(self.Xs):
            X_all = sp.vstack([self.Xs, self.Xt])
        else:
            X_all = np.vstack([self.Xs, self.Xt])

        X = termweight.term_weighting(X_all, self.feat_type)
        if sp.issparse(X):
            U, S, V = LSI(X, self.k)
            self.V = V
            self.S = S
            if self.use_singular_values:
                Un = np.dot(U, np.diag(np.sqrt(S)))
            else:
                Un = U

        else:
            #Randomized SVD Here
            svd = TruncatedSVD(n_components=self.k)
            Un = svd.fit_transform(X)

        if self.l2norm:
            sklearn.preprocessing.normalize(Un, 'l2', copy=False)

        Us = Un[:ndocs_source, :]
        Ut = Un[ndocs_source:, :]
        #Train Semi-Supervised
        if self.use_original_features is True:
            Xs_n = X_all[:ndocs_source, :]
            Us = append_features(Xs_n, Us, gamma=0.5)

        #return no_transfer(Us,self.Ys,Ut,Yt,clf_class)
        self.clf.fit(Us, self.Ys)

コード例 #7

0

ファイルを表示

def msda_classifier(Xs,
                    Ys,
                    Xt,
                    Yt,
                    noise=0.9,
                    feat_type=0,
                    score='AUC',
                    clf=None,
                    layer_func=np.tanh,
                    self_learning=False):

    tfidf_trans = TfidfTransformer()

    if feat_type == 8:
        Xsn = term_weighting(Xs, feat_type)
        Xtn = term_weighting(Xt, feat_type)
    elif feat_type == 2:
        Xsn = tfidf_trans.fit_transform(Xs)
        Xtn = tfidf_trans.transform(Xt)
    else:
        Xsn = Xs
        Xtn = Xt

    #If not Classifier are given, we cross-validate one
    if not (clf):
        clf_cv = domain_adaptation_baseline.cross_validate_classifier(
            Xsn, Ys, sklearn.linear_model.LogisticRegression, n_jobs=5)
    else:
        clf_cv = clf
        clf_cv = clf.fit(Xsn, Ys)

    no_transfer_acc = clf_cv.score(Xtn, Yt)

    proba = clf_cv.predict_proba(Xtn)

    nclasses = proba.shape[1]
    multiclass = nclasses > 2

    if not (multiclass):
        Py_d = proba[:, 1]
        vect_prob = np.zeros((Xt.shape[0], 1), dtype='f')
        vect_prob[:, 0] = Py_d[:]
        #Xt_augment=domain_adaptation_baseline.append_features(Xt,vect_prob)
        Xt_augment = domain_adaptation_baseline.append_features(Xtn, vect_prob)
    else:
        #TODO Try to do it Per Class
        Xt_augment = domain_adaptation_baseline.append_features(Xtn, proba)

    if self_learning:
        Ytpred = clf_cv.predict(Xtn)
        clf_cv = domain_adaptation_baseline.cross_validate_classifier(
            Xtn, Ytpred, sklearn.linear_model.LogisticRegression)
        no_transfer_acc = clf_cv.score(Xtn, Yt)
    '''
    log_proba = np.log( clf_cv.predict_proba(Xtn) +0.0000001)
    #log_proba = clf_cv.predict_log_proba(Xtn)
    Py_d = log_proba[:,1] -np.log(0.5)
    vect_prob=np.zeros((Xt.shape[0],1),dtype='f')
    vect_prob[:,0]=Py_d[:]
    Xt_augment=domain_adaptation_baseline.append_features(Xtn,vect_prob)
    '''

    hw, W = denoising_autoencoders.mDA(Xt_augment.T,
                                       noise,
                                       0.05,
                                       layer_func=layer_func)
    h = hw.T

    if not (multiclass):
        #TODO This is dangerous if I swap label 0 and 1 as decision, no ?
        m_score = sklearn.metrics.accuracy_score(Yt, h[:, -1] > 0.5)
        #m_score = sklearn.metrics.accuracy_score(Yt,(h[:,-1]-np.log(0.5))>0)

        model_AUC = sklearn.metrics.roc_auc_score(Yt, h[:, -1])
        baseline_AUC = sklearn.metrics.roc_auc_score(Yt, Py_d)
        print "AUC", baseline_AUC, model_AUC

        if score == 'AUC':
            return (baseline_AUC, model_AUC)
        else:
            return (no_transfer_acc, m_score)
    else:
        hy_reconstruction = h[:, -nclasses:]
        y_pred = np.argmax(hy_reconstruction, axis=1)
        m_score = sklearn.metrics.accuracy_score(Yt, y_pred)
        if score == 'AUC':
            raise NotImplementedError
        else:
            return (no_transfer_acc, m_score)