def fit(self):
        ndocs_source = self.Xs.shape[0]

        if sp.issparse(self.Xs):
            X_all = sp.vstack([self.Xs, self.Xt])
        else:
            X_all = np.vstack([self.Xs, self.Xt])

        if self.use_Xr:
            word_selected = get_most_frequent_features(X_all, 5000)
            Xdw_most_frequent = X_all[:, word_selected]
            hw, W = denoising_autoencoders.mDA(X_all.T,
                                               self.noise,
                                               1e-2,
                                               layer_func=self.layer_func,
                                               Xr=Xdw_most_frequent.T)
        else:
            if self.use_bias:
                hw, W = denoising_autoencoders.mDA(X_all.T,
                                                   self.noise,
                                                   1e-2,
                                                   layer_func=self.layer_func)
            else:
                print "Without Bias ...."
                hw, W = denoising_autoencoders.mDA_without_bias(
                    X_all.T, self.noise, 1e-2, layer_func=self.layer_func)

        X_all_dafeatures = hw.T
        Xs_mda = X_all_dafeatures[:ndocs_source, :]
        #Train
        self.clf = domain_adaptation_baseline.cross_validate_classifier(
            Xs_mda, self.Ys, LogisticRegression)
        self.W = W
Exemple #2
0
def mda_exp(Xs,
            Ys,
            Xt,
            Yt,
            clf_class=LogisticRegression,
            noise=0.9,
            feat_type=2,
            layer_func=lambda x: layer_function(x, 3),
            filter_W_option=0,
            topk=50,
            cross_valid=True,
            use_Xr=True,
            use_bias=True):
    #Stack Dataset Together
    ndocs_source = Xs.shape[0]
    ndocs_target = Xt.shape[0]

    X_all = sp.vstack([Xs, Xt])
    word_selected = get_most_frequent_features(X_all, 5000)

    if feat_type > 0:
        X_all = term_weighting(X_all, feat_type=feat_type)

    Xdw_most_frequent = X_all[:, word_selected]

    acc_bow = domain_adaptation_baseline.no_transfer(X_all[:ndocs_source, :],
                                                     Ys,
                                                     X_all[ndocs_source:, :],
                                                     Yt)
    #acc_bow=-1
    #print "BOW Baseline",acc_bow
    if use_Xr:
        hw, W = denoising_autoencoders.mDA(X_all.T,
                                           noise,
                                           1e-2,
                                           layer_func=layer_func,
                                           Xr=Xdw_most_frequent.T,
                                           filter_W_option=filter_W_option,
                                           topk=topk)
    else:
        if use_bias:
            hw, W = denoising_autoencoders.mDA(X_all.T,
                                               noise,
                                               1e-2,
                                               layer_func=layer_func,
                                               filter_W_option=filter_W_option,
                                               topk=topk)
        else:
            print("Without Bias ....")
            hw, W = denoising_autoencoders.mDA_without_bias(
                X_all.T, noise, 1e-2, layer_func=layer_func)

    accuracy = evaluate_mda_features(hw,
                                     Ys,
                                     Yt,
                                     ndocs_source,
                                     clf_class,
                                     cross_valid=cross_valid)

    return acc_bow, accuracy
Exemple #3
0
def lsi_mda(Xs,
            Ys,
            Xt,
            Yt,
            clf_class=LogisticRegression,
            noise=0.9,
            feat_type=2,
            layer_func=lambda x: layer_function(x, 1),
            lsi_rank=100):
    #First MDA and then LSI
    #Stack Dataset Together
    ndocs_source = Xs.shape[0]
    ndocs_target = Xt.shape[0]

    X_all = sp.vstack([Xs, Xt])
    X = term_weighting(X_all, feat_type)

    word_selected = get_most_frequent_features(X_all, 5000)

    Xdw_most_frequent = X_all[:, word_selected]

    hx, _ = denoising_autoencoders.mDA(X.T,
                                       noise,
                                       layer_func=layer_func,
                                       Xr=Xdw_most_frequent.T,
                                       reg_lambda=1e-2)
    X_all_dafeatures = hx.T

    Xs_mda = X_all_dafeatures[:ndocs_source, :]
    Xt_mda = X_all_dafeatures[ndocs_source:, :]

    return domain_adaptation_baseline.lsi_transfer(Xs_mda,
                                                   Ys,
                                                   Xt_mda,
                                                   Yt,
                                                   clf_class,
                                                   feat_type=0,
                                                   lsi_rank=lsi_rank)
Exemple #4
0
def msda_ssl(X, Y, noise=0.9, layer_func=lambda x: x):
    lr = sklearn.linear_model.LogisticRegression()

    rs = sklearn.cross_validation.ShuffleSplit(X.shape[0],
                                               train_size=100,
                                               test_size=X.shape[0] - 100)
    IDX = [(ta_idx, te_idx) for ta_idx, te_idx in rs]
    train_idx = IDX[0][0]
    test_idx = IDX[0][1]

    lr.fit(X[train_idx], Y[train_idx])
    lr.score(X[test_idx], Y[test_idx])

    proba = lr.predict_proba(X[test_idx])
    Py_d = proba[:, 1]
    vect_prob = np.zeros((X[test_idx].shape[0], 1), dtype='f')
    vect_prob[:, 0] = Py_d[:]
    Xt_augment = domain_adaptation_baseline.append_features(
        X[test_idx], vect_prob)
    hw, W = denoising_autoencoders.mDA(Xt_augment.T,
                                       noise,
                                       0.05,
                                       layer_func=layer_func)
    h = hw.T
    m_score = sklearn.metrics.accuracy_score(Y[test_idx], h[:, -1] > 0.5)
    #baseline_score=lr.score(X[test_idx],Y[test_idx])
    basleine_score = sklearn.metrics.accuracy_score(Y[test_idx],
                                                    vect_prob > 0.5)

    model_AUC = sklearn.metrics.roc_auc_score(Y[test_idx], h[:, -1])
    baseline_AUC = sklearn.metrics.roc_auc_score(Y[test_idx], Py_d)

    print "AUC", model_AUC, baseline_AUC
    print "ACC", m_score, basleine_score,

    return basleine_score, m_score
def msda_exp_testset(Xs,
                     Ys,
                     Xt,
                     Xtest,
                     Ytest,
                     clf_class=LogisticRegression,
                     noise=0.9,
                     feat_type=0,
                     layer_func=np.tanh,
                     filter_W_option=0,
                     topk=50,
                     cross_valid=True,
                     use_Xr=True,
                     use_bias=True):
    #Stack Dataset Together
    ndocs_source = Xs.shape[0]
    ndocs_target = Xt.shape[0]

    X_all = sp.vstack([Xs, Xt])
    #TODO RLU
    word_selected = get_most_frequent_features(X_all, 5000)

    Xdw_most_frequent = X_all[:, word_selected]

    #acc_bow = domain_adaptation_baseline.no_transfer(X_all[:ndocs_source,:],Ys,X_all[ndocs_source:,:],Yt)
    acc_bow = -1
    print "BOW Baseline", acc_bow
    if use_Xr:
        hw, W = denoising_autoencoders.mDA(X_all.T,
                                           noise,
                                           1e-2,
                                           layer_func=layer_func,
                                           Xr=Xdw_most_frequent.T,
                                           filter_W_option=filter_W_option,
                                           topk=topk)
    else:
        if use_bias:
            hw, W = denoising_autoencoders.mDA(X_all.T,
                                               noise,
                                               1e-2,
                                               layer_func=layer_func,
                                               filter_W_option=filter_W_option,
                                               topk=topk)
        else:
            print "Without Bias ...."
            hw, W = denoising_autoencoders.mDA_without_bias(
                X_all.T, noise, 1e-2, layer_func=layer_func)

    X_all_dafeatures = hw.T

    Xs_mda = X_all_dafeatures[:ndocs_source, :]
    Xtest_msda = denoising_autoencoders.transform_test(Xtest.T,
                                                       W,
                                                       layer_func=layer_func,
                                                       use_bias=use_bias).T

    #Train
    clf = domain_adaptation_baseline.cross_validate_classifier(
        Xs_mda, Ys, clf_class)
    Y_pred = clf.predict(Xtest_msda)

    print classification_report(Ytest, Y_pred)

    accuracy = sklearn.metrics.accuracy_score(Ytest, Y_pred)

    return acc_bow, accuracy
Exemple #6
0
def msda_classifier(Xs,
                    Ys,
                    Xt,
                    Yt,
                    noise=0.9,
                    feat_type=0,
                    score='AUC',
                    clf=None,
                    layer_func=np.tanh,
                    self_learning=False):

    tfidf_trans = TfidfTransformer()

    if feat_type == 8:
        Xsn = term_weighting(Xs, feat_type)
        Xtn = term_weighting(Xt, feat_type)
    elif feat_type == 2:
        Xsn = tfidf_trans.fit_transform(Xs)
        Xtn = tfidf_trans.transform(Xt)
    else:
        Xsn = Xs
        Xtn = Xt

    #If not Classifier are given, we cross-validate one
    if not (clf):
        clf_cv = domain_adaptation_baseline.cross_validate_classifier(
            Xsn, Ys, sklearn.linear_model.LogisticRegression, n_jobs=5)
    else:
        clf_cv = clf
        clf_cv = clf.fit(Xsn, Ys)

    no_transfer_acc = clf_cv.score(Xtn, Yt)

    proba = clf_cv.predict_proba(Xtn)

    nclasses = proba.shape[1]
    multiclass = nclasses > 2

    if not (multiclass):
        Py_d = proba[:, 1]
        vect_prob = np.zeros((Xt.shape[0], 1), dtype='f')
        vect_prob[:, 0] = Py_d[:]
        #Xt_augment=domain_adaptation_baseline.append_features(Xt,vect_prob)
        Xt_augment = domain_adaptation_baseline.append_features(Xtn, vect_prob)
    else:
        #TODO Try to do it Per Class
        Xt_augment = domain_adaptation_baseline.append_features(Xtn, proba)

    if self_learning:
        Ytpred = clf_cv.predict(Xtn)
        clf_cv = domain_adaptation_baseline.cross_validate_classifier(
            Xtn, Ytpred, sklearn.linear_model.LogisticRegression)
        no_transfer_acc = clf_cv.score(Xtn, Yt)
    '''
    log_proba = np.log( clf_cv.predict_proba(Xtn) +0.0000001)
    #log_proba = clf_cv.predict_log_proba(Xtn)
    Py_d = log_proba[:,1] -np.log(0.5)
    vect_prob=np.zeros((Xt.shape[0],1),dtype='f')
    vect_prob[:,0]=Py_d[:]
    Xt_augment=domain_adaptation_baseline.append_features(Xtn,vect_prob)
    '''

    hw, W = denoising_autoencoders.mDA(Xt_augment.T,
                                       noise,
                                       0.05,
                                       layer_func=layer_func)
    h = hw.T

    if not (multiclass):
        #TODO This is dangerous if I swap label 0 and 1 as decision, no ?
        m_score = sklearn.metrics.accuracy_score(Yt, h[:, -1] > 0.5)
        #m_score = sklearn.metrics.accuracy_score(Yt,(h[:,-1]-np.log(0.5))>0)

        model_AUC = sklearn.metrics.roc_auc_score(Yt, h[:, -1])
        baseline_AUC = sklearn.metrics.roc_auc_score(Yt, Py_d)
        print "AUC", baseline_AUC, model_AUC

        if score == 'AUC':
            return (baseline_AUC, model_AUC)
        else:
            return (no_transfer_acc, m_score)
    else:
        hy_reconstruction = h[:, -nclasses:]
        y_pred = np.argmax(hy_reconstruction, axis=1)
        m_score = sklearn.metrics.accuracy_score(Yt, y_pred)
        if score == 'AUC':
            raise NotImplementedError
        else:
            return (no_transfer_acc, m_score)
Exemple #7
0
def msda_classifier_with_scores(Xt,
                                Yt,
                                St,
                                use_pred=False,
                                noise=0.9,
                                score='AUC',
                                clf=None,
                                layer_func=np.tanh):
    Xtn = Xt

    #no_transfer_acc=clf_cv.score(Xtn,Yt)
    Y_pred = np.argmax(St, axis=1)

    no_transfer_acc = sklearn.metrics.accuracy_score(Yt, Y_pred)

    proba = St
    pred_features = np.zeros(proba.shape, dtype='f')
    print Y_pred[:10]

    for i, cindx in enumerate(Y_pred):
        pred_features[i, cindx] = 1.0

    print pred_features[:10, :]

    nclasses = proba.shape[1]
    multiclass = nclasses > 2

    if not (multiclass):
        Py_d = proba[:, 1]
        vect_prob = np.zeros((Xt.shape[0], 1), dtype='f')
        vect_prob[:, 0] = Py_d[:]
        #Xt_augment=domain_adaptation_baseline.append_features(Xt,vect_prob)
        if use_pred:
            raise Exception('Use Pred not implemented for binary cases')
        else:
            Xt_augment = domain_adaptation_baseline.append_features(
                Xtn, vect_prob)
    else:
        #TODO Try to do it Per Class
        if use_pred:
            Xt_augment = domain_adaptation_baseline.append_features(
                Xtn, pred_features)
        else:
            Xt_augment = domain_adaptation_baseline.append_features(Xtn, proba)
    '''
    log_proba = np.log( clf_cv.predict_proba(Xtn) +0.0000001)
    #log_proba = clf_cv.predict_log_proba(Xtn)
    Py_d = log_proba[:,1] -np.log(0.5)
    vect_prob=np.zeros((Xt.shape[0],1),dtype='f')
    vect_prob[:,0]=Py_d[:]
    Xt_augment=domain_adaptation_baseline.append_features(Xtn,vect_prob)
    '''

    hw, W = denoising_autoencoders.mDA(Xt_augment.T,
                                       noise,
                                       0.05,
                                       layer_func=layer_func)
    h = hw.T

    if not (multiclass):
        #TODO This is dangerous if I swap label 0 and 1 as decision, no ?
        m_score = sklearn.metrics.accuracy_score(Yt, h[:, -1] > 0.5)
        #m_score = sklearn.metrics.accuracy_score(Yt,(h[:,-1]-np.log(0.5))>0)

        model_AUC = sklearn.metrics.roc_auc_score(Yt, h[:, -1])
        baseline_AUC = sklearn.metrics.roc_auc_score(Yt, Py_d)
        print "AUC", baseline_AUC, model_AUC

        if score == 'AUC':
            return (baseline_AUC, model_AUC)
        else:
            return (no_transfer_acc, m_score)
    else:
        hy_reconstruction = h[:, -nclasses:]
        y_pred = np.argmax(hy_reconstruction, axis=1)
        m_score = sklearn.metrics.accuracy_score(Yt, y_pred)
        if score == 'AUC':
            raise NotImplementedError
        else:
            return (no_transfer_acc, m_score)