def fit(self):
        ndocs_source = self.Xs.shape[0]

        if sp.issparse(self.Xs):
            X_all = sp.vstack([self.Xs, self.Xt])
        else:
            X_all = np.vstack([self.Xs, self.Xt])

        if self.use_Xr:
            word_selected = get_most_frequent_features(X_all, 5000)
            Xdw_most_frequent = X_all[:, word_selected]
            hw, W = denoising_autoencoders.mDA(X_all.T,
                                               self.noise,
                                               1e-2,
                                               layer_func=self.layer_func,
                                               Xr=Xdw_most_frequent.T)
        else:
            if self.use_bias:
                hw, W = denoising_autoencoders.mDA(X_all.T,
                                                   self.noise,
                                                   1e-2,
                                                   layer_func=self.layer_func)
            else:
                print "Without Bias ...."
                hw, W = denoising_autoencoders.mDA_without_bias(
                    X_all.T, self.noise, 1e-2, layer_func=self.layer_func)

        X_all_dafeatures = hw.T
        Xs_mda = X_all_dafeatures[:ndocs_source, :]
        #Train
        self.clf = domain_adaptation_baseline.cross_validate_classifier(
            Xs_mda, self.Ys, LogisticRegression)
        self.W = W
    def fit_with_param(self, alpha, eta):
        ns = self.Xs.shape[0]

        if sp.issparse(self.Xs):
            X_all = sp.vstack([self.Xs, self.Xt])
        else:
            X_all = np.vstack([self.Xs, self.Xt])

        self.fit_domain_classifier(alpha, X_all)
        Z = np.dot(self.C, self.C.T)

        Dvector = np.array(self.D_vector)
        if self.orthogonal_reg:
            eta = -eta
            #Changing Dvector only in msda else C become 0
            Dvector[:] = 0.0

        if self.target_reg:
            eta = -eta
            #Changing Dvector only in msda else C become 0
            Dvector[:] = 1.0

        if self.source_reg:
            Dvector[:] = -1.0

        Icc = np.linalg.inv(np.eye(X_all.shape[1]) - eta * Z)

        print("Computing msda with Domain Regularizer")
        hx, W = denoising_autoencoders.mDA_domain_regularization(
            X_all.T, self.noise, eta, self.C, Dvector, Icc, reg_lambda=0.0)
        #Check it is the same link function ....
        Xs_mda = hx.T[:ns, :]
        print("CrossValidating Source Classifier")

        da_corpus = domain_adaptation_baseline.DACorpus(
            self.Xs, self.Ys, self.Xt, self.Yt)
        Ytindex = da_corpus.get_labelled_target_index()

        if Ytindex is not []:
            Ytlabels = self.Yt[Ytindex]
            Yclf = np.hstack([self.Ys, Ytlabels])
            #sample_weight = np.hstack([np.ones(self.Ys.shape[0]),2*np.ones(Ytlabels.shape[0])])
            Xs_mda = hx.T[:ns, :]
            Xt_mda = hx.T[ns:, :]
            Xtrain = np.vstack([Xs_mda, Xt_mda[Ytindex]])
        else:
            Yclf = self.Ys
            Xtrain = hx.T[:ns, :]

        self.clf = domain_adaptation_baseline.cross_validate_classifier(
            Xtrain,
            Yclf,
            LogisticRegression,
            score='accuracy',
            ncv=3,
            n_jobs=3,
            verbose=1)
        #self.clf = exp_run.cross_validate_classifier(Xs_mda,self.Ys,LogisticRegression,score='accuracy',ncv=3,n_jobs=3)
        self.W = W
Esempio n. 3
0
def evaluate_mda_features(hw,
                          Ys,
                          Yt,
                          ndocs_source,
                          clf,
                          cross_valid=True,
                          n_jobs=1):
    X_all_dafeatures = hw.T

    Xs_mda = X_all_dafeatures[:ndocs_source, :]
    Xt_mda = X_all_dafeatures[ndocs_source:, :]
    if cross_valid:
        clf = domain_adaptation_baseline.cross_validate_classifier(
            Xs_mda, Ys, clf, n_jobs=n_jobs)
    else:
        clf.fit(Xs_mda, Ys)

    Y_pred = clf.predict(Xt_mda)
    print classification_report(Yt, Y_pred)

    accuracy = sklearn.metrics.accuracy_score(Yt, Y_pred)

    return accuracy
    def cross_val_source(self):
        ns = self.Xs.shape[0]
        if sp.issparse(self.Xs):
            X_all = sp.vstack([self.Xs, self.Xt])
        else:
            X_all = np.vstack([self.Xs, self.Xt])

        ACC = []

        for alpha in self.alphas:
            self.fit_domain_classifier(alpha, X_all)
            #print "Normalizing C" #
            #self.C = sklearn.preprocessing.normalize(self.C)
            Z = np.dot(self.C, self.C.T)
            for eta in self.etas:
                Dvector = np.array(self.D_vector)
                if self.orthogonal_reg:
                    eta = -eta
                    #Changing Dvector only in msda else C become 0
                    Dvector[:] = 0.0

                if self.target_reg:
                    eta = -eta
                    #Changing Dvector only in msda else C become 0
                    Dvector[:] = 1.0

                Icc = np.linalg.inv(np.eye(X_all.shape[1]) - eta * Z)
                print("Computing msda with Domain Regularizer")
                hx, W = denoising_autoencoders.mDA_domain_regularization(
                    X_all.T,
                    self.noise,
                    eta,
                    self.C,
                    Dvector,
                    Icc,
                    reg_lambda=0.0)
                #Check it is the same link function ....
                Xs_mda = hx.T[:ns, :]

                print("CV Source Classifier")
                if self.cross_valid:
                    self.clf = domain_adaptation_baseline.cross_validate_classifier(
                        Xs_mda,
                        self.Ys,
                        LogisticRegression,
                        score='accuracy',
                        ncv=3,
                        n_jobs=3,
                        verbose=0)
                else:
                    self.clf = self.default_clf.fit(Xs_mda, self.Ys)
                #print clf.grid_scores_
                #max_score= clf.best_score_
                #clf.fit(Xs_mda,Ys)
                source_score = sklearn.cross_validation.cross_val_score(
                    self.clf, Xs_mda, y=self.Ys, cv=3, n_jobs=3).mean()
                ACC.append((source_score, (alpha, eta)))

        #Then find max
        sorted_ACC = sorted(ACC)
        self.source_cv_accuracy = sorted_ACC
        print sorted_ACC

        best_alpha = sorted_ACC[-1][1][0]
        best_eta = sorted_ACC[-1][1][1]

        Dvector = np.array(self.D_vector)
        if self.target_reg:
            #Changing Dvector only in msda else C become 0
            Dvector[:] = 1.0
        #The best eta should be a negative value ...
        # and Dvector already set

        #Print Refit the model
        self.fit_domain_classifier(best_alpha, X_all)
        Z = np.dot(self.C, self.C.T)
        Icc = np.linalg.inv(np.eye(X_all.shape[1]) - best_eta * Z)
        print "Computing msda with Domain Regularizer"
        hx, W = denoising_autoencoders.mDA_domain_regularization(
            X_all.T,
            self.noise,
            best_eta,
            self.C,
            Dvector,
            Icc,
            reg_lambda=0.0)
        Xs_mda = hx.T[:ns, :]
        self.W = W

        if self.cross_valid:
            self.clf = domain_adaptation_baseline.cross_validate_classifier(
                Xs_mda,
                self.Ys,
                LogisticRegression,
                score='accuracy',
                ncv=3,
                n_jobs=3,
                verbose=0)
        else:
            self.clf = self.default_clf.fit(Xs_mda, self.Ys)
def msda_exp_testset(Xs,
                     Ys,
                     Xt,
                     Xtest,
                     Ytest,
                     clf_class=LogisticRegression,
                     noise=0.9,
                     feat_type=0,
                     layer_func=np.tanh,
                     filter_W_option=0,
                     topk=50,
                     cross_valid=True,
                     use_Xr=True,
                     use_bias=True):
    #Stack Dataset Together
    ndocs_source = Xs.shape[0]
    ndocs_target = Xt.shape[0]

    X_all = sp.vstack([Xs, Xt])
    #TODO RLU
    word_selected = get_most_frequent_features(X_all, 5000)

    Xdw_most_frequent = X_all[:, word_selected]

    #acc_bow = domain_adaptation_baseline.no_transfer(X_all[:ndocs_source,:],Ys,X_all[ndocs_source:,:],Yt)
    acc_bow = -1
    print "BOW Baseline", acc_bow
    if use_Xr:
        hw, W = denoising_autoencoders.mDA(X_all.T,
                                           noise,
                                           1e-2,
                                           layer_func=layer_func,
                                           Xr=Xdw_most_frequent.T,
                                           filter_W_option=filter_W_option,
                                           topk=topk)
    else:
        if use_bias:
            hw, W = denoising_autoencoders.mDA(X_all.T,
                                               noise,
                                               1e-2,
                                               layer_func=layer_func,
                                               filter_W_option=filter_W_option,
                                               topk=topk)
        else:
            print "Without Bias ...."
            hw, W = denoising_autoencoders.mDA_without_bias(
                X_all.T, noise, 1e-2, layer_func=layer_func)

    X_all_dafeatures = hw.T

    Xs_mda = X_all_dafeatures[:ndocs_source, :]
    Xtest_msda = denoising_autoencoders.transform_test(Xtest.T,
                                                       W,
                                                       layer_func=layer_func,
                                                       use_bias=use_bias).T

    #Train
    clf = domain_adaptation_baseline.cross_validate_classifier(
        Xs_mda, Ys, clf_class)
    Y_pred = clf.predict(Xtest_msda)

    print classification_report(Ytest, Y_pred)

    accuracy = sklearn.metrics.accuracy_score(Ytest, Y_pred)

    return acc_bow, accuracy
Esempio n. 6
0
def msda_classifier(Xs,
                    Ys,
                    Xt,
                    Yt,
                    noise=0.9,
                    feat_type=0,
                    score='AUC',
                    clf=None,
                    layer_func=np.tanh,
                    self_learning=False):

    tfidf_trans = TfidfTransformer()

    if feat_type == 8:
        Xsn = term_weighting(Xs, feat_type)
        Xtn = term_weighting(Xt, feat_type)
    elif feat_type == 2:
        Xsn = tfidf_trans.fit_transform(Xs)
        Xtn = tfidf_trans.transform(Xt)
    else:
        Xsn = Xs
        Xtn = Xt

    #If not Classifier are given, we cross-validate one
    if not (clf):
        clf_cv = domain_adaptation_baseline.cross_validate_classifier(
            Xsn, Ys, sklearn.linear_model.LogisticRegression, n_jobs=5)
    else:
        clf_cv = clf
        clf_cv = clf.fit(Xsn, Ys)

    no_transfer_acc = clf_cv.score(Xtn, Yt)

    proba = clf_cv.predict_proba(Xtn)

    nclasses = proba.shape[1]
    multiclass = nclasses > 2

    if not (multiclass):
        Py_d = proba[:, 1]
        vect_prob = np.zeros((Xt.shape[0], 1), dtype='f')
        vect_prob[:, 0] = Py_d[:]
        #Xt_augment=domain_adaptation_baseline.append_features(Xt,vect_prob)
        Xt_augment = domain_adaptation_baseline.append_features(Xtn, vect_prob)
    else:
        #TODO Try to do it Per Class
        Xt_augment = domain_adaptation_baseline.append_features(Xtn, proba)

    if self_learning:
        Ytpred = clf_cv.predict(Xtn)
        clf_cv = domain_adaptation_baseline.cross_validate_classifier(
            Xtn, Ytpred, sklearn.linear_model.LogisticRegression)
        no_transfer_acc = clf_cv.score(Xtn, Yt)
    '''
    log_proba = np.log( clf_cv.predict_proba(Xtn) +0.0000001)
    #log_proba = clf_cv.predict_log_proba(Xtn)
    Py_d = log_proba[:,1] -np.log(0.5)
    vect_prob=np.zeros((Xt.shape[0],1),dtype='f')
    vect_prob[:,0]=Py_d[:]
    Xt_augment=domain_adaptation_baseline.append_features(Xtn,vect_prob)
    '''

    hw, W = denoising_autoencoders.mDA(Xt_augment.T,
                                       noise,
                                       0.05,
                                       layer_func=layer_func)
    h = hw.T

    if not (multiclass):
        #TODO This is dangerous if I swap label 0 and 1 as decision, no ?
        m_score = sklearn.metrics.accuracy_score(Yt, h[:, -1] > 0.5)
        #m_score = sklearn.metrics.accuracy_score(Yt,(h[:,-1]-np.log(0.5))>0)

        model_AUC = sklearn.metrics.roc_auc_score(Yt, h[:, -1])
        baseline_AUC = sklearn.metrics.roc_auc_score(Yt, Py_d)
        print "AUC", baseline_AUC, model_AUC

        if score == 'AUC':
            return (baseline_AUC, model_AUC)
        else:
            return (no_transfer_acc, m_score)
    else:
        hy_reconstruction = h[:, -nclasses:]
        y_pred = np.argmax(hy_reconstruction, axis=1)
        m_score = sklearn.metrics.accuracy_score(Yt, y_pred)
        if score == 'AUC':
            raise NotImplementedError
        else:
            return (no_transfer_acc, m_score)