def fit(self): ndocs_source = self.Xs.shape[0] if sp.issparse(self.Xs): X_all = sp.vstack([self.Xs, self.Xt]) else: X_all = np.vstack([self.Xs, self.Xt]) if self.use_Xr: word_selected = get_most_frequent_features(X_all, 5000) Xdw_most_frequent = X_all[:, word_selected] hw, W = denoising_autoencoders.mDA(X_all.T, self.noise, 1e-2, layer_func=self.layer_func, Xr=Xdw_most_frequent.T) else: if self.use_bias: hw, W = denoising_autoencoders.mDA(X_all.T, self.noise, 1e-2, layer_func=self.layer_func) else: print "Without Bias ...." hw, W = denoising_autoencoders.mDA_without_bias( X_all.T, self.noise, 1e-2, layer_func=self.layer_func) X_all_dafeatures = hw.T Xs_mda = X_all_dafeatures[:ndocs_source, :] #Train self.clf = domain_adaptation_baseline.cross_validate_classifier( Xs_mda, self.Ys, LogisticRegression) self.W = W
def mda_exp(Xs, Ys, Xt, Yt, clf_class=LogisticRegression, noise=0.9, feat_type=2, layer_func=lambda x: layer_function(x, 3), filter_W_option=0, topk=50, cross_valid=True, use_Xr=True, use_bias=True): #Stack Dataset Together ndocs_source = Xs.shape[0] ndocs_target = Xt.shape[0] X_all = sp.vstack([Xs, Xt]) word_selected = get_most_frequent_features(X_all, 5000) if feat_type > 0: X_all = term_weighting(X_all, feat_type=feat_type) Xdw_most_frequent = X_all[:, word_selected] acc_bow = domain_adaptation_baseline.no_transfer(X_all[:ndocs_source, :], Ys, X_all[ndocs_source:, :], Yt) #acc_bow=-1 #print "BOW Baseline",acc_bow if use_Xr: hw, W = denoising_autoencoders.mDA(X_all.T, noise, 1e-2, layer_func=layer_func, Xr=Xdw_most_frequent.T, filter_W_option=filter_W_option, topk=topk) else: if use_bias: hw, W = denoising_autoencoders.mDA(X_all.T, noise, 1e-2, layer_func=layer_func, filter_W_option=filter_W_option, topk=topk) else: print("Without Bias ....") hw, W = denoising_autoencoders.mDA_without_bias( X_all.T, noise, 1e-2, layer_func=layer_func) accuracy = evaluate_mda_features(hw, Ys, Yt, ndocs_source, clf_class, cross_valid=cross_valid) return acc_bow, accuracy
def lsi_mda(Xs, Ys, Xt, Yt, clf_class=LogisticRegression, noise=0.9, feat_type=2, layer_func=lambda x: layer_function(x, 1), lsi_rank=100): #First MDA and then LSI #Stack Dataset Together ndocs_source = Xs.shape[0] ndocs_target = Xt.shape[0] X_all = sp.vstack([Xs, Xt]) X = term_weighting(X_all, feat_type) word_selected = get_most_frequent_features(X_all, 5000) Xdw_most_frequent = X_all[:, word_selected] hx, _ = denoising_autoencoders.mDA(X.T, noise, layer_func=layer_func, Xr=Xdw_most_frequent.T, reg_lambda=1e-2) X_all_dafeatures = hx.T Xs_mda = X_all_dafeatures[:ndocs_source, :] Xt_mda = X_all_dafeatures[ndocs_source:, :] return domain_adaptation_baseline.lsi_transfer(Xs_mda, Ys, Xt_mda, Yt, clf_class, feat_type=0, lsi_rank=lsi_rank)
def msda_ssl(X, Y, noise=0.9, layer_func=lambda x: x): lr = sklearn.linear_model.LogisticRegression() rs = sklearn.cross_validation.ShuffleSplit(X.shape[0], train_size=100, test_size=X.shape[0] - 100) IDX = [(ta_idx, te_idx) for ta_idx, te_idx in rs] train_idx = IDX[0][0] test_idx = IDX[0][1] lr.fit(X[train_idx], Y[train_idx]) lr.score(X[test_idx], Y[test_idx]) proba = lr.predict_proba(X[test_idx]) Py_d = proba[:, 1] vect_prob = np.zeros((X[test_idx].shape[0], 1), dtype='f') vect_prob[:, 0] = Py_d[:] Xt_augment = domain_adaptation_baseline.append_features( X[test_idx], vect_prob) hw, W = denoising_autoencoders.mDA(Xt_augment.T, noise, 0.05, layer_func=layer_func) h = hw.T m_score = sklearn.metrics.accuracy_score(Y[test_idx], h[:, -1] > 0.5) #baseline_score=lr.score(X[test_idx],Y[test_idx]) basleine_score = sklearn.metrics.accuracy_score(Y[test_idx], vect_prob > 0.5) model_AUC = sklearn.metrics.roc_auc_score(Y[test_idx], h[:, -1]) baseline_AUC = sklearn.metrics.roc_auc_score(Y[test_idx], Py_d) print "AUC", model_AUC, baseline_AUC print "ACC", m_score, basleine_score, return basleine_score, m_score
def msda_exp_testset(Xs, Ys, Xt, Xtest, Ytest, clf_class=LogisticRegression, noise=0.9, feat_type=0, layer_func=np.tanh, filter_W_option=0, topk=50, cross_valid=True, use_Xr=True, use_bias=True): #Stack Dataset Together ndocs_source = Xs.shape[0] ndocs_target = Xt.shape[0] X_all = sp.vstack([Xs, Xt]) #TODO RLU word_selected = get_most_frequent_features(X_all, 5000) Xdw_most_frequent = X_all[:, word_selected] #acc_bow = domain_adaptation_baseline.no_transfer(X_all[:ndocs_source,:],Ys,X_all[ndocs_source:,:],Yt) acc_bow = -1 print "BOW Baseline", acc_bow if use_Xr: hw, W = denoising_autoencoders.mDA(X_all.T, noise, 1e-2, layer_func=layer_func, Xr=Xdw_most_frequent.T, filter_W_option=filter_W_option, topk=topk) else: if use_bias: hw, W = denoising_autoencoders.mDA(X_all.T, noise, 1e-2, layer_func=layer_func, filter_W_option=filter_W_option, topk=topk) else: print "Without Bias ...." hw, W = denoising_autoencoders.mDA_without_bias( X_all.T, noise, 1e-2, layer_func=layer_func) X_all_dafeatures = hw.T Xs_mda = X_all_dafeatures[:ndocs_source, :] Xtest_msda = denoising_autoencoders.transform_test(Xtest.T, W, layer_func=layer_func, use_bias=use_bias).T #Train clf = domain_adaptation_baseline.cross_validate_classifier( Xs_mda, Ys, clf_class) Y_pred = clf.predict(Xtest_msda) print classification_report(Ytest, Y_pred) accuracy = sklearn.metrics.accuracy_score(Ytest, Y_pred) return acc_bow, accuracy
def msda_classifier(Xs, Ys, Xt, Yt, noise=0.9, feat_type=0, score='AUC', clf=None, layer_func=np.tanh, self_learning=False): tfidf_trans = TfidfTransformer() if feat_type == 8: Xsn = term_weighting(Xs, feat_type) Xtn = term_weighting(Xt, feat_type) elif feat_type == 2: Xsn = tfidf_trans.fit_transform(Xs) Xtn = tfidf_trans.transform(Xt) else: Xsn = Xs Xtn = Xt #If not Classifier are given, we cross-validate one if not (clf): clf_cv = domain_adaptation_baseline.cross_validate_classifier( Xsn, Ys, sklearn.linear_model.LogisticRegression, n_jobs=5) else: clf_cv = clf clf_cv = clf.fit(Xsn, Ys) no_transfer_acc = clf_cv.score(Xtn, Yt) proba = clf_cv.predict_proba(Xtn) nclasses = proba.shape[1] multiclass = nclasses > 2 if not (multiclass): Py_d = proba[:, 1] vect_prob = np.zeros((Xt.shape[0], 1), dtype='f') vect_prob[:, 0] = Py_d[:] #Xt_augment=domain_adaptation_baseline.append_features(Xt,vect_prob) Xt_augment = domain_adaptation_baseline.append_features(Xtn, vect_prob) else: #TODO Try to do it Per Class Xt_augment = domain_adaptation_baseline.append_features(Xtn, proba) if self_learning: Ytpred = clf_cv.predict(Xtn) clf_cv = domain_adaptation_baseline.cross_validate_classifier( Xtn, Ytpred, sklearn.linear_model.LogisticRegression) no_transfer_acc = clf_cv.score(Xtn, Yt) ''' log_proba = np.log( clf_cv.predict_proba(Xtn) +0.0000001) #log_proba = clf_cv.predict_log_proba(Xtn) Py_d = log_proba[:,1] -np.log(0.5) vect_prob=np.zeros((Xt.shape[0],1),dtype='f') vect_prob[:,0]=Py_d[:] Xt_augment=domain_adaptation_baseline.append_features(Xtn,vect_prob) ''' hw, W = denoising_autoencoders.mDA(Xt_augment.T, noise, 0.05, layer_func=layer_func) h = hw.T if not (multiclass): #TODO This is dangerous if I swap label 0 and 1 as decision, no ? m_score = sklearn.metrics.accuracy_score(Yt, h[:, -1] > 0.5) #m_score = sklearn.metrics.accuracy_score(Yt,(h[:,-1]-np.log(0.5))>0) model_AUC = sklearn.metrics.roc_auc_score(Yt, h[:, -1]) baseline_AUC = sklearn.metrics.roc_auc_score(Yt, Py_d) print "AUC", baseline_AUC, model_AUC if score == 'AUC': return (baseline_AUC, model_AUC) else: return (no_transfer_acc, m_score) else: hy_reconstruction = h[:, -nclasses:] y_pred = np.argmax(hy_reconstruction, axis=1) m_score = sklearn.metrics.accuracy_score(Yt, y_pred) if score == 'AUC': raise NotImplementedError else: return (no_transfer_acc, m_score)
def msda_classifier_with_scores(Xt, Yt, St, use_pred=False, noise=0.9, score='AUC', clf=None, layer_func=np.tanh): Xtn = Xt #no_transfer_acc=clf_cv.score(Xtn,Yt) Y_pred = np.argmax(St, axis=1) no_transfer_acc = sklearn.metrics.accuracy_score(Yt, Y_pred) proba = St pred_features = np.zeros(proba.shape, dtype='f') print Y_pred[:10] for i, cindx in enumerate(Y_pred): pred_features[i, cindx] = 1.0 print pred_features[:10, :] nclasses = proba.shape[1] multiclass = nclasses > 2 if not (multiclass): Py_d = proba[:, 1] vect_prob = np.zeros((Xt.shape[0], 1), dtype='f') vect_prob[:, 0] = Py_d[:] #Xt_augment=domain_adaptation_baseline.append_features(Xt,vect_prob) if use_pred: raise Exception('Use Pred not implemented for binary cases') else: Xt_augment = domain_adaptation_baseline.append_features( Xtn, vect_prob) else: #TODO Try to do it Per Class if use_pred: Xt_augment = domain_adaptation_baseline.append_features( Xtn, pred_features) else: Xt_augment = domain_adaptation_baseline.append_features(Xtn, proba) ''' log_proba = np.log( clf_cv.predict_proba(Xtn) +0.0000001) #log_proba = clf_cv.predict_log_proba(Xtn) Py_d = log_proba[:,1] -np.log(0.5) vect_prob=np.zeros((Xt.shape[0],1),dtype='f') vect_prob[:,0]=Py_d[:] Xt_augment=domain_adaptation_baseline.append_features(Xtn,vect_prob) ''' hw, W = denoising_autoencoders.mDA(Xt_augment.T, noise, 0.05, layer_func=layer_func) h = hw.T if not (multiclass): #TODO This is dangerous if I swap label 0 and 1 as decision, no ? m_score = sklearn.metrics.accuracy_score(Yt, h[:, -1] > 0.5) #m_score = sklearn.metrics.accuracy_score(Yt,(h[:,-1]-np.log(0.5))>0) model_AUC = sklearn.metrics.roc_auc_score(Yt, h[:, -1]) baseline_AUC = sklearn.metrics.roc_auc_score(Yt, Py_d) print "AUC", baseline_AUC, model_AUC if score == 'AUC': return (baseline_AUC, model_AUC) else: return (no_transfer_acc, m_score) else: hy_reconstruction = h[:, -nclasses:] y_pred = np.argmax(hy_reconstruction, axis=1) m_score = sklearn.metrics.accuracy_score(Yt, y_pred) if score == 'AUC': raise NotImplementedError else: return (no_transfer_acc, m_score)