def lr_with_scale2(): """ Submission: lr_with_scale2_0704_03.csv E_val: E_in: 0.878996 E_out: 0.8768131004917349 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) clf = LogisticRegressionCV(Cs=50, cv=5, scoring='roc_auc', n_jobs=-1, class_weight='auto') clf.fit(X_scaled, y) logger.debug('Best C: %f', clf.C_[0]) logger.debug('Cs: %s', clf.Cs_) logger.debug('Grid scores: %f', clf.scores_) logger.debug('Ein: %f', Util.auc_score(clf, X_scaled, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('lr', clf)]), 'lr_with_scale2_0704_03')
def logistic_test_using_cosine(score_feature=False): logger.info('using cosine features in logistic regression') if score_feature: logger.info('also use score feature') Cs = [2**t for t in range(0, 10, 1)] Cs.extend([3**t for t in range(1, 10, 1)]) snli2cosine = SNLI2Cosine('/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin') logger.info('loading snli data ...') train_df = pd.read_csv('./snli/snli_1.0/snli_1.0_train.txt', delimiter='\t') train_df = train_df[pd.notnull(train_df.sentence2)] train_df = train_df[train_df.gold_label != '-'] train_df = train_df[:(len(train_df) / 3)] train_df.reset_index(inplace=True) test_df = pd.read_csv('./snli/snli_1.0/snli_1.0_test.txt', delimiter='\t') test_df = test_df[pd.notnull(test_df.sentence2)] test_df = test_df[test_df.gold_label != '-'] test_df.reset_index(inplace=True) X_train, train_labels, X_test, test_labels = snli2cosine.calculate_cosine_features(train_df, test_df) if score_feature: y_train_proba, y_test_proba = joblib.load('./snli/logistic_score_snli.pkl') # y_train_proba = y_train_proba.flatten() # y_test_proba = y_test_proba.flatten() X_train = np.concatenate([X_train, y_train_proba.reshape((-1, 1))], axis=1) X_test = np.concatenate([X_test, y_test_proba.reshape((-1, 1))], axis=1) logger.info('X_train.shape: {0}'.format(X_train.shape)) logger.info('X_test.shape: {0}'.format(X_test.shape)) logreg = LogisticRegressionCV(Cs=Cs, cv=3, n_jobs=10, random_state=919) logreg.fit(X_train, train_labels) logger.info('best C is {0}'.format(logreg.C_)) y_test_predicted = logreg.predict(X_test) acc = accuracy_score(test_labels, y_test_predicted) logger.info('test data predicted accuracy: {0}'.format(acc))
def logistic_test(train_data, train_labels, test_data, test_labels, cv=False): # Perform logistic regression. clf = LogisticRegressionCV() if cv else LogisticRegression() clf.fit(train_data, train_labels) predicted_labels = clf.predict(test_data) # Count true positives, true negatives, false positives, false negatives. tp, tn, fp, fn = 0, 0, 0, 0 for predicted, actual in zip(predicted_labels, test_labels): if predicted == 1 and actual == 1: tp += 1 if predicted == 0 and actual == 0: tn += 1 if predicted == 1 and actual == 0: fp += 1 if predicted == 0 and actual == 1: fn += 1 # Compute statistics. accuracy = (tp + tn) / (tp + tn + fp +fn) precision = 0 if (tp + fp) == 0 else tp / (tp + fp) recall = 0 if (tp + fn) == 0 else tp / (tp + fn) # Print report. print "Correctly classified {}/{}".format(tp + tn, tp + tn + fp +fn) print "Accuracy:", accuracy print "Precision:", precision print "Recall:", recall print "tp: {}; tn: {}; fp: {}; fn {}".format(tp, tn, fp, fn) return accuracy
def lr_with_fs(): """ Submission: lr_with_fs_0620_02.csv E_val: <missing> E_in: 0.856252488379 E_out: 0.8552577388980213 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print(auc_score(clf, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0620_02')
def classify(_char): print 'to fetch data' start_time = time.time() char_count = Character.objects.filter(char=_char, is_correct=1).count() if char_count < 10: return char_lst = Character.objects.filter(char=_char) y, X, ty, tX, t_charid_lst, test_accuracy_lst = prepare_data_with_database(char_lst) if len(y) == 0 or len(ty) == 0: return if 1 == len(set(y)) or len(y) < 10: return fetch_negative_samples(_char, X, y) if len(y) == 0 or len(ty) == 0: return if 1 == len(set(y)) or len(y) < 50: return print "fetch data done, spent %s seconds." % int(time.time() - start_time) start_time = time.time() print "traning: data size: %d" % len(y) model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1) try: model.fit(X, y) print "training done, spent %s seconds." % int(time.time() - start_time) #print 'params: ' #for k, v in model.get_params().iteritems(): # print '\t', k, ' : ', v print 'score: ', model.score(X, y) except Exception, e: print 'except: ', e traceback.print_exc() return
def optimal_l2(X, y): ''' Find the optimal level of L2 regularization for logistic regression ''' logit = LogisticRegressionCV(Cs=50, cv=10) logit.fit(X, y) return logit.C_
def LogitSelector(x, y, cv, niter, njob): t_size=1 / cv lb = prep.LabelBinarizer() y = lb.fit_transform(y).ravel() model = LogisticRegressionCV(penalty='l1', solver='liblinear', refit=False, cv=cv, n_jobs=njob) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) warnings.simplefilter('ignore', ConvergenceWarning) model.fit(x, y) columns = np.arange(x.shape[1])[model.coef_.ravel() != 0] accu = [] prec = [] rec = [] f1 = [] au = [] cls = LogisticRegression() gn_cvset = (Cvset(x[i][:, columns], y[i], x[j][:, columns], y[j]) for (i, j) in ShuffleSplit(len(y), n_iter=niter, test_size=t_size)) for cvt in gn_cvset: cls.fit(cvt.xtr, cvt.ytr) accu.append(accuracy_score(cvt.yte, cls.predict(cvt.xte))) prec.append(precision_score(cvt.yte, cls.predict(cvt.xte))) rec.append(recall_score(cvt.yte, cls.predict(cvt.xte))) f1.append(f1_score(cvt.yte, cls.predict(cvt.xte))) au.append(__Auc(cls, cvt.xte, cvt.yte)) cls.fit(x[:,columns], y) return Mdc(model=cls, idx=columns, accu=np.mean(accu), prec=np.mean(prec), rec=np.mean(rec), f1=np.mean(f1), au=np.mean(au))
def make_predictions(): # Fit Logistic Regression Model logreg = LogisticRegressionCV(scoring='log_loss', n_jobs=-1, verbose=1, random_state=6156) logreg.fit(X=trainX, y=train['y'].values) # Validate pred_pr = logreg.predict_proba(valX) loss = log_loss(y_true=val['y'].values, y_pred=pred_pr) print "Validation log loss:", loss # Get Test predictions img_files = [os.path.join(IMG_DIR, f) for f in os.listdir(IMG_DIR)] if os.path.isfile('test_pca.csv'): test_pca = pd.read_csv('test_pca.csv', dtype={'id' : str}) else: test_pca = prepare_test_data(img_files, STD_SIZE) test_predictions = logreg.predict_proba(test_pca.values[:, 1:]) id_s = [re.sub('\D', '', f) for f in img_files] df_id = pd.DataFrame({'id' : id_s}) col_names = ['col'+str(i) for i in range(1, 9)] df_yhat = pd.DataFrame(data=test_predictions, columns=col_names) df_id_yhat = pd.concat([test_pca['id'], df_yhat], axis=1) yhat = df_id.merge(df_id_yhat, on='id', how='left') yhat.fillna(1./8, inplace=True) yhat.to_csv('kaggle_430_2pm.csv', index=False)
class Fraud(object): def __init__(self): self.model = None self.fitted = False def fit(self, jsonfile, target=0.3): self.model = LogisticRegressionCV(cv=15, scoring='recall') X, y = featurize_data(jsonfile) # Balance the classes X_oversample, y_oversample = oversample(X, y, target) print X_oversample, y_oversample # Fit the model self.model.fit(X_oversample, y_oversample) self.fitted = True def predict(self, X_test): return self.model.predict(X_test)[0] def save_model(self, picklefile): with open(picklefile, 'w') as f: pickle.dump(self.model, f) def load_model(self, picklefile): with open(picklefile, 'r') as f: self.model = pickle.load(f) self.fitted = True
def train(trainingData, pklFile): # ========================================================================= # # =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= # # ========================================================================= # if (pklFile == ''): os.system('rm -rf learntModel & mkdir learntModel') pklFile = 'learntModel/learntModel.pkl' # ========================================================================= # # ================= STEP 2. PREPARE AND FORMATTING DATA =================== # # ========================================================================= # NUMBER_OF_FEATURES = len(trainingData[0]) - 1 NUMBER_OF_TRAINING_POINTS = len(trainingData) x = trainingData[:, range(0, NUMBER_OF_FEATURES)] y = trainingData[:, NUMBER_OF_FEATURES] # ========================================================================= # # ============== STEP 3. DECLARE PRIMITIVES BEFORE THE PARTY ============== # # ========================================================================= # minSquareError = np.inf targetAlpha = None alphas = np.logspace(-10, -2, 500) # ========================================================================= # # ===== STEP 4. PERFORM FITTING WITH THE BEST ALPHA AND SAVE THE MODEL ==== # # ========================================================================= # clf = LogisticRegressionCV(Cs=alphas) clf.fit(x, y) joblib.dump(clf, pklFile) return {"intercept": clf.intercept_, "coef":clf.coef_, "alpha":clf.C_, "accuracy":clf.score(x,y)}
def LogitSelector(x, y, cv, njob): lb = prep.LabelBinarizer() y = lb.fit_transform(y).ravel() cls = LogisticRegression() def __Auc(xte, yte): ypo = cls.predict_proba(xte) flt_auc = roc_auc_score(yte, ypo[:,1]) return flt_auc skf = StratifiedKFold(y, n_folds=cv) model = LogisticRegressionCV(penalty='l1', solver='liblinear', fit_intercept=False, cv=cv, n_jobs=njob) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) warnings.simplefilter('ignore', ConvergenceWarning) model.fit(x, y) columns = np.arange(x.shape[1])[model.coef_.ravel() != 0] mdl_eval = lambda func: lambda idx_tr, idx_te: func(y[idx_te], cls.fit(x[idx_tr][:,columns], y[idx_tr]).predict(x[idx_te][:,columns])) auc_eval = lambda idx_tr, idx_te: roc_auc_score(y[idx_te], cls.fit(x[idx_tr][:,columns], y[idx_tr]).predict_proba(x[idx_te][:,columns])[:,1]) res_eval = lambda func: np.average(map(mdl_eval(func), *zip(*[(idx_tr, idx_te) for idx_tr, idx_te in skf]))) accu = res_eval(accuracy_score) prec = res_eval(precision_score) rec = res_eval(recall_score) f1 = res_eval(f1_score) au = np.average(map(auc_eval, *zip(*[(idx_tr, idx_te) for idx_tr, idx_te in skf]))) cls.fit(x[:,columns], y) return Mdc(model=cls, idx=columns, accu=accu, prec=prec, rec=rec, f1=f1, au=au)
def compute_roc_auc(test_sa, adv_sa, split=1000): tr_test_sa = np.array(test_sa[:split]) tr_adv_sa = np.array(adv_sa[:split]) tr_values = np.concatenate( (tr_test_sa.reshape(-1, 1), tr_adv_sa.reshape(-1, 1)), axis=0 ) tr_labels = np.concatenate( (np.zeros_like(tr_test_sa), np.ones_like(tr_adv_sa)), axis=0 ) lr = LogisticRegressionCV(cv=5, n_jobs=-1).fit(tr_values, tr_labels) ts_test_sa = np.array(test_sa[split:]) ts_adv_sa = np.array(adv_sa[split:]) values = np.concatenate( (ts_test_sa.reshape(-1, 1), ts_adv_sa.reshape(-1, 1)), axis=0 ) labels = np.concatenate( (np.zeros_like(ts_test_sa), np.ones_like(ts_adv_sa)), axis=0 ) probs = lr.predict_proba(values)[:, 1] _, _, auc_score = compute_roc( probs_neg=probs[: (len(test_sa) - split)], probs_pos=probs[(len(test_sa) - split) :], ) return auc_score
def mdl_1d_cat(x, y): """builds univariate model to calculate AUC""" if x.nunique() > 10 and com.is_numeric_dtype(x): x = sb_cutz(x) series = pd.get_dummies(x, dummy_na=True) lr = LogisticRegressionCV(scoring='roc_auc') lr.fit(series, y) try: preds = (lr.predict_proba(series)[:, -1]) #preds = (preds > preds.mean()).astype(int) except ValueError: Tracer()() plot = plot_cat(x, y) imgdata = BytesIO() plot.savefig(imgdata) imgdata.seek(0) aucz = roc_auc_score(y, preds) cmatrix = 'data:image/png;base64,' + \ quote(base64.b64encode(imgdata.getvalue())) plt.close() return aucz, cmatrix
def fit_logistic_regression(y, X): """ Fites a logistic regression """ model_log = LogisticRegressionCV(cv=5, penalty='l2', verbose=1, max_iter=1000) fit = model_log.fit(X, y) return fit
def classify_maxEnt(train_X, train_Y, test_X): print("Classifying using Maximum Entropy ...") maxEnt = LogisticRegressionCV() maxEnt.fit(train_X, train_Y) yHat = maxEnt.predict(test_X) return yHat
def build_classifier_lr(data, labels, regularization='l2', **kwargs): if (regularization == 'l1') or (regularization == 'l2'): log_reg = LogisticRegressionCV(penalty=regularization, Cs=100, cv=10, solver='liblinear', refit=False, n_jobs=10, verbose=1, class_weight='balanced', **kwargs) else: # lambda = 1/C: if C->inf lambda -> 0. So if we want no regularization we need to set C to a high value log_reg = LogisticRegression(C=100000000., class_weight='balanced', solver='liblinear', n_jobs=10, verbose=1, **kwargs) log_reg.fit(data, labels) return log_reg
def fitModels(training_data, training_labels, test_data, test_labels): print('=========fitModels========:') # print('RandomForestClassifier:') # clf =RandomForestClassifier(n_estimators=100) # clf.fit(training_data, training_labels) # 训练模型 # getReport(clf, test_data, test_labels) # print('='*50) # print('GradientBoostingClassifier: ') # gbdt = GradientBoostingClassifier() # gbdt.fit(training_data, training_labels) # getReport(gbdt, test_data, test_labels) # print('='*50) # print('MultinomialNB: ') # clf =MultinomialNB() # clf.fit(training_data, training_labels) # 训练模型 # getReport(clf, test_data, test_labels) # print('='*50) # # print('GaussianNB: ') # clf =GaussianNB() # clf.fit(training_data, training_labels) # 训练模型 # getReport(clf, test_data, test_labels) # print('='*50) print('LogisticRegression: ') lr =LogisticRegressionCV() lr.fit(training_data, training_labels) # 训练模型 print(lr) getReport(lr, test_data, test_labels) print('='*50) print('LinearSVC: ') linSVC =LinearSVC() linSVC.fit(training_data, training_labels) # 训练模型 predict_labels = linSVC.predict(test_data) # 预测训练集 getPRF(predict_labels, test_labels) print('='*50) # print('svm: ') # clf =svm.SVC() # clf.fit(training_data, training_labels) # 训练模型 # getReport(clf, test_data, test_labels) # print('='*50) # print('DecisionTreeClassifier: ') # clf =tree.DecisionTreeClassifier() # clf.fit(training_data, training_labels) # 训练模型 # getReport(clf, test_data, test_labels) # print('='*50) return lr, linSVC
def classify_maxEnt_twitter(train_X, train_Y, test_X, test_Y): print("Classifying using Maximum Entropy ...") maxEnt = LogisticRegressionCV() maxEnt.fit(train_X, train_Y) yHat = maxEnt.predict(test_X) conf_mat = confusion_matrix(test_Y,yHat) print(conf_mat) Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat) print("Accuracy: ", Accuracy) evaluate_classifier(conf_mat)
def compute_classifier(pow_mat, recalls): print 'Computing logistic regression:', pow_mat.shape[0], 'samples', pow_mat.shape[1], 'features' lr_classifier = LogisticRegressionCV(penalty='l1', solver='liblinear') lr_classifier.fit(pow_mat, recalls) probs = lr_classifier.predict_proba(pow_mat)[:,1] auc = roc_auc_score(recalls, probs) print 'AUC =', auc return lr_classifier
def doLearn(xtrain,xtest,ytrain,ytest): # do the learning by creating an instancee of a sklearn class and fit it to the data # score the accuracy of the predictions clf=RandomForestClassifier() s=clf.fit(xtrain,ytrain).score(xtest,ytest) print('rf acc' , s) log=LogisticRegressionCV(verbose=6) ss=log.fit(xtrain,ytrain).score(xtest,ytest) print("logistic acc" , ss) svc=SVC() sss=svc.fit(xtrain,ytrain).score(xtest,ytest) print("svc acc",sss)
def lr_with_fs(): """ Submission: lr_with_fs_0703_01.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV import pylab as pl X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) pkl_path = Path.of_cache('lr_with_fs.RFECV.pkl') rfe = IO.fetch_cache(pkl_path) if rfe is None: rfe = RFECV(estimator=LogisticRegression(class_weight='auto'), cv=StratifiedKFold(y, 5), scoring='roc_auc') rfe.fit(X_scaled, y) IO.cache(rfe, pkl_path) print("Optimal number of features : %d" % rfe.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (AUC)") pl.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_) pl.savefig('lr_with_fs.refcv') X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print('CV scores: %s' % clf.scores_) print('Ein: %f' % Util.auc_score(clf, X_new, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0703_01')
def classify(self, mp, x_train, y_train, x_test): x_train = sm.add_constant(x_train) x_test = sm.add_constant(x_test) clf = LogisticRegressionCV(verbose=1, cv=5) log_to_info('Fitting a Logistic Regression to labeled training data...') clf = clf.fit(x_train, y_train) log_to_info('Training details') log_to_info('Classifier parameters: {}'.format(clf.get_params())) log_to_info('On training: {}'.format(clf.score(x_train, y_train) * 100.0)) log_to_info('Predicting test value') y_test = clf.predict(x_test) log_to_info('Done!') return y_test
def lr(): """ Submission: lr_0618.csv E_val: <missing> E_in: <missing> E_out: 0.8119110960575004 """ from sklearn.linear_model import LogisticRegressionCV X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X, y) print(auc_score(clf, X, y)) to_submission(clf, 'lr_0618_xxx')
class LogisticModelCombination(ClassifierMixin): """ Combine multiple models using a Logistic Regression """ def __init__(self, classifiers, cv_folds=1, use_original_features=False, random_state=None, verbose=0): self.classifiers = classifiers self.cv_folds = cv_folds self.use_original_features = use_original_features self.logistic = LogisticRegressionCV( Cs=[10, 1, 0.1, 0.01, 0.001], refit=True) if random_state is None: self.random_state = random.randint(0, 10000) else: self.random_state = random_state def fit(self, X, y): sss = StratifiedShuffleSplit( y, n_iter=self.cv_folds, random_state=self.random_state) for train_index, test_index in sss: train_x = X[train_index] train_y = y[train_index] test_x = X[test_index] test_y = y[test_index] self._fit_logistic(train_x, train_y) def _fit_logistic(self, X, y): pred_X = self.convert_data(X) self.logistic.fit(pred_X, y) return self def convert_data(self, X): preds = [] for i, clf in enumerate(self.classifiers): class_proba = clf.predict(X) preds.append(class_proba) pred_X = np.vstack(preds).T if self.use_original_features: pred_X = np.concatenate([X, pred_X], axis=1) return pred_X def predict_proba(self, X): pred_X = self.convert_data(X) return self.logistic.predict_proba(pred_X)
def classify_with_random_samples(char, positive_sample_count, auto_apply=False, random_sample=0): print char, positive_sample_count started = timezone.now() start_time = time.time() query = Character.objects.filter(char=char) positive_samples, negative_samples, test_X, test_y, test_char_id_lst, test_accuracy_lst = \ prepare_data_with_database2(query) X = [] y = [] if random_sample != 0: if positive_sample_count > 0: if len(positive_samples) > positive_sample_count: positive_samples = random.sample(positive_samples, positive_sample_count) if len(negative_samples) > positive_sample_count: negative_samples = random.sample(negative_samples, positive_sample_count) else: if len(positive_samples) > positive_sample_count: positive_samples.sort(key=itemgetter(2), reverse=True) positive_samples = positive_samples[:positive_sample_count] if len(negative_samples) > positive_sample_count: negative_samples.sort(key=itemgetter(2)) negative_samples = negative_samples[:positive_sample_count] for sample in positive_samples: X.append(sample[0]) y.append(sample[1]) for sample in negative_samples: X.append(sample[0]) y.append(sample[1]) train_count = len(y) predict_count = len(test_y) if 1 == len(set(y)) or train_count < 10 or predict_count == 0: return fetch_spent = int(time.time() - start_time) print "fetch data done, spent %s seconds." % fetch_spent start_time = time.time() print "traning: data size: %d" % len(y) model = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=1) try: model.fit(X, y) training_spent = int(time.time() - start_time) print "training done, spent %s seconds." % training_spent # print 'params: ' # for k, v in model.get_params().iteritems(): # print '\t', k, ' : ', v print 'score: ', model.score(X, y) except Exception, e: print 'except: ', e traceback.print_exc() return
class SentenceClassifier(BaseEstimator, ClassifierMixin): def __init__(self, sents_shuffle=False, doc2vec=gensim.models.doc2vec.Doc2Vec() ): argdict= locals() argdict.pop('argdict',None) argdict.pop('self',None) vars(self).update(argdict) #print argdict def fit(self, X, y): self.sents_train=X self.Y_train=y return self def doc2vec_set(self,all_docs): #print 'doc2vec_set,SentenceClassifier' if hasattr(self.doc2vec, 'syn0'): self.doc2vec.reset_weights() #del self.doc2vec.syn0 delattr(self.doc2vec, 'syn0') self.doc2vec.build_vocab(all_docs) self.doc2vec.train(all_docs) def predict(self,X): self.sents_test=X self.sents_all=self.sents_train + self.sents_test if self.sents_shuffle : s_indexs=range(len(self.sents_all)) random.shuffle(s_indexs) s_invers_indexs=range(len(s_indexs)) for n in range(len(s_indexs)): s_invers_indexs[s_indexs[n]]=n sents_all=[self.sents_all[n] for n in s_indexs] else: sents_all=self.sents_all all_docs = list(LabeledListSentence(self.sents_all)) self.doc2vec_set(all_docs) #print 'size',self.doc2vec.vector_size self.X_train= [self.doc2vec.infer_vector(s) for s in self.sents_train] self.X_test= [self.doc2vec.infer_vector(s) for s in self.sents_test] self.logistic =LogisticRegressionCV(class_weight='balanced')#,n_jobs=-1) self.logistic.fit(self.X_train,self.Y_train) Y_test_predict=self.logistic.predict(self.X_test) return Y_test_predict
def logit(filepath_and_pathway_ids): filepath, first_pathway_id, second_pathway_id = filepath_and_pathway_ids # we had done dataset.to_csv(filename, index=True, header=True) dataset = pd.read_csv(filepath, index_col=0) labels = dataset.index.str.replace(first_pathway_id, "positive").str.replace(second_pathway_id, "positive").tolist() classifier = LogisticRegressionCV(solver='liblinear', penalty='l1', Cs=[5], cv=10) classifier.fit(dataset.values, labels) features = pd.DataFrame(classifier.coef_, columns=dataset.columns) features = features.ix[0, features.loc[0].nonzero()[0].tolist()].index.tolist() scores = list(classifier.scores_.values())[0].flatten().tolist() return first_pathway_id, second_pathway_id, scores, features
def logit(pathway_id_and_filepath): pathway_id, filepath = pathway_id_and_filepath # we had done dataset.to_csv(filename, index=True, header=True) dataset = pd.read_csv(filepath, index_col=0) labels = dataset.index.tolist() classifier = LogisticRegressionCV(solver='liblinear', penalty='l1', Cs=[5], cv=10) classifier.fit(dataset.values, labels) features = pd.DataFrame(classifier.coef_, columns=dataset.columns) features = features.ix[0, features.loc[0].nonzero()[0].tolist()].index.tolist() scores = list(classifier.scores_.values()) return pathway_id, scores, features
def mdl_1d(x, y): """builds univariate model to calculate AUC""" lr = LogisticRegressionCV(scoring='roc_auc') lars = LassoLarsIC(criterion='aic') if x.nunique() > 10 and com.is_numeric_dtype(x): x2 = sb_cutz(x) series = pd.get_dummies(x2, dummy_na=True) else: series = pd.get_dummies(x, dummy_na=True) lr.fit(series, y) lars.fit(series, y) try: preds = (lr.predict_proba(series)[:, -1]) #preds = (preds > preds.mean()).astype(int) except ValueError: Tracer()() # try: # cm = confusion_matrix(y, (preds > y.mean()).astype(int)) # except ValueError: # Tracer()() aucz = roc_auc_score(y, preds) ns = num_bin_stats(x, y) nplot = plot_num(ns) #plot = plot_confusion_matrix(cm, y) imgdata = BytesIO() nplot.savefig(imgdata) imgdata.seek(0) nplot = 'data:image/png;base64,' + \ quote(base64.b64encode(imgdata.getvalue())) plt.close() bplot = plot_bubble(ns) imgdatab = BytesIO() bplot.savefig(imgdatab) imgdatab.seek(0) bplot = 'data:image/png;base64,' + \ quote(base64.b64encode(imgdatab.getvalue())) plt.close() return aucz, nplot, bplot
def try_all_k_best(max=13): for k in range(1,max+1): data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) selector = SelectKBest(k=k) features_train = selector.fit_transform(features_train, labels_train) features_test = selector.transform(features_test) choices.append(selector.transform(np.array(features_list[1:]).reshape(1, -1))) lr_cv = LogisticRegressionCV() lr_cv.fit(features_train, labels_train) pred.append(lr_cv.predict(features_test)) acc.append(accuracy_score(labels_test, pred[k-1])) prec.append(precision_score(labels_test, pred[k-1])) reca.append(recall_score(labels_test, pred[k-1]))
kf = KFold(n_splits=5) # Define the split - into 2 folds kf.get_n_splits(x_train) # returns the number of splitting iterations in the cross-validator # print(kf) KFold(n_splits=5, random_state=None, shuffle=True) y=[] my_score_arr=[] for k,(train_index, test_index) in enumerate(kf.split(x_train,y_train)): # print('TRAIN:', train_index) # print('TEST:', test_index,'\n') X_train_K, X_test_K = x_train[train_index], x_train[test_index] y_train_K, y_test_K = y_train[train_index], y_train[test_index] # X_train_K, X_test_K , y_train_K, y_test_K = train_test_split(x_train, y_train, test_size=0.3, random_state=0) model = LogisticRegressionCV(penalty='l1',Cs=10,cv=5,solver='liblinear') model.fit(X_train_K, y_train_K) preds = model.predict(X_test_K) # print(X_test_K) # model.fit(x_train[train_index], y_train[train_index]) my_score=model.score(X_test_K, y_test_K) my_score_arr.append(my_score) # print("[fold {0}] score: {1:.5f}".format(k, my_score)) # print('regression coef-values ') # print(model.coef_) # print('C',model.C_) # print('CS_',model.Cs_) # scores, pvalues = chi2(X_train_K, y_train_K)
clf.fit(X_train, y_train) y_pred = clf.predict(X_test).reshape(-1, 1) accuracy_score(y_test, y_pred) # LogisticRegression from sklearn.linear_model import LogisticRegression clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test).reshape(-1, 1) accuracy_score(y_test, y_pred) # LogisticRegressionCV from sklearn.linear_model import LogisticRegressionCV clf = LogisticRegressionCV(cv=5, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test).reshape(-1, 1) accuracy_score(y_test, y_pred) # SGDClassifier from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3)) clf.fit(X_train, y_train) y_pred = clf.predict(X_test).reshape(-1, 1) accuracy_score(y_test, y_pred) # Perceptron
def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ ## Enumerate features if feature names not provided N = X.shape[0] if feature_names is None: self.feature_names = [ 'feature_' + str(x) for x in range(0, X.shape[1]) ] else: self.feature_names = feature_names if 'r' in self.model_type: ## initialise tree generator if self.tree_generator is None: n_estimators_default = int( np.ceil(self.max_rules / self.tree_size)) self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N) if self.rfmode == 'regress': self.tree_generator = GradientBoostingRegressor( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) else: self.tree_generator = GradientBoostingClassifier( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) if self.rfmode == 'regress': if type(self.tree_generator) not in [ GradientBoostingRegressor, RandomForestRegressor ]: raise ValueError( "RuleFit only works with RandomForest and BoostingRegressor" ) else: if type(self.tree_generator) not in [ GradientBoostingClassifier, RandomForestClassifier ]: raise ValueError( "RuleFit only works with RandomForest and BoostingClassifier" ) ## fit tree generator if not self.exp_rand_tree_size: # simply fit with constant tree size self.tree_generator.fit(X, y) else: # randomise tree size as per Friedman 2005 Sec 3.3 np.random.seed(self.random_state) tree_sizes = np.random.exponential( scale=self.tree_size - 2, size=int(np.ceil(self.max_rules * 2 / self.tree_size))) tree_sizes = np.asarray([ 2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes)) ], dtype=int) i = int(len(tree_sizes) / 4) while np.sum(tree_sizes[0:i]) < self.max_rules: i = i + 1 tree_sizes = tree_sizes[0:i] self.tree_generator.set_params(warm_start=False) curr_est_ = 0 for i_size in np.arange(len(tree_sizes)): size = tree_sizes[i_size] self.tree_generator.set_params(n_estimators=curr_est_ + 1) self.tree_generator.set_params(max_leaf_nodes=size) random_state_add = self.random_state if self.random_state else 0 self.tree_generator.set_params( random_state=i_size + random_state_add ) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. self.tree_generator.get_params()['n_estimators'] self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) curr_est_ = curr_est_ + 1 self.tree_generator.set_params(warm_start=False) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance( self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list=tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform(X) ## standardise linear variables if requested (for regression model only) if 'l' in self.model_type: ## standard deviation and mean of winsorized features self.winsorizer.train(X) winsorized_X = self.winsorizer.trim(X) self.stddev = np.std(winsorized_X, axis=0) self.mean = np.mean(winsorized_X, axis=0) if self.lin_standardise: self.friedscale.train(X) X_regn = self.friedscale.scale(X) else: X_regn = X.copy() ## Compile Training data X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: X_concat = np.concatenate((X_concat, X_regn), axis=1) if 'r' in self.model_type: if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) ## fit Lasso if self.rfmode == 'regress': if self.Cs is None: # use defaultshasattr(self.Cs, "__len__"): n_alphas = 100 alphas = None elif hasattr(self.Cs, "__len__"): n_alphas = None alphas = 1. / self.Cs else: n_alphas = self.Cs alphas = None self.lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=self.cv, random_state=self.random_state) self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_ self.intercept_ = self.lscv.intercept_ else: Cs = 10 if self.Cs is None else self.Cs self.lscv = LogisticRegressionCV(Cs=Cs, cv=self.cv, penalty='l1', random_state=self.random_state, solver='liblinear') self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_[0] self.intercept_ = self.lscv.intercept_[0] return self
def main(data_dir, models_dir): device = 'cuda' if torch.cuda.is_available() else 'cpu' planes = ['axial', 'coronal', 'sagittal'] conditions = ['abnormal', 'acl', 'meniscus'] models = [] print(f'Loading best CNN models from {models_dir}...') for condition in conditions: models_per_condition = [] for plane in planes: checkpoint_pattern = glob(f'{models_dir}/*{plane}*{condition}*.pt') checkpoint_path = sorted(checkpoint_pattern)[-1] checkpoint = torch.load(checkpoint_path, map_location=device) model = MRNet().to(device) model.load_state_dict(checkpoint['state_dict']) models_per_condition.append(model) models.append(models_per_condition) print(f'Creating data loaders...') axial_loader = make_data_loader(data_dir, 'train', 'axial') coronal_loader = make_data_loader(data_dir, 'train', 'coronal') sagittal_loader = make_data_loader(data_dir, 'train', 'sagittal') print(f'Collecting predictions on train dataset from the models...') ys = [] Xs = [[], [], []] # Abnormal, ACL, Meniscus with tqdm(total=len(axial_loader)) as pbar: for (axial_inputs, labels), (coronal_inputs, _), (sagittal_inputs, _) in \ zip(axial_loader, coronal_loader, sagittal_loader): axial_inputs, coronal_inputs, sagittal_inputs = \ axial_inputs.to(device), coronal_inputs.to(device), sagittal_inputs.to(device) ys.append(labels[0].cpu().tolist()) for i, model in enumerate(models): axial_pred = model[0](axial_inputs).detach().cpu().item() coronal_pred = model[1](coronal_inputs).detach().cpu().item() sagittal_pred = model[2](sagittal_inputs).detach().cpu().item() X = [axial_pred, coronal_pred, sagittal_pred] Xs[i].append(X) pbar.update(1) ys = np.asarray(ys).transpose() Xs = np.asarray(Xs) print(f'Training logistic regression models for each condition...') clfs = [] for X, y in zip(Xs, ys): clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y) clfs.append(clf) for i, clf in enumerate(clfs): print( f'Cross validation score for {conditions[i]}: {clf.score(X, y):.3f}' ) clf_path = f'{models_dir}/lr_{conditions[i]}.pkl' joblib.dump(clf, clf_path) print(f'Logistic regression models saved to {models_dir}')
get_all = lambda fw: get_all_features(fw.pos) | get_all_features(fw.neg) all_features = get_all(res.targets[0].feature_weights) if len(all_features) > 1: f = list(all_features - {'<BIAS>'})[0] flt_res = get_res(x, feature_filter=lambda name, _: name != f) flt_features = get_all(flt_res.targets[0].feature_weights) assert flt_features == (all_features - {f}) return True return False @pytest.mark.parametrize(['clf'], [ [LogisticRegression(random_state=42)], [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')], [LogisticRegression(random_state=42, fit_intercept=False)], [LogisticRegressionCV(random_state=42)], [SGDClassifier(**SGD_KWARGS)], [SGDClassifier(loss='log', **SGD_KWARGS)], [PassiveAggressiveClassifier(random_state=42)], [Perceptron(random_state=42)], [RidgeClassifier(random_state=42)], [RidgeClassifierCV()], [LinearSVC(random_state=42)], [OneVsRestClassifier(LogisticRegression(random_state=42))], ]) def test_explain_linear(newsgroups_train, clf): assert_multiclass_linear_classifier_explained(newsgroups_train, clf, explain_prediction) if isinstance(clf, OneVsRestClassifier): assert_multiclass_linear_classifier_explained( newsgroups_train, clf, explain_prediction_sklearn)
sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.fit_transform(x_test) x_test_1 = sc.fit_transform(x_test_1) x_test_2 = sc.fit_transform(x_test_2) x_test_3 = sc.fit_transform(x_test_3) x_test_4 = sc.fit_transform(x_test_4) x_test_5 = sc.fit_transform(x_test_5) x_test_6 = sc.fit_transform(x_test_6) ##### Data Prep ####### End ######## *******Logistic Regression,RF*********** ##################### #https://www.edureka.co/blog/logistic-regression-in-python/ clf = LogisticRegressionCV(cv=10, random_state=0).fit(x_train, y_train) # for logistic ################# for t zero ###################################### predictions = clf.predict(x_test) probabilities = clf.predict_proba(x_test)[:,1] print(classification_report(y_test, predictions)) df_confusion = pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'], margins=False) print(df_confusion) import matplotlib.pyplot as plt print ('accuracy:' + str(round(accuracy_score(y_test, predictions)*100,2)) + "%") print (clf.coef_) print ("-feature coefficients-") for i,j in enumerate(list(x.columns)): print(str(j)+" :"+str(round(clf.coef_[:,i],3)))
y = np.ndarray.astype(user_df.values[:, -1], int) user_df = user_df.drop([1, user_df.columns[-1]], axis=1) # drop time and y column article_df = pd.read_csv(af_name, header=None) # process joined data X_df = user_df.merge(article_df, on=0) X = X_df.as_matrix() X = np.ndarray.astype(X[:, 1:], float) # remove user_id X[np.isnan(X)] = 0 # clear NaNs # min-max scaling from sklearn.preprocessing import MinMaxScaler scalar = MinMaxScaler(feature_range=(-1, 1)) scalar_fit = scalar.fit(X) dmin = scalar.data_min_ dmax = scalar.data_max_ Xnorm = scalar.transform(X) # sample weights yrat = np.sum(y == 1) / len(y) xrat = 1 - yrat s_weights = np.zeros(len(y)) s_weights[y == 0] = yrat s_weights[y == 1] = xrat # Logistic Regression clf = LR(penalty='l2', class_weight='balanced').fit(Xnorm, y) preds = clf.predict_proba(Xnorm)[:, 1] ll = log_loss(y, preds, s_weights)
plt.grid() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic -- XGBoost') plt.legend(loc="lower right") plt.savefig('xgboost_roc.pdf', format='pdf') #plt.show() from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from sklearn.model_selection import KFold kFold = 3 cv = KFold(n_splits=kFold, random_state=seed) # Default is accuracy_score. clf = LogisticRegressionCV(penalty='l2', cv=cv, random_state=seed) clf.fit(X_train, y_train) # for the ith class C_optimal = clf.C_[0] # This is the best model best_model_lr = clf.C_[0] print(best_model_lr) clf = LogisticRegression(penalty='l2', random_state=seed, C=C_optimal) clf.fit(X_train, y_train) y_score_logistic = clf.predict_proba(X_test) y_hat_logistic = clf.predict(X_test) from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt
# coding=utf-8
mpl.rcParams['axes.unicode_minus'] = False plt.figure(facecolor='w') plt.scatter(x[:, 0], x[:, 1], s=30, c=y, marker='o', cmap=cm_dark) plt.grid(b=True, ls=':') plt.xlabel(u'组份1', fontsize=14) plt.ylabel(u'组份2', fontsize=14) plt.title(u'鸢尾花数据PCA降维', fontsize=18) # plt.savefig('1.png') plt.show() x, x_test, y, y_test = train_test_split(x, y, train_size=0.7) model = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=True)), ('lr', LogisticRegressionCV(Cs=np.logspace(-3, 4, 8), cv=5, fit_intercept=False))]) model.fit(x, y) print('最优参数:', model.get_params('lr')['lr'].C_) y_hat = model.predict(x) print('训练集精确度:', metrics.accuracy_score(y, y_hat)) y_test_hat = model.predict(x_test) print('测试集精确度:', metrics.accuracy_score(y_test, y_test_hat)) N, M = 500, 500 # 横纵各采样多少个值 x1_min, x1_max = extend(x[:, 0].min(), x[:, 0].max()) # 第0列的范围 x2_min, x2_max = extend(x[:, 1].min(), x[:, 1].max()) # 第1列的范围 t1 = np.linspace(x1_min, x1_max, N) t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2) # 生成网格采样点 x_show = np.stack((x1.flat, x2.flat), axis=1) # 测试点
def logistic(self): lr = make_pipeline(LogisticRegressionCV(cv=self.kfolds)) lr.fit(self.X, self.y) return lr
print(metrics.confusion_matrix(y_test, lr_predict_test)) print("") print("Classification Report") print(metrics.classification_report(y_test, lr_predict_test)) print(metrics.recall_score(y_test, lr_predict_test)) # ### LogisticRegressionCV # In[37]: from sklearn.linear_model import LogisticRegressionCV lr_cv_model = LogisticRegressionCV( n_jobs=-1, random_state=42, Cs=3, cv=10, refit=False, class_weight="balanced", max_iter=500 ) # set number of jobs to -1 which uses all cores to parallelize lr_cv_model.fit(X_train, y_train.ravel()) # ### Predict on Test data # In[38]: lr_cv_predict_test = lr_cv_model.predict(X_test) # training metrics print("Accuracy: {0:.4f}".format( metrics.accuracy_score(y_test, lr_cv_predict_test)))
def performance_analysis(self): """ Analyze and print to stdout the performances of a big list of classifiers, in order to include only the best ones in the final version of RiskInDroid. :return: None. """ # Category of permissions for which to calculate the performances. _cat = 'declared' _k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=self.seed) # The original list of classifiers taken into consideration, before selecting # only the best ones for RiskInDroid. _all_models = (SVC(kernel='linear', probability=True, random_state=self.seed), GaussianNB(), MultinomialNB(), BernoulliNB(), DecisionTreeClassifier(random_state=self.seed), RandomForestClassifier(random_state=self.seed), AdaBoostClassifier(random_state=self.seed), GradientBoostingClassifier(random_state=self.seed), SGDClassifier(loss='log', random_state=self.seed), LogisticRegression(random_state=self.seed), LogisticRegressionCV(random_state=self.seed), KNeighborsClassifier(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), MLPClassifier(random_state=self.seed)) _training_sets = list(self.get_training_vectors_3_sets()) for model in _all_models: print('\n\n\nAnalysis of ' + model.__class__.__name__ + ':') # Goodware and malware scores for the current model. _malware_scores = numpy.array([]) _goodware_scores = numpy.array([]) # Correctly predicted targets for the current model. _ok_targets = numpy.array([]) # We analyze the 3 training sets for each model. for (index, current_set) in enumerate(_training_sets): # current_set[0] = application set # current_set[1] = application targets # Goodware and malware scores for the current set. _loc_m_scores = numpy.array([]) _loc_g_scores = numpy.array([]) # Correctly predicted targets for the current set. _loc_ok_targets = numpy.array([]) # The analysis is done using 10-cross fold validation. for train_index, test_index in _k_fold.split( current_set[0][_cat], current_set[1]): _train_data = numpy.array(current_set[0][_cat]) _train_targets = numpy.array(current_set[1]) model.fit(_train_data[train_index], _train_targets[train_index]) # Correctly predicted targets for the current fold. _fold_ok_targets = 0 for loc_index in test_index: proba = list( zip( model.classes_, model.predict_proba([_train_data[loc_index] ])[0])) # The malware probability is considered as the risk value. if proba[0][0] == b'malware': _result = proba[0] else: _result = proba[1] # We consider only correct predictions for calculating the mean # and the standard deviation. _true_target = _train_targets[loc_index] # If the current app under test is a malware. if _result[1] >= 0.5: # If the prediction is correct. if _result[0] == _true_target: _fold_ok_targets += 1 _loc_m_scores = numpy.append( _loc_m_scores, _result[1]) # If the current app under test is not a malware. else: # If the prediction is correct. if _result[0] != _true_target: _fold_ok_targets += 1 _loc_g_scores = numpy.append( _loc_g_scores, _result[1]) _loc_ok_targets = numpy.append( _loc_ok_targets, _fold_ok_targets / len(test_index)) print(' set_{0}:'.format(index + 1)) print(' accuracy: {0:.2f}'.format( _loc_ok_targets.mean() * 100)) print(' malware mean: {0:.2f}'.format( _loc_m_scores.mean() * 100)) print(' malware std_dev: {0:.2f}'.format( _loc_m_scores.std() * 100)) print(' goodware mean: {0:.2f}'.format( _loc_g_scores.mean() * 100)) print(' goodware std_dev: {0:.2f}'.format( _loc_g_scores.std() * 100)) _ok_targets = numpy.append(_ok_targets, _loc_ok_targets) _malware_scores = numpy.append(_malware_scores, _loc_m_scores) _goodware_scores = numpy.append(_goodware_scores, _loc_g_scores) print(' total:') print(' accuracy: {0:.2f}'.format(_ok_targets.mean() * 100)) print(' malware mean: {0:.2f}'.format( _malware_scores.mean() * 100)) print(' malware std_dev: {0:.2f}'.format( _malware_scores.std() * 100)) print(' goodware mean: {0:.2f}'.format( _goodware_scores.mean() * 100)) print(' goodware std_dev: {0:.2f}'.format( _goodware_scores.std() * 100))
{ 'Reservoir': [ 'Artiodactyl', 'Carnivore', 'Fish', 'Galloanserae', 'Insect', 'Neoaves', 'Plant', 'Primate', 'Pterobat', 'Rodent', 'Vespbat' ] }, {'Reservoir': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) data_array = experiment_data.iloc[:, 6:].to_numpy() label_array = experiment_data['Reservoir'].to_numpy() train_set, test_set, train_labels, test_labels = train_test_split( data_array, label_array, test_size=0.20, random_state=314, stratify=label_array) # train_labels = preprocessing.label_binarize(train_labels, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # test_labels = preprocessing.label_binarize(test_labels, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) lr = LogisticRegressionCV() parameters = {'Cs': [1, 5, 10, 20, 50], 'cv': [5], 'penalty': ['l2']} clf = GridSearchCV(lr, parameters, cv=(test_set, test_labels)) clf.fit(train_set, train_labels) print(accuracy_score(clf.predict(train_set), train_labels)) print(accuracy_score(clf.predict(test_set), test_labels))
def __init__(self): self.sentences = list() self.features = list() self.pos_labels = list() self.vectorizer = DictVectorizer() self.model = LogisticRegressionCV(random_state=123)
plt.show() def plot_decision_boundary(pred_func): #援引自CSDN上的边界决策函数,看不太懂,具体意思大概知道 # 设定最大最小值,附加一点点边缘填充 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 h = 0.01 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # 用预测函数预测一下 Z = pred_func(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # 然后画出图 plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral) from sklearn.linear_model import LogisticRegressionCV # 生成线性逻辑回归分类器 clf = LogisticRegressionCV() clf.fit(X, y) # 画决策边界 plot_decision_boundary(lambda x: clf.predict(x)) plt.title("Logistic Regression") plt.show()
# # # # ============================================================================= # In[51]: #Run a Kfolds cross validation model on the data set and predicted y from the set from sklearn.linear_model import LogisticRegressionCV from sklearn.cross_validation import KFold fold = KFold(len(y_train), n_folds=10, shuffle=True) classifier = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))), penalty='l2', scoring='roc_auc', cv=fold, max_iter=4000, fit_intercept=True, solver='newton-cg') classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) #print(classifier.scores_[1].max()) classifier.fit(X_train_scale, y_train) y_pred_scale = classifier.predict(X_test_scale) #print(classifier.scores_[1].max()) # In[52]:
elif preprocess == "scaler": scaler = StandardScaler() else: ValueError("Unknown preprocessing option") X_train, X_test, y_train, y_test = train_test_split(XX, Y) ros = RandomOverSampler() #%% algorithms = { "lr": LogisticRegressionCV(n_jobs=-1, penalty="l2", solver="saga", verbose=True), "svc": SVC(C=10.0, kernel="rbf", gamma="auto", verbose=True), "rf": RandomForestClassifier(n_estimators=5000, n_jobs=-1), "mlp": MLPClassifier(hidden_layer_sizes=(100, 100)), "grb": GradientBoostingClassifier(n_estimators=1000), "auto": autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=10 * 3600) } # Resampling instead of using "class_weight" produces better results (empiricaly) X_train, y_train = ros.fit_sample(X_train, y_train)
'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'], 'dataset': 'classifier', }, { 'model': LogisticRegression(max_iter=100, multi_class='multinomial'), 'methods': [ 'decision_function', 'predict', 'predict_proba', 'predict_log_proba', 'score' ], 'dataset': 'classifier', }, { 'model': LogisticRegressionCV(max_iter=100), 'methods': [ 'decision_function', 'predict', 'predict_proba', 'predict_log_proba', 'score' ], 'dataset': 'classifier', }, { 'model': RandomForestRegressor(n_estimators=10), 'methods': ['predict', 'score'], 'dataset': 'regression', }, { 'model': LinearRegression(), 'methods': ['predict', 'score'],
def _reset_classifier(self) -> None: self.classifier = LogRegCV()
def tune_mahalanobis_hyperparams(): def print_tuning_results(results, stypes): mtypes = ['FPR', 'DTERR', 'AUROC', 'AUIN', 'AUOUT'] for stype in stypes: print(' OOD detection method: ' + stype) for mtype in mtypes: print(' {mtype:6s}'.format(mtype=mtype), end='') print('\n{val:6.2f}'.format(val=100. * results[stype]['FPR']), end='') print(' {val:6.2f}'.format(val=100. * results[stype]['DTERR']), end='') print(' {val:6.2f}'.format(val=100. * results[stype]['AUROC']), end='') print(' {val:6.2f}'.format(val=100. * results[stype]['AUIN']), end='') print(' {val:6.2f}\n'.format(val=100. * results[stype]['AUOUT']), end='') print('') print('Tuning hyper-parameters...') stypes = ['mahalanobis'] save_dir = os.path.join('output/hyperparams/', args.name, 'tmp') if not os.path.exists(save_dir): os.makedirs(save_dir) normalizer = transforms.Normalize((125.3 / 255, 123.0 / 255, 113.9 / 255), (63.0 / 255, 62.1 / 255.0, 66.7 / 255.0)) transform = transforms.Compose([ transforms.ToTensor(), ]) if args.in_dataset == "CIFAR-10": trainset = torchvision.datasets.CIFAR10('../../data', train=True, download=True, transform=transform) trainloaderIn = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='../../data', train=False, download=True, transform=transform) testloaderIn = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=True, num_workers=2) num_classes = 10 elif args.in_dataset == "CIFAR-100": trainset = torchvision.datasets.CIFAR100('./datasets/cifar10', train=True, download=True, transform=transform) trainloaderIn = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR100(root='./datasets/cifar100', train=False, download=True, transform=transform) testloaderIn = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=True, num_workers=2) num_classes = 100 valloaderOut = torch.utils.data.DataLoader( TinyImages(transform=transforms.Compose([ transforms.ToTensor(), transforms.ToPILImage(), transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ])), batch_size=args.batch_size, shuffle=True, num_workers=2) model = dn.DenseNet3(args.layers, num_classes, normalizer=normalizer) checkpoint = torch.load( "./checkpoints/{name}/checkpoint_{epochs}.pth.tar".format( name=args.name, epochs=args.epochs)) model.load_state_dict(checkpoint['state_dict']) model.eval() model.cuda() # set information about feature extaction temp_x = torch.rand(2, 3, 32, 32) temp_x = Variable(temp_x) temp_list = model.feature_list(temp_x)[1] num_output = len(temp_list) feature_list = np.empty(num_output) count = 0 for out in temp_list: feature_list[count] = out.size(1) count += 1 print('get sample mean and covariance') sample_mean, precision = sample_estimator(model, num_classes, feature_list, trainloaderIn) print('train logistic regression model') m = 1000 val_in = [] val_out = [] cnt = 0 for data, target in trainloaderIn: for x in data: val_in.append(x.numpy()) cnt += 1 if cnt == m: break if cnt == m: break cnt = 0 for data, target in valloaderOut: for x in data: val_out.append(data[0].numpy()) cnt += 1 if cnt == m: break if cnt == m: break train_lr_data = [] train_lr_label = [] train_lr_data.extend(val_in) train_lr_label.extend(np.zeros(m)) train_lr_data.extend(val_out) train_lr_label.extend(np.ones(m)) train_lr_data = torch.tensor(train_lr_data) train_lr_label = torch.tensor(train_lr_label) best_fpr = 1.1 best_magnitude = 0.0 for magnitude in np.arange(0, 0.0041, 0.004 / 20): train_lr_Mahalanobis = [] total = 0 for data_index in range( int(np.floor(train_lr_data.size(0) / args.batch_size))): data = train_lr_data[total:total + args.batch_size] total += args.batch_size Mahalanobis_scores = get_Mahalanobis_score(model, data, num_classes, sample_mean, precision, num_output, magnitude) train_lr_Mahalanobis.extend(Mahalanobis_scores) train_lr_Mahalanobis = np.asarray(train_lr_Mahalanobis, dtype=np.float32) regressor = LogisticRegressionCV().fit(train_lr_Mahalanobis, train_lr_label) print('Logistic Regressor params:', regressor.coef_, regressor.intercept_) t0 = time.time() f1 = open(os.path.join(save_dir, "confidence_mahalanobis_In.txt"), 'w') f2 = open(os.path.join(save_dir, "confidence_mahalanobis_Out.txt"), 'w') ########################################In-distribution########################################### print("Processing in-distribution images") count = 0 for i in range(int(m / args.batch_size) + 1): if i * args.batch_size >= m: break images = torch.tensor( val_in[i * args.batch_size:min((i + 1) * args.batch_size, m)]) # if j<1000: continue batch_size = images.shape[0] Mahalanobis_scores = get_Mahalanobis_score(model, images, num_classes, sample_mean, precision, num_output, magnitude) confidence_scores = regressor.predict_proba(Mahalanobis_scores)[:, 1] for k in range(batch_size): f1.write("{}\n".format(-confidence_scores[k])) count += batch_size print("{:4}/{:4} images processed, {:.1f} seconds used.".format( count, m, time.time() - t0)) t0 = time.time() ###################################Out-of-Distributions##################################### t0 = time.time() print("Processing out-of-distribution images") count = 0 for i in range(int(m / args.batch_size) + 1): if i * args.batch_size >= m: break images = torch.tensor( val_out[i * args.batch_size:min((i + 1) * args.batch_size, m)]) # if j<1000: continue batch_size = images.shape[0] Mahalanobis_scores = get_Mahalanobis_score(model, images, num_classes, sample_mean, precision, num_output, magnitude) confidence_scores = regressor.predict_proba(Mahalanobis_scores)[:, 1] for k in range(batch_size): f2.write("{}\n".format(-confidence_scores[k])) count += batch_size print("{:4}/{:4} images processed, {:.1f} seconds used.".format( count, m, time.time() - t0)) t0 = time.time() f1.close() f2.close() results = metric(save_dir, stypes) print_tuning_results(results, stypes) fpr = results['mahalanobis']['FPR'] if fpr < best_fpr: best_fpr = fpr best_magnitude = magnitude best_regressor = regressor print('Best Logistic Regressor params:', best_regressor.coef_, best_regressor.intercept_) print('Best magnitude', best_magnitude) return sample_mean, precision, best_regressor, best_magnitude
class PosTagger(): """Part-of-speech(pos) tagger class for the English language""" def __init__(self): self.sentences = list() self.features = list() self.pos_labels = list() self.vectorizer = DictVectorizer() self.model = LogisticRegressionCV(random_state=123) def read_data(self, train_datapath): """Read sentences from given corpus data""" self.sentences = [] with open(train_datapath, 'r') as infile: sent = [] for line in infile: line = str.split(str.strip(line), '\t') if len(line) == 3: token, tag_label = line[0], line[2] sent.append((token, tag_label)) continue self.sentences.append(sent) sent = [] print("-> %d sentences are read from '%s'." % (len(self.sentences), train_datapath)) return def get_feature(self, token, token_index, sent): """Extract features of given word(token)""" token_feature = { 'token': token, 'is_first': token_index == 0, 'is_last': token_index == len(sent) - 1, 'is_capitalized': token[0].upper() == token[0], 'is_all_capitalized': token.upper() == token, 'is_capitals_inside': token[1:].lower() != token[1:], 'is_numeric': token.isdigit(), 'prefix-1': token[0], 'prefix-2': '' if len(token) < 2 else token[:1], 'suffix-1': token[-1], 'suffix-2': '' if len(token) < 2 else token[-2:], 'prev-token': '' if token_index == 0 else sent[token_index - 1][0], '2-prev-token': '' if token_index <= 1 else sent[token_index - 2][0], 'next-token': '' if token_index == len(sent) - 1 else sent[token_index + 1][0], '2-next-token': '' if token_index >= len(sent) - 2 else sent[token_index + 2][0] } return token_feature def form_data(self): """Create datasets for training/evaluation/testing""" self.features = [] self.pos_labels = [] for sent in self.sentences: for token_index, token_pair in enumerate(sent): token = token_pair[0] self.features.append(self.get_feature(token, token_index, sent)) try: pos_label = token_pair[1] self.pos_labels.append(pos_label) except: pass return def train(self, train_datapath): """Train part-of-speech(pos) tagger model""" self.read_data(train_datapath) self.form_data() print("-> Training phase is started.") t0 = time.time() self.model.fit(self.vectorizer.fit_transform(self.features), self.pos_labels) print("-> Training is completed in %s secs." % (str(round(time.time() - t0, 3)))) preds = self.model.predict(self.vectorizer.transform(self.features)) acc_score = accuracy_score(self.pos_labels, preds) print("## Evaluation accuracy is %.2f on '%s'" % (acc_score, train_datapath)) print() return def evaluate(self, datapath): """Evaluate the accuracy of trained part-of-speech(pos) tagger on given development/test corpus data""" self.read_data(datapath) self.form_data() preds = self.model.predict(self.vectorizer.transform(self.features)) acc_score = accuracy_score(self.pos_labels, preds) print("## Evaluation accuracy is %.2f on '%s'" % (acc_score, datapath)) print() return acc_score def test(self, datapath): """Measure various score values of part-of-speech(pos) tagger on given development/test corpus data""" self.read_data(datapath) self.form_data() preds = self.model.predict(self.vectorizer.transform(self.features)) precision = precision_score(self.pos_labels, preds, average='micro') recall = recall_score(self.pos_labels, preds, average='micro') f1 = f1_score(self.pos_labels, preds, average='micro') accuracy = accuracy_score(self.pos_labels, preds) conf_matrix = confusion_matrix(self.pos_labels, preds) return precision, recall, f1, accuracy, conf_matrix def tag(self, sentence): """Tag single sentence""" self.sentences = list([sentence]) self.form_data() preds = (self.model.predict(self.vectorizer.transform(self.features))) tagged_sent = list(zip(sentence, preds)) return tagged_sent def tag_sents(self, sentences): """Tag multiple sentences""" tagged_sents = list() for sent in sentences: tagged_sents.append(self.tag(sent)) return tagged_sents def save(self, save_path): """Save part-of-speech(pos) tagger""" with gzip.GzipFile(save_path, 'wb') as outfile: joblib.dump((self.vectorizer, self.model), outfile, compress=('gzip', 9)) print("-> POS tagger is saved to '%s'" % save_path) return def load(self, load_path): """Load part-of-speech(pos) tagger""" with gzip.GzipFile(load_path, 'rb') as infile: self.vectorizer, self.model = joblib.load(infile) print("-> POS tagger is loaded from '%s'" % load_path) return
def main(): # initial setup dataset_list = ['cifar10', 'cifar100', 'svhn'] adv_test_list = ['FGSM', 'BIM', 'DeepFool', 'CWL2', 'PGD100'] print('evaluate the LID estimator') score_list = [ 'LID_10', 'LID_20', 'LID_30', 'LID_40', 'LID_50', 'LID_60', 'LID_70', 'LID_80', 'LID_90' ] list_best_results, list_best_results_index = [], [] for dataset in dataset_list: print('load train data: ', dataset) outf = './adv_output/' + args.net_type + '_' + dataset + '/' list_best_results_out, list_best_results_index_out = [], [] for out in adv_test_list: best_auroc, best_result, best_index = 0, 0, 0 for score in score_list: print('load train data: ', out, ' of ', score) total_X, total_Y = lib_regression.load_characteristics( score, dataset, out, outf) X_val, Y_val, X_test, Y_test = lib_regression.block_split_adv( total_X, total_Y) pivot = int(X_val.shape[0] / 6) X_train = np.concatenate( (X_val[:pivot], X_val[2 * pivot:3 * pivot], X_val[4 * pivot:5 * pivot])) Y_train = np.concatenate( (Y_val[:pivot], Y_val[2 * pivot:3 * pivot], Y_val[4 * pivot:5 * pivot])) X_val_for_test = np.concatenate( (X_val[pivot:2 * pivot], X_val[3 * pivot:4 * pivot], X_val[5 * pivot:])) Y_val_for_test = np.concatenate( (Y_val[pivot:2 * pivot], Y_val[3 * pivot:4 * pivot], Y_val[5 * pivot:])) lr = LogisticRegressionCV(n_jobs=-1).fit(X_train, Y_train) y_pred = lr.predict_proba(X_train)[:, 1] #print('training mse: {:.4f}'.format(np.mean(y_pred - Y_train))) y_pred = lr.predict_proba(X_val_for_test)[:, 1] #print('test mse: {:.4f}'.format(np.mean(y_pred - Y_val_for_test))) results = lib_regression.detection_performance( lr, X_val_for_test, Y_val_for_test, outf) if best_auroc < results['TMP']['AUROC']: best_auroc = results['TMP']['AUROC'] best_index = score best_result = lib_regression.detection_performance( lr, X_test, Y_test, outf) list_best_results_out.append(best_result) list_best_results_index_out.append(best_index) list_best_results.append(list_best_results_out) list_best_results_index.append(list_best_results_index_out) print('evaluate the Mahalanobis estimator') score_list = ['Mahalanobis_0.0', 'Mahalanobis_0.01', 'Mahalanobis_0.005', \ 'Mahalanobis_0.002', 'Mahalanobis_0.0014', 'Mahalanobis_0.001', 'Mahalanobis_0.0005'] list_best_results_ours, list_best_results_index_ours = [], [] for dataset in dataset_list: print('load train data: ', dataset) outf = './adv_output/' + args.net_type + '_' + dataset + '/' list_best_results_out, list_best_results_index_out = [], [] for out in adv_test_list: best_auroc, best_result, best_index = 0, 0, 0 for score in score_list: print('load train data: ', out, ' of ', score) total_X, total_Y = lib_regression.load_characteristics( score, dataset, out, outf) X_val, Y_val, X_test, Y_test = lib_regression.block_split_adv( total_X, total_Y) pivot = int(X_val.shape[0] / 6) X_train = np.concatenate( (X_val[:pivot], X_val[2 * pivot:3 * pivot], X_val[4 * pivot:5 * pivot])) Y_train = np.concatenate( (Y_val[:pivot], Y_val[2 * pivot:3 * pivot], Y_val[4 * pivot:5 * pivot])) X_val_for_test = np.concatenate( (X_val[pivot:2 * pivot], X_val[3 * pivot:4 * pivot], X_val[5 * pivot:])) Y_val_for_test = np.concatenate( (Y_val[pivot:2 * pivot], Y_val[3 * pivot:4 * pivot], Y_val[5 * pivot:])) lr = LogisticRegressionCV(n_jobs=-1).fit(X_train, Y_train) y_pred = lr.predict_proba(X_train)[:, 1] #print('training mse: {:.4f}'.format(np.mean(y_pred - Y_train))) y_pred = lr.predict_proba(X_val_for_test)[:, 1] #print('test mse: {:.4f}'.format(np.mean(y_pred - Y_val_for_test))) results = lib_regression.detection_performance( lr, X_val_for_test, Y_val_for_test, outf) if best_auroc < results['TMP']['AUROC']: best_auroc = results['TMP']['AUROC'] best_index = score best_result = lib_regression.detection_performance( lr, X_test, Y_test, outf) list_best_results_out.append(best_result) list_best_results_index_out.append(best_index) list_best_results_ours.append(list_best_results_out) list_best_results_index_ours.append(list_best_results_index_out) count_in = 0 mtypes = ['TNR', 'AUROC', 'DTACC', 'AUIN', 'AUOUT'] print("results of LID") for in_list in list_best_results: print('in_distribution: ' + dataset_list[count_in] + '==========') count_out = 0 for results in in_list: print('out_distribution: ' + adv_test_list[count_out]) for mtype in mtypes: print(' {mtype:6s}'.format(mtype=mtype), end='') print('\n{val:6.2f}'.format(val=100. * results['TMP']['TNR']), end='') print(' {val:6.2f}'.format(val=100. * results['TMP']['AUROC']), end='') print(' {val:6.2f}'.format(val=100. * results['TMP']['DTACC']), end='') print(' {val:6.2f}'.format(val=100. * results['TMP']['AUIN']), end='') print(' {val:6.2f}\n'.format(val=100. * results['TMP']['AUOUT']), end='') print('Input noise: ' + list_best_results_index[count_in][count_out]) print('') count_out += 1 count_in += 1 count_in = 0 print("results of Mahalanobis") for in_list in list_best_results_ours: print('in_distribution: ' + dataset_list[count_in] + '==========') count_out = 0 for results in in_list: print('out_distribution: ' + adv_test_list[count_out]) for mtype in mtypes: print(' {mtype:6s}'.format(mtype=mtype), end='') print('\n{val:6.2f}'.format(val=100. * results['TMP']['TNR']), end='') print(' {val:6.2f}'.format(val=100. * results['TMP']['AUROC']), end='') print(' {val:6.2f}'.format(val=100. * results['TMP']['DTACC']), end='') print(' {val:6.2f}'.format(val=100. * results['TMP']['AUIN']), end='') print(' {val:6.2f}\n'.format(val=100. * results['TMP']['AUOUT']), end='') print('Input noise: ' + list_best_results_index_ours[count_in][count_out]) print('') count_out += 1 count_in += 1
def QuickML_Ensembling(X_train, y_train, X_test, y_test='', modeltype='Regression', Boosting_Flag=False, scoring='', verbose=0): """ Quickly builds and runs multiple models for a clean data set(only numerics). """ start_time = time.time() seed = 99 if len(X_train) <= 100000 or X_train.shape[1] < 50: NUMS = 100 FOLDS = 5 else: NUMS = 200 FOLDS = 10 ## create Voting models estimators = [] if modeltype == 'Regression': if scoring == '': scoring = 'neg_mean_squared_error' scv = ShuffleSplit(n_splits=FOLDS, random_state=seed) if Boosting_Flag is None: model5 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging1', model5, metrics1)) else: model5 = LassoLarsCV(cv=scv) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = rmse(results1, y_test).mean() else: metrics1 = 0 estimators.append(('LassoLarsCV', model5, metrics1)) model6 = LassoCV(alphas=np.logspace(-10, -1, 50), cv=scv, random_state=seed) results2 = model6.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = rmse(results2, y_test).mean() else: metrics2 = 0 estimators.append(('LassoCV', model6, metrics2)) model7 = RidgeCV(alphas=np.logspace(-10, -1, 50), cv=scv) results3 = model7.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = rmse(results3, y_test).mean() else: metrics3 = 0 estimators.append(('RidgeCV', model7, metrics3)) ## Create an ensemble model #### if Boosting_Flag: model8 = BaggingRegressor(DecisionTreeRegressor(random_state=seed), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging2', model8, metrics4)) else: model8 = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( min_samples_leaf=2, max_depth=1, random_state=seed), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = rmse(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting', model8, metrics4)) estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if verbose >= 2: print('QuickML_Ensembling Model results:') print( ' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' % (estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if scoring == '': scoring = 'accuracy' scv = StratifiedKFold(n_splits=FOLDS, random_state=seed) if Boosting_Flag is None: model5 = ExtraTreesClassifier(n_estimators=NUMS, min_samples_leaf=2, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Bagging', model5, metrics1)) else: model5 = LogisticRegressionCV(Cs=np.linspace(0.01, 100, 20), cv=scv, scoring=scoring, random_state=seed) results1 = model5.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics1 = accu(results1, y_test).mean() else: metrics1 = 0 estimators.append(('Logistic Regression', model5, metrics1)) model6 = LinearDiscriminantAnalysis() results2 = model6.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics2 = accu(results2, y_test).mean() else: metrics2 = 0 estimators.append(('Linear Discriminant', model6, metrics2)) if modeltype == 'Binary_Classification': float_cols = X_train.columns[( X_train.dtypes == float).values].tolist() int_cols = X_train.columns[(X_train.dtypes == int).values].tolist() if (X_train[float_cols + int_cols] < 0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = GaussianNB() else: float_cols = X_train.columns[( X_train.dtypes == float).values].tolist() int_cols = X_train.columns[(X_train.dtypes == int).values].tolist() if (X_train[float_cols + int_cols] < 0).astype(int).sum().sum() > 0: model7 = DecisionTreeClassifier(max_depth=5) else: model7 = MultinomialNB() results3 = model7.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics3 = accu(results3, y_test).mean() else: metrics3 = 0 estimators.append(('Naive Bayes', model7, metrics3)) if Boosting_Flag: #### If the Boosting_Flag is True, it means Boosting model is present. So choose a Bagging here. model8 = ExtraTreesClassifier(n_estimators=NUMS, min_samples_leaf=2, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Bagging', model8, metrics4)) else: ## Create an ensemble model #### model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=seed, max_depth=1, min_samples_leaf=2), n_estimators=NUMS, random_state=seed) results4 = model8.fit(X_train, y_train).predict(X_test) if not isinstance(y_test, str): metrics4 = accu(results4, y_test).mean() else: metrics4 = 0 estimators.append(('Boosting', model8, metrics4)) estimators_list = [(tuples[0], tuples[1]) for tuples in estimators] estimator_names = [tuples[0] for tuples in estimators] if not isinstance(y_test, str): if verbose >= 2: print('QuickML_Ensembling Model results:') print( ' %s = %0.4f \n %s = %0.4f\n %s = %0.4f \n %s = %0.4f' % (estimator_names[0], metrics1, estimator_names[1], metrics2, estimator_names[2], metrics3, estimator_names[3], metrics4)) else: if verbose >= 1: print('QuickML_Ensembling completed.') stacks = np.c_[results1, results2, results3, results4] if verbose == 1: print(' Time taken for Ensembling: %0.1f seconds' % (time.time() - start_time)) return estimator_names, stacks #########################################################
datas['A16'] = label.fit_transform(df[classification]) # print(datas.info()) x = datas.iloc[:, :-1] y = datas.iloc[:, -1] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) # Logistic模型训练 model = Pipeline([('ss', StandardScaler()), ('lr', LogisticRegressionCV(multi_class='ovr', fit_intercept=True, Cs=np.logspace(-4, 1, 50), penalty='l2', solver='lbfgs', tol=0.01))]) model.fit(x_train, y_train) y_predict = model.predict(x_test) result = model.get_params()['lr'] print('r:', model.score(x_train, y_train)) print('参数:', result.coef_) print('截距:', result.intercept_) # KNN模型训练 knn = KNeighborsClassifier(n_neighbors=20, algorithm='kd_tree', weights='distance')
# #使用交叉验证来选择正则化系数C # LR_model_2 = LogisticRegressionCV(Cs=[C_penalty], penalty='l2', solver='lbfgs', class_weight={1:bad_weight, 0:1}) # LR_model_2_fit = LR_model_2.fit(X_train,y_train) # y_pred = LR_model_2_fit.predict_proba(X_test)[:,1] # scorecard_result = pd.DataFrame({'prob':y_pred, 'target':y_test}) # performance = KS(scorecard_result,'prob','target') # # KS = performance['KS'] # # KS = performance # model_parameter[(C_penalty, bad_weight)] = performance #KS # sortparam = sorted(model_parameter,key=lambda x:x[1],reverse=True) # print('sortedparam --> ',sortparam[0],model_parameter[sortparam[0]],sortparam[1],model_parameter[sortparam[0]]) # penalty,badWeight = sortparam[0] LR_model_2 = LogisticRegressionCV(penalty='l2', solver='lbfgs', scoring='roc_auc', cv=3, class_weight={ 1: 10, 0: 1 }) LR_model_2_fit = LR_model_2.fit(X_train, y_train) y_prob = LR_model_2_fit.predict_proba(X_test)[:, 1] print('y_prob --> ', y_prob.shape) y_pred = LR_model_2_fit.predict(X_test) print('y_pred --> ', y_pred.shape) scorecard_result = pd.DataFrame({ 'prob': y_prob, 'target': y_test, 'pred': y_pred }) performance = KS(scorecard_result, 'prob', 'target') print('ks --> ', performance)
# WOE 编码 woe = rpt.preprocessing.WeightOfEvidence(categorical_features=categorical_var, encoder_na=False) X = woe.fit_transform(X, y) # 离散化 #dis=rpt.preprocessing.Discretization(continous_features=continuous_var) #X2=dis.fit_transform(X,y) # 补缺和标准化 X = X.fillna(-99) X[continuous_var] = preprocessing.MinMaxScaler().fit_transform( X[continuous_var]) clfs={'LogisticRegression':LogisticRegressionCV(),\ 'RandomForest':RandomForestClassifier(),'GradientBoosting':GradientBoostingClassifier()} y_preds, y_probas = {}, {} for clf in clfs: clfs[clf].fit(X, y) y_preds[clf] = clfs[clf].predict(X) y_probas[clf] = clfs[clf].predict_proba(X)[:, 1] models_report, conf_matrix = rpt.ClassifierReport(y, y_preds, y_probas) print(models_report) # 信息论度量 p = y_probas['LogisticRegression'][y == 1] q = y_probas['LogisticRegression'][y == 0] print(rpt.metrics.entropyc.kl_div(p, q))
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from dataprocess import get_data_two from sklearn.metrics import precision_recall_curve,precision_score,recall_score,f1_score import numpy as np stand=StandardScaler() x,y=get_data_two('/home/cooper/PycharmProjects/sxyl/Assignments/iris2.txt') x=stand.fit_transform(X=x) print(x.shape) x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) print(x_train.shape,x_test.shape,y_train.shape,y_test.shape) model=LogisticRegressionCV(multi_class="ovr",fit_intercept=True,Cs=np.logspace(-2,2,20),cv=2,penalty="l2",solver="lbfgs",tol=0.01) # result=model.fit(x_train,y_train) # s=result.score(x_train,y_train) print(s) y_pre=model.predict(x_test) print('recall: ',recall_score(y_test,y_pre)) print('precision',precision_score(y_test,y_pre)) print('f1_score',f1_score(y_test,y_pre))
class RuleFit(BaseEstimator, TransformerMixin): """Rulefit class Parameters ---------- tree_size: Number of terminal nodes in generated trees. If exp_rand_tree_size=True, this will be the mean number of terminal nodes. sample_fract: fraction of randomly chosen training observations used to produce each tree. FP 2004 (Sec. 2) max_rules: approximate total number of rules generated for fitting. Note that actual number of rules will usually be lower than this due to duplicates. memory_par: scale multiplier (shrinkage factor) applied to each new tree when sequentially induced. FP 2004 (Sec. 2) rfmode: 'regress' for regression or 'classify' for binary classification. lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2 by multiplying the winsorised variable by 0.4/stdev. lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear terms before standardisation. exp_rand_tree_size: If True, each boosted tree will have a different maximum number of terminal nodes based on an exponential distribution about tree_size. (Friedman Sec 3.3) model_type: 'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms random_state: Integer to initialise random objects and provide repeatability. tree_generator: Optional: this object will be used as provided to generate the rules. This will override almost all the other properties above. Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None) Attributes ---------- rule_ensemble: RuleEnsemble The rule ensemble feature_names: list of strings, optional (default=None) The names of the features (columns) """ def __init__(self, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, rfmode='regress', lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, model_type='rl', Cs=None, cv=3, random_state=None): self.tree_generator = tree_generator self.rfmode = rfmode self.lin_trim_quantile = lin_trim_quantile self.lin_standardise = lin_standardise self.winsorizer = Winsorizer(trim_quantile=lin_trim_quantile) self.friedscale = FriedScale(self.winsorizer) self.stddev = None self.mean = None self.exp_rand_tree_size = exp_rand_tree_size self.max_rules = max_rules self.sample_fract = sample_fract self.max_rules = max_rules self.memory_par = memory_par self.tree_size = tree_size self.random_state = random_state self.model_type = model_type self.cv = cv self.Cs = Cs def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ ## Enumerate features if feature names not provided N = X.shape[0] if feature_names is None: self.feature_names = [ 'feature_' + str(x) for x in range(0, X.shape[1]) ] else: self.feature_names = feature_names if 'r' in self.model_type: ## initialise tree generator if self.tree_generator is None: n_estimators_default = int( np.ceil(self.max_rules / self.tree_size)) self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N) if self.rfmode == 'regress': self.tree_generator = GradientBoostingRegressor( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) else: self.tree_generator = GradientBoostingClassifier( n_estimators=n_estimators_default, max_leaf_nodes=self.tree_size, learning_rate=self.memory_par, subsample=self.sample_fract_, random_state=self.random_state, max_depth=100) if self.rfmode == 'regress': if type(self.tree_generator) not in [ GradientBoostingRegressor, RandomForestRegressor ]: raise ValueError( "RuleFit only works with RandomForest and BoostingRegressor" ) else: if type(self.tree_generator) not in [ GradientBoostingClassifier, RandomForestClassifier ]: raise ValueError( "RuleFit only works with RandomForest and BoostingClassifier" ) ## fit tree generator if not self.exp_rand_tree_size: # simply fit with constant tree size self.tree_generator.fit(X, y) else: # randomise tree size as per Friedman 2005 Sec 3.3 np.random.seed(self.random_state) tree_sizes = np.random.exponential( scale=self.tree_size - 2, size=int(np.ceil(self.max_rules * 2 / self.tree_size))) tree_sizes = np.asarray([ 2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes)) ], dtype=int) i = int(len(tree_sizes) / 4) while np.sum(tree_sizes[0:i]) < self.max_rules: i = i + 1 tree_sizes = tree_sizes[0:i] self.tree_generator.set_params(warm_start=False) curr_est_ = 0 for i_size in np.arange(len(tree_sizes)): size = tree_sizes[i_size] self.tree_generator.set_params(n_estimators=curr_est_ + 1) self.tree_generator.set_params(max_leaf_nodes=size) random_state_add = self.random_state if self.random_state else 0 self.tree_generator.set_params( random_state=i_size + random_state_add ) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. self.tree_generator.get_params()['n_estimators'] self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) curr_est_ = curr_est_ + 1 self.tree_generator.set_params(warm_start=False) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance( self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list=tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform(X) ## standardise linear variables if requested (for regression model only) if 'l' in self.model_type: ## standard deviation and mean of winsorized features self.winsorizer.train(X) winsorized_X = self.winsorizer.trim(X) self.stddev = np.std(winsorized_X, axis=0) self.mean = np.mean(winsorized_X, axis=0) if self.lin_standardise: self.friedscale.train(X) X_regn = self.friedscale.scale(X) else: X_regn = X.copy() ## Compile Training data X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: X_concat = np.concatenate((X_concat, X_regn), axis=1) if 'r' in self.model_type: if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) ## fit Lasso if self.rfmode == 'regress': if self.Cs is None: # use defaultshasattr(self.Cs, "__len__"): n_alphas = 100 alphas = None elif hasattr(self.Cs, "__len__"): n_alphas = None alphas = 1. / self.Cs else: n_alphas = self.Cs alphas = None self.lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=self.cv, random_state=self.random_state) self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_ self.intercept_ = self.lscv.intercept_ else: Cs = 10 if self.Cs is None else self.Cs self.lscv = LogisticRegressionCV(Cs=Cs, cv=self.cv, penalty='l1', random_state=self.random_state, solver='liblinear') self.lscv.fit(X_concat, y) self.coef_ = self.lscv.coef_[0] self.intercept_ = self.lscv.intercept_[0] return self def predict(self, X): """Predict outcome for X """ X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate((X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len(self.rule_ensemble.rules):] if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict(X_concat) def predict_proba(self, X): """Predict probability of outcome for X """ if self.rfmode == 'regress': raise ValueError( "Probaility prediction only works for classification tasks.") else: X_concat = np.zeros([X.shape[0], 0]) if 'l' in self.model_type: if self.lin_standardise: X_concat = np.concatenate( (X_concat, self.friedscale.scale(X)), axis=1) else: X_concat = np.concatenate((X_concat, X), axis=1) if 'r' in self.model_type: rule_coefs = self.coef_[-len(self.rule_ensemble.rules):] if len(rule_coefs) > 0: X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs) if X_rules.shape[0] > 0: X_concat = np.concatenate((X_concat, X_rules), axis=1) return self.lscv.predict_proba(X_concat) def transform(self, X=None, y=None): """Transform dataset. Parameters ---------- X : array-like matrix, shape=(n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Returns ------- X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ return self.rule_ensemble.transform(X) def get_rules(self, exclude_zero_coef=False, subregion=None): """Return the estimated rules Parameters ---------- exclude_zero_coef: If True (default), returns only the rules with an estimated coefficient not equalt to zero. subregion: If None (default) returns global importances (FP 2004 eq. 28/29), else returns importance over subregion of inputs (FP 2004 eq. 30/31/32). Returns ------- rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds the coefficients and 'support' the support of the rule in the training data set (X) """ n_features = len(self.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) output_rules = [] ## Add coefficients for linear effects for i in range(0, n_features): if self.lin_standardise: coef = self.coef_[i] * self.friedscale.scale_multipliers[i] else: coef = self.coef_[i] if subregion is None: importance = abs(coef) * self.stddev[i] else: subregion = np.array(subregion) importance = sum( abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len(subregion) output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)] ## Add rules for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] coef = self.coef_[i + n_features] if subregion is None: importance = abs(coef) * (rule.support * (1 - rule.support))**(1 / 2) else: rkx = rule.transform(subregion) importance = sum( abs(coef) * abs(rkx - rule.support)) / len(subregion) output_rules += [(rule.__str__(), 'rule', coef, rule.support, importance)] rules = pd.DataFrame( output_rules, columns=["rule", "type", "coef", "support", "importance"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] return rules
class DCTrainer: """ Trains Diagnostic Classifiers (DC) on extracted activation data. For each activation that is part of the provided activation_names argument a different classifier will be trained. Parameters ---------- save_dir : str, optional Directory to which trained models will be saved, if provided. corpus : Corpus Corpus containing the token labels for each sentence. activation_names : List[ActivationName] List of activation names on which classifiers will be trained. activations_dir : str, optional Path to folder containing the activations to train on. If not provided newly extracted activations will be saved to `save_dir`. test_activations_dir : str, optional Directory containing the extracted test activations. If not provided the train activation set will be split and partially used as test set. test_corpus : Corpus, optional Corpus containing the test labels for each sentence. If provided without `test_activations_dir` newly extracted activations will be saved to `save_dir`. model : LanguageModel, optional LanguageModel that should be provided if new activations need to be extracted prior to training the classifiers. selection_func : SelectFunc, optional Selection function that determines whether a corpus item should be taken into account for training. If such a function has been used during extraction, make sure to pass it along here as well. Attributes ---------- data_loader : DataLoader Class that reads and preprocesses activation data. classifier : Classifier Current classifier that is being trained. """ def __init__( self, save_dir: str, corpus: Corpus, activation_names: ActivationNames, activations_dir: Optional[str] = None, test_activations_dir: Optional[str] = None, test_corpus: Optional[Corpus] = None, model: Optional[LanguageModel] = None, selection_func: SelectFunc = lambda sen_id, pos, example: True, ) -> None: self.save_dir = save_dir if not os.path.exists(save_dir): os.mkdir(save_dir) activations_dir, test_activations_dir = self._extract_activations( save_dir, corpus, activation_names, selection_func, activations_dir, test_activations_dir, test_corpus, model, ) self.activation_names = activation_names self.data_loader = DataLoader( activations_dir, corpus, test_activations_dir=test_activations_dir, test_corpus=test_corpus, selection_func=selection_func, ) self.classifier = LogRegCV() def train( self, calc_class_weights: bool = False, data_subset_size: int = -1, train_test_split: float = 0.9, ) -> None: """ Trains DCs on multiple activation names. Parameters ---------- calc_class_weights : bool, optional Set to True to calculate the classifier class weights based on the corpus class frequencies. Defaults to False. data_subset_size : int, optional Size of the subset on which training will be performed. Defaults to the full set of activations. train_test_split : float, optional Percentage of the train/test split. If separate test activations are provided this split won't be used. Defaults to 0.9/0.1. """ for activation_name in self.activation_names: self._train( activation_name, calc_class_weights=calc_class_weights, data_subset_size=data_subset_size, train_test_split=train_test_split, ) def _train( self, activation_name: ActivationName, calc_class_weights: bool = False, data_subset_size: int = -1, train_test_split: float = 0.9, ) -> None: """ Initiates training the DC on 1 activation type. """ self._reset_classifier() data_dict = self.data_loader.create_data_split( activation_name, data_subset_size, train_test_split ) # Calculate class weights if calc_class_weights: self._set_class_weights(data_dict["train_y"]) # Train self._fit(data_dict["train_x"], data_dict["train_y"], activation_name) results = self._eval(data_dict["test_x"], data_dict["test_y"]) if self.save_dir is not None: self._save(results, activation_name) def _fit( self, train_x: Tensor, train_y: Tensor, activation_name: ActivationName ) -> None: start_time = time() print(f"\nStarting fitting model on {activation_name}...") self.classifier.fit(train_x, train_y) print(f"Fitting done in {time() - start_time:.2f}s") def _eval(self, test_x: Tensor, test_y: Tensor) -> Dict[str, Any]: pred_y = self.classifier.predict(test_x) acc = accuracy_score(test_y, pred_y) cm = confusion_matrix(test_y, pred_y) results = {"accuracy": acc, "confusion matrix": cm} for k, v in results.items(): print(k, v, "", sep="\n") results["pred_y"] = pred_y return results def _save(self, results: Dict[str, Any], activation_name: ActivationName) -> None: l, name = activation_name preds_path = os.path.join(self.save_dir, f"{name}_l{l}_results.pickle") model_path = os.path.join(self.save_dir, f"{name}_l{l}.joblib") dump_pickle(results, preds_path) joblib.dump(self.classifier, model_path) def _reset_classifier(self) -> None: self.classifier = LogRegCV() def _set_class_weights(self, train_y: Tensor) -> None: classes, class_freqs = torch.unique(train_y, return_counts=True) norm = class_freqs.sum().item() class_weight = { classes[i].item(): class_freqs[i].item() / norm for i in range(len(class_freqs)) } self.classifier.class_weight = class_weight @staticmethod def _extract_activations( save_dir: str, corpus: Corpus, activation_names: ActivationNames, selection_func: SelectFunc, activations_dir: Optional[str], test_activations_dir: Optional[str], test_corpus: Optional[Corpus], model: Optional[LanguageModel], ) -> Tuple[str, Optional[str]]: if activations_dir is None: activations_dir = os.path.join(save_dir, "activations") simple_extract( model, activations_dir, corpus, activation_names, selection_func ) if test_corpus is not None and test_activations_dir is None: test_activations_dir = os.path.join(save_dir, "test_activations") simple_extract( model, test_activations_dir, test_corpus, activation_names, selection_func, ) return activations_dir, test_activations_dir