def validate(self): ''' Ten-fold cross-validation with stratified sampling. ''' print('Validating new model: {}()'.format(self.__class__.__name__)) accuracy_scores = [] precision_scores = [] recall_scores = [] f1_scores = [] sss = StratifiedShuffleSplit(n_splits=10) for train_index, test_index in sss.split(self.data, self.labels): x_train, x_test = self.data[train_index], self.data[test_index] y_train, y_test = self.labels[train_index], self.labels[test_index] model = self.create_model() model.fit(x_train, y_train, epochs=100, batch_size=128, class_weight=self.class_weight) y_pred = model.predict_classes(x_test, batch_size=128) accuracy_scores.append(accuracy_score(y_test, y_pred)) precision_scores.append(precision_score(y_test, y_pred)) recall_scores.append(recall_score(y_test, y_pred)) f1_scores.append(f1_score(y_test, y_pred)) print('') print('Accuracy: {}'.format(np.mean(accuracy_scores))) print('Precision: {}'.format(np.mean(precision_scores))) print('Recall: {}'.format(np.mean(recall_scores))) print('F1-measure: {}'.format(np.mean(f1_scores)))
def simple_classification(n_samples=100, n_features=10, random_state=33): """ Generate simple classification task for training. Parameters ---------- n_samples : int Number of samples in dataset. n_features : int Number of features for each sample. random_state : int Random state to make results reproducible. Returns ------- tuple Returns tuple that contains 4 variables. There are input train, input test, target train, target test respectevly. """ X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, random_state=random_state) shuffle_split = StratifiedShuffleSplit(n_splits=1, train_size=0.6, random_state=random_state) train_index, test_index = next(shuffle_split.split(X, y)) x_train, x_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return x_train, x_test, y_train, y_test
def fit_model(self, X, y): """ X::pd.DataFrame: Input data y::np.ndarray: response for input data """ X = X.values XY = np.hstack((X, y[:, None])) np.random.shuffle(XY) X = XY[:, :-1] y = XY[:, -1] cv_out = StratifiedShuffleSplit(n_splits=400) cv_in = StratifiedKFold(n_splits=5) clf = Pipeline([('scaler', StandardScaler()), ('lg', linear_model.LogisticRegressionCV( penalty='l1', solver='liblinear', cv=cv_in))]) self.res = {'coef':[], 'auc':[], 'model':0} for idx, (train, test) in enumerate(cv_out.split(X, y)): clf.fit(X[train], y[train]) prediction = clf.predict(X[test]) self.res['coef'].append((idx, clf.named_steps['lg'].coef_[0])) self.res['auc'].append((idx, roc_auc_score(y[test], prediction))) self.res['model'] = clf output_saved = self.save_pickle(self.res, self.out) return output_saved
def fit(self, X, y, X_test=None, y_test=None): super(MLP, self).fit(X, y) callbacks = [] test = X_test is not None and y_test is not None if test: self.test_loss = TestLossHistory(X_test, y_test) callbacks.append(self.test_loss) if self.n_class == 1 and self.n_label > 2: yr = unroll(y) if self.early_stop: sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0) train_index, val_index = next(iter(sss.split(X, y))) x_train, x_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] stop = EarlyStopping(monitor="val_loss", patience=self.patience, verbose=self.verbose) callbacks.append(stop) history = self.model.fit( x_train, y_train, nb_epoch=self.max_epoch, verbose=self.verbose, callbacks=callbacks, validation_data=(x_val, y_val), ) else: history = self.model.fit(X, y, nb_epoch=self.max_epoch, verbose=self.verbose, callbacks=callbacks) self.history = history.history return self
def split(data, test_size): X, y = np.array(data.data), np.array(data.target) splitter = StratifiedShuffleSplit(n_iter=1, test_size=test_size) train, test = next(splitter.split(X, y)) return X[train], y[train], X[test], y[test]
def _get_validation_split(self): train = pd.read_csv(self.train_csv_file) # mapping labels to integer classes flatten = lambda l: [item for sublist in l for item in sublist] labels = list(set(flatten([l.split(' ') for l in train['tags'].values]))) label_map = {l: i for i, l in enumerate(labels)} y_train = [] for f,tags in (train.values): targets = np.zeros(len(label_map)) for t in tags.split(' '): targets[label_map[t]] = 1 y_train.append(targets) y_train = np.array(y_train, np.uint8) trn_index = [] val_index = [] index = np.arange(len(train)) for i in (range(len(label_map))): sss = StratifiedShuffleSplit(n_splits=2, test_size=self.validation_split, random_state=i) for train_index, test_index in sss.split(index,y_train[:,i]): X_train, X_test = index[train_index], index[test_index] # to ensure there is no repetetion within each split and between the splits trn_index = trn_index + list(set(X_train) - set(trn_index) - set(val_index)) val_index = val_index + list(set(X_test) - set(val_index) - set(trn_index)) return np.array(trn_index), np.array(val_index)
def outer_cv_loop(Xdata,Ydata,clf,parameters=[], n_splits=10,test_size=0.25): pred=numpy.zeros(len(Ydata)) importances=[] kf=StratifiedShuffleSplit(n_splits=n_splits,test_size=test_size) rocscores=[] for train,test in kf.split(Xdata,Ydata): if numpy.var(Ydata[test])==0: print('zero variance',varname) rocscores.append(numpy.nan) continue Ytrain=Ydata[train] Xtrain=fancyimpute.SoftImpute(verbose=False).complete(Xdata[train,:]) Xtest=fancyimpute.SoftImpute(verbose=False).complete(Xdata[test,:]) if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2: smt = SMOTETomek() Xtrain,Ytrain=smt.fit_sample(Xtrain.copy(),Ydata[train]) # filter out bad folds clf.fit(Xtrain,Ytrain) pred=clf.predict(Xtest) if numpy.var(pred)>0: rocscores.append(roc_auc_score(Ydata[test],pred)) else: rocscores.append(numpy.nan) importances.append(clf.feature_importances_) return rocscores,importances
def main(): args = cli_parser().parse_args() TEST_PERCENT = args.test_percent RAND_STATE = args.rand_state OUTPUT_BASE = args.output_base CLS_TO_FILEPATH = args.cls_to_cmdProcessedCsv # Parse CSV files associated to classes cls_uuids = {} for cls, filepath in six.iteritems(CLS_TO_FILEPATH): cls_uuids[cls] = sorted({r[1] for r in csv.reader(open(filepath))}) cls_list = sorted(cls_uuids) all_label, all_uuids = \ zip(*[(cls_name, uuid) for cls_name in cls_list for uuid in cls_uuids[cls_name]]) # Transform into numpy array for multi-index access later all_label = numpy.array(all_label) all_uuids = numpy.array(all_uuids) # ``n_splits=1`` -- Only make one train/test split sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_PERCENT, random_state=RAND_STATE) # Get array of index position values of ``all_uuids`` of uuids to use for # train and test sets, respectively. train_index, test_index = \ iter(sss.split(numpy.zeros(len(all_label)), all_label)).next() uuids_train, uuids_test = all_uuids[train_index], all_uuids[test_index] label_train, label_test = all_label[train_index], all_label[test_index] print("Train:") for cls_label in cls_list: cnt = label_train.tolist().count(cls_label) print("- %s:\t%d\t(~%.2f %% of total class examples)" % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100)) print("Test:") for cls_label in cls_list: cnt = label_test.tolist().count(cls_label) print("- %s:\t%d\t(~%.2f %% of total class examples)" % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100)) # Save out files for use with ``classifier_model_validation`` with open('%s.all_uuids.csv' % OUTPUT_BASE, 'w') as f: w = csv.writer(f) for uuid, label in itertools.izip(all_uuids, all_label): w.writerow([uuid, label]) with open('%s.train_uuids.csv' % OUTPUT_BASE, 'w') as f: w = csv.writer(f) for uuid, label in itertools.izip(uuids_train, label_train): w.writerow([uuid, label]) with open('%s.test_uuids.csv' % OUTPUT_BASE, 'w') as f: w = csv.writer(f) for uuid, label in itertools.izip(uuids_test, label_test): w.writerow([uuid, label])
def __init__(self, fm_decoder, n_iter=5, test_size=0.2, train_size=None, random_state=None): self.fm_decoder = fm_decoder StratifiedShuffleSplit.__init__( self, n_iter=n_iter, test_size=test_size, train_size=train_size, random_state=random_state)
def robust_coef(self,xwl2,hm_y,n_iter=100): skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.2,random_state=1) coefs_ = [] intercept_ = [] for train,test in skf.split(xwl2,hm_y): self.clf2.fit(xwl2[train,:],hm_y[train]) coefs_.append(self.clf2.coef_) intercept_.append(self.clf2.intercept_) self.clf2.coef_ = np.stack(coefs_).mean(0) self.clf2.intercept_ = np.stack(intercept_).mean(0)
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999): f = open(os.path.join('datasets', 'titanic', 'titanic3.csv')) # Remove . from home.dest, split on quotes because some fields have commas keys = f.readline().strip().replace('.', '').split('","') lines = f.readlines() f.close() string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 'homedest'] string_keys = [s for s in string_keys if s not in feature_skip_tuple] numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare'] numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple] train_vectorizer_list = [] test_vectorizer_list = [] n_samples = len(lines) numeric_data = np.zeros((n_samples, len(numeric_keys))) numeric_labels = np.zeros((n_samples,), dtype=int) # Doing this twice is horribly inefficient but the file is small... for n, l in enumerate(lines): line_dict = process_titanic_line(l) strings = {k: line_dict[k] for k in string_keys} numeric_labels[n] = line_dict["survived"] sss = StratifiedShuffleSplit(n_iter=1, test_size=test_size, random_state=12) # This is a weird way to get the indices but it works train_idx = None test_idx = None for train_idx, test_idx in sss.split(numeric_data, numeric_labels): pass for n, l in enumerate(lines): line_dict = process_titanic_line(l) strings = {k: line_dict[k] for k in string_keys} if n in train_idx: train_vectorizer_list.append(strings) else: test_vectorizer_list.append(strings) numeric_data[n] = np.asarray([line_dict[k] for k in numeric_keys]) train_numeric = numeric_data[train_idx] test_numeric = numeric_data[test_idx] train_labels = numeric_labels[train_idx] test_labels = numeric_labels[test_idx] vec = DictVectorizer() # .toarray() due to returning a scipy sparse array train_categorical = vec.fit_transform(train_vectorizer_list).toarray() test_categorical = vec.transform(test_vectorizer_list).toarray() train_data = np.concatenate([train_numeric, train_categorical], axis=1) test_data = np.concatenate([test_numeric, test_categorical], axis=1) keys = numeric_keys + string_keys return keys, train_data, test_data, train_labels, test_labels
def shuffled_split(housing): add_income_category(housing) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] strat_test_set["income_cat"].value_counts() / len(strat_test_set) for set_ in (strat_train_set, strat_test_set): set_.drop("income_cat", axis=1, inplace=True) return strat_train_set, strat_test_set
def test_stratified_shuffle_split_overlap_train_test_bug(): # See https://github.com/scikit-learn/scikit-learn/issues/6121 for # the original bug report y = [0, 1, 2, 3] * 3 + [4, 5] * 5 X = np.ones_like(y) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) train, test = next(iter(sss.split(X=X, y=y))) assert_array_equal(np.intersect1d(train, test), [])
def test_stratifiedshufflesplit_list_input(): # Check that when y is a list / list of string labels, it works. sss = StratifiedShuffleSplit(test_size=2, random_state=42) X = np.ones(7) y1 = ['1'] * 4 + ['0'] * 3 y2 = np.hstack((np.ones(4), np.zeros(3))) y3 = y2.tolist() np.testing.assert_equal(list(sss.split(X, y1)), list(sss.split(X, y2))) np.testing.assert_equal(list(sss.split(X, y3)), list(sss.split(X, y2)))
def _split_data(X, y, p_train=0.5, seed=None): """ Splits data into train and test data. X contains the data and y contains the labels. """ sss = StratifiedShuffleSplit(n_splits=1, test_size=None, train_size=p_train, random_state=seed) train_index, test_index = next(iter(sss.split(X, y))) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return (X_train, y_train), (X_test, y_test)
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_iter = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: p = bf.pmf(count) assert_true(p > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) splits = StratifiedShuffleSplit(n_iter=n_iter, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits = 0 for train, test in splits.split(X=np.ones(n_samples), y=labels): n_splits += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits, n_iter) n_train, n_test = _validate_shuffle_split(n_samples, test_size=1./n_folds, train_size=1.-(1./n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) assert_equal(len(set(train).intersection(test)), 0) label_counts = np.unique(labels) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(n_train + n_test, len(labels)) assert_equal(len(label_counts), 2) ex_test_p = float(n_test) / n_samples ex_train_p = float(n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)
def gen_sample_array(self): try: from sklearn.model_selection import StratifiedShuffleSplit except: print('Need scikit-learn for this functionality') import numpy as np s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5) X = th.randn(self.class_vector.size(0),2).numpy() y = self.class_vector.numpy() s.get_n_splits(X, y) train_index, test_index = next(s.split(X, y)) return np.hstack([train_index, test_index])
def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print(clf) print(PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)) print(RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") except: print("Got a divide by zero when trying out:", clf) print("Precision or recall may be undefined due to a lack of true positive predicitons.")
def main_cv_loop(Xdata,Ydata,clf,parameters, n_folds=4,oversample_thresh=0.1,verbose=False): # use stratified K-fold CV to get roughly equal folds #kf=StratifiedKFold(n_splits=nfolds) kf=StratifiedShuffleSplit(n_splits=4,test_size=0.2) # use oversampling if the difference in prevalence is greater than 20% if numpy.abs(numpy.mean(Ydata)-0.5)>oversample_thresh: oversample='smote' else: oversample='none' # variables to store outputs pred=numpy.zeros(len(Ydata)) # predicted values pred_proba=numpy.zeros(len(Ydata)) # predicted values kernel=[] C=[] fa_ctr=0 for train,test in kf.split(Xdata,Ydata): Xtrain=Xdata[train,:] Xtest=Xdata[test,:] Ytrain=Ydata[train] if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2: if verbose: print('oversampling using SMOTETomek') sm = SMOTETomek() Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain) best_estimator_,bestroc,fa=inner_cv_loop(Xtrain,Ytrain,clf, parameters,verbose=True) if not fa is None: if verbose: print('transforming using fa') print(fa) tmp=fa.transform(Xtest) Xtest=tmp fa_ctr+=1 pred_proba.flat[test]=best_estimator_.predict_proba(Xtest) pred.flat[test]=best_estimator_.predict(Xtest) kernel.append(best_estimator_.kernel) C.append(best_estimator_.C) return roc_auc_score(Ydata,pred,average='weighted'),Ydata,pred,pred_proba
def start_to_fit(X, y): classifiers = [ KNeighborsClassifier(3), SVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression()] res_cols = ['Classifier','Accuracy'] res = pd.DataFrame(columns = res_cols) data_set = StratifiedShuffleSplit(n_splits=10, test_size=0.3, train_size=0.7, random_state=0) accuracy_dic ={} for train_index, test_index in data_set.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for clf in classifiers: name = clf.__class__.__name__ clf.fit(X_train, y_train) #train_predictions = clf.predict(X_test) accuracy = accuracy_score(y_test, clf.predict(X_test)) if name in accuracy_dic: accuracy_dic[name] += accuracy else: accuracy_dic[name] = accuracy for clf in accuracy_dic: accuracy_dic[clf] = accuracy_dic[clf] / 10.0 res_entry = pd.DataFrame([[clf, accuracy_dic[clf]]], columns=res_cols) res = res.append(res_entry) print res
def splitTrainTest(inputDF,random_state): simpleTrainSet, simpleTestSet = train_test_split(inputDF, test_size=0.2, random_state=random_state) inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5) inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True ) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19) for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]): stratifiedTrainSet = inputDF.loc[trainIndices] stratifiedTestSet = inputDF.loc[testIndices] print('\ninputDF["income_category"].value_counts() / len(inputDF)') print( inputDF["income_category"].value_counts() / len(inputDF) ) for set in (stratifiedTrainSet,stratifiedTestSet): set.drop(["income_category"],axis=1,inplace=True) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( stratifiedTrainSet , stratifiedTestSet )
def train_and_test(raw_data, label="Qw", degree=1, p=0.1): # my_full_pipeline = Pipeline([ # # ('removeFirstFrame', RemoveFirstFrame(frame)), # ('featureSelection', full_pipeline) # ]) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142) for train_index, test_index in split.split(raw_data, raw_data["isGood"]): strat_train_set = raw_data.iloc[train_index] strat_test_set = raw_data.iloc[test_index] # strat_test_set[LABEL].value_counts() / len(strat_test_set) X_train = my_transform(strat_train_set, label, degree) X_test = my_transform(strat_test_set, label, degree) train_y = X_train[:,-1] train_set = X_train[:,:-1] test_y = X_test[:,-1] test_set = X_test[:,:-1] return (train_set, train_y, test_set, test_y)
def splitTrainTest(inputDF,random_state): ms_spec = importlib.util.find_spec(name="sklearn.model_selection") if ms_spec is None: trainSet, testSet = train_test_split(inputDF, test_size=0.2, random_state=random_state) else: inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5) inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True ) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19) for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]): trainSet = inputDF.loc[trainIndices] testSet = inputDF.loc[testIndices] print('\nincome category relative sizes (whole data set)') print( inputDF["income_category"].value_counts() / len(inputDF) ) for set in (trainSet,testSet): set.drop(["income_category"],axis=1,inplace=True) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( trainSet , testSet )
def suffle_hm(self,x,y,gamma=0.5,n_iter=50): hm_count = np.zeros_like(y).astype(float) hm = np.zeros_like(y).astype(float) skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.25,random_state=1) coefs_ = [] sv_ = [] for train,test in skf.split(x,y): self.clf1.fit(x[train,:],y[train]) hm_count[test] += 1. hm[test] += (self.clf1.predict(x[test,:])==y[test]).astype(float) #coefs_.append(self.clf1.dual_coef_) #coefs_.append(self.clf1.coef_) #sv_.append(self.clf1.support_vectors_) proba = hm/hm_count if self.verbose: print(hm_count) print(proba) #self.clf1.dual_coef_ = np.stack(coefs_).mean(0) #self.clf1.support_vectors_ = np.stack(sv_).mean(0) #self.clf1.coef_ = np.stack(coefs_).mean(0) self.clf1.fit(x,y) return (proba>=gamma).astype(int),proba
def fit_model(self, X, y): """ X::pd.DataFrame: Input data y::np.ndarray: response for input data """ cv_out = StratifiedShuffleSplit(n_splits=400) clf = Pipeline([('scaler', StandardScaler()), ('fs', CustFsNoiseWinnow()), ('et', ExtraTreesClassifier(n_estimators=2000))]) self.res = {'mask':[], 'fimp':[], 'auc':[], 'model':0} for idx, (train, test) in enumerate(cv_out.split(X, y)): clf.fit(X[train], y[train]) prediction = clf.predict(X[test]) self.res['mask'].append((idx, clf.named_steps['fs'].mask_)) self.res['fimp'].append((idx, clf.named_steps['et'].feature_importances_)) self.res['auc'].append((idx, roc_auc_score(y[test], prediction))) self.res['model'] = clf output_saved = self.save_pickle(self.res, self.out) return output_saved
def train_age(kfold, batchsize, lr_age, lr_gender, num_epochs, p_augment, device, num_age_classes, num_gender_classes, test_fold, train_fold, random_seed): all_accuracy_age = [] all_val_loss_age = [] all_stat_fold = [] for fold in range(kfold): all_stat = defaultdict(list) # image paths train_data = train_fold[fold]['image_path'].copy().reset_index( drop=True).to_list() test_data = test_fold[fold]['image_path'].copy().reset_index( drop=True).to_list() #get label train_age_label = train_fold[fold]['age'].copy().reset_index( drop=True).to_list() train_gender_label = train_fold[fold]['gender'].copy().reset_index( drop=True).to_list() test_age_label = test_fold[fold]['age'].copy().reset_index( drop=True).to_list() test_gender_label = test_fold[fold]['gender'].copy().reset_index( drop=True).to_list() #create train-validation stratified split sss = StratifiedShuffleSplit(n_splits=10, random_state=random_seed) #split based on age, more balanced for both age and gender train_idx, val_idx = list(sss.split(train_data, train_age_label))[0] train_idx = list(train_idx) val_idx = list(val_idx) #create dataloader for gender train_dataset = AgeDataset( '', list(np.array(train_data)[train_idx]), list(np.array(train_age_label)[train_idx]), list(np.array(train_gender_label)[train_idx]), p_augment=p_augment) val_dataset = AgeDataset('', list(np.array(train_data)[val_idx]), list(np.array(train_age_label)[val_idx]), list(np.array(train_gender_label)[val_idx]), validation=True) test_dataset = AgeDataset('', test_data, test_age_label, test_gender_label, validation=True) train_loader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batchsize, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False) val_gender_label = list(np.array(train_gender_label)[val_idx]) val_age_label = list(np.array(train_age_label)[val_idx]) model = InceptionResnetV1(classify=True, pretrained='vggface2', num_classes=num_age_classes) model = model.to(device) #optimizer optimizer = optim.AdamW(model.parameters(), lr=lr_age) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [5, 10]) #loss criterion = nn.CrossEntropyLoss() best_acc_age = 0 best_val_loss_age = 999 print(f'Fold {fold+1}\n') for epoch in range(num_epochs): print(f'epoch: {epoch}\n') train_loss_age = 0 val_loss_age = 0 #Training model.train() iterat = 0 vsego = len(train_loader) for batch in train_loader: print(f'batch_num: {100*(iterat/vsego)}%\n') # Load image batch batch_data, batch_age_label = batch batch_data = batch_data.to(device) batch_age_label = batch_age_label.to(device) # Clear gradients optimizer.zero_grad() with torch.set_grad_enabled(True): pred_age = model(batch_data) loss_age = criterion(pred_age, batch_age_label) train_loss_age += loss_age.detach().item() loss_age.backward() optimizer.step() iterat = iterat + 1 #Validation model.eval() all_pred_age = torch.empty(0).to(device) for batch in val_loader: # Load image batch batch_data, batch_age_label = batch batch_data = batch_data.to(device) batch_age_label = batch_age_label.to(device) with torch.set_grad_enabled(False): pred_age = model(batch_data) loss_age = criterion(pred_age, batch_age_label) val_loss_age += loss_age.detach().item() all_pred_age = torch.cat( (all_pred_age, nn.functional.softmax(pred_age.detach(), dim=1)), 0) train_loss_age /= len(train_loader) val_loss_age /= len(val_loader) all_pred_age = all_pred_age.cpu().numpy() pred_label_age = list(np.argmax(all_pred_age, axis=1)) acc_age = accuracy_score(val_age_label, pred_label_age) if acc_age > best_acc_age: best_acc_age = acc_age best_val_loss_age = val_loss_age torch.save(model.state_dict(), f'models/age_model{fold}.pth') all_stat['train_loss'].append(train_loss_age) all_stat['val_loss'].append(val_loss_age) all_stat['val_acc'].append(acc_age) print( f'Epoch {epoch} | train loss: {train_loss_age} | val loss: {val_loss_age} | accuracy: {round(acc_age*100, 2)}%' ) scheduler.step() #INFERENCE with torch.no_grad(): model.load_state_dict(torch.load(f'models/age_model{fold}.pth')) model.eval() test_pred_age = torch.empty(0).to(device) for batch in test_loader: # Load image batch batch_data, batch_age_label = batch batch_data = batch_data.to(device) batch_age_label = batch_age_label.to(device) with torch.set_grad_enabled(False): pred_age = model(batch_data) test_pred_age = torch.cat( (test_pred_age, nn.functional.softmax(pred_age.detach(), dim=1)), 0) test_pred_age = test_pred_age.cpu().numpy() pred_label_age = list(np.argmax(test_pred_age, axis=1)) acc_age = accuracy_score(test_age_label, pred_label_age) all_stat['test_acc'].append(acc_age) all_stat['conf'].append( confusion_matrix(test_age_label, pred_label_age, labels=list(range(num_age_classes)))) all_stat['conf_norm'].append( confusion_matrix(test_age_label, pred_label_age, normalize='true', labels=list(range(num_age_classes)))) all_stat['test_pred'].append(pred_label_age) all_stat['test_target'].append(test_age_label) all_accuracy_age.append(acc_age) all_val_loss_age.append(best_val_loss_age) print( f'TEST ACCURACY: {round(acc_age*100,2)}% | Val. Accuracy: {round(best_acc_age*100,2)}% | Val. Loss.: {best_val_loss_age}\n' ) all_stat_fold.append(all_stat) all_accuracy_age = np.array(all_accuracy_age) all_val_loss_age = np.array(all_val_loss_age) mean_accuracy_age = round(all_accuracy_age.mean() * 100, 2) print(f'\nOverall Accuracy: {mean_accuracy_age} p/m')
def rodar_experimento(dir_experimento, documentos_validos, freq_min, op_stopwords, op_ica, op_tesauro, op_tam_vec, lista_k, rnd, exp, w2v_geral, ftt_geral, glv_geral): sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=rnd) X = documentos_validos.id y = documentos_validos.Assunto stopwords = nltk.corpus.stopwords.words('portuguese') diretorio = "dados/corpus_tratado/" le = LabelEncoder() #index[0] são os indices de treino, e index[1] são os de teste for index in sss.split(X, y): X_treino, X_teste = X[index[0]], X[index[1]] y_treino, y_teste = y[index[0]], y[index[1]] # instanciando o corpus do conjunto de treinamento base_treino = criar_base_treino(exp, X_treino, y_treino, diretorio, stopwords) # criando vocabulário vocab = extrair_vocabulario(base_treino, freq_min, stopwords, op_stopwords, op_ica, op_tesauro) # treinando modelos juridicos w2v_jur, ftt_jur, glv_jur = treinar_modelos_jur( X_treino, X_teste, y_treino, y_teste, vocab, diretorio, exp, op_tam_vec) #criando representações através da soma de vetores bs = criar_representacoes_soma_jur(X_teste, y_teste, vocab, diretorio, w2v_jur, ftt_jur, glv_jur, exp, op_tam_vec) criar_representacoes_soma_ger(vocab, diretorio, w2v_geral, ftt_geral, glv_geral, exp, op_tam_vec, bs) ######DOC2VEC#### print('--------- Treinando doc2vec do experimento ' + str(exp) + ' ---------') os.mkdir('resultados/' + dir_experimento) corpus = "dados/" + dir_experimento + "/base_treino_glv.txt" model = Doc2Vec(corpus_file=corpus, vector_size=100, window=5, min_count=1, workers=8) model.save("dados/" + dir_experimento + "/doc2vec_jur.model") print( '--------- Inferindo vetores para docs de teste do experimento ' + str(exp) + ' ---------') base_teste = pd.read_csv("dados/" + dir_experimento + "/vetores_teste.csv") base_teste['doc2vec_jur'] = [ normalize(model.infer_vector(x[0].split(' ')).reshape(1, -1)) for x in base_teste.teores ] base_teste.to_csv('dados/experimento_' + str(exp) + '/vetores_teste.csv', index=False) df = pd.read_csv('dados/' + dir_experimento + '/vetores_teste.csv') print('++++++ modelos ++++++ ' + df.iloc[:, 3:].columns) for modelo in df.iloc[:, 3:].columns: #####AGRUPAMENTOS############### print('--------- Agrupando dados para o modelo ' + modelo + ' no experimento' + str(exp) + ' ---------') df[modelo] = df[modelo].apply(lambda x: converter_string_array(x)) X_kmeans = np.stack(df[modelo]) X_kmeans = X_kmeans.reshape(X_kmeans.shape[0], X_kmeans.shape[2]) y_kmeans = df['assunto'] le.fit(y_kmeans) y_kmeans = le.transform(y_kmeans) lista_scores_k = computar_scores_agrupamento( X_kmeans, y_kmeans, dir_experimento, modelo, lista_k) #gerar_graficos_kmeans(lista_scores_k, dir_experimento, modelo) np.save( 'resultados/' + dir_experimento + '/' + modelo + '_lista_scores_k.npy', lista_scores_k) print('****** dados de agrupamento do modelo ' + modelo + 'salvos.') #####MATRIZES DE SIMILARIDADE############## print('--------- executando analyzer para experimento ' + str(exp) + ' ---------') sim_m = calc_matriz_sim(df[modelo], dir_experimento) calcular_sim_assuntos(df['assunto'], sim_m, df[modelo].name, dir_experimento) plt.close()
df = test_data.fillna(np.mean(train_data['Age'])) scaled_data = scaler.transform(df[['Age', 'Fare']]) df[['Age', 'Fare']] = scaled_data for var in categorical: df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], axis=1) del df[var] testdf = df test_data = df.to_numpy() train_labels = train_data_dropped[:, 0] train_data_dropped = train_data_dropped[:, 1:] ### Running classification acc, val_acc, loss, val_loss = [], [], [], [] ## Running k-folds classification to improve generalization and reduce overfitting K = StratifiedShuffleSplit(10, train_size=0.6) for train_index, test_index in K.split(train_data_dropped, train_labels): x_train, y_train = train_data_dropped[train_index], train_labels[ train_index] x_valid, y_valid = train_data_dropped[test_index], train_labels[ test_index] # ##Only need to balance the training data, not the validation data. # x_train, x_valid, y_train, y_valid = train_test_split(train_data_dropped, # train_labels, test_size=0.2, # shuffle= True) y_train = pd.get_dummies(y_train).to_numpy() y_valid = pd.get_dummies(y_valid).to_numpy() history = model.fit( x_train, y_train,
#########################Creating a Training + Test Set######################### ###Using Scikit-Learn (BEST -RECOMMENDED!) (Method 4) #One liner.. #Benefit: Can input multiple data set, can input random_state (So the training set will not change) train_set_04, test_set_04 = train_test_split(housing, test_size=0.2, random_state=42) #########################Creating a Training + Test Set######################### ###Using Scikit-Learn (Categorization/Strata) (Method 5) #Method 04 is the best, but if your data is small -> Sample Bias could happen #Method 05 is good when samples is small and we want to select samples based on categorized main features. #Categorize Samples Based on Important Features housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) #Split the data using Strata split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): train_set_05 = housing.loc[train_index] test_set_05 = housing.loc[test_index] #Check Distribution of Categorized/Stratified Samples housing["income_cat"].value_counts() / len(housing)
print() x_data = np.array([x[0:num_data_col] for x in survey_data]) # [0,6) == [0,5] y_data = np.array([y[num_data_col:num_data_col+num_choice_col] for y in survey_data]) x_headers = [h for h in survey_headers[1:6]] print('x-shape: ' + str(x_data.shape)) print('y-shape: ' + str(y_data.shape)) # --------------------------------------------- #%% # use sklearn to perform stratified randomized partitioning into training and dev sets # this is necessary because the vehicle choice dataset is very unbalanced trainPerc = 0.95; devePerc = 0.05 # deep learning uses much higher %'s for training sss = StratifiedShuffleSplit(n_splits=1, train_size=trainPerc, test_size = devePerc) train_indices,deve_indices = next(sss.split(x_data, y_data)) num_train_rows = len(train_indices) # need this later on # create the patitions x_vals_train = x_data[train_indices,:] y_vals_train = y_data[train_indices,:] x_vals_deve = x_data[deve_indices,:] y_vals_deve = y_data[deve_indices,:] print("num_train_rows: %u, num_deve_rows: %u" %(num_train_rows, len(deve_indices))) # --------------------------------------------- #%% # setup training a_stdv = 0.1 # standard dev. for initialization of node weights
masker = MultiNiftiMasker(mask_img=gm_mask, target_shape=shape, target_affine=affine, smoothing_fwhm=6., standardize=True, detrend=True, mask_strategy='epi', memory=mem, memory_level=2, n_jobs=2, verbose=10) ############################################################################## # Cross Validator # --------------- from sklearn.model_selection import StratifiedShuffleSplit n_iter = 100 classes = phenotypic _, labels = np.unique(classes, return_inverse=True) cv = StratifiedShuffleSplit(n_splits=n_iter, test_size=0.25, random_state=0) ############################################################################## # Functional Connectivity Analysis model # --------------------------------------- from model import LearnBrainRegions connectomes = ['correlation', 'partial correlation', 'tangent'] ############################################################################ # Gather results - Data structure columns = ['atlas', 'measure', 'classifier', 'scores', 'iter_shuffle_split', 'n_regions', 'smoothing_fwhm', 'dataset', 'compcor_10', 'motion_regress', 'dimensionality', 'connectome_regress', 'scoring', 'region_extraction', 'multi_pca_reduction', 'reduction_n_components', 'covariance_estimator', 'min_region_size_in_mm3']
50) # 返回在对数刻度上均匀间隔的数字 for i in gamma_range: clf = SVC(kernel="rbf", gamma=i, cache_size=5000).fit(Xtrain, Ytrain) score.append(clf.score(Xtest, Ytest)) print(max(score), gamma_range[score.index(max(score))]) plt.plot(gamma_range, score) plt.show() from sklearn.model_selection import StratifiedShuffleSplit from sklearn.model_selection import GridSearchCV time0 = time() gamma_range = np.logspace(-10, 1, 20) coef0_range = np.linspace(0, 5, 10) param_grid = dict(gamma=gamma_range, coef0=coef0_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=420) grid = GridSearchCV(SVC(kernel="poly", degree=1, cache_size=5000), param_grid=param_grid, cv=cv) grid.fit(X, y) print("The best parameters are %s, score = %0.5f" % (grid.best_params_, grid.best_score_)) print(datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f")) # 调参C score = [] C_range = np.linspace(0.01, 30, 50) for i in C_range: clf = SVC(kernel="linear", C=i, cache_size=5000).fit(Xtrain, Ytrain) score.append(clf.score(Xtest, Ytest)) print(max(score), C_range[score.index(max(score))]) plt.plot(C_range, score) plt.show()
test_predictions, average='weighted') recall = recall_score(test_labels, test_predictions, average='weighted') f1 = 2.0 * (precision * recall) / (precision + recall) print("Test Precision: %.4f" % (precision)) print("Test Recall: %.4f" % (recall)) print("Test f1_score: %.4f" % (f1)) return accuracy, precision, recall, f1 filename = sys.argv[1] X_data, Y_data = load_csv(filename) sss = StratifiedShuffleSplit(n_splits=5, test_size=0.125) metrics = [] fold = 1 for train_indices, test_indices in sss.split(X_data, Y_data): train_data, test_data = X_data[train_indices], X_data[test_indices] train_labels, test_labels = Y_data[train_indices], Y_data[test_indices] metrics.append(SVM(train_data, train_labels, test_data, test_labels)) fold += 1 accuracy = 0.00 precision = 0.00 recall = 0.00 fi = 0.00 for i in metrics: accuracy += i[0] precision += i[1]
avgFlakyTest /= successFold avgNonFlakyTest /= successFold return (avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest, avgP, avgR, storage, avgTPrep, avgTPred) if __name__ == "__main__": projectBasePath = "dataset" projectName = "pinto-ds" outDir = "results/" os.makedirs(outDir, exist_ok=True) numSplit = 30 testSetSize = 0.2 kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize) # DISTANCE outFile = "params-distance.csv" with open(os.path.join(outDir, outFile), "w") as fo: fo.write( "distance,k,sigma,eps,precision,recall,storage,preparationTime,predictionTime\n" ) k = 7 sigma = 0.5 dim = 0 # number of dimensions (0: JL with error eps) eps = 0.3 # JL eps params = {"algorithm": "brute", "metric": "cosine", "weights": "uniform"} for metric in ["cosine", "euclidean"]: for k in [3, 7]:
def stratify(housing): split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state =42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] return strat_train_set, strat_test_set
df1['label'] = 'BENIGN' df2 = pd.read_csv('../results/dataset_dos.csv') df2['label'] = 'dos' df3 = pd.read_csv('../results/dataset_hb.csv') df3['label'] = 'heartbleed' frames = [df1, df2, df3] print('join datasets') df = pd.concat(frames) print('separate y') X = df.drop(columns=['label']) y = df['label'].values print('StratifiedShuffleSplit') sss = StratifiedShuffleSplit(n_splits=1, test_size=110000, random_state=1) print('split') print(sss.get_n_splits(X, y)) list = [] for train_index, test_index in sss.split(X, y): for index in test_index: list.append(df.iloc[index].values) dts = pd.DataFrame(list, columns=df.columns) dts = df.drop(columns=['ipsrc', 'ipdst']) print('saving') dts.to_csv("../results/dataset_110000.csv", sep=',', encoding='utf-8',
def prepareData(): if wiki_model_name in os.listdir(wiki_model_path): model = gensim.models.KeyedVectors.load( os.path.join(wiki_model_path, wiki_model_name)) else: print("Word2vec model not found in {}".format(wiki_model_path)) vec_len = len(model['a']) print("Word2vec Vector length {}".format(vec_len)) SheetsToParse = [ 'AAPL', 'MSFT', 'GE', 'IBM', 'DIS', 'PG', 'AXP', 'BA', 'DD', 'JNJ', 'KO', 'MCD', 'MMM' ] #df= parseExcelFileWithMultipleSheetsAndCombine("/datadrive/Sahil/code/GL/fewTrails/twitter/Tweet-Scale.xlsx",SheetsToParse) df = pd.read_csv( "/datadrive/Sahil/code/GL/fewTrails/twitter/twitter_training.csv") #df = pd.read_csv(training_data_csv, encoding='iso-8859-1') sentences_len = [len(str(s).split()) for s in df['text']] max_len = max(sentences_len) + 20 # 20 margin print("Max Sentence length {}".format(max_len)) V_index_dict = getIndexedDict(model) vocab_size = len(V_index_dict) embedding_weights = getEmbeddings(vocab_size, vec_len) data_X = [] for sen in df.text[:]: #vec = np.zeros(max_len) vec = [] for index, word in enumerate(word_tokenize(str(sen))[:max_len]): if word in V_index_dict.keys(): vec.append(V_index_dict[word]) else: vec.append(0) data_X.append(vec) data_X = np.array(data_X) data_X = sequence.pad_sequences(data_X, maxlen=max_len) y = df.Rating_m y = to_categorical(y, num_classes=None) print(y) print("Shape of Y{}".format(y.shape)) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed) for train_index, test_index1 in sss.split(data_X, y): print("TRAIN:", train_index, "TEST:", test_index1) print("TRAIN:", len(train_index), "TEST:", len(test_index1)) X_train, X_test = data_X[train_index], data_X[test_index1] y_train, y_test = y[train_index], y[test_index1] sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed) for val_index, test_index2 in sss.split(X_test, y_test): print("TRAIN:", val_index, "TEST:", test_index2) print("TRAIN:", len(val_index), "TEST:", len(test_index2)) X_val, X_test = X_test[val_index], X_test[test_index2] y_val, y_test = y_test[val_index], y_test[test_index2] data = {} data["X_train"] = X_train data["X_test"] = X_test data["X_val"] = X_val data["y_train"] = y_train data["y_test"] = y_test data["y_val"] = y_val data["train_index"] = train_index data["test_index"] = test_index1[test_index2] data["val_index"] = test_index1[val_index] data["max_len"] = max_len data["vec_len"] = vec_len data["vocab_size"] = vocab_size pickle.dump(data, open(saved_data_filename, 'wb'))
neg_data = neg_data[:len(pos_data)] neg_label = neg_label[:len(pos_data)] #trace_data, trace_label = load_data("data/relevant_documents/english", 1) #trace_data = np.array(trace_data) #trace_label = np.array(trace_label) # print('split') all_data = [] all_data.extend(pos_data + neg_data) all_labels = [] all_labels.extend(pos_label + neg_label) print len(all_labels), len(all_data) all_data = np.array(all_data) all_labels = np.array(all_labels) print('split') sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) print('split') idx = 0 batch_size = 64 num_classes = 2 epochs = 5 filepath = "uk_best.hdf5" #filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5" for train_index, test_index in sss.split(all_data, all_labels): X_train, X_test = all_data[train_index], all_data[test_index] y_train, y_test = all_labels[train_index], all_labels[test_index] y_f1_test = y_test
model_results = [] iterations = 100 model = RandomForestClassifier(n_jobs=-1, random_state=55, min_samples_split=20, n_estimators=500, max_features='auto', min_samples_leaf=20, oob_score='TRUE') modelname = 'RF' # Make 'iterations' index vectors for the train-test split sss = StratifiedShuffleSplit(n_splits=iterations, test_size=0.33, random_state=None) accuracy_scores_is = [] accuracy_scores_oos = [] precision_scores_is = [] precision_scores_oos = [] recall_scores_is = [] recall_scores_oos = [] f1_scores_is = [] f1_scores_oos = [] # Initialize the confusion matrix cm_sum_is = np.zeros((2, 2)) cm_sum_oos = np.zeros((2, 2))
def classify( X, y, verbose=False, nfolds=2, dim_red=None, n_components=[5, 10, 20], scale=True, fs=None, njobs=1, LR_C=[0.01, 0.1, 1, 10, 100], LR_class_weight=[None, "balanced"], SVC_C=[0.01, 0.1, 1, 10, 100], SVC_class_weight=[None, "balanced"], SVC_kernels=["rbf", "linear", "poly"], n_estimators=[10, 20, 30], max_features=["auto", "log2", None], **kwargs ): # spit out to the screen the function parameters, for logging if verbose: import inspect frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) print 'function name "%s"' % inspect.getframeinfo(frame)[2] for i in args[2:]: print " %s = %s" % (i, values[i]) # prepare configuration for cross validation test harness seed = 8 # prepare models models = [] # all these support multiclass: # http://scikit-learn.org/stable/modules/multiclass.html models.append( ( "LR", LogisticRegression(multi_class="multinomial", solver="newton-cg"), {"C": LR_C, "class_weight": LR_class_weight}, ) ) models.append(("LDA", LinearDiscriminantAnalysis(), {})) models.append(("RndFor", RandomForestClassifier(), {"n_estimators": n_estimators, "max_features": max_features})) models.append(("NB", GaussianNB(), {})) models.append(("SVC", SVC(), {"C": SVC_C, "class_weight": SVC_class_weight, "kernel": SVC_kernels})) models.append(("Most frequent", DummyClassifier(strategy="most_frequent"), {})) models.append(("Stratified", DummyClassifier(strategy="stratified"), {})) # spit out to the screen the parameters to be tried in each classifier if verbose: print "Trying these parameters:" for m in models: print m[0], ":", m[2] # evaluate each model in turn results = [] names = [] for name, model, params in models: # need to create the CV objects inside the loop because they get used # and not get reset! inner_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed) outer_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed) # # do this if no shuffling is wanted # inner_cv = StratifiedKFold(n_splits=num_folds, random_state=seed) # outer_cv = StratifiedKFold(n_splits=num_folds, random_state=seed) steps = [("clf", model)] pipe_params = {} for key, val in params.iteritems(): key_name = "clf__%s" % key pipe_params[key_name] = val if fs == "l1": lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) fs = feature_selection.SelectFromModel(lsvc) elif fs == "rfe": fs = feature_selection.RFE(estimator=model) pipe_params["feat_sel__n_features_to_select"] = n_components steps = [("feat_sel", fs)] + steps if dim_red is not None: if dim_red == "pca": dr = decomposition.PCA() pipe_params["dim_red__n_components"] = n_components elif dim_red == "ica": dr = decomposition.FastICA() pipe_params["dim_red__n_components"] = n_components steps = [("dim_red", dr)] + steps if scale: steps = [("scale", preprocessing.RobustScaler())] + steps pipe = Pipeline(steps) cv_results = [] cnt = 0 for train_idx, test_idx in outer_cv.split(X, y): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] opt_model = GridSearchCV(estimator=pipe, param_grid=pipe_params, verbose=0, n_jobs=njobs, cv=inner_cv) opt_model.fit(X_train, y_train) if verbose: if len(params.keys()) > 0: print "Best paramaters for", name, " (%d/%d):" % (cnt + 1, outer_cv.n_splits) print opt_model.best_params_ predictions = opt_model.predict(X_test) cv_results.append(metrics.accuracy_score(y_test, predictions)) cnt += 1 results.append(cv_results) names.append(name) if verbose: print "\n======" for model, res in zip(models, results): msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res)) print (msg) print "Chance: %f" % (1 / float(len(np.unique(y)))) print "======\n" return results, models
HOUSING_PATH = "datasets/housing" HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz" def load_housing_data(housing_path=HOUSING_PATH): csv_path = os.path.join(housing_path, "housing.csv") return pd.read_csv(csv_path) housing = load_housing_data() from sklearn.model_selection import StratifiedShuffleSplit housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() housing_num = housing.drop("ocean_proximity", axis=1) rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None):
def test_classifier(clf, dataset, feature_list, folds=1000): data = featureFormat(dataset, feature_list, sort_keys=True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(n_splits=folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) print(clf) print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
#for word in dictionary: # def countTokens(tokens): # return tokens.count(word) # data[word] = data['tokens'].apply(countTokens) #data.drop("tokens", axis = 1, inplace = True) print('counting tokens by file') data['tok_array'] = data['tokens'].apply(createTokenArray) print('saving data') data.to_csv('data.csv', sep=',', encoding='utf-8') data.drop("tokens", axis = 1, inplace = True) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(data, data["type"]): strat_train_set = data.loc[train_index] strat_test_set = data.loc[test_index] def type_proportions(data): return data["type"].value_counts() / len(data) compare_props = pd.DataFrame({ "Overall": type_proportions(data), "Stratified": type_proportions(strat_train_set), "Stratified-test": type_proportions(strat_test_set), }).sort_index() compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100 compare_props["Strat. test %error"] = 100 * compare_props["Stratified-test"] / compare_props["Overall"] - 100
def processtarget(inp): global thresh activity_threshold = thresh sdict = {idx:i for idx, i in enumerate([round(float(i),2) for i in np.arange(0,.9,0.1)])} uniprot,infile = inp try: matrix,active_scaf,pactivity = processfile(infile.groupby('smiles').mean().reset_index()[['smiles','pchembl_value']].values,file=True) except TypeError: return if len(matrix) < 100: return vector = [1 if x >= activity_threshold else 0 for x in pactivity] sfvector = [] #set up cdf for bioactivity scale for standard_deviation_threshold in sorted(sdict.values()): if standard_deviation_threshold == 0.0: sfvector.append(vector) else: reweighted = convertPvalue(pactivity,activity_threshold,standard_deviation_threshold) sfvector.append(reweighted) #process the inactive set if sum(vector) < 100: return print(uniprot) nact = sum(vector) ninact = len(vector)-sum(vector) conf_smiles = [] egids = uniprot_egid.get(uniprot) if egids != None: for egid in egids: try: with zipfile.ZipFile(path_to_pidgin_inactives + egid + '.smi.zip') as z: conf_smiles += [i.split(' ')[0] for i in z.open(egid + '.smi').read().decode('UTF-8').splitlines()] except: pass req = nact * 2 if req < 1000: req = 1000 if req > 2000: req = 2000 req -= ninact if req < 0: req = 0 conf_inactives, inactive_scaf = [], [] #sample inactives if necessary if len(conf_smiles) > 0: random.seed(2) random.shuffle(conf_smiles) try: random.seed(2) conf_inactives,inactive_scaf = calcFingerprints_array(random.sample(conf_smiles,req)) except ValueError: conf_inactives,inactive_scaf = calcFingerprints_array(conf_smiles) conf_smiles = [] vector2 = [] for i in conf_inactives: if req > 0: matrix.append(i) vector2.append(0) req-=1 conf_inactives = None ninact += len(vector2) nse = 0 if req > 0: vector2 += [0] * req random_bg, random_scaf = getfp(req) nse = len(random_bg) matrix += random_bg inactive_scaf += random_scaf del random_bg, random_scaf all_scafs = active_scaf+inactive_scaf del active_scaf, inactive_scaf scaf_dict = {s[0]:s[1] for s in zip(set(all_scafs),range(0,len(set(all_scafs)),1))} all_scafs = [scaf_dict[sca] for sca in all_scafs] nscaf = len(scaf_dict.keys()) vector += vector2 pactivity = np.array(pactivity + [0] * len(vector2), dtype=np.float32) sfvector = [s+vector2 for s in sfvector] vector2 = None matrix = np.array(matrix, dtype=np.uint8) vector = np.array(vector, dtype=np.uint8) sfvector = [np.array(s) for s in sfvector] skf = StratifiedShuffleSplit(n_splits=3, random_state=2, test_size=0.75, train_size=0.25) lso = GroupShuffleSplit(n_splits=3, random_state=2, test_size=0.75, train_size=0.25) base_predicted1, base_predicted2, base_predicted3 = [], [], [] y_lab, y_lab_raw, y_binary = [], [], [] per_fold=[] try: #remove '[:1]' to enable scaffold splitting for split_method, split_name in [(skf,0),(lso,1)][:1]: #for each splitting method, perform the evaluation for train, test in split_method.split(matrix,vector,groups=all_scafs): x, y, X_test,Y_binary, Y_raw = matrix[train], vector[train], matrix[test], vector[test], pactivity[test] class_weights = class_weight.compute_class_weight('balanced',np.unique(y),y) sw = np.array([class_weights[1] if i == 1 else class_weights[0] for i in y]) rfc = RandomForestClassifier(n_jobs = 1, n_estimators=200, class_weight='balanced', random_state=2) ###### ###### ###### ###### ###### ###### ###### ###### ###### brfc=sklearn.base.clone(rfc) brfc.fit(x,y,sample_weight=sw) #for each emulated experimental error, generate predictions for sidx,ystrain in enumerate(sfvector): sw2 = ystrain[train] py=np.zeros([len(sw2),2]) py[:,1] = sw2 py[:,0] = 1-py[:,1] prfc = prf(n_estimators=200, bootstrap=True, keep_proba=0.05) prfc.fit(X=x.astype(float), py=py.astype(float)) rfr = RandomForestRegressor(n_jobs = 1, n_estimators=200, random_state=2) rfr.fit(x,sw2) p_prfc = [round(pr,3) for pr in list(np.array(prfc.predict_proba(X=X_test.astype(float)))[:,1])] p_brfc = [round(pr,3) for pr in list(brfc.predict_proba(X_test)[:,1])] p_rfr = [round(pr,3) for pr in list(np.array(rfr.predict(X_test)))] for sidx2, ystest in enumerate(sfvector): y_test=list(ystest[test]) #add base rf method output base_predicted1 += p_brfc #add base prf method output (when stdev = 0) base_predicted2 += p_prfc #add prf method output base_predicted3 += p_rfr y_lab_raw += list(Y_raw) y_lab += list(y_test) y_binary += list(Y_binary) per_fold.append([len(y_test),[split_name,sdict[sidx],sdict[sidx2]]]) except ValueError: return return [uniprot,nact,ninact,nse,nscaf], [y_binary,y_lab_raw,y_lab,base_predicted1,base_predicted2,base_predicted3], per_fold
return img_data, np.array(_2d_images) train, labels, test, classes = encode(train, test) train = train.values img_data, _2d_images = load_image_data() ##plt.imshow(_2d_images[0])#, interpolation='nearest') ##plt.show() #img_data = np.array(img_data) ##img_data = img_data.reshape(1584, rows,cols) ##print("data loaded") ##input() # splittrain data into train and validation sss = StratifiedShuffleSplit(test_size=0.2, random_state=23) for train_index, valid_index in sss.split(train, labels): X_train, X_valid = train[train_index], train[valid_index] y_train, y_valid = labels[train_index], labels[valid_index] X_train_img, X_valid_img = img_data[train_index], img_data[valid_index] X_train_2dimg, X_valid_2dimg = _2d_images[train_index], _2d_images[ valid_index] X_test = test.values print("Done") import keras from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten
# %% import seaborn as sns import numpy as np import matplotlib.pyplot as plt import matplotlib.patches as patches from utils import get_data, plot_prediction_samples from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold from sklearn import metrics # %% imgs, labels = get_data(as_gray=False) sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42) _, super_idx = next(sss.split(imgs, labels)) X, _, y, y_super_test = train_test_split(imgs, labels, test_size=0.2, random_state=42, stratify=labels) canny_svm_preds = np.load('data/canny_svm_train_preds.npy', allow_pickle=True) hog_svm_preds = np.load('data/hog_svm_train_preds.npy', allow_pickle=True) cnn_preds = np.load('data/cv_cnn_train_preds.npy', allow_pickle=True) transfer_preds = np.load('data/transfer_cnn_train_preds.npy', allow_pickle=True) bagged_cnn_preds = np.load('data/cv_cnn_super_preds.npy', allow_pickle=True)
array([ 0.938..., 0.963..., 0.944...]) """ testing = 1 fileDir = os.path.join(os.getcwd(), 'MicroMaster', 'AI', 'week7ML', 'input3.csv') input_data = np.genfromtxt(fileDir, delimiter=',', skip_header=1) X = input_data[:, :2] y = input_data[:, 2] if testing: print(X) test_size = 0.4 random_state = 0 n_splits = 5 cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) # SVM with Linear Kernel # https://stats.stackexchange.com/questions/31066/what-is-the-influence-of-c-in-svms-with-linear-kernel # https://stats.stackexchange.com/questions/73032/linear-kernel-and-non-linear-kernel-for-support-vector-machine """kernel : string, optional (default=’rbf’) Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples).""" kernel = 'linear' C = [0.1, 0.5, 1, 5, 10, 50, 100] param_grid = dict(C=C) grid = GridSearchCV(SVC(kernel=kernel), param_grid=param_grid, cv=cv) grid.fit(X, y)
# All the numerical features in the dataset # print(df.describe()) # plotting histogram for features df.hist() plt.tight_layout() plt.show() # splitting data into train-test -- normal split train_set, test_set = train_test_split(df, test_size=0.25, random_state=42) # stratified split # adding income category feature for stratified splitting df['income_category'] = np.ceil(df['median_income'] / 1.5) df['income_category'].where(df['income_category'] < 5, 5.0, inplace=True) # stratified splitting split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42) for train_index, test_index in split.split(df, df['income_category']): strat_train_set = df.loc[train_index] strat_test_set = df.loc[test_index] # removing 'income_category' from df for set_ in (strat_train_set, strat_test_set): set_.drop(['income_category'], axis=1, inplace=True) # plotting a scatter graph longitude vs latitude # the more dense places are more populated df.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1) plt.show() # getting correlation matrix for df corr_mat = df.corr() # print(corr_mat)
def train(self, datasets): ''' Initialize, train and predict a classifier. This includes: Feature engineering (i.e. PCA) and selection, training clf, (hyper)parameter optimization, and a prediction on the test set. Make sure to save all variables you want to keep track of in the instance. Input: datasets:: dict Contains train and test x, y Output: clf:: instance, dict, list, None Trained classifier/regressor instance, such as sklearn logistic regression. Is not used outside this file, so can be left empty datasets:: dict Dictionary containing the UPDATED train and test sets. Any new features should be present in this dict test_y_hat:: list List containing the probabilities of outcomes. ''' train_x = datasets['train_x'] test_x = datasets['test_x'] train_y = datasets['train_y'] test_y = datasets['test_y'] self.learn_size += [{ 'tr_x': train_x.shape, 'tr_y': train_y.shape, 'te_x': test_x.shape, 'te_y': test_y.shape }] train_x = self.impute_missing_values(train_x) test_x = self.impute_missing_values(test_x) # Define pipeline self.pipeline = self.get_pipeline() # Model and feature selection # TODO ideally also the feature selection would take place within a CV pipeline if self.model_args['grid_search']: # print("Train classfier using grid search for best parameters.") cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=self.random_state) grid = RandomizedSearchCV(self.pipeline, param_distributions=self.grid, cv=cv, scoring='roc_auc', n_jobs=-2, n_iter=50) grid.fit(train_x, train_y) clf = grid.best_estimator_ self.trained_classifiers += [clf] # print("Best estimator: ", clf) else: # Train classifier without optimization. clf = self.pipeline clf.fit(train_x, train_y) self.coefs.append(clf['XGB'].feature_importances_) test_y_hat = clf.predict_proba(test_x) # Predict if 'feature_selection' in clf.named_steps: # columns = train_x.columns[np.argsort(clf.named_steps\ # .feature_selection\ # .pvalues_)][0:self.model_args['n_features']].to_list() # self.n_best_features += [columns] # print(columns) idx_sorted = np.argsort(clf['feature_selection'].pvalues_) f_values = clf['feature_selection'].scores_ p_values = clf['feature_selection'].pvalues_ columns = train_x.columns[ idx_sorted[0:self.model_args['n_features']]].to_list() self.n_best_features += [[columns, f_values, p_values]] print(columns) else: columns = train_x.columns idx_train = train_x.index idx_test = test_x.index if self.model_args['add_missing_indicator']: missing_cols = columns.to_list()\ + ['{}_nan'.format(c) for c in train_x.loc[:, train_x.isna().any()]] train_x = pd.DataFrame(clf[:-1].transform(train_x)) test_x = pd.DataFrame(clf[:-1].transform(test_x)) if self.model_args['add_missing_indicator']: train_x.columns = missing_cols test_x.columns = missing_cols else: train_x.columns = columns test_x.columns = columns train_x.index = idx_train test_x.index = idx_test datasets = { "train_x": train_x, "test_x": test_x, "train_y": train_y, "test_y": test_y } explainer = shap.TreeExplainer(clf['XGB']) shap_values = explainer.shap_values(test_x) return clf, datasets, test_y_hat, shap_values, test_x
features_list_selected.append(features_list[index + 1]) features_list = features_list_selected data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) #### Using GridSearchCV with a Stratified Shuffle Split to find best parameters from sklearn.model_selection import GridSearchCV parameters = { 'criterion': ('gini', 'entropy'), 'max_depth': [1, 2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3, 4, 5], 'min_samples_split': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] } tree = DecisionTreeClassifier() sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42) clf = GridSearchCV(tree, parameters, cv=sss) clf.fit(features, labels) print clf.best_params_ best_params = clf.best_params_ precision_list = [] recall_list = [] for count_fit in range(1, 100): features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42, stratify = labels) clf = DecisionTreeClassifier( min_samples_split=best_params['min_samples_split'], criterion=best_params['criterion'], max_depth=best_params['max_depth'],