def test_stratifiedshufflesplit_list_input(): # Check that when y is a list / list of string labels, it works. sss = StratifiedShuffleSplit(test_size=2, random_state=42) X = np.ones(7) y1 = ['1'] * 4 + ['0'] * 3 y2 = np.hstack((np.ones(4), np.zeros(3))) y3 = y2.tolist() np.testing.assert_equal(list(sss.split(X, y1)), list(sss.split(X, y2))) np.testing.assert_equal(list(sss.split(X, y3)), list(sss.split(X, y2)))
def validate(self): ''' Ten-fold cross-validation with stratified sampling. ''' print('Validating new model: {}()'.format(self.__class__.__name__)) accuracy_scores = [] precision_scores = [] recall_scores = [] f1_scores = [] sss = StratifiedShuffleSplit(n_splits=10) for train_index, test_index in sss.split(self.data, self.labels): x_train, x_test = self.data[train_index], self.data[test_index] y_train, y_test = self.labels[train_index], self.labels[test_index] model = self.create_model() model.fit(x_train, y_train, epochs=100, batch_size=128, class_weight=self.class_weight) y_pred = model.predict_classes(x_test, batch_size=128) accuracy_scores.append(accuracy_score(y_test, y_pred)) precision_scores.append(precision_score(y_test, y_pred)) recall_scores.append(recall_score(y_test, y_pred)) f1_scores.append(f1_score(y_test, y_pred)) print('') print('Accuracy: {}'.format(np.mean(accuracy_scores))) print('Precision: {}'.format(np.mean(precision_scores))) print('Recall: {}'.format(np.mean(recall_scores))) print('F1-measure: {}'.format(np.mean(f1_scores)))
def fit_model(self, X, y): """ X::pd.DataFrame: Input data y::np.ndarray: response for input data """ X = X.values XY = np.hstack((X, y[:, None])) np.random.shuffle(XY) X = XY[:, :-1] y = XY[:, -1] cv_out = StratifiedShuffleSplit(n_splits=400) cv_in = StratifiedKFold(n_splits=5) clf = Pipeline([('scaler', StandardScaler()), ('lg', linear_model.LogisticRegressionCV( penalty='l1', solver='liblinear', cv=cv_in))]) self.res = {'coef':[], 'auc':[], 'model':0} for idx, (train, test) in enumerate(cv_out.split(X, y)): clf.fit(X[train], y[train]) prediction = clf.predict(X[test]) self.res['coef'].append((idx, clf.named_steps['lg'].coef_[0])) self.res['auc'].append((idx, roc_auc_score(y[test], prediction))) self.res['model'] = clf output_saved = self.save_pickle(self.res, self.out) return output_saved
def outer_cv_loop(Xdata,Ydata,clf,parameters=[], n_splits=10,test_size=0.25): pred=numpy.zeros(len(Ydata)) importances=[] kf=StratifiedShuffleSplit(n_splits=n_splits,test_size=test_size) rocscores=[] for train,test in kf.split(Xdata,Ydata): if numpy.var(Ydata[test])==0: print('zero variance',varname) rocscores.append(numpy.nan) continue Ytrain=Ydata[train] Xtrain=fancyimpute.SoftImpute(verbose=False).complete(Xdata[train,:]) Xtest=fancyimpute.SoftImpute(verbose=False).complete(Xdata[test,:]) if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2: smt = SMOTETomek() Xtrain,Ytrain=smt.fit_sample(Xtrain.copy(),Ydata[train]) # filter out bad folds clf.fit(Xtrain,Ytrain) pred=clf.predict(Xtest) if numpy.var(pred)>0: rocscores.append(roc_auc_score(Ydata[test],pred)) else: rocscores.append(numpy.nan) importances.append(clf.feature_importances_) return rocscores,importances
def simple_classification(n_samples=100, n_features=10, random_state=33): """ Generate simple classification task for training. Parameters ---------- n_samples : int Number of samples in dataset. n_features : int Number of features for each sample. random_state : int Random state to make results reproducible. Returns ------- tuple Returns tuple that contains 4 variables. There are input train, input test, target train, target test respectevly. """ X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, random_state=random_state) shuffle_split = StratifiedShuffleSplit(n_splits=1, train_size=0.6, random_state=random_state) train_index, test_index = next(shuffle_split.split(X, y)) x_train, x_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return x_train, x_test, y_train, y_test
def split(data, test_size): X, y = np.array(data.data), np.array(data.target) splitter = StratifiedShuffleSplit(n_iter=1, test_size=test_size) train, test = next(splitter.split(X, y)) return X[train], y[train], X[test], y[test]
def fit(self, X, y, X_test=None, y_test=None): super(MLP, self).fit(X, y) callbacks = [] test = X_test is not None and y_test is not None if test: self.test_loss = TestLossHistory(X_test, y_test) callbacks.append(self.test_loss) if self.n_class == 1 and self.n_label > 2: yr = unroll(y) if self.early_stop: sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0) train_index, val_index = next(iter(sss.split(X, y))) x_train, x_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] stop = EarlyStopping(monitor="val_loss", patience=self.patience, verbose=self.verbose) callbacks.append(stop) history = self.model.fit( x_train, y_train, nb_epoch=self.max_epoch, verbose=self.verbose, callbacks=callbacks, validation_data=(x_val, y_val), ) else: history = self.model.fit(X, y, nb_epoch=self.max_epoch, verbose=self.verbose, callbacks=callbacks) self.history = history.history return self
def _get_validation_split(self): train = pd.read_csv(self.train_csv_file) # mapping labels to integer classes flatten = lambda l: [item for sublist in l for item in sublist] labels = list(set(flatten([l.split(' ') for l in train['tags'].values]))) label_map = {l: i for i, l in enumerate(labels)} y_train = [] for f,tags in (train.values): targets = np.zeros(len(label_map)) for t in tags.split(' '): targets[label_map[t]] = 1 y_train.append(targets) y_train = np.array(y_train, np.uint8) trn_index = [] val_index = [] index = np.arange(len(train)) for i in (range(len(label_map))): sss = StratifiedShuffleSplit(n_splits=2, test_size=self.validation_split, random_state=i) for train_index, test_index in sss.split(index,y_train[:,i]): X_train, X_test = index[train_index], index[test_index] # to ensure there is no repetetion within each split and between the splits trn_index = trn_index + list(set(X_train) - set(trn_index) - set(val_index)) val_index = val_index + list(set(X_test) - set(val_index) - set(trn_index)) return np.array(trn_index), np.array(val_index)
def main(): args = cli_parser().parse_args() TEST_PERCENT = args.test_percent RAND_STATE = args.rand_state OUTPUT_BASE = args.output_base CLS_TO_FILEPATH = args.cls_to_cmdProcessedCsv # Parse CSV files associated to classes cls_uuids = {} for cls, filepath in six.iteritems(CLS_TO_FILEPATH): cls_uuids[cls] = sorted({r[1] for r in csv.reader(open(filepath))}) cls_list = sorted(cls_uuids) all_label, all_uuids = \ zip(*[(cls_name, uuid) for cls_name in cls_list for uuid in cls_uuids[cls_name]]) # Transform into numpy array for multi-index access later all_label = numpy.array(all_label) all_uuids = numpy.array(all_uuids) # ``n_splits=1`` -- Only make one train/test split sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_PERCENT, random_state=RAND_STATE) # Get array of index position values of ``all_uuids`` of uuids to use for # train and test sets, respectively. train_index, test_index = \ iter(sss.split(numpy.zeros(len(all_label)), all_label)).next() uuids_train, uuids_test = all_uuids[train_index], all_uuids[test_index] label_train, label_test = all_label[train_index], all_label[test_index] print("Train:") for cls_label in cls_list: cnt = label_train.tolist().count(cls_label) print("- %s:\t%d\t(~%.2f %% of total class examples)" % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100)) print("Test:") for cls_label in cls_list: cnt = label_test.tolist().count(cls_label) print("- %s:\t%d\t(~%.2f %% of total class examples)" % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100)) # Save out files for use with ``classifier_model_validation`` with open('%s.all_uuids.csv' % OUTPUT_BASE, 'w') as f: w = csv.writer(f) for uuid, label in itertools.izip(all_uuids, all_label): w.writerow([uuid, label]) with open('%s.train_uuids.csv' % OUTPUT_BASE, 'w') as f: w = csv.writer(f) for uuid, label in itertools.izip(uuids_train, label_train): w.writerow([uuid, label]) with open('%s.test_uuids.csv' % OUTPUT_BASE, 'w') as f: w = csv.writer(f) for uuid, label in itertools.izip(uuids_test, label_test): w.writerow([uuid, label])
def robust_coef(self,xwl2,hm_y,n_iter=100): skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.2,random_state=1) coefs_ = [] intercept_ = [] for train,test in skf.split(xwl2,hm_y): self.clf2.fit(xwl2[train,:],hm_y[train]) coefs_.append(self.clf2.coef_) intercept_.append(self.clf2.intercept_) self.clf2.coef_ = np.stack(coefs_).mean(0) self.clf2.intercept_ = np.stack(intercept_).mean(0)
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999): f = open(os.path.join('datasets', 'titanic', 'titanic3.csv')) # Remove . from home.dest, split on quotes because some fields have commas keys = f.readline().strip().replace('.', '').split('","') lines = f.readlines() f.close() string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 'homedest'] string_keys = [s for s in string_keys if s not in feature_skip_tuple] numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare'] numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple] train_vectorizer_list = [] test_vectorizer_list = [] n_samples = len(lines) numeric_data = np.zeros((n_samples, len(numeric_keys))) numeric_labels = np.zeros((n_samples,), dtype=int) # Doing this twice is horribly inefficient but the file is small... for n, l in enumerate(lines): line_dict = process_titanic_line(l) strings = {k: line_dict[k] for k in string_keys} numeric_labels[n] = line_dict["survived"] sss = StratifiedShuffleSplit(n_iter=1, test_size=test_size, random_state=12) # This is a weird way to get the indices but it works train_idx = None test_idx = None for train_idx, test_idx in sss.split(numeric_data, numeric_labels): pass for n, l in enumerate(lines): line_dict = process_titanic_line(l) strings = {k: line_dict[k] for k in string_keys} if n in train_idx: train_vectorizer_list.append(strings) else: test_vectorizer_list.append(strings) numeric_data[n] = np.asarray([line_dict[k] for k in numeric_keys]) train_numeric = numeric_data[train_idx] test_numeric = numeric_data[test_idx] train_labels = numeric_labels[train_idx] test_labels = numeric_labels[test_idx] vec = DictVectorizer() # .toarray() due to returning a scipy sparse array train_categorical = vec.fit_transform(train_vectorizer_list).toarray() test_categorical = vec.transform(test_vectorizer_list).toarray() train_data = np.concatenate([train_numeric, train_categorical], axis=1) test_data = np.concatenate([test_numeric, test_categorical], axis=1) keys = numeric_keys + string_keys return keys, train_data, test_data, train_labels, test_labels
def shuffled_split(housing): add_income_category(housing) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] strat_test_set["income_cat"].value_counts() / len(strat_test_set) for set_ in (strat_train_set, strat_test_set): set_.drop("income_cat", axis=1, inplace=True) return strat_train_set, strat_test_set
def test_stratified_shuffle_split_overlap_train_test_bug(): # See https://github.com/scikit-learn/scikit-learn/issues/6121 for # the original bug report y = [0, 1, 2, 3] * 3 + [4, 5] * 5 X = np.ones_like(y) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0) train, test = next(iter(sss.split(X=X, y=y))) assert_array_equal(np.intersect1d(train, test), [])
def _split_data(X, y, p_train=0.5, seed=None): """ Splits data into train and test data. X contains the data and y contains the labels. """ sss = StratifiedShuffleSplit(n_splits=1, test_size=None, train_size=p_train, random_state=seed) train_index, test_index = next(iter(sss.split(X, y))) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return (X_train, y_train), (X_test, y_test)
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_iter = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: p = bf.pmf(count) assert_true(p > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) splits = StratifiedShuffleSplit(n_iter=n_iter, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits = 0 for train, test in splits.split(X=np.ones(n_samples), y=labels): n_splits += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits, n_iter) n_train, n_test = _validate_shuffle_split(n_samples, test_size=1./n_folds, train_size=1.-(1./n_folds)) assert_equal(len(train), n_train) assert_equal(len(test), n_test) assert_equal(len(set(train).intersection(test)), 0) label_counts = np.unique(labels) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(n_train + n_test, len(labels)) assert_equal(len(label_counts), 2) ex_test_p = float(n_test) / n_samples ex_train_p = float(n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)
def test_classifier(clf, dataset, feature_list, folds = 1000): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(folds, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print(clf) print(PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)) print(RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") except: print("Got a divide by zero when trying out:", clf) print("Precision or recall may be undefined due to a lack of true positive predicitons.")
def gen_sample_array(self): try: from sklearn.model_selection import StratifiedShuffleSplit except: print('Need scikit-learn for this functionality') import numpy as np s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5) X = th.randn(self.class_vector.size(0),2).numpy() y = self.class_vector.numpy() s.get_n_splits(X, y) train_index, test_index = next(s.split(X, y)) return np.hstack([train_index, test_index])
def main_cv_loop(Xdata,Ydata,clf,parameters, n_folds=4,oversample_thresh=0.1,verbose=False): # use stratified K-fold CV to get roughly equal folds #kf=StratifiedKFold(n_splits=nfolds) kf=StratifiedShuffleSplit(n_splits=4,test_size=0.2) # use oversampling if the difference in prevalence is greater than 20% if numpy.abs(numpy.mean(Ydata)-0.5)>oversample_thresh: oversample='smote' else: oversample='none' # variables to store outputs pred=numpy.zeros(len(Ydata)) # predicted values pred_proba=numpy.zeros(len(Ydata)) # predicted values kernel=[] C=[] fa_ctr=0 for train,test in kf.split(Xdata,Ydata): Xtrain=Xdata[train,:] Xtest=Xdata[test,:] Ytrain=Ydata[train] if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2: if verbose: print('oversampling using SMOTETomek') sm = SMOTETomek() Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain) best_estimator_,bestroc,fa=inner_cv_loop(Xtrain,Ytrain,clf, parameters,verbose=True) if not fa is None: if verbose: print('transforming using fa') print(fa) tmp=fa.transform(Xtest) Xtest=tmp fa_ctr+=1 pred_proba.flat[test]=best_estimator_.predict_proba(Xtest) pred.flat[test]=best_estimator_.predict(Xtest) kernel.append(best_estimator_.kernel) C.append(best_estimator_.C) return roc_auc_score(Ydata,pred,average='weighted'),Ydata,pred,pred_proba
def start_to_fit(X, y): classifiers = [ KNeighborsClassifier(3), SVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression()] res_cols = ['Classifier','Accuracy'] res = pd.DataFrame(columns = res_cols) data_set = StratifiedShuffleSplit(n_splits=10, test_size=0.3, train_size=0.7, random_state=0) accuracy_dic ={} for train_index, test_index in data_set.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for clf in classifiers: name = clf.__class__.__name__ clf.fit(X_train, y_train) #train_predictions = clf.predict(X_test) accuracy = accuracy_score(y_test, clf.predict(X_test)) if name in accuracy_dic: accuracy_dic[name] += accuracy else: accuracy_dic[name] = accuracy for clf in accuracy_dic: accuracy_dic[clf] = accuracy_dic[clf] / 10.0 res_entry = pd.DataFrame([[clf, accuracy_dic[clf]]], columns=res_cols) res = res.append(res_entry) print res
def train_and_test(raw_data, label="Qw", degree=1, p=0.1): # my_full_pipeline = Pipeline([ # # ('removeFirstFrame', RemoveFirstFrame(frame)), # ('featureSelection', full_pipeline) # ]) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142) for train_index, test_index in split.split(raw_data, raw_data["isGood"]): strat_train_set = raw_data.iloc[train_index] strat_test_set = raw_data.iloc[test_index] # strat_test_set[LABEL].value_counts() / len(strat_test_set) X_train = my_transform(strat_train_set, label, degree) X_test = my_transform(strat_test_set, label, degree) train_y = X_train[:,-1] train_set = X_train[:,:-1] test_y = X_test[:,-1] test_set = X_test[:,:-1] return (train_set, train_y, test_set, test_y)
def splitTrainTest(inputDF,random_state): simpleTrainSet, simpleTestSet = train_test_split(inputDF, test_size=0.2, random_state=random_state) inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5) inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True ) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19) for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]): stratifiedTrainSet = inputDF.loc[trainIndices] stratifiedTestSet = inputDF.loc[testIndices] print('\ninputDF["income_category"].value_counts() / len(inputDF)') print( inputDF["income_category"].value_counts() / len(inputDF) ) for set in (stratifiedTrainSet,stratifiedTestSet): set.drop(["income_category"],axis=1,inplace=True) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( stratifiedTrainSet , stratifiedTestSet )
def suffle_hm(self,x,y,gamma=0.5,n_iter=50): hm_count = np.zeros_like(y).astype(float) hm = np.zeros_like(y).astype(float) skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.25,random_state=1) coefs_ = [] sv_ = [] for train,test in skf.split(x,y): self.clf1.fit(x[train,:],y[train]) hm_count[test] += 1. hm[test] += (self.clf1.predict(x[test,:])==y[test]).astype(float) #coefs_.append(self.clf1.dual_coef_) #coefs_.append(self.clf1.coef_) #sv_.append(self.clf1.support_vectors_) proba = hm/hm_count if self.verbose: print(hm_count) print(proba) #self.clf1.dual_coef_ = np.stack(coefs_).mean(0) #self.clf1.support_vectors_ = np.stack(sv_).mean(0) #self.clf1.coef_ = np.stack(coefs_).mean(0) self.clf1.fit(x,y) return (proba>=gamma).astype(int),proba
def splitTrainTest(inputDF,random_state): ms_spec = importlib.util.find_spec(name="sklearn.model_selection") if ms_spec is None: trainSet, testSet = train_test_split(inputDF, test_size=0.2, random_state=random_state) else: inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5) inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True ) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19) for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]): trainSet = inputDF.loc[trainIndices] testSet = inputDF.loc[testIndices] print('\nincome category relative sizes (whole data set)') print( inputDF["income_category"].value_counts() / len(inputDF) ) for set in (trainSet,testSet): set.drop(["income_category"],axis=1,inplace=True) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( trainSet , testSet )
def fit_model(self, X, y): """ X::pd.DataFrame: Input data y::np.ndarray: response for input data """ cv_out = StratifiedShuffleSplit(n_splits=400) clf = Pipeline([('scaler', StandardScaler()), ('fs', CustFsNoiseWinnow()), ('et', ExtraTreesClassifier(n_estimators=2000))]) self.res = {'mask':[], 'fimp':[], 'auc':[], 'model':0} for idx, (train, test) in enumerate(cv_out.split(X, y)): clf.fit(X[train], y[train]) prediction = clf.predict(X[test]) self.res['mask'].append((idx, clf.named_steps['fs'].mask_)) self.res['fimp'].append((idx, clf.named_steps['et'].feature_importances_)) self.res['auc'].append((idx, roc_auc_score(y[test], prediction))) self.res['model'] = clf output_saved = self.save_pickle(self.res, self.out) return output_saved
'covariance_estimator', 'min_region_size_in_mm3'] results = dict() for column_name in columns: results.setdefault(column_name, []) print(results) ############################################################################## # Run the analysis now # -------------------- import pandas as pd dimensions = [40, 60, 80, 100, 120, 150, 200, 300] folder_name = name + str(n_iter) + '_kmeans_list_dim_graphlasso' for model in ['kmeans']: for dim in dimensions: iter_for_prediction = cv.split(func_imgs, classes) for index, (train_index, test_index) in enumerate(iter_for_prediction): all_results = draw_predictions( imgs=func_imgs, labels=labels, groups=classes, index=index, dimensionality=dim, train_index=train_index, test_index=test_index, scoring='roc_auc', models=model, atlases=None, masker=masker, connectomes=connectomes, confounds=motion_confounds, confounds_mask_img=gm_mask, connectome_regress_confounds=connectome_regress_confounds) print(index) # Dump the results for model_ in all_results.models_: save_path = os.path.join(folder_name, model_, str(dim), str(index))
# **Exercise for part 3**: Use the code below to test what value of # `n_neighbors` works best for the given data. *Note: do NOT change the metric # to be anything other than `'euclidean'`. Other distance functions are not # optimized for the amount of data we are working with.* # # **Question for part 3**: What is the accuracy of the best classifier you can # create for this data (by changing only the `n_neighbors` parameter)? #%% from sklearn.model_selection import StratifiedShuffleSplit from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from IPython.html import widgets sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8) cv = sss.split(X=ds.data, y=ds.target) # fill in the training and testing data and save as separate variables for trainidx, testidx in cv: # note that these are sparse matrices X_train = ds.data[trainidx] X_test = ds.data[testidx] y_train = ds.target[trainidx] y_test = ds.target[testidx] # fill in your code here to train and test # calculate the accuracy and print it for various values of K clf = KNeighborsClassifier(weights='uniform', metric='euclidean') accuracies = [] for k in range(1, 10): clf.n_neighbors = k
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz" def load_housing_data(housing_path=HOUSING_PATH): csv_path = os.path.join(housing_path, "housing.csv") return pd.read_csv(csv_path) housing = load_housing_data() from sklearn.model_selection import StratifiedShuffleSplit housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() housing_num = housing.drop("ocean_proximity", axis=1) rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self # nothing else to do
train_labels[list(label_mapping.keys())] = pd.DataFrame( train_labels['target_v'].values.tolist(), index=train_labels.index) train_labels['Id'] = train_labels['Id'] + '_green.png' class_count = train_labels['target_v'].sum() # Split from sklearn.model_selection import StratifiedShuffleSplit def coalesce(arr): return arr[np.argmin(np.array(class_count)[arr])] train_labels['y_coal'] = train_labels['Target'].apply(coalesce) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42) for train_index, val_index in sss.split(np.zeros(len(train_labels)), train_labels['y_coal']): df_train = train_labels.iloc[train_index] df_val = train_labels.iloc[val_index] df_train.reset_index(drop=True, inplace=True) df_val.reset_index(drop=True, inplace=True) # some gc collection del train_labels gc.collect() ################## ### Model ################## # constants and utils
def read_data(filename): data = pd.read_csv(filename) y_array = data['Survived'].values X_df = data.drop(['Survived', 'PassengerId'], axis=1) return X_df, y_array if __name__ == '__main__': print("Reading file ...") X_df, y_array = read_data(train_filename) skf = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=57) print("Training file ...") scores = [] for train_is, test_is in skf.split(X_df, y_array): print('--------------------------') X_train_df = X_df.iloc[train_is] y_train_array = y_array[train_is] X_test_df = X_df.iloc[test_is] y_test_array = y_array[test_is] fe = feature_extractor.FeatureExtractor() fe.fit(X_train_df, y_train_array) X_train_array = fe.transform(X_train_df) X_test_array = fe.transform(X_test_df) clf = classifier.Classifier() clf.fit(X_train_array, y_train_array) y_proba = clf.predict_proba(X_test_array)
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) #print(f"rows in train set: {len(train_set)}\n rows in test set: {len(test_set)} ") # In[9]: print(len(train_set)) # In[10]: print(len(test_set)) # In[11]: from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing['CHAS']): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] # In[12]: strat_test_set['CHAS'].value_counts() # In[13]: housing = strat_train_set.copy() # ##next haeding is # ## corelate matrix #
## Excercises ### 1.clustering from sklearn.datasets import fetch_olivetti_faces olivetti = fetch_olivetti_faces() print(olivetti.DESCR) olivetti.target from sklearn.model_selection import StratifiedShuffleSplit strat_split = StratifiedShuffleSplit(n_splits=1, test_size=40, random_state=42) train_valid_idx, test_idx = next(strat_split.split(olivetti.data, olivetti.target)) X_train_valid = olivetti.data[train_valid_idx] y_train_valid = olivetti.target[train_valid_idx] X_test = olivetti.data[test_idx] y_test = olivetti.target[test_idx] strat_split = StratifiedShuffleSplit(n_splits=1, test_size=80, random_state=43) train_idx, valid_idx = next(strat_split.split(X_train_valid, y_train_valid)) X_train = X_train_valid[train_idx] y_train = y_train_valid[train_idx] X_valid = X_train_valid[valid_idx] y_valid = y_train_valid[valid_idx] print(X_train.shape, y_train.shape) print(X_valid.shape, y_valid.shape) print(X_test.shape, y_test.shape)
'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat' ]) mushroom_targets = mushroom_data[['E/P']] #Complete a Stratified Shuffle Split to dataset in 80/20 ratio from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=43) sss.get_n_splits(mushroom_data) for train_index, test_index in sss.split(mushroom_data, mushroom_targets): mushroom_train_set = mushroom_data.loc[train_index] mushroom_test_set = mushroom_data.loc[test_index] #Split the training and test datasets into inputs and targets dataframes mushroom_train_inputs = mushroom_train_set.drop(['E/P'], axis=1) mushroom_train_targets = mushroom_train_set[['E/P']] mushroom_test_inputs = mushroom_test_set.drop(['E/P'], axis=1) mushroom_test_targets = mushroom_test_set[['E/P']] #Encode the categorical input columns and print the total number of columns from each encoding train_inputs = mushroom_train_set.drop('E/P', axis=1) #One-Hot Encoding (scikit func) from sklearn.preprocessing import OneHotEncoder
def main(): fetch_housing_data() # Pull Data housing_df = load_housing_data() #print(housing_df["ocean_proximity"].value_counts()) #print(housing_df.describe()) ''' EXPLORE DATA ''' #housing_df.hist(bins=50, figsize=(20, 15)) # Plot histogram of features #plt.show() housing_with_id = housing_df.reset_index() # adds an `index` column housing_with_id["id"] = housing_df["longitude"] * 1000 + housing_df[ "latitude"] # Create id using lat and lon housing_with_id["income_cat"] = pd.cut( housing_with_id["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5] ) # Create income bins so that you can complete a stratified training/test set housing_df["income_cat"] = pd.cut( housing_df["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5] ) # Create income bins so that you can complete a stratified training/test set housing_df["income_cat"].hist() #plt.show() train_set, test_set = split_train_test_by_id( housing_with_id, 0.2, "id") # Create train and test set that is not stratified split = StratifiedShuffleSplit( n_splits=1, test_size=0.2, random_state=42) # Create train and test set that is stratified for train_index, test_index in split.split(housing_df, housing_df["income_cat"]): strat_train_set = housing_df.loc[train_index] strat_test_set = housing_df.loc[test_index] # Print distributions to determine how the test sets compare to overall data #print(strat_test_set["income_cat"].value_counts() / len(strat_test_set)) #print(test_set["income_cat"].value_counts() / len(test_set)) #print(housing_df["income_cat"].value_counts() / len(housing_df)) for set_ in (strat_train_set, strat_test_set): set_.drop("income_cat", axis=1, inplace=True) ''' ADDITIONAL VISUALIZATION ''' housing_copy = strat_train_set.copy() #housing_copy.plot(kind="scatter", x="longitude", y="latitude") #housing_copy.plot(kind="scatter", x="longitude", y="latitude", alpha=.1) # alpha defines density housing_copy.plot( kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing_copy['population'] / 100, label='population', figsize=(10, 7), c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True, ) #plt.legend() #plt.show() corr_matrix = housing_copy.corr() #print(corr_matrix) from pandas.plotting import scatter_matrix attributes = [ 'median_house_value', 'median_income', 'total_rooms', 'housing_median_age' ] # plot matrix scatter plots for several attributes scatter_matrix(housing_copy[attributes], figsize=(12, 8)) #plt.show() housing_copy.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1) #plt.show() ''' CREATE ADDITIONAL ATTRIBUTES ''' housing_copy['rooms_per_household'] = housing_copy[ 'total_rooms'] / housing_copy['households'] housing_copy['bedrooms_per_room'] = housing_copy[ 'total_bedrooms'] / housing_copy['total_rooms'] housing_copy['population_per_households'] = housing_copy[ 'population'] / housing_copy['households'] corr_matrix = housing_copy.corr() corr_matrix['median_house_value'].sort_values(ascending=False) #print(corr_matrix['median_house_value'].sort_values(ascending=False)) ''' DATA CLEANING ''' housing_copy = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set['median_house_value'].copy() #housing_copy.dropna(subset=['total_bedrooms']) # drops records that don't contain a value for total_bedrooms #housing_copy.drop('total_bedrooms', axis=1) # drops entire attribute #median = housing_copy['total_bedrooms'].median() # median total bedrooms #housing_copy['total_bedrooms'].fillna(median, inplace=True) # populate total bedroom NAs with median from sklearn.impute import SimpleImputer # imputer can be used to track stats on all numerical fields imputer = SimpleImputer(strategy="median") housing_num = housing_copy.drop("ocean_proximity", axis=1) # remove non-numerical field imputer.fit(housing_num) # estimate using fit() #print(imputer.statistics_) X = imputer.transform( housing_num ) # transform housing_num using imputer median, filling in NAs housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index) #print(housing_num['total_bedrooms'].count()) #print(housing_tr['total_bedrooms'].count()) housing_cat = housing_copy[['ocean_proximity']] housing_cat.head(10) # One method to create numerical representation. We do not want proximity between values #from sklearn.preprocessing import OrdinalEncoder #ordinal_encoder = OrdinalEncoder() #housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) #housing_cat_encoded[:10] #print(ordinal_encoder.categories_) #Create a custom transformer to add extra attributes from sklearn.base import BaseEstimator, TransformerMixin rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self # do nothing def transform(self, X): rooms_per_household = X[:, rooms_ix] / X[:, households_ix] population_per_household = X[:, population_ix] / X[:, households_ix] if self.add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, households_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing_copy.values) housing_extra_attribs = pd.DataFrame( housing_extra_attribs, columns=list(housing_copy.columns) + ["rooms_per_household", "population_per_household"], index=housing_copy.index) print(housing_extra_attribs.head()) ''' CREATE A PIPELINE FOR NUMERICAL AND CATEGORICAL ATTRIBUTES ''' from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing_copy) print(housing_prepared.shape) ''' LINEAR REGRESSION ''' from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) some_data = housing_copy.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("predictions", lin_reg.predict(some_data_prepared)) print("Labels: ", list(some_labels)) from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) ''' DECISION TREE ''' from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) # train model housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) print(tree_rmse) ''' Cross Validation ''' from sklearn.model_selection import cross_val_score scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) tree_rmse_scores = np.sqrt(-scores) def display_scores(scores): print("Scores:", scores) print("Mean: ", scores.mean()) print("Standard Deviation: ", scores.std()) #display_scores(tree_rmse_scores) lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) #display_scores(lin_rmse_scores) ''' Random Forest ''' from sklearn.ensemble import RandomForestRegressor forest_reg = RandomForestRegressor() forest_reg.fit(housing_prepared, housing_labels) housing_predictions = forest_reg.predict(housing_prepared) forest_mse = mean_squared_error(housing_labels, housing_predictions) forest_rmse = np.sqrt(forest_mse) print("Training set: ") display_scores(-forest_rmse) forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) print("Validation set: ") display_scores(forest_rmse_scores) ''' Save models ''' #import joblib #joblib.dump(my_model, "my_model.pkl") #my_model_loaded = joblib.load("my_model.pkl") ''' GRID SEARCH ''' # fiddles with hyperparameters for me from sklearn.model_selection import GridSearchCV param_grid = [ { 'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8] }, { 'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4] }, ] forest_reg = RandomForestRegressor() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True) grid_search.fit(housing_prepared, housing_labels) grid_search.best_params_ print(grid_search.best_params_) # Print best combination of parameters print(grid_search.best_estimator_) # Print best estimator # Print evaluation scores cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) # Print feature importances feature_importances = grid_search.best_estimator_.feature_importances_ #print(feature_importances) extra_attribs = ["rooms)per)hhold", "pop_per_hhold", "bedrooms_per_room"] cat_encoder = full_pipeline.named_transformers_["cat"] cat_one_hot_attribs = list(cat_encoder.categories_[0]) attributes = num_attribs + extra_attribs + cat_one_hot_attribs print(sorted(zip(feature_importances, attributes), reverse=True)) ''' EVALUATE ON TEST SET ''' final_model = grid_search.best_estimator_ X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) print(final_rmse) # Compute accuracy w/ confidence intervol from scipy import stats confidence = .95 squared_errors = (final_predictions - y_test)**2 print( np.sqrt( stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))) print("complete")
def run_classifier(set_size): import util as u #load data balanced by class labels limited to SET_SIZE if set_size: df = u.generate_small_set(set_size, fname_data) else: #load whole data limited by balanced undersampling df = u.generate_small_set(None, None) # split the data into a training set and a validation set from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=5, test_size=VALIDATION_SPLIT, random_state=0) X = df.eligibility y = df.eligible scoresTrain = [] scoresVal = [] for train_index, test_index in sss.split(X, y): df_val, df_train = df.iloc[test_index, :], df.iloc[train_index, :] print("training sample after stratified sampling: ") print(df_train.describe()) print("validation sample after after stratified sampling: ") print(df_val.describe()) df_train.to_csv(sep='\t', path_or_buf=data_train) df_val.to_csv(sep='\t', path_or_buf=data_val) classifier = None if TRAIN_MODEL == False: print("starting to load model") classifier = fasttext.load_model(classifier_fname + '.bin') else: print("start to train classifier model") #classifier = fasttext.supervised(data_train, classifier_fname, pretrained_vectors = './wordEmbeddings/vectorsFastText.vec', epoch= 100) #classifier = fasttext.supervised(data_train, classifier_fname, epoch= 100, silent = 0, thread=4, pretrained_vectors = './wordEmbeddings/vectorsFastText_skipgram.vec', ) classifier = fasttext.supervised(data_train, classifier_fname, epoch=100, silent=0, thread=4, lr=0.1) print("end") result = classifier.test(data_val) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) texts = [ 'neuropsychiatric history or altered mental status', 'pembrolizumab and corticosteroids', 'trastuzumab and breast cancer and heart insufficiency and dyspnea', 'trastuzumab and breast cancer', 'trastuzumab and breast cancer and invasive cancer', 'nivolumab and hiv', 'CAR and lymphoma', 'TCR and breast cancer', 'in situ breast cancer and pemetrexed', 'bevacizumab and patients who has had any event of thrombosis', 'capecitabine and breast cancer and brain metastasis', 'capecitabine and colon cancer', 'lapatinib and breast cancer and brain metastasis', 'pertuzumab and breast cancer and brain metastasis', ] # predict with the probability labels = classifier.predict_proba(texts) print(labels) result = classifier.test(data_test) print(result.precision) # Precision at one print(result.recall) # Recall at one print(result.nexamples) # Number of test examples #k = 1 # print(classifier.labels) # List of labels # print(classifier.label_prefix) # Prefix of the label # print(classifier.dim) # Size of word vector # print(classifier.ws) # Size of context window # print(classifier.epoch) # Number of epochs # print(classifier.min_count) # Minimal number of word occurences # print(classifier.neg) # Number of negative sampled # print(classifier.word_ngrams) # Max length of word ngram # print(classifier.loss_name) # Loss function name # print(classifier.bucket) # Number of buckets # print(classifier.minn) # Min length of char ngram # print(classifier.maxn) # Max length of char ngram # print(classifier.lr_update_rate) # Rate of updates for the learning rate # print(classifier.t) # Value of sampling threshold # print(classifier.encoding) # Encoding that used by classifier # print(classifier.test(data_val, k)) # Test the classifier # print(classifier.predict(texts, k)) # Predict the most likely label #print(classifier.predict_proba(texts, k)) # Predict the most likely label include their probability #Confusion matrix classifier = fasttext.load_model(classifier_fname + '.bin') df_val = pd.read_csv(data_val, sep='\t', header=0, names=["index", "y", "x"]) predicted = pd.Series(np.array(classifier.predict(df_val.x)).flatten()) predictedTrain = pd.Series( np.array(classifier.predict(df_train.eligibility)).flatten()) d = {"y_true": df_val.y, "y_pred": predicted} df_confVal = pd.DataFrame(d) truePos = df_confVal.loc[lambda df: (df.y_true == "__label__0") & (df.y_true == df.y_pred), :] FalseNeg = df_confVal.loc[lambda df: (df.y_true == "__label__0") & (df.y_true != df.y_pred), :] trueNeg = df_confVal.loc[lambda df: (df.y_true == "__label__1") & (df.y_true == df.y_pred), :] FalsePos = df_confVal.loc[lambda df: (df.y_true == "__label__1") & (df.y_true != df.y_pred), :] confusion_table = pd.DataFrame( { "True Positives": [truePos.y_true.size, FalseNeg.y_true.size], "True Negatives": [FalsePos.y_true.size, trueNeg.y_true.size] }, index=["Predicted Positives", "Predicted Negatives"]) print(confusion_table) #cohen's Kappa agreement from sklearn.metrics import cohen_kappa_score kappa = cohen_kappa_score(df_confVal.y_true, df_confVal.y_pred) print("kappa =" + str(kappa)) #classification report from sklearn.metrics import classification_report, f1_score target_names = ['Eligible', 'Not elegible'] report = classification_report(df_confVal.y_true, df_confVal.y_pred, target_names=target_names) print(report) f1Val = f1_score(df_confVal.y_true, df_confVal.y_pred, pos_label='__label__0', average='macro') scoresVal.append(f1Val) f1Train = f1_score(df_train.eligible, predictedTrain, pos_label='__label__0', average='macro') scoresTrain.append(f1Train) scoresTrain = np.array(scoresTrain) scoresVal = np.array(scoresVal) print("Accuracy " + str(y.size) + ": %0.2f (+/- %0.2f)" % (scoresVal.mean(), scoresVal.std() * 2)) return scoresTrain, scoresVal
def test_classifier(clf, dataset, feature_list, folds=1000): # extract the features specified in features_list data = featureFormat(dataset, feature_list, sort_keys=True) # split into labels and features (this line assumes that the first # feature in the array is the label, which is why "poi" must always # be first in the features list labels, features = targetFeatureSplit(data) cv = StratifiedShuffleSplit(random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv.split(features, labels): features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print("Warning: Found a predicted label not == 0 or 1.") print("All predictions should take value 0 or 1.") print("Evaluating performance for processed predictions:") break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0 * (true_positives + true_negatives) / total_predictions precision = 1.0 * true_positives / (true_positives + false_positives) recall = 1.0 * true_positives / (true_positives + false_negatives) f1 = 2.0 * true_positives / (2 * true_positives + false_positives + false_negatives) f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall) # print clf print( PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision=5)) print( RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)) print("") return clf except: print("Got a divide by zero when trying out:", clf) print( "Precision or recall may be undefined due to a lack of true positive predicitons." )
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) housing["median_income"].hist() import numpy as np #Creating a new column housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) #Everything below 5 will be as such and above 5 will be replaced with 5 housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) housing["income_cat"].hist() plt.show() #to represent the actual population stratified split is used from sklearn.model_selection import StratifiedShuffleSplit #representing the same proportion of category as in population split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] strat_train_set.describe() strat_test_set.describe() strat_train_set.hist(bins=50, figsize=(12, 7)) strat_test_set.hist(bins=50, figsize=(12, 7)) plt.show() #removing the new column strat_train_set.drop("income_cat", axis=1, inplace=True) strat_test_set.drop("income_cat", axis=1, inplace=True) housing = strat_train_set.copy() #scatter plot
def main(): print("hello world") #Downloading data print("Downloading data") download_credit_card_data() #Reading in data to pandas dataframe print("Reading in data to pandas dataframe") credit_card_df = load_credit_card_data() print(f"Columns: \n{credit_card_df.columns}\n") print(f"Summary stats:\n{credit_card_df.describe()}\n") print(f"Head:\n{credit_card_df.head()}\n") #Look at histograms of data #Choosing interesting features to look at print("Choosing interesting features to look at") features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'AGE'] #Create test and train sets with random number generator seed set print("Create test and train sets with random number generator seed set") np.random.seed(42) train_set, test_set = split_train_test(credit_card_df, 0.2) print(f"Train set length = {len(train_set)}") print(f"Test set length = {len(test_set)}") #Test data set is being set to the side for the time being #...but I also need to transform it at some point as well #Seems to be some strangely popular ages in the data #University seems to be the most popular level of education #BILL_AMT features are very tail-heavy #Check for missing data values # print("Check for missing data values") # print(strat_train_set_sample.info()) # print(strat_train_set_sample.describe()) # #12000 values in each - no missing values, but shall add # print("Adding imputer") # imp_median = SimpleImputer(missing_values=np.nan, strategy='median') # imp_median.fit(strat_train_set_sample) # imp_median.transform(strat_train_set_sample) # #Any values need to be encoded? # # # #Feature scaling using StandardScaler # print("Feature scaling using StandardScaler") # scaler = StandardScaler().fit(strat_train_set) # print(scaler.mean_) # print(scaler.scale_) #Shall sample from age strata to ensure groups are representative of age groups print( "Shall sample from age strata to ensure groups are representative of age groups" ) #Dividing by 10 gives 6 age categories, rounding up to five groups print("Dividing by 10 gives 6 age categories, rounding up to five groups") credit_card_df = add_age_category(credit_card_df) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(credit_card_df, credit_card_df["AGE_cat"]): strat_train_set = credit_card_df.loc[train_index] strat_test_set = credit_card_df.loc[test_index] print(f"Strat train set length = {len(strat_train_set)}") print(f"Strat test set length = {len(strat_test_set)}") #Checking how age proportionalities match up with random or stratified sampling print( "Checking how age proportionalities match up with random or stratified sampling" ) train_set = add_age_category(train_set) strat_train_set = add_age_category(strat_train_set) print("Overall:") print(credit_card_df["AGE_cat"].value_counts() / len(credit_card_df)) print("Random:") print(train_set["AGE_cat"].value_counts() / len(train_set)) print("Stratified:") print(strat_train_set["AGE_cat"].value_counts() / len(strat_train_set)) #Stratified sampling does give a better representation of the overall data #Removing AGE_cat variable from data frames print("Removing AGE_cat variable from data frames") for set_ in (strat_test_set, strat_train_set): set_.drop("AGE_cat", axis=1, inplace=True) # print("Overall\tRandom\tStrat\tRand. Error\tStrat. Error\n") # for i, cat in enumerate(credit_card_df["AGE_cat"].value_counts()/len(credit_card_df)): # print(cat, # train_set["AGE_cat"].value_counts()[i]/len(train_set), # strat_train_set["AGE_cat"].value_counts()[i]/len(strat_train_set)) # print((train_set["AGE_cat"].value_counts()[i]/len(train_set) - cat) * 100/cat, (strat_train_set["AGE_cat"].value_counts()[i]/len(strat_train_set) - cat) * 100/cat) #Splitting test set in to target and feature variables strat_test_set_X = strat_test_set.drop( columns=["default payment next month"]) strat_test_set_y = strat_test_set["default payment next month"] #Adding ratio variable to test set #Instance of attribute adder attr_adder = RatioAttributesAdder(add_payment_ratios=True) #Returns 2D numpy array extra_attribs = attr_adder.transform(strat_test_set_X.values) #Adding new data to dataframe strat_test_set_X = strat_test_set_X.assign(Ratio=extra_attribs[:, -1]) #Making a sample of the training set to experiment with print("Making a sample of the training set to experiment with") strat_train_set_sample = strat_train_set.sample(frac=0.9, random_state=42) #Adding another attribute to data frame # print("Adding another attribute to data frame") # #Instance of attribute adder # attr_adder = RatioAttributesAdder(add_payment_ratios = True) # #Returns 2D numpy array # extra_attribs = attr_adder.transform(strat_train_set_sample.values) # #Adding new data to dataframe # strat_train_set_sample = strat_train_set_sample.assign(Ratio = extra_attribs[:,-1]) # # print("Shapes:") # print(extra_attribs.shape) # print(strat_train_set_sample.shape) # # extra_attribs_columns = (strat_train_set_sample.columns) # print(extra_attribs_columns) # print(type(extra_attribs_columns)) # # # strat_train_set_sample = pd.DataFrame(data = extra_attribs, columns = strat_train_set_sample.columns) # # # print(type(extra_attribs)) # print(type(strat_train_set_sample)) # print(len(extra_attribs)) # print(len(strat_train_set_sample)) print(strat_train_set_sample.head()) #Using preparation pipeline print("Using preparation pipeline") print(strat_train_set_sample.columns) strat_train_set_sample_X = strat_train_set_sample.drop( columns=["default payment next month"]) strat_train_set_sample_y = strat_train_set_sample[ "default payment next month"] print(strat_train_set_sample_X.head()) print(strat_train_set_sample_y.head()) strat_train_set_sample_X_array = prep_pipeline.fit_transform( strat_train_set_sample_X) print(strat_train_set_sample_X_array) print(strat_train_set_sample_X_array.shape) #Putting it back in to pandas df new_columns = list(strat_train_set_sample_X.columns) new_columns.append("Ratio") print(new_columns) strat_train_set_sample_X = pd.DataFrame( columns=new_columns, data=strat_train_set_sample_X_array) # strat_train_set_sample.assign(Ratio = []) # strat_train_set_sample.append(strat_train_set_sample_array) print(strat_train_set_sample_X) print(strat_train_set_sample_X.describe()) #Selecting correlated features print("Selecting correlated features using earlier data ") strat_train_set_sample = select_correlated_features( strat_train_set_sample, threshold=0.08, plot_boolean=False, target="default payment next month") print(strat_train_set_sample) #Now trying some models for the data print("Now trying some models for the data") print(strat_train_set_sample["PAY_0"]) #Logistic regression print("\n\n\nLogistic regression") #Instance of logistic regression model log_reg = LogisticRegression() #penalty = 'l2', C = 0.1,random_state = 0) # log_reg.fit(X, y) # print(f"Score: {log_reg.score(X, y)}") #Using GridSearchCV to find optimum parameters #Making parameter grid param_grid = [{ 'C': np.logspace(-5, 1, 4), 'penalty': ['l2'], 'solver': ['sag'] }] #Making grid search object grid_clf = GridSearchCV(log_reg, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error') grid_clf.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(grid_clf.best_params_) cvres = grid_clf.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(mean_score, params) log_reg = LogisticRegression(penalty='l2', C=0.001, random_state=0) log_reg.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(f"Score: {log_reg.score(strat_test_set_X, strat_test_set_y)}") #Decision trees print("Decision tree") #Instance of decision tree classifier dt_clf = DecisionTreeClassifier() dt_clf.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(f"Score: {dt_clf.score(strat_test_set_X, strat_test_set_y)}") #Support Vector Machine - is taking a very very long time # X_svm = np.array(strat_train_set_sample["PAY_0"]).reshape(-1, 1)#.reshape(-1, 1) # print("Support Vector Machine") # svm_clf = svm.SVC(kernel='linear') # svm_clf.fit(X_svm[:1000], y[:1000]) # print(f"Score: {svm_clf.score(X_svm, y)}") #K-nearest neighbours model print("K-nearest neighbours") knn_model = KNeighborsClassifier(n_neighbors=4) knn_model.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(f"Score: {knn_model.score(strat_test_set_X, strat_test_set_y)}") #Random Forest Regressor print("Random forest regressor") param_grid = [ { 'bootstrap': [True], 'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8] }, { 'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4] }, ] forest_reg = RandomForestClassifier() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(strat_train_set_sample_X, strat_train_set_sample_y) cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) print("Checking that the lower the gridsearchcv score, the better") forest_reg_1 = RandomForestClassifier(max_features=8, n_estimators=30) forest_reg_1.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(f"Score: {forest_reg_1.score(strat_test_set_X, strat_test_set_y)}") forest_reg_2 = RandomForestClassifier(max_features=8, n_estimators=3) forest_reg_2.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(f"Score: {forest_reg_2.score(strat_test_set_X, strat_test_set_y)}") forest_reg_3 = RandomForestClassifier(max_features=2, n_estimators=3) forest_reg_3.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(f"Score: {forest_reg_3.score(strat_test_set_X, strat_test_set_y)}") #Making an ensemble model print(f"Making an ensemble model") voting_clf = VotingClassifier( estimators=[('lr', log_reg), ('knn', knn_model), ('rndm_for', forest_reg_3)], # ('svm', svm_clf)], voting='soft' #Soft currently doing better than hard ) voting_clf.fit(strat_train_set_sample_X, strat_train_set_sample_y) print(f"Score: {voting_clf.score(strat_test_set_X, strat_test_set_y)}") return
def main(): # 读入数据 os.chdir('/Users/xuejiang/PycharmProjects/isofom_/data/') # ========= Step 1. 读入数据 =========== isoform_expression_df = pd.read_csv('select_isoform_express.csv') isoform_expression = isoform_expression_df.as_matrix() isoform_expression = isoform_expression[:, 2:] isoform_name = isoform_expression[:, :2] # 对每一列数据进行归一化处理 scaler = MinMaxScaler() isoform_express_scaled = scaler.fit_transform(isoform_expression) sample_label_df = pd.read_csv('sample_label.csv') sample_label = sample_label_df.as_matrix() sample_name = sample_label[:, 0] sample_label_state = sample_label[:, 1] sample_label_cognitive = sample_label[:, 2] isoform_express_scaled_state, sample_label_state = trasform_data_format(isoform_express_scaled, sample_label_state) # 对标签数据进行OneHot编码 sample_label_state_onehot = tf.keras.utils.to_categorical(sample_label_state) isoform_express_scaled_cognitive, sample_label_cognitive = trasform_data_format(isoform_express_scaled, sample_label_cognitive) # 对标签数据进行OneHot编码 sample_label_cognitive_onehot = tf.keras.utils.to_categorical(sample_label_cognitive) # ### 用是ad 或者不是 ad 的标签进行实验 # true_sample_name_s = [] # true_label_s = [] # predict_label_s = [] # # ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, train_size=0.75, random_state=0) # # start_time_all = time.clock() # for train_index, test_index in ss.split(isoform_express_scaled_state, sample_label_state_onehot): # X_train, X_test = isoform_express_scaled_state[train_index], isoform_express_scaled_state[test_index] # y_train, y_test = sample_label_state_onehot[train_index], sample_label_state_onehot[test_index] # y_test_true = sample_label_state[test_index] # sample_test_name = sample_name[test_index] # model = create_model() # model.fit(X_train, y_train, validation_split=0.1, epochs=20, batch_size=20, verbose=1) # predict = model.predict(X_test) # true_sample_name_s.append(sample_test_name) # true_label_s.append(y_test_true) # predict_label_s.append(predict) # # stop_time_all = time.clock() # cost_all_s = stop_time_all - start_time_all # # true_sample_name_s = np.array(true_sample_name_s) # true_sample_name_s = true_sample_name_s.flatten() # true_sample_name_s = true_sample_name_s.T # true_sample_name_s = np.reshape(true_sample_name_s, (len(true_sample_name_s), 1)) # print(true_sample_name_s.shape) # true_label_s = np.array(true_label_s) # true_label_s = true_label_s.flatten() # true_label_s = true_label_s.T # true_label_s = np.reshape(true_label_s, (len(true_label_s), 1)) # print(true_label_s.shape) # predict_label_s = np.array(predict_label_s) # predict_label_s = np.reshape(predict_label_s, (-1, 2)) # print(predict_label_s.shape) # # final_pre_s = predict_label_s.argmax(axis=1) # final_pre_s = np.array(final_pre_s) # final_pre_s = np.reshape(final_pre_s, (len(final_pre_s), 1)) # print(final_pre_s.shape) # # label_all_s = np.hstack((true_sample_name_s, true_label_s)) # label_all_s = np.hstack((label_all_s, predict_label_s)) # label_all_s = np.hstack((label_all_s, final_pre_s)) # print(label_all_s.shape) # col_names = ['sample name', 'true_label', 'predict 0', 'predict 1', 'predict'] # col_names = np.array(col_names) # label_all_s = np.vstack((col_names, label_all_s)) ### 用认知评价得分进行实验 true_sample_name_c = [] true_label_c = [] predict_label_c = [] ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, train_size=0.75, random_state=0) start_time_all = time.clock() for train_index, test_index in ss.split(isoform_express_scaled_cognitive, sample_label_cognitive_onehot): X_train, X_test = isoform_express_scaled_state[train_index], isoform_express_scaled_cognitive[test_index] y_train, y_test = sample_label_cognitive_onehot[train_index], sample_label_cognitive_onehot[test_index] y_test_true = sample_label_cognitive[test_index] sample_test_name = sample_name[test_index] model = create_model() model.fit(X_train, y_train, validation_split=0.1, epochs=20, batch_size=20, verbose=1) predict = model.predict(X_test) true_sample_name_c.append(sample_test_name) true_label_c.append(y_test_true) predict_label_c.append(predict) stop_time_all = time.clock() cost_all_c = stop_time_all - start_time_all true_sample_name_c = np.array(true_sample_name_c) true_sample_name_c = true_sample_name_c.flatten() true_sample_name_c = true_sample_name_c.T true_sample_name_c = np.reshape(true_sample_name_c, (len(true_sample_name_c), 1)) print(true_sample_name_c.shape) true_label_c = np.array(true_label_c) true_label_c = true_label_c.flatten() true_label_c = true_label_c.T true_label_c = np.reshape(true_label_c, (len(true_label_c), 1)) print(true_label_c.shape) predict_label_c = np.array(predict_label_c) predict_label_c = np.reshape(predict_label_c, (-1, 6)) print(predict_label_c.shape) final_pre_c = predict_label_c.argmax(axis=1) final_pre_c = np.array(final_pre_c) final_pre_c = np.reshape(final_pre_c, (len(final_pre_c), 1)) label_all_c = np.hstack((true_sample_name_c, true_label_c)) label_all_c = np.hstack((label_all_c, predict_label_c)) label_all_c = np.hstack((label_all_c, final_pre_c)) col_names = ['sample name', 'true_label', 'predict 0', 'predict 1', 'predict 2', 'predict 3', 'predict 4', 'predict 5', 'predict'] col_names = np.array(col_names) label_all_c = np.vstack((col_names, label_all_c)) # 保存结果 cost = [cost_all_c] cost = np.array(cost) cost_df = pd.DataFrame(data=cost) cost_df.to_csv('/Users/xuejiang/PycharmProjects/isofom_/result/cnn/5/time_cost_c.csv') # label_all_s_df = pd.DataFrame(data=label_all_s) label_all_c_df = pd.DataFrame(data=label_all_c) # label_all_s_df.to_csv('/Users/xuejiang/PycharmProjects/isofom_/result/cnn/9/label_all_s.csv') label_all_c_df.to_csv('/Users/xuejiang/PycharmProjects/isofom_/result/cnn/5/label_all_c.csv')
Y[i] -= 2 if Y[i] == 6: Y[i] -= 3 if Y[i] == 8 or Y[i] == 9: Y[i] -= 4 #calculate class weights for trainer class_weights = class_weight.compute_class_weight('balanced', np.unique(Y), Y) class_weights = dict(enumerate(class_weights)) #convert to one hot encoding Y = keras.utils.to_categorical(Y, dtype='float32') #split out the test se sss = StratifiedShuffleSplit(n_splits=2, test_size=500) tt_val_index, _ = sss.split(X, Y) X_train_val = X[tt_val_index[0]] Y_train_val = Y[tt_val_index[0]] X_test = X[tt_val_index[1]] Y_test = Y[tt_val_index[1]] #split out the validation set sss = StratifiedShuffleSplit(n_splits=2, test_size=100) tt_index, _ = sss.split(X_train_val, Y_train_val) X_train = X_train_val[tt_index[0]] Y_train = Y_train_val[tt_index[0]] X_val = X_train_val[tt_index[1]] Y_val = Y_train_val[tt_index[1]] #define model input = layers.Input(shape=(183, 183, 6))
# In[21]: datSet # In[22]: from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(datSet, datSet["income_cat"]): strat_train_set = datSet.loc[train_index] strat_test_set = datSet.loc[test_index] # In[23]: strat_train_set # In[24]: test_index
# In[10]: from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit print('No Frauds', round(df['Class'].value_counts()[0] / len(df) * 100, 2), '% of the dataset') print('Frauds', round(df['Class'].value_counts()[1] / len(df) * 100, 2), '% of the dataset') X = df.drop('Class', axis=1) y = df['Class'] sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) for train_index, test_index in sss.split(X, y): print("Train:", train_index, "Test:", test_index) original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index] original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index] # We already have X_train and y_train for undersample data thats why I am using original to distinguish and to not overwrite these variables. # original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(X, y, test_size=0.2, random_state=42) # Check the Distribution of the labels # Turn into an array original_Xtrain = original_Xtrain.values original_Xtest = original_Xtest.values original_ytrain = original_ytrain.values original_ytest = original_ytest.values
train_dataset = Caltech(DATA_DIR, split='train', transform=train_transform) test_dataset = Caltech(DATA_DIR, split='test', transform=eval_transform) class_to_idx = train_dataset.class_to_idx classes = train_dataset.classes X = [] y = [] for image, label in train_dataset: X.append(image) y.append(label) sss = StratifiedShuffleSplit(n_splits=1, train_size=0.5, random_state=0) for train_index, val_index in sss.split(X, y): train_indexes = train_index # split the indices for your train split val_indexes = val_index # split the indices for your val split val_dataset = Subset(train_dataset, val_indexes) train_dataset = Subset(train_dataset, train_indexes) # Check dataset sizes print('Train Dataset: {}'.format(len(train_dataset))) print('Valid Dataset: {}'.format(len(val_dataset))) print('Test Dataset: {}'.format(len(test_dataset))) print('Dataset size: {}'.format(len(train_dataset) + len(val_dataset) + len(test_dataset))) """**Images distribution among classes**""" count_train_items = {}
# get the test and hold-out data to jive with each other, regarding sampling based on these bins # only useful in the linear model, the boosted/bagged treee based models should do fine with whatever we give them train_data['living_area_cat'] = pd.cut( train_data['GrLivArea'], bins=[0, 500, 1000, 1500, 2000, 2500, np.inf], labels=[1, 2, 3, 4, 5, 6]) #split = StratifiedShuffleSplit(n_splits=1, test_size=my_test_size, random_state=9261774) #for train_index, test_index in split.split(train_data, train_data['living_area_cat']): # X_train = train_data.loc[train_index] # this is the training data # X_test = train_data.loc[test_index] # this is the hold out, the protion of the training i will use for testing split = StratifiedShuffleSplit(n_splits=1, test_size=my_test_size, random_state=9261774) for train_index, test_index in split.split(train_data, train_data['living_area_cat']): X_train = train_data.iloc[train_index].copy() # this is the training data X_test = train_data.iloc[test_index].copy( ) # this is the hold out, the protion of the training i will use for testing # set up the y aka the label y_train = X_train['SalePrice'] y_test = X_test['SalePrice'] # drop SalePrice from the x vars X_train.drop('SalePrice', axis=1, inplace=True) X_test.drop('SalePrice', axis=1, inplace=True) submission_id = sub_data[ 'Id'] # this is the start of the submission data frame. # sub data is already loaded, store the Id now, later we add in the y predictions
# import model from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import StratifiedShuffleSplit from sklearn.metrics import accuracy_score sss = StratifiedShuffleSplit(n_splits = 10,test_size= 0.1,random_state = 10) sss.split(train_X,train_y) classifiers = [SVC(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), LogisticRegression(), GaussianNB(), KNeighborsClassifier(), LinearDiscriminantAnalysis(), MLPClassifier(), DecisionTreeClassifier()] acc_table = {} # a dictionary store the prediction for train_index, test_index in sss.split(train_X,train_y): train_X_cv, test_X_cv = train_X[train_index],train_X[test_index] train_y_cv, test_y_cv = train_y[train_index],train_y[test_index]
#Loading Data and splitting data into train, validation and test set import idx2numpy import numpy as np file = "t10k-images-idx3-ubyte" x_test = idx2numpy.convert_from_file(file) file = "t10k-labels.idx1-ubyte" y_test = idx2numpy.convert_from_file(file) file = "train-images-idx3-ubyte" x_train_val = idx2numpy.convert_from_file(file) file = "train-labels-idx1-ubyte" y_train_val = idx2numpy.convert_from_file(file) test_fold = np.zeros((60000, 1)) from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=1, test_size=10000) for train_index, test_index in sss.split(x_train_val, y_train_val): x_train, y_train = x_train_val[train_index], y_train_val[train_index] x_val, y_val = x_train_val[test_index], y_train_val[test_index] test_fold[train_index] = -1 test_fold[test_index] = 0 print("Training Set ", x_train.shape, y_train.shape) print("Validation Set ", x_val.shape, y_val.shape) print("Test Set ", x_test.shape, y_test.shape) x_train = x_train.reshape(50000, 784) x_val = x_val.reshape(10000, 784) x_test = x_test.reshape(10000, 784) x_train_val = x_train_val.reshape(60000, 784) #Renormalizing the features of the data scal = StandardScaler()
def classify( X, y, verbose=False, nfolds=2, dim_red=None, n_components=[5, 10, 20], scale=True, fs=None, njobs=1, LR_C=[0.01, 0.1, 1, 10, 100], LR_class_weight=[None, "balanced"], SVC_C=[0.01, 0.1, 1, 10, 100], SVC_class_weight=[None, "balanced"], SVC_kernels=["rbf", "linear", "poly"], n_estimators=[10, 20, 30], max_features=["auto", "log2", None], **kwargs ): # spit out to the screen the function parameters, for logging if verbose: import inspect frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) print 'function name "%s"' % inspect.getframeinfo(frame)[2] for i in args[2:]: print " %s = %s" % (i, values[i]) # prepare configuration for cross validation test harness seed = 8 # prepare models models = [] # all these support multiclass: # http://scikit-learn.org/stable/modules/multiclass.html models.append( ( "LR", LogisticRegression(multi_class="multinomial", solver="newton-cg"), {"C": LR_C, "class_weight": LR_class_weight}, ) ) models.append(("LDA", LinearDiscriminantAnalysis(), {})) models.append(("RndFor", RandomForestClassifier(), {"n_estimators": n_estimators, "max_features": max_features})) models.append(("NB", GaussianNB(), {})) models.append(("SVC", SVC(), {"C": SVC_C, "class_weight": SVC_class_weight, "kernel": SVC_kernels})) models.append(("Most frequent", DummyClassifier(strategy="most_frequent"), {})) models.append(("Stratified", DummyClassifier(strategy="stratified"), {})) # spit out to the screen the parameters to be tried in each classifier if verbose: print "Trying these parameters:" for m in models: print m[0], ":", m[2] # evaluate each model in turn results = [] names = [] for name, model, params in models: # need to create the CV objects inside the loop because they get used # and not get reset! inner_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed) outer_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed) # # do this if no shuffling is wanted # inner_cv = StratifiedKFold(n_splits=num_folds, random_state=seed) # outer_cv = StratifiedKFold(n_splits=num_folds, random_state=seed) steps = [("clf", model)] pipe_params = {} for key, val in params.iteritems(): key_name = "clf__%s" % key pipe_params[key_name] = val if fs == "l1": lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) fs = feature_selection.SelectFromModel(lsvc) elif fs == "rfe": fs = feature_selection.RFE(estimator=model) pipe_params["feat_sel__n_features_to_select"] = n_components steps = [("feat_sel", fs)] + steps if dim_red is not None: if dim_red == "pca": dr = decomposition.PCA() pipe_params["dim_red__n_components"] = n_components elif dim_red == "ica": dr = decomposition.FastICA() pipe_params["dim_red__n_components"] = n_components steps = [("dim_red", dr)] + steps if scale: steps = [("scale", preprocessing.RobustScaler())] + steps pipe = Pipeline(steps) cv_results = [] cnt = 0 for train_idx, test_idx in outer_cv.split(X, y): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] opt_model = GridSearchCV(estimator=pipe, param_grid=pipe_params, verbose=0, n_jobs=njobs, cv=inner_cv) opt_model.fit(X_train, y_train) if verbose: if len(params.keys()) > 0: print "Best paramaters for", name, " (%d/%d):" % (cnt + 1, outer_cv.n_splits) print opt_model.best_params_ predictions = opt_model.predict(X_test) cv_results.append(metrics.accuracy_score(y_test, predictions)) cnt += 1 results.append(cv_results) names.append(name) if verbose: print "\n======" for model, res in zip(models, results): msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res)) print (msg) print "Chance: %f" % (1 / float(len(np.unique(y)))) print "======\n" return results, models
if __name__ == "__main__": n_splits = int(sys.argv[3]); test_size = float(sys.argv[2]); filename = sys.argv[1]; classes = {"iris.data.txt":['Iris-setosa','Iris-versicolor','Iris-virginica'], "scale1.data.txt":[0,1,2]} # filename = "scale1.data.txt" data = pd.read_csv(filename,header=None); sss = StratifiedShuffleSplit(n_splits= n_splits, test_size=test_size); X = data.iloc[:,:-1]; y = data.iloc[:,-1:]; total_pre = 0.0; total_acc = 0.0; for train_indices , test_indices in sss.split(X,y): train_f = X.loc[train_indices]; train_l = y.loc[train_indices]; test_f = X.loc[test_indices]; test_l = y.loc[test_indices]; train_set = data.loc[train_indices]; test_set = data.loc[test_indices]; train_set.reset_index(inplace=True); train_set.drop(labels=['index'],inplace = True,axis = 1) decision_tree = dt.Decision_tree(filename); decision_tree.reload_data(train_set); decision_tree.run(); pre_l = decision_tree.predict(test_set.values.tolist()); pre_l_binarized = label_binarize(pre_l,classes=classes[filename]); test_l_binarized = label_binarize(test_l,classes=classes[filename]);
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from multiprocessing import Pool, cpu_count import pickle dataset = pd.read_csv('../data/data.csv') le = LabelEncoder() le.fit(dataset['color']) train = dataset[['r', 'g', 'b']] labels = le.transform(dataset['color']) sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=23) X_train, X_test, y_train, y_test = None, None, None, None for train_index, test_index in sss.split(train, labels): X_train, X_test = train.values[train_index], train.values[test_index] y_train, y_test = labels[train_index], labels[test_index] classifiers = [ KNeighborsClassifier(3), SVC(kernel="rbf", C=0.025, probability=True), NuSVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis() ]
def classify(X, y, verbose=False, nfolds=5, dim_red=None, n_components=[5, 10, 20], scale=True, fs=None, njobs=1, LR_C=[.01, .1, 1, 10, 100], LR_class_weight=[None, 'balanced'], SVC_C=[.01, .1, 1, 10, 100], SVC_class_weight=[None, 'balanced'], SVC_kernels=['rbf', 'linear', 'poly'], n_estimators=[10, 20, 30], max_features=['auto', 'log2', None], shuffle=False, **kwargs): # spit out to the screen the function parameters, for logging if verbose: import inspect frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) print 'function name "%s"' % inspect.getframeinfo(frame)[2] for i in args[2:]: print " %s = %s" % (i, values[i]) # prepare configuration for cross validation test harness num_instances = len(X) seed = 8 # prepare models models = [] # all these support multiclass: # http://scikit-learn.org/stable/modules/multiclass.html models.append(('LR', LogisticRegression(multi_class='multinomial', solver='newton-cg'), {"C": LR_C, "class_weight": LR_class_weight})) models.append(('LDA', LinearDiscriminantAnalysis(), {})) models.append(('RndFor', RandomForestClassifier(), {'n_estimators': n_estimators, 'max_features': max_features})) models.append(('NB', GaussianNB(), {})) models.append(('SVC', SVC(), {"C": SVC_C, "class_weight": SVC_class_weight, 'kernel': SVC_kernels})) models.append(('Most frequent', DummyClassifier(strategy='most_frequent'), {})) models.append(('Stratified', DummyClassifier(strategy='stratified'), {})) # spit out to the screen the parameters to be tried in each classifier if verbose: print 'Trying these parameters:' for m in models: print m[0], ':', m[2] # evaluate each model in turn results = [] names = [] scoring = 'accuracy' for name, model, params in models: # need to create the CV objects inside the loop because they get used # and not get reset! if shuffle: inner_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=.1, random_state=seed) outer_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=.1, random_state=seed) else: # do this if no shuffling is wanted inner_cv = StratifiedKFold(n_splits=nfolds, random_state=seed) outer_cv = StratifiedKFold(n_splits=nfolds, random_state=seed) steps = [('clf', model)] pipe_params = {} for key, val in params.iteritems(): key_name = 'clf__%s' % key pipe_params[key_name] = val if fs == 'l1': lsvc = LinearSVC(C=0.1, penalty="l1", dual=False) fs = feature_selection.SelectFromModel(lsvc) elif fs == 'rfe': fs = feature_selection.RFE(estimator=model) pipe_params['feat_sel__n_features_to_select'] = n_components steps = [('feat_sel', fs)] + steps if dim_red is not None: if dim_red == 'pca': dr = decomposition.PCA() pipe_params['dim_red__n_components'] = n_components elif dim_red == 'ica': dr = decomposition.FastICA() pipe_params['dim_red__n_components'] = n_components steps = [('dim_red', dr)] + steps if scale: steps = [('scale', preprocessing.RobustScaler())] + steps pipe = Pipeline(steps) cv_results = [] cnt = 0 for train_idx, test_idx in outer_cv.split(X, y): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] opt_model = GridSearchCV(estimator=pipe, param_grid=pipe_params, verbose=0, n_jobs=njobs, cv=inner_cv) opt_model.fit(X_train, y_train) if verbose: if len(params.keys()) > 0: print 'Best paramaters for', name, \ ' (%d/%d):' % (cnt + 1, outer_cv.n_splits) print opt_model.best_params_ predictions = opt_model.predict(X_test) cv_results.append(metrics.accuracy_score(y_test, predictions)) cnt += 1 results.append(cv_results) names.append(name) if verbose: print '\n======' for model, res in zip(models, results): msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res)) print(msg) print 'Chance: %f' % (1 / float(len(np.unique(y)))) print '======\n' return results, models
test_path = path.join(results_path, filename + '_' + args.subset_name + '.tsv') flag_selection = True sex = list(merged_df.gender.values) site = list(merged_df.site.values) age = list(merged_df.age.values) train_index, test_index = None, None while flag_selection: splits = StratifiedShuffleSplit(n_splits=1, test_size=args.test_size) for train_index, test_index in splits.split(np.zeros(len(site)), site): age_test = [float(age[idx]) for idx in test_index] age_train = [float(age[idx]) for idx in train_index] sex_test = [sex_dict[sex[idx]] for idx in test_index] sex_train = [sex_dict[sex[idx]] for idx in train_index] t_age, p_age = ttest_ind(age_test, age_train) T_sex = chi2(sex_test, sex_train) print(p_age, T_sex) if p_age > args.p_val_threshold and T_sex < args.t_val_threshold: flag_selection = False test_df = merged_df.iloc[test_index]
print() x_data = np.array([x[0:num_data_col] for x in survey_data]) # [0,6) == [0,5] y_data = np.array([y[num_data_col:num_data_col+num_choice_col] for y in survey_data]) x_headers = [h for h in survey_headers[1:6]] print('x-shape: ' + str(x_data.shape)) print('y-shape: ' + str(y_data.shape)) # --------------------------------------------- #%% # use sklearn to perform stratified randomized partitioning into training and dev sets # this is necessary because the vehicle choice dataset is very unbalanced trainPerc = 0.95; devePerc = 0.05 # deep learning uses much higher %'s for training sss = StratifiedShuffleSplit(n_splits=1, train_size=trainPerc, test_size = devePerc) train_indices,deve_indices = next(sss.split(x_data, y_data)) num_train_rows = len(train_indices) # need this later on # create the patitions x_vals_train = x_data[train_indices,:] y_vals_train = y_data[train_indices,:] x_vals_deve = x_data[deve_indices,:] y_vals_deve = y_data[deve_indices,:] print("num_train_rows: %u, num_deve_rows: %u" %(num_train_rows, len(deve_indices))) # --------------------------------------------- #%% # setup training a_stdv = 0.1 # standard dev. for initialization of node weights learn_rate = 1.0 # gradient descent learning rate
def main(): #1,加载数据(训练和测试)和预处理数据 #将NumberTime30-59,60-89,90中标记的96,98替换为NaN #将Age中的0替换为NaN colnames = [ 'ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', 'NORealEstate', 'NOTime60-89', 'NODependents' ] col_nas = [ '', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA', [98, 96], 'NA' ] col_na_values = creatDictKV(colnames, col_nas) dftrain = pd.read_csv("./data/cs-training.csv", names=colnames, na_values=col_na_values, skiprows=[0]) dftrain.pop("NOCredit") train_id = [int(x) for x in dftrain.pop("ID")] y_train = np.asarray([int(x) for x in dftrain.pop("label")]) x_train = dftrain.as_matrix() dftest = pd.read_csv("./data/cs-test.csv", names=colnames, na_values=col_na_values, skiprows=[0]) dftest.pop("NOCredit") test_id = [int(x) for x in dftest.pop("ID")] y_test = np.asarray(dftest.pop("label")) x_test = dftest.as_matrix() #2,使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0) for train_index, test_index in sss.split(x_train, y_train): print("TRAIN:", train_index, "TEST:", test_index) x_train_new, x_test_new = x_train[train_index], x_train[test_index] y_train_new, y_test_new = y_train[train_index], y_train[test_index] y_train = y_train_new x_train = x_train_new #3,使用Imputer将NaN替换为平均值 imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(x_train) x_train = imp.transform(x_train) x_test_new = imp.transform(x_test_new) x_test = imp.transform(x_test) #x_train = np.delete(x_train, 5, axis=1) #x_test_new = np.delete(x_test_new, 5, axis=1) if not os.path.isfile("rfc_model.m"): clf = RandomForestClassifier(n_estimators=100, oob_score=True, min_samples_split=2, min_samples_leaf=50, n_jobs=-1, class_weight='balanced_subsample', bootstrap=True) #输出特征重要性评估 clf.fit(x_train, y_train) param_grid = {"max_features": [2, 3, 4], "min_samples_leaf": [50]} grid_search = GridSearchCV(clf, cv=10, scoring='roc_auc', param_grid=param_grid, iid=False, n_jobs=-1) #c.输出最佳模型 grid_search.fit(x_train, y_train) joblib.dump(grid_search, "rfc_model.m") print("the best parameter:", grid_search.best_params_) print("the best score:", grid_search.best_score_) predicted_probs_train = grid_search.predict_proba(x_train) predicted_probs_train = [x[1] for x in predicted_probs_train] computeAUC(y_train, predicted_probs_train) print( sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), dftrain.columns), reverse=True)) else: clf = joblib.load("rfc_model.m") predicted_probs_test_new = clf.predict_proba(x_test_new) predicted_probs_test_new = [x[1] for x in predicted_probs_test_new] computeAUC(y_test_new, predicted_probs_test_new) clf.fit(x_test_new, y_test_new) joblib.dump(clf, "rfc_model.m")
# def countTokens(tokens): # return tokens.count(word) # data[word] = data['tokens'].apply(countTokens) #data.drop("tokens", axis = 1, inplace = True) print('counting tokens by file') data['tok_array'] = data['tokens'].apply(createTokenArray) print('saving data') data.to_csv('data.csv', sep=',', encoding='utf-8') data.drop("tokens", axis = 1, inplace = True) split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(data, data["type"]): strat_train_set = data.loc[train_index] strat_test_set = data.loc[test_index] def type_proportions(data): return data["type"].value_counts() / len(data) compare_props = pd.DataFrame({ "Overall": type_proportions(data), "Stratified": type_proportions(strat_train_set), "Stratified-test": type_proportions(strat_test_set), }).sort_index() compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100 compare_props["Strat. test %error"] = 100 * compare_props["Stratified-test"] / compare_props["Overall"] - 100 compare_props
def lab(): form = LabForm() if form.validate_on_submit(): X_test = np.array([[ float(form.latitude.data), float(form.longitude.data), str(form.month.data), str(form.day.data), float(form.avg.data), float(form.max.data), float(form.wind_s.data), float(form.wind_avg.data) ]]) print(X_test.shape) fires = pd.read_csv('datasets/sanbul-5.csv', sep=',') X_test = pd.DataFrame(X_test, columns=[ 'latitude', 'longitude', 'month', 'day', 'avg_temp', 'max_temp', 'max_wind_speed', 'avg_wind' ]) print(X_test) from sklearn.model_selection import train_test_split train_set, test_set = train_test_split(fires, test_size=0.2, random_state=42) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(fires, fires["month"]): strat_train_set = fires.loc[train_index] strat_test_set = fires.loc[test_index] fires = strat_train_set.drop(["burned_area"], axis=1) # drop labels for training set fires_labels = strat_train_set["burned_area"].copy() fires_num = fires.drop(["month", "day"], axis=1) from sklearn.preprocessing import OneHotEncoder cat_encoder = OneHotEncoder() fires_cat = fires[["month"]] fires_cat_1hot = cat_encoder.fit_transform(fires_cat) cat_encoder = OneHotEncoder(sparse=False) fires_cat_1hot = cat_encoder.fit_transform(fires_cat) cat_encoder2 = OneHotEncoder() fires_cat = fires[["day"]] fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat) cat_encoder2 = OneHotEncoder(sparse=False) fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat) from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('std_scaler', StandardScaler()), ]) fires_num_tr = num_pipeline.fit_transform(fires_num) from sklearn.compose import ColumnTransformer num_attribs = list(fires_num) cat_attribs = ["month", "day"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) fires_prepared = full_pipeline.fit_transform(fires) X_test = full_pipeline.transform(X_test) MODEL_NAME = "my_sanbul_model" os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = "term-224506-9bc8286b5d7b.json" project_id = 'term-224506' model_id = MODEL_NAME model_path = "projects/{}/models/{}".format(project_id, model_id) model_path += "/versions/v0001/" ml_resource = googleapiclient.discovery.build("ml", "v1").projects() input_data_json = { "signature_name": "serving_default", "instances": X_test.tolist() } request = ml_resource.predict(name=model_path, body=input_data_json) response = request.execute() print("\nresponse:\n", response) if "error" in response: raise RuntimeError(response["error"]) predD = np.array([pred['dense_1'] for pred in response["predictions"]]) print(predD[0][0]) res = predD[0][0] return render_template('result.html', res=res) return render_template('prediction.html', form=form)
def run(argv=None): """Run the feature selection using ANOVA on the chosen task.""" args = parser.parse_args(argv) print('Retrieving task') RS = int(args.RS) T = int(args.T) TMAX = int(args.TMAX) print(f'RS {RS} T {T} TMAX {TMAX}') task_name = args.task_name task = tasks.get(task_name, n_top_pvals=None) temp_dir = f'selected/{task.meta.tag}/temp/' print('Retreiving db') db = dbs[task.meta.db] print('Retrieving y') y = task.y print(f'y loaded with shape {y.shape}') if task.is_classif(): logger.info('Classification, using f_classif') f_callable = f_classif ss = StratifiedShuffleSplit(n_splits=TMAX, test_size=2 / 3, random_state=RS) else: logger.info('Regression, using f_regression') f_callable = f_regression ss = ShuffleSplit(n_splits=TMAX, test_size=2 / 3, random_state=RS) index = y.index assert T >= 0 # Alter the task to select only 1/3 for selection split_iter = ss.split(y, y) for _ in range(T + 1): keep_idx, drop_idx = next(split_iter) # Convert to index keep_index = [index[i] for i in keep_idx] drop_index = [index[i] for i in drop_idx] def select_idx(df): """Define the idx to keep from the database.""" return df.drop(drop_index, axis=0) task.meta.idx_selection = Transform( input_features=[], transform=select_idx, ) series = pd.Series(keep_index) dump_path = f'pvals/{task.meta.tag}/RS{RS}-T{T}-used_idx.csv' os.makedirs(os.path.dirname(dump_path), exist_ok=True) series.to_csv(dump_path, header=None, index=False) print(f'Idx used of shape {series.size}') # Ignore existing pvals selection task.meta.select = None task.meta.encode_select = 'ordinal' # Force reload y to take into account previous change task._load_y() y = task.y print(f'y reloaded with shape {y.shape}') index = y.index temp_df_transposed_path = temp_dir + f'RS{RS}-T{T}-X_transposed.csv' print('Retrieving X') X = task.X print(f'X loaded with shape {X.shape}') os.makedirs(temp_dir, exist_ok=True) # Little trick here, to iterate efficiently over the features, data is # transposed so that features are now in the place of rows. # This is useful because it is less memory expensive to iterate over # rows than features (rows are loaded on the fly from the file). # Particularly usefull with big datasets that doesn't fit in memory. X_t = X.transpose() X_t.to_csv(temp_df_transposed_path, quoting=csv.QUOTE_ALL) # Here we create an iterator over the rows (features, since its transposed) # Data is loaded row by row (since chunksize=1) when the iterator is called X_t = pd.read_csv(temp_df_transposed_path, iterator=True, chunksize=1, index_col=0) # Load types print('Loading types') db._load_feature_types(task.meta) types = db.feature_types[task.meta.tag] def pval_one_feature(x, y): # Drop rows wih missing values both in f and y x = pd.Series(x, index=index) x.replace(to_replace='', value=np.nan, inplace=True) x = x.astype(float) idx_to_drop = set(x.index[x.isna()]) x = x.drop(idx_to_drop, axis=0) y_dropped = y.drop(idx_to_drop, axis=0) x = x.to_numpy().reshape(-1, 1) y_dropped = y_dropped.to_numpy().reshape(-1) assert x.shape[0] == y_dropped.shape[0] if x.shape[0] < 0.01 * index.size: # Not enough sample, skipping return None _, pval = f_callable(x, y_dropped) # Keep only 6 significant digits (not the same as keeping 6 digits) # eg 1.23456789e-10 -> 1.234567e-10 return float(f'{pval[0]:.6g}') def handler(row, y): name = row.index[0] x = np.squeeze(np.transpose(row.to_numpy())) print(name) if name == '': return t = types[name] if t == CATEGORICAL or t == BINARY: # categorical encode df = pd.DataFrame({name: x}) df = df.astype(str) df.replace(to_replace='', value=np.nan, inplace=True) enc = OneHotEncoder(sparse=False) # Cast to str to prevent: "argument must be a string or number" # error which occurs when mixed types floats and str # Fill missing values with a placeholder df.fillna('MISSING_VALUE', inplace=True) # Fit transform the encoder data_encoded = enc.fit_transform(df) feature_names = list(enc.get_feature_names(list(df.columns))) df_encoded = pd.DataFrame(data_encoded, index=df.index, columns=feature_names) L = [] for f in df_encoded: print(f'\t{f}') L.append((f, pval_one_feature(df_encoded[f], y))) return L elif t == CONTINUE_R or t == CONTINUE_I or t == ORDINAL: return [(name, pval_one_feature(x, y))] print(f'"{name}" ignored ') res = Parallel(n_jobs=-1, require='sharedmem')(delayed(handler)(row, y) for row in X_t) res = [r for r in res if r is not None] res = functools.reduce(lambda x, y: x + y, res) print(res) names, pvals = zip(*res) pvals = pd.Series(pvals, index=names) print(pvals) dump_path = f'pvals/{task.meta.tag}/RS{RS}-T{T}-pvals.csv' os.makedirs(os.path.dirname(dump_path), exist_ok=True) pvals.to_csv(dump_path, header=False)
if step % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: X_train, y_: y_train, keep_prob: 1.0 }) print("step %d, training accuracy %g" % (step, train_accuracy)) train_step.run(feed_dict={x: X_train, y_: y_train, keep_prob: 0.5}) print("test accuracy:%g" % accuracy.eval(feed_dict={ x: X_valid, y_: y_valid, keep_prob: 1.0 })) if __name__ == '__main__': os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' #防止系统报错 Allocation of exceeds 10% of system memory os.environ["CUDA_VISIBLE_DEVICES"] = "-1" train, labels, test, classes = get_data() #将train中值归一化为-1~1之间的值 train_scaled = StandardScaler().fit_transform(train.values) sss = StratifiedShuffleSplit(test_size=0.1, random_state=23) for train_index, valid_index in sss.split(train_scaled, labels): X_train, X_valid = train_scaled[train_index], train_scaled[valid_index] y_train, y_valid = labels[train_index], labels[valid_index] OneHot = OneHotEncoder().fit(y_train.reshape(-1, 1)) y_train = OneHot.transform(y_train.reshape(-1, 1)).toarray() y_valid = OneHotEncoder().fit_transform(y_valid.reshape(-1, 1)).toarray() main()
print("Number of samples: " + str(nb_samples)) x = np.array([x for x, _ in ds.data]) y = np.array(ds.targets) y = to_categorical(y) if speaker_independence: k_folds = len(ds.test_sets) splits = zip(ds.train_sets, ds.test_sets) print("Using speaker independence %s-fold cross validation" % k_folds) else: k_folds = 5 sss = StratifiedShuffleSplit(n_splits=k_folds, test_size=0.2, random_state=1) splits = sss.split(x, y) print("Using %s-fold cross validation by StratifiedShuffleSplit" % k_folds) cvscores = [] for (train, test) in splits: # create network model = networks.create_softmax_la_network( input_shape=(globalvars.max_len, globalvars.nb_features), nb_classes=nb_classes) # compile the model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
import numpy as np # # StratifiedShuffleSplit # * 引数: # 1. n_splits:分割数(分割をする回数) # 2. test_size:データ全体に対するテストデータの割合 # 3. random_state:乱数の初期化のための値 # ## StratifiedShuffleSplit.split # * 引数 # 1. 分割対象のデータ # 2. データのグループ(分割時にグループの割合が保たれる) # * 戻り値 # * JavaでいうIterator的なオブジェクトが返却される # * 値の取得にはFor文を使う必要あり(それ以外の方法もある?) # In[8]: from sklearn.model_selection import StratifiedShuffleSplit # In[23]: data = np.array(['A', 'B', 'a', 'b']) group = np.array([0, 0, 1, 1]) sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) # In[24]: for train_index, test_index in sss.split(data, group): print('%s %s %s %s' % (train_index, data[train_index], test_index, data[test_index]))
''' # import matplotlib.pyplot as plt # housing_df.hist(bins=50, figsize=(20,20)) # plt.show() ''' split train & test ''' import numpy as np housing_df["income_cat"] = np.ceil(housing_df["median_income_value"] / 1.5) housing_df["income_cat"].where(cond=housing_df["income_cat"] < 0.5, other=0.5, inplace=True) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_indices, test_indices in split.split(X=housing_df, y=housing_df["income_cat"]): train_set = housing_df.loc[train_indices] test_set = housing_df.loc[test_indices] housing_train = train_set.drop("median_house_value", axis=1) housing_test = test_set.drop("median_house_value", axis=1) ''' selector for number and text attributes ''' from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.base import BaseEstimator, TransformerMixin class DataFrameSelector(BaseEstimator, TransformerMixin): def __init__(self, attribute_names):
def get_cv(X, y): cv = StratifiedShuffleSplit(n_splits=8, test_size=0.2, random_state=57) return cv.split(X, y)