def get_train_val_test_indexes_no_cv(X, Y, stratus_list=None, test_val_size=0.15): """ Outputs the train, validation and test split indexes """ from sklearn.model_selection import StratifiedShuffleSplit as SSS import numpy as np samples = X.shape[0] testn = int(round(samples * test_val_size)) testn = testn + 1 if testn % 2 == 1 else testn #Give always even numbers sss = SSS(n_splits=1, test_size=testn, train_size=None) #ranndom_state=0 #Check format if type(stratus_list) == list: stratus_list = np.array(stratus_list) if type(stratus_list) == type(None): stratus_list = Y.copy() x = X.copy() y = Y.copy() if len(X.shape) > 2: x = np.random.random((samples, 1)) if len(Y.shape) > 1: y = Y[:, 0] TrainVal_Index, Test_Index = zip(*sss.split(x, stratus_list)) x1 = np.arange(samples) x2 = x1[TrainVal_Index] y2 = stratus_list[TrainVal_Index] Train_Index, Val_Index = zip(*sss.split(x2, y2)) TrainIndex, ValIndex, TestIndex = x2[Train_Index], x2[Val_Index], x1[ Test_Index] return TrainIndex.copy(), ValIndex.copy(), TestIndex.copy()
def DTpredictor(X_train, y_train, X_test): '''Logistic Regression Classifier Input traning data ,target, and test data Output prabability of each label for test data''' from sklearn.tree import DecisionTreeClassifier as DT from sklearn.model_selection import StratifiedShuffleSplit as SSS # cross validation using StratifiedShuffleSplit sss = SSS(n_splits=5, test_size=0.2, random_state=0) sss.get_n_splits(X_train, y_train) accuracy, logLoss, count = 0, 0, 0 for train_ind, test_ind in sss.split(X_train, y_train): Xtrain, Xtest = X_train.iloc[train_ind], X_train.iloc[test_ind] ytrain, ytest = y_train[train_ind], y_train[test_ind] model = DT(random_state=1) model.fit(Xtrain, ytrain) y_pred = model.predict(Xtest) accuracy += metrics.accuracy_score(ytest, y_pred) logLoss += metrics.log_loss(ytest, y_pred) count += 1 y_pred = model.predict(X_test) modelName = model.__class__.__name__ accModels[modelName] = accuracy / count predictions[modelName] = y_pred return y_pred, accuracy
def split(X, y, split_size, random_state=1212): """ Dividing data into 2 parts, that is training set and test set in logical. :param X: Data :param y: Label :return: training set index and test set index """ sss = SSS(n_splits=1, test_size=split_size, random_state=random_state) return next(sss.split(X, y))
def __process_data(data_folder: str, data_set: str): """ To generate manifest Args: data_folder: source with wav files Returns: """ fullpath = os.path.abspath(data_folder) scp = [(path, data_set) for path in glob(fullpath + '/**/*.wav', recursive=True)] out = os.path.join(fullpath, data_set + '_all.json') utt2spk = os.path.join(fullpath, 'utt2spk') utt2spk_file = open(utt2spk, 'w') if os.path.exists(out): logging.warning( "%s already exists and is assumed to be processed. If not, please delete %s and rerun this script", out, out, ) return speakers = [] lines = [] num_processes = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=num_processes) with open(out, 'w') as outfile: for meta in tqdm(pool.imap(process_single_line, scp), total=len(scp)): speaker = meta["label"] speakers.append(speaker) lines.append(meta) json.dump(meta, outfile) outfile.write("\n") line = meta["audio_filepath"] utt2spk_file.write(line.split('/')[-1] + "\t" + speaker + "\n") utt2spk_file.close() pool.close() pool.join() if data_set != 'test': sss = SSS(n_splits=1, test_size=0.1, random_state=42) for train_idx, test_idx in sss.split(speakers, speakers): print(len(train_idx)) out = os.path.join(fullpath, 'train.json') write_file(out, lines, train_idx) out = os.path.join(fullpath, 'dev.json') write_file(out, lines, test_idx)
def _split_train_valid(self, valid_split=0.1, stratified=True, seed=None, **kwargs): paths, labels = zip(*self._train_set.samples) if stratified: from sklearn.model_selection import StratifiedShuffleSplit as SSS splitter = SSS(n_splits=1, test_size=valid_split, random_state=seed) else: from sklearn.model_selection import ShuffleSplit as SS splitter = SS(n_splits=1, test_size=valid_split, random_state=seed) idx_train, idx_test = next(splitter.split(paths, labels)) return idx_train, idx_test
def get_train_val_test_indexes(X, Y, stratus_list=None, test_size=0.2, kfold=5, shuffle=True): """ X = np.random.random((100,2)) y = np.random.randint(0,2,(100)) Tr, Vl, Ts = get_train_val_test_indexes(X,y,stratus_list=None,test_size=0.2,kfold=5,shuffle=True) """ from sklearn.model_selection import StratifiedShuffleSplit as SSS from sklearn.model_selection import StratifiedKFold import numpy as np #Check format if type(stratus_list) == list: stratus_list = np.array(stratus_list) if type(stratus_list) == type(None): stratus_list = Y.copy() if type(test_size) != float: test_size = 0.2 else: if (test_size <= 0) or (test_size >= 1): test_size = 0.2 #init funcs sss = SSS(n_splits=1, test_size=test_size, train_size=None) #ranndom_state=0 skf = StratifiedKFold(n_splits=kfold, shuffle=shuffle) x = X.copy() y = Y.copy() if len(X.shape) > 2: x = np.random.random((X.shape[0], 1)) if len(Y.shape) > 1: y = Y[:, 0] TrainVal_Index, Test_Index = zip(*sss.split(x, stratus_list)) x_tr_val = TrainVal_Index[0] y_tr_val = stratus_list[TrainVal_Index] #Separate train and validation from test skf.get_n_splits(x_tr_val, y_tr_val) train_indexes, val_indexes = [], [] for train_index, val_index in skf.split(x_tr_val, y_tr_val): tr_i = x_tr_val[train_index] val_i = x_tr_val[val_index] train_indexes.append(tr_i.copy()) val_indexes.append(val_i.copy()) return train_indexes[:], val_indexes[:], TrainVal_Index[0].copy( ), Test_Index[0].copy()
def classif_subcosp(state, freq, elec, n_jobs=-1): global CHANGES print(state, freq) if SUBSAMPLE or ADAPT: info_data = pd.read_csv(SAVE_PATH.parent / "info_data.csv")[STATE_LIST] if SUBSAMPLE: n_trials = info_data.min().min() n_trials = 61 elif ADAPT: n_trials = info_data.min()[state] elif FULL_TRIAL: groups = range(36) labels_og = INIT_LABELS file_path = ( SAVE_PATH / "results" / PREFIX + NAME + "_{}_{}_{}_{}_{:.2f}.npy".format(state, freq, elec, WINDOW, OVERLAP)) if not file_path.isfile(): n_rep = 0 else: final_save = np.load(file_path) n_rep = int(final_save["n_rep"]) n_splits = int(final_save["n_splits"]) print("Starting from i={}".format(n_rep)) file_name = NAME + "_{}_{}_{}_{}_{:.2f}.npy".format( state, freq, elec, WINDOW, OVERLAP) data_file_path = SAVE_PATH / file_name data_og = np.load(data_file_path) if FULL_TRIAL: cv = SSS(9) else: cv = StratifiedShuffleGroupSplit(2) lda = LDA() clf = TSclassifier(clf=lda) for i in range(n_rep, N_BOOTSTRAPS): CHANGES = True if FULL_TRIAL: data = data_og["data"] elif SUBSAMPLE or ADAPT: data, labels, groups = prepare_data(data_og, labels_og, n_trials=n_trials, random_state=i) else: data, labels, groups = prepare_data(data_og, labels_og) n_splits = cv.get_n_splits(None, labels, groups) save = classification(clf, cv, data, labels, groups, N_PERM, n_jobs=n_jobs) if i == 0: final_save = save elif BOOTSTRAP: for key, value in save.items(): if key != "n_splits": final_save[key] += value final_save["n_rep"] = i + 1 np.save(file_path, final_save) final_save["auc_score"] = np.mean(final_save.get("auc_score", 0)) final_save["acc_score"] = np.mean(final_save["acc_score"]) if CHANGES: np.save(file_path, final_save) to_print = "accuracy for {} {} : {:.2f}".format(state, freq, final_save["acc_score"]) if BOOTSTRAP: standev = np.std([ np.mean(final_save["acc"][i * n_splits:(i + 1) * n_splits]) for i in range(N_BOOTSTRAPS) ]) to_print += " (+/- {:.2f})".format(standev) print(to_print) if PERM: print("pval = {}".format(final_save["acc_pvalue"]))
def classif_cov(state): """Where the magic happens""" print(state) if FULL_TRIAL: labels = np.concatenate((np.ones(18), np.zeros(18))) groups = range(36) elif SUBSAMPLE: info_data = pd.read_csv(SAVE_PATH.parent / "info_data.csv")[STATE_LIST] n_trials = info_data.min().min() n_subs = len(info_data) - 1 groups = [i for i in range(n_subs) for _ in range(n_trials)] n_total = n_trials * n_subs labels = [0 if i < n_total / 2 else 1 for i in range(n_total)] else: labels = loadmat(LABEL_PATH / state + "_labels.mat")["y"].ravel() labels, groups = create_groups(labels) file_path = SAVE_PATH / "results" / PREFIX + NAME + "_{}.mat".format(state) if not file_path.isfile(): n_rep = 0 else: final_save = proper_loadmat(file_path) n_rep = final_save["n_rep"] print("starting from i={}".format(n_rep)) file_name = NAME + "_{}.mat".format(state) data_file_path = SAVE_PATH / file_name if data_file_path.isfile(): data_og = loadmat(data_file_path) for i in range(n_rep, N_BOOTSTRAPS): if FULL_TRIAL: data = data_og["data"] elif SUBSAMPLE: data = prepare_data(data_og, n_trials=n_trials, random_state=i) else: data = prepare_data(data_og) if REDUCED: reduced_data = [] for submat in data: temp_a = np.delete(submat, i, 0) temp_b = np.delete(temp_a, i, 1) reduced_data.append(temp_b) data = np.asarray(reduced_data) if FULL_TRIAL: crossval = SSS(9) else: crossval = StratifiedLeave2GroupsOut() lda = LDA() clf = TSclassifier(clf=lda) save = classification(clf, crossval, data, labels, groups, N_PERM, n_jobs=-1) print(save["acc_score"]) if i == 0: final_save = save elif BOOTSTRAP or REDUCED: for key, value in save.items(): final_save[key] += value final_save["n_rep"] = i + 1 savemat(file_path, final_save) final_save["n_rep"] = N_BOOTSTRAPS if BOOTSTRAP: final_save["auc_score"] = np.mean(final_save["auc_score"]) final_save["acc_score"] = np.mean(final_save["acc_score"]) savemat(file_path, final_save) print("accuracy for %s %s : %0.2f (+/- %0.2f)" % (state, np.mean(save["acc_score"]), np.std(save["acc"]))) if PERM: print("pval = {}".format(save["acc_pvalue"])) else: print(data_file_path.name + " Not found")
def create_testdata(data, test_size = 0.2, random_state = 42): train_set, test_set = tts(data, test_size = test_size, random_state = random_state) return train_set, test_set # train_set, test_set = create_testdata(housing) # 根據題意, 查看收入中位數的直方圖 # housing["median_income"].hist(bins = 50) # plt.show() # 在數據集中, 每一層都必須要有足夠的實例, 不然數據不足的層很可能會被錯估 # 篩選數據(利用ceil進行取整, 然後把>5的數據都歸類成5) housing["income_cat"] = np.ceil(housing["median_income"] / 1.5) housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True) # 篩選完後, 可以根據收入類別進行分層抽樣, 使用 Scikit-Learn的 StratifiedShuffleSplit from sklearn.model_selection import StratifiedShuffleSplit as SSS split = SSS(n_splits = 1, test_size = 0.2, random_state = 42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] # 查看數據是否為我們所需要的 # print(housing["income_cat"].value_counts() / len(housing)) # 補充: 分層抽樣和純隨機抽樣偏差比較, 可以得知分層抽樣與原本數據中的分布幾乎一致, 但隨機抽樣卻有重大偏差 # 現在可以把數據恢復原樣了, 刪除income_cat屬性 for sets in (strat_train_set, strat_test_set): sets.drop(["income_cat"], axis = 1, inplace = True) # 花了一段時間在數據集的生成上, 這是機器學習極致重要的一部份. 接著要進入下一步驟了
from sklearn.datasets import fetch_olivetti_faces olivetti = fetch_olivetti_faces() # print(olivetti.DESCR) # 關於 olivetti 的描述 from sklearn.model_selection import StratifiedShuffleSplit as SSS # 訓練驗證集 和 測試集 data_split = SSS(n_splits=1, test_size=40, random_state=42) train_valid_idx, test_idx = next( data_split.split(olivetti.data, olivetti.target)) X_train_valid, y_train_valid = olivetti.data[train_valid_idx], olivetti.target[ train_valid_idx] X_test, y_test = olivetti.data[test_idx], olivetti.target[test_idx] # 把訓練驗證集 拆成 訓練集 驗證集 data_split = SSS(n_splits=1, test_size=80, random_state=43) train_idx, valid_idx = next(data_split.split(X_train_valid, y_train_valid)) X_train, y_train = X_train_valid[train_idx], y_train_valid[train_idx] X_valid, y_valid = X_train_valid[valid_idx], y_train_valid[valid_idx] # print(X_train.shape, y_train.shape) # (280, 4096) (280,) # print(X_valid.shape, y_valid.shape) # (80, 4096) (80,) # print(X_test.shape, y_test.shape) # (40, 4096) (40,) from sklearn.decomposition import PCA pca = PCA(0.99) X_train_pca = pca.fit_transform(X_train) X_valid_pca = pca.transform(X_valid) X_test_pca = pca.transform(X_test) # print(pca.n_components_) # 199 from sklearn.cluster import KMeans
]] TestData = TestData.apply(lambda x: x.fillna(x.mean()), axis=0) ################################### pre sampling ++++++++++++++++++++++++++++++++++++++++++ ##################### model bau ann = MLP(hidden_layer_sizes=(10, 4), activation='relu', solver='sgd', alpha=0.0001, learning_rate='constant', learning_rate_init=0.00001, max_iter=10000, tol=0.00000000005) cv = SSS(n_splits=20, test_size=0.3, random_state=42) selector = SelectKBest(chi2, k=5) TrainData_new = selector.fit_transform(TrainData, Train_Target) selector.get_support(indices=True) TrainData_new = TrainData_new Train_Target = Train_Target.values networks = {} f1_scores = {} i = 0 for train_index, test_index in cv.split(TrainData_new, Train_Target): X_train, X_test = TrainData_new[train_index], TrainData_new[test_index] y_train, y_test = Train_Target[train_index], Train_Target[test_index]
def classif_cosp(state, n_jobs=-1): global CHANGES print(state, "multif") if SUBSAMPLE or ADAPT: info_data = pd.read_csv(SAVE_PATH.parent / "info_data.csv")[STATE_LIST] if SUBSAMPLE: n_trials = info_data.min().min() # n_trials = 30 elif ADAPT: n_trials = info_data.min()[state] elif FULL_TRIAL: groups = range(36) labels_og = INIT_LABELS file_path = (SAVE_PATH / "results" / PREFIX + NAME + "_{}_{}_{:.2f}.mat".format(state, WINDOW, OVERLAP)) if not file_path.isfile(): n_rep = 0 else: final_save = proper_loadmat(file_path) n_rep = int(final_save["n_rep"]) n_splits = int(final_save["n_splits"]) print("Starting from i={}".format(n_rep)) if FULL_TRIAL: crossval = SSS(9) else: crossval = StratifiedShuffleGroupSplit(2) lda = LDA() clf = TSclassifier(clf=lda) for i in range(n_rep, N_BOOTSTRAPS): CHANGES = True data_freqs = [] for freq in FREQ_DICT: file_name = NAME + "_{}_{}_{}_{:.2f}.mat".format( state, freq, WINDOW, OVERLAP) data_file_path = SAVE_PATH / file_name data_og = loadmat(data_file_path)["data"].ravel() data_og = np.asarray([sub.squeeze() for sub in data_og]) if SUBSAMPLE or ADAPT: data, labels, groups = prepare_data(data_og, labels_og, n_trials=n_trials, random_state=i) else: data, labels, groups = prepare_data(data_og, labels_og) data_freqs.append(data) n_splits = crossval.get_n_splits(None, labels, groups) data_freqs = np.asarray(data_freqs).swapaxes(0, 1).swapaxes( 1, 3).swapaxes(1, 2) save = classification(clf, crossval, data, labels, groups, N_PERM, n_jobs=n_jobs) if i == 0: final_save = save elif BOOTSTRAP: for key, value in save.items(): if key != "n_splits": final_save[key] += value final_save["n_rep"] = i + 1 if n_jobs == -1: savemat(file_path, final_save) final_save["auc_score"] = np.mean(final_save.get("auc_score", 0)) final_save["acc_score"] = np.mean(final_save["acc_score"]) if CHANGES: savemat(file_path, final_save) to_print = "accuracy for {} {} : {:.2f}".format(state, freq, final_save["acc_score"]) if BOOTSTRAP: standev = np.std([ np.mean(final_save["acc"][i * n_splits:(i + 1) * n_splits]) for i in range(N_BOOTSTRAPS) ]) to_print += " (+/- {:.2f})".format(standev) print(to_print) if PERM: print("pval = {}".format(final_save["acc_pvalue"]))
n_train_subs = int(0.6 * len(data_df)) train_df = data_df[:n_train_subs] test_df = data_df[n_train_subs:] X_og, y = load_freq_data(train_df) X_test_og, y_test = load_freq_data(test_df) idx = np.random.RandomState(0).permutation(range(len(X_og))) X_og = X_og[idx] y = y[idx] print(X_og[[0, 33, 166]], y[[0, 33, 166]]) idx = np.random.RandomState(0).permutation(range(len(X_test_og))) X_test_og = X_test_og[idx] y_test = y_test[idx] cv = SSS(5) all_scores = [] # for C in [0.1, 1.0, 10.0, 100.0]: param_distributions = { "C": sp.stats.expon(scale=10), "gamma": sp.stats.expon(scale=0.1), } for elec in range(N_ELEC): # X = X_og # X_test = X_test_og X = X_og[:, elec] X_test = X_test_og[:, elec] # if len(X.shape) < 2: # X = X[..., None] # X_test = X_test[..., None]
def create_crossval(label, y): if label != "subject": return SSGS(len(np.unique(y)) * 1, args.n_crossval) return SSS(10)