コード例 #1
0
def get_train_val_test_indexes_no_cv(X,
                                     Y,
                                     stratus_list=None,
                                     test_val_size=0.15):
    """
  Outputs the train, validation and test split indexes
  """
    from sklearn.model_selection import StratifiedShuffleSplit as SSS
    import numpy as np
    samples = X.shape[0]
    testn = int(round(samples * test_val_size))
    testn = testn + 1 if testn % 2 == 1 else testn  #Give always even numbers
    sss = SSS(n_splits=1, test_size=testn, train_size=None)  #ranndom_state=0
    #Check format
    if type(stratus_list) == list:
        stratus_list = np.array(stratus_list)
    if type(stratus_list) == type(None):
        stratus_list = Y.copy()
    x = X.copy()
    y = Y.copy()
    if len(X.shape) > 2:
        x = np.random.random((samples, 1))
    if len(Y.shape) > 1:
        y = Y[:, 0]
    TrainVal_Index, Test_Index = zip(*sss.split(x, stratus_list))
    x1 = np.arange(samples)
    x2 = x1[TrainVal_Index]
    y2 = stratus_list[TrainVal_Index]
    Train_Index, Val_Index = zip(*sss.split(x2, y2))
    TrainIndex, ValIndex, TestIndex = x2[Train_Index], x2[Val_Index], x1[
        Test_Index]
    return TrainIndex.copy(), ValIndex.copy(), TestIndex.copy()
コード例 #2
0
ファイル: titanic1.py プロジェクト: kemin-li/MachineLearning
def DTpredictor(X_train, y_train, X_test):
    '''Logistic Regression Classifier
    Input traning data ,target, and test data
    Output prabability of each label for test data'''
    from sklearn.tree import DecisionTreeClassifier as DT
    from sklearn.model_selection import StratifiedShuffleSplit as SSS

    # cross validation using StratifiedShuffleSplit
    sss = SSS(n_splits=5, test_size=0.2, random_state=0)
    sss.get_n_splits(X_train, y_train)
    accuracy, logLoss, count = 0, 0, 0
    for train_ind, test_ind in sss.split(X_train, y_train):
        Xtrain, Xtest = X_train.iloc[train_ind], X_train.iloc[test_ind]
        ytrain, ytest = y_train[train_ind], y_train[test_ind]
        model = DT(random_state=1)
        model.fit(Xtrain, ytrain)
        y_pred = model.predict(Xtest)
        accuracy += metrics.accuracy_score(ytest, y_pred)
        logLoss += metrics.log_loss(ytest, y_pred)
        count += 1

    y_pred = model.predict(X_test)
    modelName = model.__class__.__name__
    accModels[modelName] = accuracy / count
    predictions[modelName] = y_pred

    return y_pred, accuracy
コード例 #3
0
 def split(X, y, split_size, random_state=1212):
     """
     Dividing data into 2 parts, that is training set and test set in logical.
     :param X: Data
     :param y: Label
     :return: training set index and test set index
     """
     sss = SSS(n_splits=1, test_size=split_size, random_state=random_state)
     return next(sss.split(X, y))
コード例 #4
0
def __process_data(data_folder: str, data_set: str):
    """
    To generate manifest
    Args:
        data_folder: source with wav files
    Returns:

    """
    fullpath = os.path.abspath(data_folder)
    scp = [(path, data_set) for path in glob(fullpath + '/**/*.wav', recursive=True)]
    out = os.path.join(fullpath, data_set + '_all.json')
    utt2spk = os.path.join(fullpath, 'utt2spk')
    utt2spk_file = open(utt2spk, 'w')

    if os.path.exists(out):
        logging.warning(
            "%s already exists and is assumed to be processed. If not, please delete %s and rerun this script",
            out,
            out,
        )
        return

    speakers = []
    lines = []
    num_processes = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=num_processes)
    with open(out, 'w') as outfile:
        for meta in tqdm(pool.imap(process_single_line, scp), total=len(scp)):
            speaker = meta["label"]
            speakers.append(speaker)
            lines.append(meta)
            json.dump(meta, outfile)
            outfile.write("\n")
            line = meta["audio_filepath"]
            utt2spk_file.write(line.split('/')[-1] + "\t" + speaker + "\n")
    utt2spk_file.close()
    pool.close()
    pool.join()

    if data_set != 'test':
        sss = SSS(n_splits=1, test_size=0.1, random_state=42)
        for train_idx, test_idx in sss.split(speakers, speakers):
            print(len(train_idx))

        out = os.path.join(fullpath, 'train.json')
        write_file(out, lines, train_idx)
        out = os.path.join(fullpath, 'dev.json')
        write_file(out, lines, test_idx)
コード例 #5
0
ファイル: dataset.py プロジェクト: Gandor26/invoice
 def _split_train_valid(self,
                        valid_split=0.1,
                        stratified=True,
                        seed=None,
                        **kwargs):
     paths, labels = zip(*self._train_set.samples)
     if stratified:
         from sklearn.model_selection import StratifiedShuffleSplit as SSS
         splitter = SSS(n_splits=1,
                        test_size=valid_split,
                        random_state=seed)
     else:
         from sklearn.model_selection import ShuffleSplit as SS
         splitter = SS(n_splits=1, test_size=valid_split, random_state=seed)
     idx_train, idx_test = next(splitter.split(paths, labels))
     return idx_train, idx_test
コード例 #6
0
def get_train_val_test_indexes(X,
                               Y,
                               stratus_list=None,
                               test_size=0.2,
                               kfold=5,
                               shuffle=True):
    """
    X = np.random.random((100,2))
    y = np.random.randint(0,2,(100))
    Tr, Vl, Ts = get_train_val_test_indexes(X,y,stratus_list=None,test_size=0.2,kfold=5,shuffle=True)
    """
    from sklearn.model_selection import StratifiedShuffleSplit as SSS
    from sklearn.model_selection import StratifiedKFold
    import numpy as np
    #Check format
    if type(stratus_list) == list:
        stratus_list = np.array(stratus_list)
    if type(stratus_list) == type(None):
        stratus_list = Y.copy()
    if type(test_size) != float:
        test_size = 0.2
    else:
        if (test_size <= 0) or (test_size >= 1):
            test_size = 0.2
    #init funcs
    sss = SSS(n_splits=1, test_size=test_size,
              train_size=None)  #ranndom_state=0
    skf = StratifiedKFold(n_splits=kfold, shuffle=shuffle)
    x = X.copy()
    y = Y.copy()
    if len(X.shape) > 2:
        x = np.random.random((X.shape[0], 1))
    if len(Y.shape) > 1:
        y = Y[:, 0]
    TrainVal_Index, Test_Index = zip(*sss.split(x, stratus_list))
    x_tr_val = TrainVal_Index[0]
    y_tr_val = stratus_list[TrainVal_Index]
    #Separate train and validation from test
    skf.get_n_splits(x_tr_val, y_tr_val)
    train_indexes, val_indexes = [], []
    for train_index, val_index in skf.split(x_tr_val, y_tr_val):
        tr_i = x_tr_val[train_index]
        val_i = x_tr_val[val_index]
        train_indexes.append(tr_i.copy())
        val_indexes.append(val_i.copy())
    return train_indexes[:], val_indexes[:], TrainVal_Index[0].copy(
    ), Test_Index[0].copy()
コード例 #7
0
def classif_subcosp(state, freq, elec, n_jobs=-1):
    global CHANGES
    print(state, freq)
    if SUBSAMPLE or ADAPT:
        info_data = pd.read_csv(SAVE_PATH.parent / "info_data.csv")[STATE_LIST]
        if SUBSAMPLE:
            n_trials = info_data.min().min()
            n_trials = 61
        elif ADAPT:
            n_trials = info_data.min()[state]
    elif FULL_TRIAL:
        groups = range(36)
    labels_og = INIT_LABELS

    file_path = (
        SAVE_PATH / "results" / PREFIX + NAME +
        "_{}_{}_{}_{}_{:.2f}.npy".format(state, freq, elec, WINDOW, OVERLAP))

    if not file_path.isfile():
        n_rep = 0
    else:
        final_save = np.load(file_path)
        n_rep = int(final_save["n_rep"])
        n_splits = int(final_save["n_splits"])
    print("Starting from i={}".format(n_rep))

    file_name = NAME + "_{}_{}_{}_{}_{:.2f}.npy".format(
        state, freq, elec, WINDOW, OVERLAP)
    data_file_path = SAVE_PATH / file_name

    data_og = np.load(data_file_path)
    if FULL_TRIAL:
        cv = SSS(9)
    else:
        cv = StratifiedShuffleGroupSplit(2)
    lda = LDA()
    clf = TSclassifier(clf=lda)

    for i in range(n_rep, N_BOOTSTRAPS):
        CHANGES = True
        if FULL_TRIAL:
            data = data_og["data"]
        elif SUBSAMPLE or ADAPT:
            data, labels, groups = prepare_data(data_og,
                                                labels_og,
                                                n_trials=n_trials,
                                                random_state=i)
        else:
            data, labels, groups = prepare_data(data_og, labels_og)
        n_splits = cv.get_n_splits(None, labels, groups)

        save = classification(clf,
                              cv,
                              data,
                              labels,
                              groups,
                              N_PERM,
                              n_jobs=n_jobs)

        if i == 0:
            final_save = save
        elif BOOTSTRAP:
            for key, value in save.items():
                if key != "n_splits":
                    final_save[key] += value

        final_save["n_rep"] = i + 1
        np.save(file_path, final_save)

    final_save["auc_score"] = np.mean(final_save.get("auc_score", 0))
    final_save["acc_score"] = np.mean(final_save["acc_score"])
    if CHANGES:
        np.save(file_path, final_save)

    to_print = "accuracy for {} {} : {:.2f}".format(state, freq,
                                                    final_save["acc_score"])
    if BOOTSTRAP:
        standev = np.std([
            np.mean(final_save["acc"][i * n_splits:(i + 1) * n_splits])
            for i in range(N_BOOTSTRAPS)
        ])
        to_print += " (+/- {:.2f})".format(standev)
    print(to_print)
    if PERM:
        print("pval = {}".format(final_save["acc_pvalue"]))
コード例 #8
0
def classif_cov(state):
    """Where the magic happens"""
    print(state)
    if FULL_TRIAL:
        labels = np.concatenate((np.ones(18), np.zeros(18)))
        groups = range(36)
    elif SUBSAMPLE:
        info_data = pd.read_csv(SAVE_PATH.parent / "info_data.csv")[STATE_LIST]
        n_trials = info_data.min().min()
        n_subs = len(info_data) - 1
        groups = [i for i in range(n_subs) for _ in range(n_trials)]
        n_total = n_trials * n_subs
        labels = [0 if i < n_total / 2 else 1 for i in range(n_total)]
    else:
        labels = loadmat(LABEL_PATH / state + "_labels.mat")["y"].ravel()
        labels, groups = create_groups(labels)

    file_path = SAVE_PATH / "results" / PREFIX + NAME + "_{}.mat".format(state)
    if not file_path.isfile():
        n_rep = 0
    else:
        final_save = proper_loadmat(file_path)
        n_rep = final_save["n_rep"]
    print("starting from i={}".format(n_rep))

    file_name = NAME + "_{}.mat".format(state)
    data_file_path = SAVE_PATH / file_name

    if data_file_path.isfile():
        data_og = loadmat(data_file_path)
        for i in range(n_rep, N_BOOTSTRAPS):
            if FULL_TRIAL:
                data = data_og["data"]
            elif SUBSAMPLE:
                data = prepare_data(data_og, n_trials=n_trials, random_state=i)
            else:
                data = prepare_data(data_og)

            if REDUCED:
                reduced_data = []
                for submat in data:
                    temp_a = np.delete(submat, i, 0)
                    temp_b = np.delete(temp_a, i, 1)
                    reduced_data.append(temp_b)
                data = np.asarray(reduced_data)

            if FULL_TRIAL:
                crossval = SSS(9)
            else:
                crossval = StratifiedLeave2GroupsOut()
            lda = LDA()
            clf = TSclassifier(clf=lda)
            save = classification(clf,
                                  crossval,
                                  data,
                                  labels,
                                  groups,
                                  N_PERM,
                                  n_jobs=-1)

            print(save["acc_score"])
            if i == 0:
                final_save = save
            elif BOOTSTRAP or REDUCED:
                for key, value in save.items():
                    final_save[key] += value

            final_save["n_rep"] = i + 1
            savemat(file_path, final_save)

        final_save["n_rep"] = N_BOOTSTRAPS
        if BOOTSTRAP:
            final_save["auc_score"] = np.mean(final_save["auc_score"])
            final_save["acc_score"] = np.mean(final_save["acc_score"])
        savemat(file_path, final_save)

        print("accuracy for %s %s : %0.2f (+/- %0.2f)" %
              (state, np.mean(save["acc_score"]), np.std(save["acc"])))
        if PERM:
            print("pval = {}".format(save["acc_pvalue"]))

    else:
        print(data_file_path.name + " Not found")
コード例 #9
0
def create_testdata(data, test_size = 0.2, random_state = 42):
    train_set, test_set = tts(data, test_size = test_size, random_state = random_state)
    return train_set, test_set

# train_set, test_set = create_testdata(housing)
# 根據題意, 查看收入中位數的直方圖
# housing["median_income"].hist(bins = 50)
# plt.show()

# 在數據集中, 每一層都必須要有足夠的實例, 不然數據不足的層很可能會被錯估
# 篩選數據(利用ceil進行取整, 然後把>5的數據都歸類成5)
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace = True)
# 篩選完後, 可以根據收入類別進行分層抽樣, 使用 Scikit-Learn的 StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit as SSS
split = SSS(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# 查看數據是否為我們所需要的
# print(housing["income_cat"].value_counts() / len(housing))
# 補充: 分層抽樣和純隨機抽樣偏差比較, 可以得知分層抽樣與原本數據中的分布幾乎一致, 但隨機抽樣卻有重大偏差

# 現在可以把數據恢復原樣了, 刪除income_cat屬性
for sets in (strat_train_set, strat_test_set):
    sets.drop(["income_cat"], axis = 1, inplace = True)

# 花了一段時間在數據集的生成上, 這是機器學習極致重要的一部份. 接著要進入下一步驟了

コード例 #10
0
from sklearn.datasets import fetch_olivetti_faces
olivetti = fetch_olivetti_faces()
# print(olivetti.DESCR) # 關於 olivetti 的描述

from sklearn.model_selection import StratifiedShuffleSplit as SSS
# 訓練驗證集 和 測試集
data_split = SSS(n_splits=1, test_size=40, random_state=42)
train_valid_idx, test_idx = next(
    data_split.split(olivetti.data, olivetti.target))
X_train_valid, y_train_valid = olivetti.data[train_valid_idx], olivetti.target[
    train_valid_idx]
X_test, y_test = olivetti.data[test_idx], olivetti.target[test_idx]

# 把訓練驗證集 拆成 訓練集 驗證集
data_split = SSS(n_splits=1, test_size=80, random_state=43)
train_idx, valid_idx = next(data_split.split(X_train_valid, y_train_valid))
X_train, y_train = X_train_valid[train_idx], y_train_valid[train_idx]
X_valid, y_valid = X_train_valid[valid_idx], y_train_valid[valid_idx]

# print(X_train.shape, y_train.shape) # (280, 4096) (280,)
# print(X_valid.shape, y_valid.shape) # (80, 4096) (80,)
# print(X_test.shape, y_test.shape) # (40, 4096) (40,)

from sklearn.decomposition import PCA
pca = PCA(0.99)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(X_test)
# print(pca.n_components_) # 199

from sklearn.cluster import KMeans
コード例 #11
0
]]
TestData = TestData.apply(lambda x: x.fillna(x.mean()), axis=0)

################################### pre sampling ++++++++++++++++++++++++++++++++++++++++++
##################### model bau

ann = MLP(hidden_layer_sizes=(10, 4),
          activation='relu',
          solver='sgd',
          alpha=0.0001,
          learning_rate='constant',
          learning_rate_init=0.00001,
          max_iter=10000,
          tol=0.00000000005)

cv = SSS(n_splits=20, test_size=0.3, random_state=42)
selector = SelectKBest(chi2, k=5)
TrainData_new = selector.fit_transform(TrainData, Train_Target)
selector.get_support(indices=True)
TrainData_new = TrainData_new
Train_Target = Train_Target.values

networks = {}
f1_scores = {}

i = 0
for train_index, test_index in cv.split(TrainData_new, Train_Target):

    X_train, X_test = TrainData_new[train_index], TrainData_new[test_index]
    y_train, y_test = Train_Target[train_index], Train_Target[test_index]
コード例 #12
0
def classif_cosp(state, n_jobs=-1):
    global CHANGES
    print(state, "multif")
    if SUBSAMPLE or ADAPT:
        info_data = pd.read_csv(SAVE_PATH.parent / "info_data.csv")[STATE_LIST]
        if SUBSAMPLE:
            n_trials = info_data.min().min()
            # n_trials = 30
        elif ADAPT:
            n_trials = info_data.min()[state]
    elif FULL_TRIAL:
        groups = range(36)
    labels_og = INIT_LABELS

    file_path = (SAVE_PATH / "results" / PREFIX + NAME +
                 "_{}_{}_{:.2f}.mat".format(state, WINDOW, OVERLAP))

    if not file_path.isfile():
        n_rep = 0
    else:
        final_save = proper_loadmat(file_path)
        n_rep = int(final_save["n_rep"])
        n_splits = int(final_save["n_splits"])
    print("Starting from i={}".format(n_rep))

    if FULL_TRIAL:
        crossval = SSS(9)
    else:
        crossval = StratifiedShuffleGroupSplit(2)
    lda = LDA()
    clf = TSclassifier(clf=lda)

    for i in range(n_rep, N_BOOTSTRAPS):
        CHANGES = True
        data_freqs = []
        for freq in FREQ_DICT:
            file_name = NAME + "_{}_{}_{}_{:.2f}.mat".format(
                state, freq, WINDOW, OVERLAP)
            data_file_path = SAVE_PATH / file_name
            data_og = loadmat(data_file_path)["data"].ravel()
            data_og = np.asarray([sub.squeeze() for sub in data_og])
            if SUBSAMPLE or ADAPT:
                data, labels, groups = prepare_data(data_og,
                                                    labels_og,
                                                    n_trials=n_trials,
                                                    random_state=i)
            else:
                data, labels, groups = prepare_data(data_og, labels_og)
            data_freqs.append(data)
            n_splits = crossval.get_n_splits(None, labels, groups)

        data_freqs = np.asarray(data_freqs).swapaxes(0, 1).swapaxes(
            1, 3).swapaxes(1, 2)
        save = classification(clf,
                              crossval,
                              data,
                              labels,
                              groups,
                              N_PERM,
                              n_jobs=n_jobs)

        if i == 0:
            final_save = save
        elif BOOTSTRAP:
            for key, value in save.items():
                if key != "n_splits":
                    final_save[key] += value

        final_save["n_rep"] = i + 1
        if n_jobs == -1:
            savemat(file_path, final_save)

    final_save["auc_score"] = np.mean(final_save.get("auc_score", 0))
    final_save["acc_score"] = np.mean(final_save["acc_score"])
    if CHANGES:
        savemat(file_path, final_save)

    to_print = "accuracy for {} {} : {:.2f}".format(state, freq,
                                                    final_save["acc_score"])
    if BOOTSTRAP:
        standev = np.std([
            np.mean(final_save["acc"][i * n_splits:(i + 1) * n_splits])
            for i in range(N_BOOTSTRAPS)
        ])
        to_print += " (+/- {:.2f})".format(standev)
    print(to_print)
    if PERM:
        print("pval = {}".format(final_save["acc_pvalue"]))
コード例 #13
0
    n_train_subs = int(0.6 * len(data_df))
    train_df = data_df[:n_train_subs]
    test_df = data_df[n_train_subs:]
    X_og, y = load_freq_data(train_df)
    X_test_og, y_test = load_freq_data(test_df)

    idx = np.random.RandomState(0).permutation(range(len(X_og)))
    X_og = X_og[idx]
    y = y[idx]
    print(X_og[[0, 33, 166]], y[[0, 33, 166]])

    idx = np.random.RandomState(0).permutation(range(len(X_test_og)))
    X_test_og = X_test_og[idx]
    y_test = y_test[idx]

    cv = SSS(5)

    all_scores = []
    # for C in [0.1, 1.0, 10.0, 100.0]:
    param_distributions = {
        "C": sp.stats.expon(scale=10),
        "gamma": sp.stats.expon(scale=0.1),
    }
    for elec in range(N_ELEC):
        # X = X_og
        # X_test = X_test_og
        X = X_og[:, elec]
        X_test = X_test_og[:, elec]
        # if len(X.shape) < 2:
        #     X = X[..., None]
        #     X_test = X_test[..., None]
コード例 #14
0
ファイル: ml_pipeline.py プロジェクト: arthurdehgan/camcan
def create_crossval(label, y):
    if label != "subject":
        return SSGS(len(np.unique(y)) * 1, args.n_crossval)
    return SSS(10)