for word2vec_vec,word2vec_name in zip(word2vec_vecs,word2vec_names):
         csv_filename            = os.path.join(saving_dir,'{} {} {} {} {} {}.csv'.format(
                                     experiment,
                                     here,
                                     sub_name,
                                     roi_name,
                                     condition,
                                     word2vec_name))
         processed               = glob(os.path.join(saving_dir,'*.csv'))
         if csv_filename in processed: # don't repeat what have done
             print(csv_filename)
             pass
         else:
             if n_splits >= 100:
                 idxs_test       = utils.customized_partition(df_data,['id','words'],n_splits)
                 while utils.check_train_test_splits(idxs_test): # just in case
                     idxs_test   = utils.customized_partition(df_data,['id','words'],n_splits = n_splits)
                 idxs_train      = [shuffle(np.array([idx for idx in np.arange(df_data.shape[0]) if (idx not in idx_test)])) for idx_test in idxs_test]
 #                idxs_train      = [utils.check_train_balance(df_data,idx_train,list(label_map.keys())) for idx_train in tqdm(idxs_train)]
                 cv              = zip(idxs_train,idxs_test)
             else:
                 from sklearn.model_selection import GroupShuffleSplit
                 cv = GroupShuffleSplit(n_splits     = n_splits,
                                        test_size    = 0.2,
                                        random_state = 12345)
                 idxs_train,idxs_test = [],[]
                 for idx_train,idx_test in cv.split(BOLD,targets,groups=groups):
                     idxs_train.append(idx_train)
                     idxs_test.append(idx_test)
         
             embedding_features  = np.array([word2vec_vec[word.lower()] for word in df_data['words']])
コード例 #2
0
        variance_threshold.fit(BOLD)
        scaler = StandardScaler()
        scaler.fit(variance_threshold.transform(BOLD))
        csv_filename = os.path.join(
            saving_dir, '{} {} {} {} {}.csv'.format(experiment, here, sub_name,
                                                    roi_name, condition))
        processed = glob(os.path.join(saving_dir, '*.csv'))
        if csv_filename in processed:
            print(csv_filename)
            pass
        else:
            print('partitioning ...')
            idxs_train, idxs_test = utils.get_train_test_splits(
                dataset, label_map, n_splits)

            if utils.check_train_test_splits(idxs_test):
                idxs_train, idxs_test = utils.get_train_test_splits(
                    dataset, label_map, n_splits)
            for word2vec_name, word2vec_features in zip(
                    word2vec_names, word2vec_vecs):
                r_squares, scores = [], []
                for fold, (idx_train, idx_test) in tqdm(
                        enumerate(zip(idxs_train, idxs_test))):
                    if average:
                        tr = dataset[idx_train].get_mapped(
                            mean_group_sample(['chunks', 'id'],
                                              order='occurrence'))
                    else:
                        tr = dataset[idx_train]
                    te = dataset[idx_test].get_mapped(
                        mean_group_sample(['chunks', 'id'],