def test_split_kfold_classwise(self): nclasses = 10 k = 10 max_ninstances = 100 min_ninstances = k labels = [] for i in range(nclasses): ninstances = min_ninstances + one_randint(max_ninstances - min_ninstances) labels += [i] * ninstances labels = np.array(labels, dtype=int) np.random.shuffle(labels) folds_iter1 = split_classwise(labels, k) folds_iter2 = split_classwise(labels, k) sorted_indices = np.arange(len(labels)) self.assertEqual(len(folds_iter1), k) self.assertEqual(len(folds_iter2), k) for i in range(k): fold1 = folds_iter1[i] fold2 = folds_iter2[i] test1 = fold1['test'] train1 = fold1['train'] test2 = fold2['test'] train2 = fold2['train'] all1 = np.concatenate((test1, train1)) all1.sort() all2 = np.concatenate((test2, train2)) all2.sort() self.assertEqual(len(np.intersect1d(test1, train1)), 0) self.assertTrue(np.all(sorted_indices == all1)) self.assertEqual(len(np.intersect1d(test2, train2)), 0) self.assertTrue(np.all(sorted_indices == all2)) self.assertTrue(np.all(all1 == all2)) self.assertFalse( len(test1) != len(test2) or np.all(test1 == test2)) self.assertFalse( len(train1) != len(train2) or np.all(train1 == train2))
def run_nfolds(data, nsyls, nfolds, niters, enum_labels, nlabels, classifier, bar): ntrials = nfolds * niters if bar: bar.max = ntrials label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) ind = 0 for i in range(niters): folds = split_classwise(enum_labels, nfolds) for fold in folds: test_syl_idx = fold['test'] train_syl_idx = fold['train'] train_y = enum_labels[train_syl_idx] test_y = enum_labels[test_syl_idx] train_x = data[train_syl_idx, :] test_x = data[test_syl_idx, :] score, label_hits, label_misses, importances = classifier( train_x, train_y, test_x, test_y, nlabels) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / (label_hits + label_misses).astype( np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances ind += 1 if bar: bar.next() if bar: bar.finish() return label_prediction_scores, label_hitrates, importancess
def split(self, ratio, limits=None): fold = split_classwise(self.enum_labels, ratio, nfolds=1, balanced=self.balanced, limits=limits) train = fold[0]['train'] test = fold[0]['test'] trainable_enum_labels = self.enum_labels[train] trainset = inds2dataset( train, self.data, self.labels, self.lens).make_trainable(trainable_enum_labels) testset = inds2dataset(test, self.data, self.labels, self.lens) return trainset, testset
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options.get('profile', None) load_dir = options['load_dir'] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio = get_ratios(ratio_, 2) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) aggregations = Aggregation.objects.filter(enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] _sids, _tids = get_sids_tids(database) _labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: _sids, _tids, _labels = exclude_no_labels(_sids, _tids, _labels, no_label_ids) unique_labels, enum_labels = np.unique(_labels, return_inverse=True) fold = split_classwise(enum_labels, ratio=valid_ratio, limits=(min_occur, int(np.floor(min_occur * 1.5))), nfolds=1, balanced=True) train = fold[0]['train'] test = fold[0]['test'] all_indices = np.concatenate((train, test)) tids = _tids[all_indices] labels = _labels[all_indices] with open('/tmp/hyperopt.pkl', 'rb') as f: saved = pickle.load(f) performance_data = saved[clsf_type] accuracies = performance_data['accuracies'] groups = performance_data['groups'] params = performance_data['params'] group_name = '{}-{}'.format('mfcc', source) group_member_inds = np.where(groups == group_name) group_accuracies = accuracies[group_member_inds] best_acc_idx = np.argmax(group_accuracies) group_params = {} best_params = {} for param_name in params: param_values = np.array(params[param_name]) group_param_values = param_values[group_member_inds] group_params[param_name] = group_param_values converter = converters[clsf_type][param_name] best_params[param_name] = converter( group_param_values[best_acc_idx]) params_names = [] params_converters = [] params_count = 0 v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) def loss(params): mfcc_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] mfcc_args[param_name] = param_converter(param_value) _fmin = mfcc_args['fmin'] _fmax = mfcc_args['fmax'] _ncep = mfcc_args['ncep'] extract_mfcc_multiparams(database_name, load_dir, _ncep, _fmin, _fmax) data = [] tid2rows = {tid: [] for tid in tids} for aggregator in aggregators: agg_saved_file = 'database={}-feature=mfcc-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\ .format(database_name, aggregator.get_name(), _fmin, _fmax, _ncep) agg_saved_file_loc = os.path.join(load_dir, agg_saved_file) with open(agg_saved_file_loc, 'rb') as f: tid2aval = pickle.load(f) for tid in tids: val = tid2aval[tid] row = tid2rows[tid] row.append(val) for tid in tids: row = tid2rows[tid] row = np.hstack(row).T data.append(row) data = np.array(data) data = zscore(data) data[np.where(np.isnan(data))] = 0 data[np.where(np.isinf(data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) dp = EnumDataProvider(data, labels, balanced=True) trainvalidset, _ = dp.split(0, limits=(ipc_min, ipc_max)) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **best_params) return 1. - score ncep_choices = hp.uniform('ncep', 13, 48) fmin_choices = hp.uniform('fmin', 0, 5) fmax_choices = hp.uniform('fmax', 8, 24) mfcc_params = { 'ncep': (lambda x: int(np.round(x)), ncep_choices), 'fmin': (lambda x: int(np.round(x) * 100), fmin_choices), 'fmax': (lambda x: int(np.round(x) * 1000), fmax_choices), } space = [] for arg_name, (converter, arg_values) in mfcc_params.items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=100, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = mfcc_params[arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: converter = mfcc_params[arg_name][0] val = converter(trial_args_values[arg_name][0]) model_args_values[arg_name].append(val) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') open_mode = 'a'
def make_folds(self, nfolds, ratio=None): if ratio is None: ratio = 1. / nfolds self.folds = split_classwise(self.enum_labels, ratio, nfolds) return self.folds