def handle(self, *args, **options): database_name = options['database_name'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] niters = options['niters'] profile = options.get('profile', None) tsv_file = profile + '.tsv' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id').filter(name='spectrum') sids, tids = get_sids_tids(database) labels, no_label_ids = get_labels_by_sids(sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(sids, tids, labels, no_label_ids) full_data = extract_rawdata(tids, features) data = [x[0].T for x in full_data] unique_labels = np.unique(labels) nlabels = len(unique_labels) dp = OneHotSequenceProvider(data, labels, balanced=True) trainvalidset, testset = dp.split( test_ratio, limits=(min_occur, int(np.floor(min_occur * 1.5)))) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) hidden_layer_sizes_choices = [ (100, ), (200, ), (400, ), (100, 100), (100, 200), (100, 400), (200, 100), (200, 200), (200, 400), (400, 100), (400, 200), (400, 400), ] choices = {'cnn': {'hidden_layer_sizes': hidden_layer_sizes_choices}} best_trial_args_values = {} for arg_name, arg_values in choices['cnn'].items(): losses = [] ids = [] def loss_func(params): arg_value = params[0] classifier_args = best_trial_args_values.copy() classifier_args[arg_name] = arg_value print('classifier_args = {}'.format(classifier_args)) score = perform_k_fold(cnn, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score for idx, arg_value in enumerate(arg_values): loss = loss_func((arg_value, )) ids.append(idx) losses.append(loss) best_loss_idx = np.argmin(losses) best_arg_value = arg_values[best_loss_idx] best_trial_args_values[arg_name] = best_arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, loss in enumerate(losses): if idx == best_loss_idx: idx_str = 'Best' else: idx_str = str(idx) # trial_args_values = trial['misc']['vals'] for arg_name_ in model_args: if arg_name_ == 'id': model_args_values['id'].append(idx_str) elif arg_name_ == 'accuracy': trial_accuracy = 1. - loss model_args_values['accuracy'].append(trial_accuracy) else: if arg_name_ == arg_name: val = arg_values[idx] else: val = best_trial_args_values[arg_name_] model_args_values[arg_name_].append(val) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') open_mode = 'a' # Perform classification on the test set nfolds = int(np.floor(1 / test_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Running CNN', max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max)) traintetset.make_folds(nfolds, test_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ cnn(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / (label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) with open(tsv_file, open_mode, encoding='utf-8') as f: f.write('Results using best-model\'s paramaters on testset\n') f.write('Feature group\tLabel prediction mean\tstdev\t{}\n'.format( '\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format( 'Spectrum', mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n')
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] niters = options['niters'] profile = options.get('profile', None) tsv_file = profile + '.tsv' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio = get_ratios(ratio_, 2) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter(enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join( list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) dm = DataMatrix.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).last() if dm is None: raise Exception( 'No full data matrix for database {}'.format(database_name)) dm_sids_path = dm.get_sids_path() dm_tids_path = dm.get_tids_path() dm_bytes_path = dm.get_bytes_path() feature_cols = dm.get_cols_path() with open(feature_cols, 'r', encoding='utf-8') as f: col_inds = json.load(f) _sids = bytes_to_ndarray(dm_sids_path, np.int32) _sids, sort_order = np.unique(_sids, return_index=True) try: _tids = bytes_to_ndarray(dm_tids_path, np.int32) _tids = _tids[sort_order] except FileNotFoundError: _tids = get_tids(_sids) full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids)) full_data = full_data[sort_order, :] labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) for ftgroup_name, feature_names in ftgroup_names.items(): if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggregators: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] with open('/tmp/hyperopt.pkl', 'rb') as f: saved = pickle.load(f) performance_data = saved[clsf_type] accuracies = performance_data['accuracies'] groups = performance_data['groups'] params = performance_data['params'] group_name = '{}-{}'.format(ftgroup_name, source) group_member_inds = np.where(groups == group_name) group_accuracies = accuracies[group_member_inds] best_acc_idx = np.argmax(group_accuracies) group_params = {} best_params = {} for param_name in params: param_values = np.array(params[param_name]) group_param_values = param_values[group_member_inds] group_params[param_name] = group_param_values converter = converters[clsf_type][param_name] best_params[param_name] = converter( group_param_values[best_acc_idx]) dp = EnumDataProvider(data, labels, balanced=True) nfolds = int(np.floor(1 / valid_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Features: {}. Classifier: {} Data type: {}...'.format( ftgroup_name, clsf_type, source), max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max)) traintetset.make_folds(nfolds, valid_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances = \ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_params) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / ( label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) with open(tsv_file, open_mode, encoding='utf-8') as f: if source == 'full': f.write('{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) else: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, explained, pca_dims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('Accuracy: \n') f.write('\t'.join(list(map(str, label_prediction_scores)))) f.write('\n') f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n') open_mode = 'a'
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options.get('profile', None) load_dir = options['load_dir'] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio = get_ratios(ratio_, 2) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) aggregations = Aggregation.objects.filter(enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] _sids, _tids = get_sids_tids(database) _labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: _sids, _tids, _labels = exclude_no_labels(_sids, _tids, _labels, no_label_ids) unique_labels, enum_labels = np.unique(_labels, return_inverse=True) fold = split_classwise(enum_labels, ratio=valid_ratio, limits=(min_occur, int(np.floor(min_occur * 1.5))), nfolds=1, balanced=True) train = fold[0]['train'] test = fold[0]['test'] all_indices = np.concatenate((train, test)) tids = _tids[all_indices] labels = _labels[all_indices] with open('/tmp/hyperopt.pkl', 'rb') as f: saved = pickle.load(f) performance_data = saved[clsf_type] accuracies = performance_data['accuracies'] groups = performance_data['groups'] params = performance_data['params'] group_name = '{}-{}'.format('mfcc', source) group_member_inds = np.where(groups == group_name) group_accuracies = accuracies[group_member_inds] best_acc_idx = np.argmax(group_accuracies) group_params = {} best_params = {} for param_name in params: param_values = np.array(params[param_name]) group_param_values = param_values[group_member_inds] group_params[param_name] = group_param_values converter = converters[clsf_type][param_name] best_params[param_name] = converter( group_param_values[best_acc_idx]) params_names = [] params_converters = [] params_count = 0 v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) def loss(params): mfcc_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] mfcc_args[param_name] = param_converter(param_value) _fmin = mfcc_args['fmin'] _fmax = mfcc_args['fmax'] _ncep = mfcc_args['ncep'] extract_mfcc_multiparams(database_name, load_dir, _ncep, _fmin, _fmax) data = [] tid2rows = {tid: [] for tid in tids} for aggregator in aggregators: agg_saved_file = 'database={}-feature=mfcc-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\ .format(database_name, aggregator.get_name(), _fmin, _fmax, _ncep) agg_saved_file_loc = os.path.join(load_dir, agg_saved_file) with open(agg_saved_file_loc, 'rb') as f: tid2aval = pickle.load(f) for tid in tids: val = tid2aval[tid] row = tid2rows[tid] row.append(val) for tid in tids: row = tid2rows[tid] row = np.hstack(row).T data.append(row) data = np.array(data) data = zscore(data) data[np.where(np.isnan(data))] = 0 data[np.where(np.isinf(data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) dp = EnumDataProvider(data, labels, balanced=True) trainvalidset, _ = dp.split(0, limits=(ipc_min, ipc_max)) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **best_params) return 1. - score ncep_choices = hp.uniform('ncep', 13, 48) fmin_choices = hp.uniform('fmin', 0, 5) fmax_choices = hp.uniform('fmax', 8, 24) mfcc_params = { 'ncep': (lambda x: int(np.round(x)), ncep_choices), 'fmin': (lambda x: int(np.round(x) * 100), fmin_choices), 'fmax': (lambda x: int(np.round(x) * 1000), fmax_choices), } space = [] for arg_name, (converter, arg_values) in mfcc_params.items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=100, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = mfcc_params[arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: converter = mfcc_params[arg_name][0] val = converter(trial_args_values[arg_name][0]) model_args_values[arg_name].append(val) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') open_mode = 'a'
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options['profile'] agg = options['agg'] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = list(feature_map.values()) aggregations = Aggregation.objects.filter(enabled=True).order_by('id') if agg == 'all': aggregators = [aggregator_map[x.name] for x in aggregations] else: aggregators = enabled_aggregators[agg] _sids, _tids = get_sids_tids(database) full_data, col_inds = extract_rawdata(_tids, features, aggregators) labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) for ftgroup_name, feature_names in ftgroup_names.items(): if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggregators: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] dp = EnumDataProvider(data, labels, balanced=True) trainvalidset, testset = dp.split(test_ratio, limits=(ipc_min, ipc_max)) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) params_names = [] params_converters = [] params_count = 0 def loss(params): classifier_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] classifier_args[param_name] = param_converter(param_value) print(classifier_args) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score n_estimators_choices = hp.uniform('n_estimators', 40, 100) min_samples_split_choices = hp.uniform('min_samples_split', 2, 21) min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20) n_features = data.shape[1] auto_gamma = 1 / n_features gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10) c_choices = hp.uniform('C', -1, 2) hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000) n_neighbors_choices = hp.uniform('n_neighbors', 1, 10) choices = { 'rf': { 'n_estimators': (lambda x: int(np.round(x)), n_estimators_choices), 'min_samples_split': (lambda x: int(np.round(x)), min_samples_split_choices), 'min_samples_leaf': (lambda x: int(np.round(x)), min_samples_leaf_choices), }, 'svm_rbf': { 'gamma': (float, gamma_choices), 'C': (lambda x: 10**x, c_choices), }, 'svm_linear': { 'C': (lambda x: 10**x, c_choices), }, 'nnet': { 'hidden_layer_sizes': (lambda x: (int(np.round(x)), ), hidden_layer_size_choices) }, 'knn': { 'n_neighbors': (lambda x: int(np.round(x)), n_neighbors_choices) } } space = [] for arg_name, (converter, arg_values) in choices[clsf_type].items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() max_evals = params_count * 10 best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = choices[clsf_type][arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: # choice = choices[clsf_type][arg_name] converter = choices[clsf_type][arg_name][0] val = converter(trial_args_values[arg_name][0]) # val = choice[choice_idx] model_args_values[arg_name].append(val) # Perform classification on the test set train_x = np.array(trainvalidset.data) train_y = np.array(trainvalidset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) lb_hitrates = label_hits / (label_hits + label_misses).astype( np.float) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') f.write('Results using best-model\'s paramaters on testset\n') if source == 'full': f.write( 'Feature group\tNdims\tLabel prediction score\t{}\n'. format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, score, '\t'.join(map(str, lb_hitrates)))) else: f.write( 'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n' .format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, explained, pca_dims, score, '\t'.join(map(str, lb_hitrates)))) f.write('\n') open_mode = 'a'
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ratio_ = options['ratio'] niters = options['niters'] csv_filename = options.get('csv_filename', None) train_ratio, valid_ratio = get_ratios(ratio_, 2) assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter(enabled=True).order_by('id') enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join(list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join(list(map(str, aggregations.values_list('id', flat=True)))) dm = DataMatrix.objects.filter(database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).last() if dm is None: raise Exception('No full data matrix for database {}'.format(database_name)) dm_sids_path = dm.get_sids_path() dm_tids_path = dm.get_tids_path() dm_bytes_path = dm.get_bytes_path() feature_cols = dm.get_cols_path() with open(feature_cols, 'r', encoding='utf-8') as f: col_inds = json.load(f) _sids = bytes_to_ndarray(dm_sids_path, np.int32) _sids, sort_order = np.unique(_sids, return_index=True) try: _tids = bytes_to_ndarray(dm_tids_path, np.int32) _tids = _tids[sort_order] except FileNotFoundError: _tids = get_tids(_sids) full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids)) full_data = full_data[sort_order, :] labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) if csv_filename: with open(csv_filename, 'w', encoding='utf-8') as f: if source == 'pca': f.write('Feature group\tAggregators\tNdims\tPCA explained\tPCA Dims\tLabel prediction mean\tstdev' '\t{}\n'.format('\t '.join(unique_labels))) else: f.write('Feature group\tAggregators\tNdims\tLabel prediction mean\tstdev\t{}\n' .format('\t '.join(unique_labels))) for ftgroup_name, feature_names in ftgroup_names.items(): for agggroup_name, aggs in list(enabled_aggregators.items()) + [('all', None)]: if agggroup_name == 'all': aggs = [aggregator_map[x.name] for x in aggregations] if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggs: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] dp = EnumDataProvider(data, labels, balanced=True) nfolds = int(np.floor(1 / valid_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Features: {}. Aggregator: {}. Classifier: {} Data type: {}...' .format(ftgroup_name, agggroup_name, clsf_type, source), max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(min_occur, int(np.floor(min_occur * 1.5)))) traintetset.make_folds(nfolds, valid_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances = \ classifier(train_x, train_y, test_x, test_y, nlabels, True) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / (label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) if csv_filename: with open(csv_filename, 'a', encoding='utf-8') as f: if source == 'full': f.write('{}\t{}\t{}\t{}\t{}\t{}\n' .format(ftgroup_name, agggroup_name, ndims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) else: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n' .format(ftgroup_name, agggroup_name, ndims, explained, pca_dims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n') else: print('{}/{}: {} by {}: mean = {} std = {}' .format(ftgroup_name, agggroup_name, clsf_type, source, mean_label_prediction_scores, std_label_prediction_scores))
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options['profile'] load_from = options['load_from'] format = options['format'] min_max_loc = options['min_max_loc'] denormalised = options['denormalised'] kernel_only = options['kernel_only'] extractor = extractors[format] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] annotator = get_or_error(User, dict(username__iexact=annotator_name)) if not load_from.lower().endswith('.zip'): load_from += '.zip' variables = read_variables(load_from) variables['extractor'] = extractor variables['denormalised'] = denormalised if denormalised: global_min, global_max = load_global_min_max(min_max_loc) variables['global_min'] = global_min variables['global_max'] = global_max variables['is_log_psd'] = format.startswith('log_') factory = NDS2SAEFactory() factory.set_output(load_from) factory.learning_rate = None factory.learning_rate_func = None encoder = factory.build() session = encoder.recreate_session() _sids, full_data = encode_into_data(variables, encoder, session, database_name, kernel_only) labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, _, labels = exclude_no_labels(_sids, None, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 ndims = full_data.shape[1] unique_labels = np.unique(labels) nlabels = len(unique_labels) dp = EnumDataProvider(full_data, labels, balanced=True) trainvalidset, testset = dp.split(test_ratio, limits=(ipc_min, ipc_max)) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) params_names = [] params_converters = [] params_count = 0 def loss(params): classifier_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] classifier_args[param_name] = param_converter(param_value) print(classifier_args) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score n_estimators_choices = hp.uniform('n_estimators', 40, 100) min_samples_split_choices = hp.uniform('min_samples_split', 2, 21) min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20) n_features = full_data.shape[1] auto_gamma = 1 / n_features gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10) c_choices = hp.uniform('C', -1, 2) hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000) n_neighbors_choices = hp.uniform('n_neighbors', 1, 10) choices = { 'rf': { 'n_estimators': (lambda x: int(np.round(x)), n_estimators_choices), 'min_samples_split': (lambda x: int(np.round(x)), min_samples_split_choices), 'min_samples_leaf': (lambda x: int(np.round(x)), min_samples_leaf_choices), }, 'svm_rbf': { 'gamma': (float, gamma_choices), 'C': (lambda x: 10**x, c_choices), }, 'svm_linear': { 'C': (lambda x: 10**x, c_choices), }, 'nnet': { 'hidden_layer_sizes': (lambda x: (int(np.round(x)), ), hidden_layer_size_choices) }, 'knn': { 'n_neighbors': (lambda x: int(np.round(x)), n_neighbors_choices) } } space = [] for arg_name, (converter, arg_values) in choices[clsf_type].items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() max_evals = params_count * 10 best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = choices[clsf_type][arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: # choice = choices[clsf_type][arg_name] converter = choices[clsf_type][arg_name][0] val = converter(trial_args_values[arg_name][0]) # val = choice[choice_idx] model_args_values[arg_name].append(val) # Perform classification on the test set train_x = np.array(trainvalidset.data) train_y = np.array(trainvalidset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) lb_hitrates = label_hits / (label_hits + label_misses).astype(np.float) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') f.write('Results using best-model\'s paramaters on testset\n') f.write( 'Feature group\tNdims\tLabel prediction score\t{}\n'.format( '\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format('s2senc', ndims, score, '\t'.join(map(str, lb_hitrates)))) f.write('\n') open_mode = 'a'