def process_class_measures(self, original_measures): if self.feature_grouper == 'pca': explained, pcaed_measures = pca_optimal(original_measures, self.pca_dimension * 2, self.pca_explained, self.pca_dimension) pcaed_measures = zscore(pcaed_measures) print('explained = {}, data.shape= {}'.format( explained, pcaed_measures.shape)) dist_triu = pdist(pcaed_measures) return dist_triu, pcaed_measures else: dist_triu = pdist(original_measures) return dist_triu, original_measures
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] niters = options['niters'] profile = options.get('profile', None) tsv_file = profile + '.tsv' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio = get_ratios(ratio_, 2) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter(enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join( list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) dm = DataMatrix.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).last() if dm is None: raise Exception( 'No full data matrix for database {}'.format(database_name)) dm_sids_path = dm.get_sids_path() dm_tids_path = dm.get_tids_path() dm_bytes_path = dm.get_bytes_path() feature_cols = dm.get_cols_path() with open(feature_cols, 'r', encoding='utf-8') as f: col_inds = json.load(f) _sids = bytes_to_ndarray(dm_sids_path, np.int32) _sids, sort_order = np.unique(_sids, return_index=True) try: _tids = bytes_to_ndarray(dm_tids_path, np.int32) _tids = _tids[sort_order] except FileNotFoundError: _tids = get_tids(_sids) full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids)) full_data = full_data[sort_order, :] labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) for ftgroup_name, feature_names in ftgroup_names.items(): if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggregators: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] with open('/tmp/hyperopt.pkl', 'rb') as f: saved = pickle.load(f) performance_data = saved[clsf_type] accuracies = performance_data['accuracies'] groups = performance_data['groups'] params = performance_data['params'] group_name = '{}-{}'.format(ftgroup_name, source) group_member_inds = np.where(groups == group_name) group_accuracies = accuracies[group_member_inds] best_acc_idx = np.argmax(group_accuracies) group_params = {} best_params = {} for param_name in params: param_values = np.array(params[param_name]) group_param_values = param_values[group_member_inds] group_params[param_name] = group_param_values converter = converters[clsf_type][param_name] best_params[param_name] = converter( group_param_values[best_acc_idx]) dp = EnumDataProvider(data, labels, balanced=True) nfolds = int(np.floor(1 / valid_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Features: {}. Classifier: {} Data type: {}...'.format( ftgroup_name, clsf_type, source), max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max)) traintetset.make_folds(nfolds, valid_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances = \ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_params) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / ( label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) with open(tsv_file, open_mode, encoding='utf-8') as f: if source == 'full': f.write('{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) else: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, explained, pca_dims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('Accuracy: \n') f.write('\t'.join(list(map(str, label_prediction_scores)))) f.write('\n') f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n') open_mode = 'a'
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options['profile'] agg = options['agg'] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = list(feature_map.values()) aggregations = Aggregation.objects.filter(enabled=True).order_by('id') if agg == 'all': aggregators = [aggregator_map[x.name] for x in aggregations] else: aggregators = enabled_aggregators[agg] _sids, _tids = get_sids_tids(database) full_data, col_inds = extract_rawdata(_tids, features, aggregators) labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) for ftgroup_name, feature_names in ftgroup_names.items(): if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggregators: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] dp = EnumDataProvider(data, labels, balanced=True) trainvalidset, testset = dp.split(test_ratio, limits=(ipc_min, ipc_max)) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) params_names = [] params_converters = [] params_count = 0 def loss(params): classifier_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] classifier_args[param_name] = param_converter(param_value) print(classifier_args) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score n_estimators_choices = hp.uniform('n_estimators', 40, 100) min_samples_split_choices = hp.uniform('min_samples_split', 2, 21) min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20) n_features = data.shape[1] auto_gamma = 1 / n_features gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10) c_choices = hp.uniform('C', -1, 2) hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000) n_neighbors_choices = hp.uniform('n_neighbors', 1, 10) choices = { 'rf': { 'n_estimators': (lambda x: int(np.round(x)), n_estimators_choices), 'min_samples_split': (lambda x: int(np.round(x)), min_samples_split_choices), 'min_samples_leaf': (lambda x: int(np.round(x)), min_samples_leaf_choices), }, 'svm_rbf': { 'gamma': (float, gamma_choices), 'C': (lambda x: 10**x, c_choices), }, 'svm_linear': { 'C': (lambda x: 10**x, c_choices), }, 'nnet': { 'hidden_layer_sizes': (lambda x: (int(np.round(x)), ), hidden_layer_size_choices) }, 'knn': { 'n_neighbors': (lambda x: int(np.round(x)), n_neighbors_choices) } } space = [] for arg_name, (converter, arg_values) in choices[clsf_type].items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() max_evals = params_count * 10 best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = choices[clsf_type][arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: # choice = choices[clsf_type][arg_name] converter = choices[clsf_type][arg_name][0] val = converter(trial_args_values[arg_name][0]) # val = choice[choice_idx] model_args_values[arg_name].append(val) # Perform classification on the test set train_x = np.array(trainvalidset.data) train_y = np.array(trainvalidset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) lb_hitrates = label_hits / (label_hits + label_misses).astype( np.float) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') f.write('Results using best-model\'s paramaters on testset\n') if source == 'full': f.write( 'Feature group\tNdims\tLabel prediction score\t{}\n'. format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, score, '\t'.join(map(str, lb_hitrates)))) else: f.write( 'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n' .format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, explained, pca_dims, score, '\t'.join(map(str, lb_hitrates)))) f.write('\n') open_mode = 'a'
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ratio_ = options['ratio'] niters = options['niters'] csv_filename = options.get('csv_filename', None) train_ratio, valid_ratio = get_ratios(ratio_, 2) assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.filter(enabled=True).order_by('id') enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) features_hash = '-'.join(list(map(str, [x.id for x in enabled_features]))) aggregations_hash = '-'.join(list(map(str, aggregations.values_list('id', flat=True)))) dm = DataMatrix.objects.filter(database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).last() if dm is None: raise Exception('No full data matrix for database {}'.format(database_name)) dm_sids_path = dm.get_sids_path() dm_tids_path = dm.get_tids_path() dm_bytes_path = dm.get_bytes_path() feature_cols = dm.get_cols_path() with open(feature_cols, 'r', encoding='utf-8') as f: col_inds = json.load(f) _sids = bytes_to_ndarray(dm_sids_path, np.int32) _sids, sort_order = np.unique(_sids, return_index=True) try: _tids = bytes_to_ndarray(dm_tids_path, np.int32) _tids = _tids[sort_order] except FileNotFoundError: _tids = get_tids(_sids) full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids)) full_data = full_data[sort_order, :] labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) if csv_filename: with open(csv_filename, 'w', encoding='utf-8') as f: if source == 'pca': f.write('Feature group\tAggregators\tNdims\tPCA explained\tPCA Dims\tLabel prediction mean\tstdev' '\t{}\n'.format('\t '.join(unique_labels))) else: f.write('Feature group\tAggregators\tNdims\tLabel prediction mean\tstdev\t{}\n' .format('\t '.join(unique_labels))) for ftgroup_name, feature_names in ftgroup_names.items(): for agggroup_name, aggs in list(enabled_aggregators.items()) + [('all', None)]: if agggroup_name == 'all': aggs = [aggregator_map[x.name] for x in aggregations] if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggs: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] dp = EnumDataProvider(data, labels, balanced=True) nfolds = int(np.floor(1 / valid_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Features: {}. Aggregator: {}. Classifier: {} Data type: {}...' .format(ftgroup_name, agggroup_name, clsf_type, source), max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(min_occur, int(np.floor(min_occur * 1.5)))) traintetset.make_folds(nfolds, valid_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances = \ classifier(train_x, train_y, test_x, test_y, nlabels, True) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / (label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) if csv_filename: with open(csv_filename, 'a', encoding='utf-8') as f: if source == 'full': f.write('{}\t{}\t{}\t{}\t{}\t{}\n' .format(ftgroup_name, agggroup_name, ndims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) else: f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n' .format(ftgroup_name, agggroup_name, ndims, explained, pca_dims, mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n') else: print('{}/{}: {} by {}: mean = {} std = {}' .format(ftgroup_name, agggroup_name, clsf_type, source, mean_label_prediction_scores, std_label_prediction_scores))