def get_syntactically_similar_pairs(request): extra_args = json.loads(request.POST.get('extras', {})) granularity = extra_args['granularity'] user = request.user database = get_user_databases(user) permission = database.get_assigned_permission(user) if permission < DatabasePermission.ANNOTATE: raise CustomAssertionError( 'You don\'t have permission to annotate this database') sids, tids = get_sids_tids(database) label_arr = get_syllable_labels(user, granularity, sids, on_no_label='set_blank') cls_labels, syl_label_enum_arr = np.unique(label_arr, return_inverse=True) enum2label = {enum: label for enum, label in enumerate(cls_labels)} sid2enumlabel = { sid: enum_label for sid, enum_label in zip(sids, syl_label_enum_arr) } adjacency_mat, classes_info = calc_class_ajacency(database, syl_label_enum_arr, enum2label, sid2enumlabel, count_style='forward', self_count='append') counter = Counter(syl_label_enum_arr) nlabels = len(counter) frequencies = np.array([counter[i] for i in range(nlabels)]) return adjacency_mat.tolist(), frequencies.tolist(), cls_labels.tolist()
def create_full_tensor(database, recreate): features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.all().order_by('id') features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) aggregators = [aggregator_map[x.name] for x in aggregations] full_tensor = FullTensorData.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).first() if full_tensor and not recreate: print( 'Full tensor {} already exists. If you want to recreate, turn on flag --recreate' .format(full_tensor.name)) return full_tensor, False if full_tensor is None: full_tensors_name = uuid.uuid4().hex full_tensor = FullTensorData(name=full_tensors_name, database=database, features_hash=features_hash, aggregations_hash=aggregations_hash) full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() full_cols_path = full_tensor.get_cols_path() sids, tids = get_sids_tids(database) f2bs, fa2bs = get_binstorage_locations(features, aggregators) data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features, aggregators) ndarray_to_bytes(data, full_bytes_path) ndarray_to_bytes(sids, full_sids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f) full_tensor.save() return full_tensor, True
def prepare_data_for_analysis(self, pkl_filename, options): label_level = options['label_level'] dbid = options['dbid'] annotator_name = options['annotator_name'] database = get_database(dbid) sids, tids = get_sids_tids(database) annotator = get_or_error(User, dict(username__iexact=annotator_name)) label_arr = get_syllable_labels(annotator, label_level, sids) cls_labels, syl_label_enum_arr = np.unique(label_arr, return_inverse=True) enum2label = {enum: label for enum, label in enumerate(cls_labels)} sid2enumlabel = { sid: enum_label for sid, enum_label in zip(sids, syl_label_enum_arr) } adjacency_mat, classes_info = calc_class_ajacency( database, syl_label_enum_arr, enum2label, sid2enumlabel, count_style='symmetric', count_circular=False) dist_triu = calc_class_dist_by_adjacency(adjacency_mat, syl_label_enum_arr, return_triu=True) tree = linkage(dist_triu, method='average') saved_dict = dict(tree=tree, dbid=database.id, sids=sids, unique_labels=label_arr, classes_info=classes_info) with open(pkl_filename, 'wb') as f: pickle.dump(saved_dict, f) return saved_dict
def handle(self, *args, **options): database_name = options['database_name'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] niters = options['niters'] profile = options.get('profile', None) tsv_file = profile + '.tsv' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id').filter(name='spectrum') sids, tids = get_sids_tids(database) labels, no_label_ids = get_labels_by_sids(sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(sids, tids, labels, no_label_ids) full_data = extract_rawdata(tids, features) data = [x[0].T for x in full_data] unique_labels = np.unique(labels) nlabels = len(unique_labels) dp = OneHotSequenceProvider(data, labels, balanced=True) trainvalidset, testset = dp.split( test_ratio, limits=(min_occur, int(np.floor(min_occur * 1.5)))) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) hidden_layer_sizes_choices = [ (100, ), (200, ), (400, ), (100, 100), (100, 200), (100, 400), (200, 100), (200, 200), (200, 400), (400, 100), (400, 200), (400, 400), ] choices = {'cnn': {'hidden_layer_sizes': hidden_layer_sizes_choices}} best_trial_args_values = {} for arg_name, arg_values in choices['cnn'].items(): losses = [] ids = [] def loss_func(params): arg_value = params[0] classifier_args = best_trial_args_values.copy() classifier_args[arg_name] = arg_value print('classifier_args = {}'.format(classifier_args)) score = perform_k_fold(cnn, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score for idx, arg_value in enumerate(arg_values): loss = loss_func((arg_value, )) ids.append(idx) losses.append(loss) best_loss_idx = np.argmin(losses) best_arg_value = arg_values[best_loss_idx] best_trial_args_values[arg_name] = best_arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, loss in enumerate(losses): if idx == best_loss_idx: idx_str = 'Best' else: idx_str = str(idx) # trial_args_values = trial['misc']['vals'] for arg_name_ in model_args: if arg_name_ == 'id': model_args_values['id'].append(idx_str) elif arg_name_ == 'accuracy': trial_accuracy = 1. - loss model_args_values['accuracy'].append(trial_accuracy) else: if arg_name_ == arg_name: val = arg_values[idx] else: val = best_trial_args_values[arg_name_] model_args_values[arg_name_].append(val) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') open_mode = 'a' # Perform classification on the test set nfolds = int(np.floor(1 / test_ratio + 0.01)) ntrials = nfolds * niters label_prediction_scores = [0] * ntrials label_hitss = [0] * ntrials label_missess = [0] * ntrials label_hitrates = np.empty((ntrials, nlabels)) label_hitrates[:] = np.nan importancess = np.empty((ntrials, data.shape[1])) cfmats = np.ndarray((ntrials, nlabels, nlabels)) ind = 0 bar = Bar('Running CNN', max=ntrials) for iter in range(niters): traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max)) traintetset.make_folds(nfolds, test_ratio) for k in range(nfolds): trainset, testset = traintetset.get_fold(k) train_x = np.array(trainset.data) train_y = np.array(trainset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ cnn(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) label_prediction_scores[ind] = score label_hitss[ind] = label_hits label_missess[ind] = label_misses label_hitrate = label_hits / (label_hits + label_misses).astype(np.float) label_hitrates[ind, :] = label_hitrate importancess[ind, :] = importances cfmats[ind, :, :] = cfmat bar.next() ind += 1 bar.finish() mean_label_prediction_scores = np.nanmean(label_prediction_scores) std_label_prediction_scores = np.nanstd(label_prediction_scores) sum_cfmat = np.nansum(cfmats, axis=0) with open(tsv_file, open_mode, encoding='utf-8') as f: f.write('Results using best-model\'s paramaters on testset\n') f.write('Feature group\tLabel prediction mean\tstdev\t{}\n'.format( '\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format( 'Spectrum', mean_label_prediction_scores, std_label_prediction_scores, '\t'.join(map(str, np.nanmean(label_hitrates, 0))))) f.write('\t') f.write('\t'.join(unique_labels)) f.write('\n') for i in range(nlabels): label = unique_labels[i] cfrow = sum_cfmat[:, i] f.write(label) f.write('\t') f.write('\t'.join(map(str, cfrow))) f.write('\n') f.write('\n')
def handle(self, *args, **options): database_name = options['database_name'] annotator_name = options['annotator_name'] # population_name = options['population_name'] label_level = options['label_level'] min_occur = options['min_occur'] no_gpu = options['no_gpu'] # feature_group = options['feature_group'] # if feature_group: # feature_names = feature_names.split(',') # features = Feature.objects.filter(name__in=feature_names).order_by('id') # else: # features = Feature.objects.all().order_by('id') # # features = features.exclude(is_fixed_length=True) database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = Feature.objects.all().order_by('id') # features = list(features)[:4] enabled_features = [] for f in features: if f.name in feature_map: enabled_features.append(f) sids, tids = get_sids_tids(database) labels, no_label_ids = get_labels_by_sids(sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(sids, tids, labels, no_label_ids) full_data = extract_rawdata(tids, enabled_features) feature_inds = {x.name: idx for idx, x in enumerate(enabled_features)} for ftgroup_name in ftgroup_names + ['all']: data = [] if ftgroup_name == 'all': features = feature_whereabout_flat else: features = feature_whereabout[ftgroup_name] ftgroup_col_inds = [] for feature_name, is_fixed_length, _ in features: col_name = feature_name feature_ind = feature_inds.get(col_name, None) if feature_ind is not None: ftgroup_col_inds.append(feature_ind) for full_row, sid in zip(full_data, sids): row = [full_row[x] for x in ftgroup_col_inds] try: row = np.vstack(row).T except ValueError: print('Encounter error at id={}'.format(sid)) for idx, (feature_name, is_fixed_length, _) in enumerate(features): print('{} - {}'.format(feature_name, row[idx].shape)) data.append(row) data_provider = OneHotSequenceProvider(data, labels, balanced=True) model_name = '{}_{}_{}'.format(database_name, label_level, ftgroup_name) print('Training for: {}'.format(model_name)) train(data_provider, name=model_name, disable_gpu=no_gpu)
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options.get('profile', None) load_dir = options['load_dir'] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio = get_ratios(ratio_, 2) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) aggregations = Aggregation.objects.filter(enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] _sids, _tids = get_sids_tids(database) _labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: _sids, _tids, _labels = exclude_no_labels(_sids, _tids, _labels, no_label_ids) unique_labels, enum_labels = np.unique(_labels, return_inverse=True) fold = split_classwise(enum_labels, ratio=valid_ratio, limits=(min_occur, int(np.floor(min_occur * 1.5))), nfolds=1, balanced=True) train = fold[0]['train'] test = fold[0]['test'] all_indices = np.concatenate((train, test)) tids = _tids[all_indices] labels = _labels[all_indices] with open('/tmp/hyperopt.pkl', 'rb') as f: saved = pickle.load(f) performance_data = saved[clsf_type] accuracies = performance_data['accuracies'] groups = performance_data['groups'] params = performance_data['params'] group_name = '{}-{}'.format('mfcc', source) group_member_inds = np.where(groups == group_name) group_accuracies = accuracies[group_member_inds] best_acc_idx = np.argmax(group_accuracies) group_params = {} best_params = {} for param_name in params: param_values = np.array(params[param_name]) group_param_values = param_values[group_member_inds] group_params[param_name] = group_param_values converter = converters[clsf_type][param_name] best_params[param_name] = converter( group_param_values[best_acc_idx]) params_names = [] params_converters = [] params_count = 0 v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) def loss(params): mfcc_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] mfcc_args[param_name] = param_converter(param_value) _fmin = mfcc_args['fmin'] _fmax = mfcc_args['fmax'] _ncep = mfcc_args['ncep'] extract_mfcc_multiparams(database_name, load_dir, _ncep, _fmin, _fmax) data = [] tid2rows = {tid: [] for tid in tids} for aggregator in aggregators: agg_saved_file = 'database={}-feature=mfcc-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\ .format(database_name, aggregator.get_name(), _fmin, _fmax, _ncep) agg_saved_file_loc = os.path.join(load_dir, agg_saved_file) with open(agg_saved_file_loc, 'rb') as f: tid2aval = pickle.load(f) for tid in tids: val = tid2aval[tid] row = tid2rows[tid] row.append(val) for tid in tids: row = tid2rows[tid] row = np.hstack(row).T data.append(row) data = np.array(data) data = zscore(data) data[np.where(np.isnan(data))] = 0 data[np.where(np.isinf(data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) dp = EnumDataProvider(data, labels, balanced=True) trainvalidset, _ = dp.split(0, limits=(ipc_min, ipc_max)) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **best_params) return 1. - score ncep_choices = hp.uniform('ncep', 13, 48) fmin_choices = hp.uniform('fmin', 0, 5) fmax_choices = hp.uniform('fmax', 8, 24) mfcc_params = { 'ncep': (lambda x: int(np.round(x)), ncep_choices), 'fmin': (lambda x: int(np.round(x) * 100), fmin_choices), 'fmax': (lambda x: int(np.round(x) * 1000), fmax_choices), } space = [] for arg_name, (converter, arg_values) in mfcc_params.items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=100, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = mfcc_params[arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: converter = mfcc_params[arg_name][0] val = converter(trial_args_values[arg_name][0]) model_args_values[arg_name].append(val) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') open_mode = 'a'
def handle(self, *args, **options): clsf_type = options['clsf_type'] database_name = options['database_name'] source = options['source'] annotator_name = options['annotator_name'] label_level = options['label_level'] min_occur = options['min_occur'] ipc = options['ipc'] ratio_ = options['ratio'] profile = options['profile'] agg = options['agg'] tsv_file = profile + '.tsv' trials_file = profile + '.trials' if ipc is not None: assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur' ipc_min = ipc ipc_max = ipc else: ipc_min = min_occur ipc_max = int(np.floor(min_occur * 1.5)) train_ratio, valid_ratio, test_ratio = get_ratios(ratio_) open_mode = 'w' assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format( clsf_type) classifier = classifiers[clsf_type] database = get_or_error(Database, dict(name__iexact=database_name)) annotator = get_or_error(User, dict(username__iexact=annotator_name)) features = list(feature_map.values()) aggregations = Aggregation.objects.filter(enabled=True).order_by('id') if agg == 'all': aggregators = [aggregator_map[x.name] for x in aggregations] else: aggregators = enabled_aggregators[agg] _sids, _tids = get_sids_tids(database) full_data, col_inds = extract_rawdata(_tids, features, aggregators) labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur) if len(no_label_ids) > 0: sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids) lookup_ids_rows = np.searchsorted(_sids, sids) full_data = full_data[lookup_ids_rows, :] full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 unique_labels = np.unique(labels) nlabels = len(unique_labels) for ftgroup_name, feature_names in ftgroup_names.items(): if ftgroup_name == 'all': features = list(feature_map.values()) else: features = [feature_map[x] for x in feature_names] ft_col_inds = [] for feature in features: if feature.is_fixed_length: col_name = feature.name col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) else: for aggregator in aggregators: col_name = '{}_{}'.format(feature.name, aggregator.get_name()) col_range = col_inds[col_name] ft_col_inds += range(col_range[0], col_range[1]) ft_col_inds = np.array(ft_col_inds, dtype=np.int32) ndims = len(ft_col_inds) data = full_data[:, ft_col_inds] if source == 'pca': explained, data = pca_optimal(data, ndims, 0.9) pca_dims = data.shape[1] dp = EnumDataProvider(data, labels, balanced=True) trainvalidset, testset = dp.split(test_ratio, limits=(ipc_min, ipc_max)) v2t_ratio = valid_ratio / (train_ratio + valid_ratio) nfolds = int(np.floor(1. / v2t_ratio + 0.01)) params_names = [] params_converters = [] params_count = 0 def loss(params): classifier_args = {} for i in range(params_count): param_name = params_names[i] param_converter = params_converters[i] param_value = params[i] classifier_args[param_name] = param_converter(param_value) print(classifier_args) score = perform_k_fold(classifier, trainvalidset, nfolds, v2t_ratio, nlabels, **classifier_args) return 1. - score n_estimators_choices = hp.uniform('n_estimators', 40, 100) min_samples_split_choices = hp.uniform('min_samples_split', 2, 21) min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20) n_features = data.shape[1] auto_gamma = 1 / n_features gamma_choices = hp.uniform('gamma', auto_gamma / 10, auto_gamma * 10) c_choices = hp.uniform('C', -1, 2) hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100, 5000) n_neighbors_choices = hp.uniform('n_neighbors', 1, 10) choices = { 'rf': { 'n_estimators': (lambda x: int(np.round(x)), n_estimators_choices), 'min_samples_split': (lambda x: int(np.round(x)), min_samples_split_choices), 'min_samples_leaf': (lambda x: int(np.round(x)), min_samples_leaf_choices), }, 'svm_rbf': { 'gamma': (float, gamma_choices), 'C': (lambda x: 10**x, c_choices), }, 'svm_linear': { 'C': (lambda x: 10**x, c_choices), }, 'nnet': { 'hidden_layer_sizes': (lambda x: (int(np.round(x)), ), hidden_layer_size_choices) }, 'knn': { 'n_neighbors': (lambda x: int(np.round(x)), n_neighbors_choices) } } space = [] for arg_name, (converter, arg_values) in choices[clsf_type].items(): space.append(arg_values) params_names.append(arg_name) params_converters.append(converter) params_count += 1 trials = Trials() max_evals = params_count * 10 best = fmin(fn=loss, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) print(best) with open(trials_file, 'wb') as f: pickle.dump(trials, f) best_trial = trials.best_trial best_trial_args_values_ = best_trial['misc']['vals'] best_trial_args_values = {} for arg_name, arg_values in best_trial_args_values_.items(): converter = choices[clsf_type][arg_name][0] arg_value = converter(arg_values[0]) best_trial_args_values[arg_name] = arg_value model_args = ['id'] + list( best_trial_args_values.keys()) + ['accuracy'] model_args_values = {x: [] for x in model_args} for idx, trial in enumerate(trials.trials): if trial == best_trial: idx = 'Best' trial_args_values = trial['misc']['vals'] for arg_name in model_args: if arg_name == 'id': model_args_values['id'].append(idx) elif arg_name == 'accuracy': trial_accuracy = 1. - trial['result']['loss'] model_args_values['accuracy'].append(trial_accuracy) else: # choice = choices[clsf_type][arg_name] converter = choices[clsf_type][arg_name][0] val = converter(trial_args_values[arg_name][0]) # val = choice[choice_idx] model_args_values[arg_name].append(val) # Perform classification on the test set train_x = np.array(trainvalidset.data) train_y = np.array(trainvalidset.labels, dtype=np.int32) test_x = np.array(testset.data) test_y = np.array(testset.labels, dtype=np.int32) score, label_hits, label_misses, cfmat, importances =\ classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values) lb_hitrates = label_hits / (label_hits + label_misses).astype( np.float) with open(tsv_file, open_mode, encoding='utf-8') as f: for arg in model_args: values = model_args_values[arg] f.write('{}\t'.format(arg)) f.write('\t'.join(map(str, values))) f.write('\n') f.write('Results using best-model\'s paramaters on testset\n') if source == 'full': f.write( 'Feature group\tNdims\tLabel prediction score\t{}\n'. format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, score, '\t'.join(map(str, lb_hitrates)))) else: f.write( 'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n' .format('\t '.join(unique_labels))) f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format( ftgroup_name, ndims, explained, pca_dims, score, '\t'.join(map(str, lb_hitrates)))) f.write('\n') open_mode = 'a'
def handle(self, database_name, population_name, type, perplexity, normalised, *args, **kwargs): database = get_or_error(Database, dict(name__iexact=database_name)) assert type in ['tsne2', 'tsne3', 'mds', 'mdspca'] features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.all().order_by('id') features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) full_tensor = FullTensorData.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).first() if full_tensor is None: raise Exception( 'Full feature matrix not found. Need to create FullTensor first.' ) full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() full_sids = bytes_to_ndarray(full_sids_path, np.int32) full_data = get_rawdata_from_binary(full_bytes_path, len(full_sids)) sids, tids = get_sids_tids(database, population_name) normalised_str = 'normed' if normalised else 'raw' if type.startswith('tsne'): file_name = '{}_{}_{}_{}_{}.pkl'.format(database_name, population_name, type, perplexity, normalised_str) else: file_name = '{}_{}_{}_{}.pkl'.format(database_name, population_name, type, normalised_str) if os.path.isfile(file_name): with open(file_name, 'rb') as f: saved = pickle.load(f) coordinate = saved['coordinate'] stress = saved['stress'] else: population_data = cherrypick_tensor_data_by_sids( full_data, full_sids, sids).astype(np.float64) if normalised: population_data = zscore(population_data) population_data[np.where(np.isnan(population_data))] = 0 population_data[np.where(np.isinf(population_data))] = 0 if type.startswith('mds'): if type == 'mdspca': dim_reduce_func = PCA(n_components=50) population_data = dim_reduce_func.fit_transform( population_data, y=None) if hasattr(dim_reduce_func, 'explained_variance_ratio_'): print( 'Cumulative explained variation for {} principal components: {}' .format( 50, np.sum(dim_reduce_func. explained_variance_ratio_))) similarities = squareform(pdist(population_data, 'euclidean')) model = MDS(n_components=3, dissimilarity='precomputed', random_state=7, verbose=1, max_iter=1000) coordinate = model.fit_transform(similarities) stress = model.stress_ else: ntsne_dims = int(type[4:]) dim_reduce_func = PCA(n_components=50) population_data = dim_reduce_func.fit_transform( population_data, y=None) print('Cumulative explained variation: {}'.format( np.sum(dim_reduce_func.explained_variance_ratio_))) time_start = time.time() tsne = TSNE(n_components=ntsne_dims, verbose=1, perplexity=perplexity, n_iter=4000) coordinate = tsne.fit_transform(population_data) print( 't-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start)) stress = None with open(file_name, 'wb') as f: pickle.dump(dict(coordinate=coordinate, stress=stress, sids=sids, tids=tids), f, protocol=pickle.HIGHEST_PROTOCOL)
def extract_mfcc_multiparams(database_name, save_dir, ncep, fmin, fmax): xtra_args = dict(ncep=ncep, fmin=fmin, fmax=fmax) features = Feature.objects.filter(name='mfcc') database = get_or_error(Database, dict(name__iexact=database_name)) aggregations = Aggregation.objects.filter(enabled=True).order_by('id') aggregators = [aggregator_map[x.name] for x in aggregations] sids, tids = get_sids_tids(database) segments = Segment.objects.filter(id__in=sids) vals = list(segments.order_by('audio_file', 'start_time_ms') .values_list('audio_file__name', 'tid', 'start_time_ms', 'end_time_ms')) af_to_segments = {} for afname, tid, start, end in vals: if afname not in af_to_segments: af_to_segments[afname] = [] segs_info = af_to_segments[afname] segs_info.append((tid, start, end)) for feature in features: tid2fval = {} saved_file = 'database={}-feature={}-fmin={}-fmax={}-ncep={}.pkl'\ .format(database_name, feature.name, fmin, fmax, ncep) saved_file_loc = os.path.join(save_dir, saved_file) if os.path.isfile(saved_file_loc): print('{} already exists. Skip'.format(saved_file_loc)) continue bar = Bar('Extracting to {}'.format(saved_file_loc), max=len(af_to_segments)) for song_name, segs_info in af_to_segments.items(): wav_file_path = wav_path(song_name) __tids, __fvals = extract_segment_feature_for_audio_file(wav_file_path, segs_info, feature, **xtra_args) bar.next() for tid, fval in zip(__tids, __fvals): tid2fval[tid] = fval bar.finish() with open(saved_file_loc, 'wb') as f: pickle.dump(tid2fval, f) bar = Bar('Aggregating...', max=len(aggregators)) for aggregator in aggregators: tid2aval = {} agg_saved_file = 'database={}-feature={}-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\ .format(database_name, feature.name, aggregator.get_name(), fmin, fmax, ncep) agg_saved_file_loc = os.path.join(save_dir, agg_saved_file) if os.path.isfile(agg_saved_file_loc): print('{} already exists. Skip'.format(agg_saved_file_loc)) continue for tid, fval in tid2fval.items(): aggregated = aggregator.process(fval) tid2aval[tid] = aggregated bar.next() with open(agg_saved_file_loc, 'wb') as f: pickle.dump(tid2aval, f) bar.finish()