Example #1
0
def get_syntactically_similar_pairs(request):
    extra_args = json.loads(request.POST.get('extras', {}))
    granularity = extra_args['granularity']
    user = request.user
    database = get_user_databases(user)
    permission = database.get_assigned_permission(user)
    if permission < DatabasePermission.ANNOTATE:
        raise CustomAssertionError(
            'You don\'t have permission to annotate this database')

    sids, tids = get_sids_tids(database)

    label_arr = get_syllable_labels(user,
                                    granularity,
                                    sids,
                                    on_no_label='set_blank')
    cls_labels, syl_label_enum_arr = np.unique(label_arr, return_inverse=True)

    enum2label = {enum: label for enum, label in enumerate(cls_labels)}
    sid2enumlabel = {
        sid: enum_label
        for sid, enum_label in zip(sids, syl_label_enum_arr)
    }

    adjacency_mat, classes_info = calc_class_ajacency(database,
                                                      syl_label_enum_arr,
                                                      enum2label,
                                                      sid2enumlabel,
                                                      count_style='forward',
                                                      self_count='append')
    counter = Counter(syl_label_enum_arr)
    nlabels = len(counter)
    frequencies = np.array([counter[i] for i in range(nlabels)])

    return adjacency_mat.tolist(), frequencies.tolist(), cls_labels.tolist()
def create_full_tensor(database, recreate):
    features = Feature.objects.all().order_by('id')
    aggregations = Aggregation.objects.all().order_by('id')
    features_hash = '-'.join(
        list(map(str, features.values_list('id', flat=True))))
    aggregations_hash = '-'.join(
        list(map(str, aggregations.values_list('id', flat=True))))
    aggregators = [aggregator_map[x.name] for x in aggregations]

    full_tensor = FullTensorData.objects.filter(
        database=database,
        features_hash=features_hash,
        aggregations_hash=aggregations_hash).first()

    if full_tensor and not recreate:
        print(
            'Full tensor {} already exists. If you want to recreate, turn on flag --recreate'
            .format(full_tensor.name))
        return full_tensor, False

    if full_tensor is None:
        full_tensors_name = uuid.uuid4().hex
        full_tensor = FullTensorData(name=full_tensors_name,
                                     database=database,
                                     features_hash=features_hash,
                                     aggregations_hash=aggregations_hash)

    full_sids_path = full_tensor.get_sids_path()
    full_bytes_path = full_tensor.get_bytes_path()
    full_cols_path = full_tensor.get_cols_path()

    sids, tids = get_sids_tids(database)
    f2bs, fa2bs = get_binstorage_locations(features, aggregators)
    data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features, aggregators)

    ndarray_to_bytes(data, full_bytes_path)
    ndarray_to_bytes(sids, full_sids_path)

    with open(full_cols_path, 'w', encoding='utf-8') as f:
        json.dump(col_inds, f)

    full_tensor.save()
    return full_tensor, True
    def prepare_data_for_analysis(self, pkl_filename, options):
        label_level = options['label_level']
        dbid = options['dbid']
        annotator_name = options['annotator_name']

        database = get_database(dbid)
        sids, tids = get_sids_tids(database)
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        label_arr = get_syllable_labels(annotator, label_level, sids)
        cls_labels, syl_label_enum_arr = np.unique(label_arr,
                                                   return_inverse=True)

        enum2label = {enum: label for enum, label in enumerate(cls_labels)}
        sid2enumlabel = {
            sid: enum_label
            for sid, enum_label in zip(sids, syl_label_enum_arr)
        }

        adjacency_mat, classes_info = calc_class_ajacency(
            database,
            syl_label_enum_arr,
            enum2label,
            sid2enumlabel,
            count_style='symmetric',
            count_circular=False)

        dist_triu = calc_class_dist_by_adjacency(adjacency_mat,
                                                 syl_label_enum_arr,
                                                 return_triu=True)
        tree = linkage(dist_triu, method='average')

        saved_dict = dict(tree=tree,
                          dbid=database.id,
                          sids=sids,
                          unique_labels=label_arr,
                          classes_info=classes_info)

        with open(pkl_filename, 'wb') as f:
            pickle.dump(saved_dict, f)

        return saved_dict
Example #4
0
    def handle(self, *args, **options):
        database_name = options['database_name']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        niters = options['niters']
        profile = options.get('profile', None)
        tsv_file = profile + '.tsv'

        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))
        features = Feature.objects.all().order_by('id').filter(name='spectrum')

        sids, tids = get_sids_tids(database)
        labels, no_label_ids = get_labels_by_sids(sids, label_level, annotator,
                                                  min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(sids, tids, labels,
                                                   no_label_ids)

        full_data = extract_rawdata(tids, features)
        data = [x[0].T for x in full_data]

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        dp = OneHotSequenceProvider(data, labels, balanced=True)
        trainvalidset, testset = dp.split(
            test_ratio, limits=(min_occur, int(np.floor(min_occur * 1.5))))

        v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
        nfolds = int(np.floor(1. / v2t_ratio + 0.01))

        hidden_layer_sizes_choices = [
            (100, ),
            (200, ),
            (400, ),
            (100, 100),
            (100, 200),
            (100, 400),
            (200, 100),
            (200, 200),
            (200, 400),
            (400, 100),
            (400, 200),
            (400, 400),
        ]

        choices = {'cnn': {'hidden_layer_sizes': hidden_layer_sizes_choices}}

        best_trial_args_values = {}

        for arg_name, arg_values in choices['cnn'].items():
            losses = []
            ids = []

            def loss_func(params):
                arg_value = params[0]
                classifier_args = best_trial_args_values.copy()
                classifier_args[arg_name] = arg_value
                print('classifier_args = {}'.format(classifier_args))
                score = perform_k_fold(cnn, trainvalidset, nfolds, v2t_ratio,
                                       nlabels, **classifier_args)
                return 1. - score

            for idx, arg_value in enumerate(arg_values):
                loss = loss_func((arg_value, ))
                ids.append(idx)
                losses.append(loss)

            best_loss_idx = np.argmin(losses)
            best_arg_value = arg_values[best_loss_idx]
            best_trial_args_values[arg_name] = best_arg_value

            model_args = ['id'] + list(
                best_trial_args_values.keys()) + ['accuracy']

            model_args_values = {x: [] for x in model_args}
            for idx, loss in enumerate(losses):
                if idx == best_loss_idx:
                    idx_str = 'Best'
                else:
                    idx_str = str(idx)
                # trial_args_values = trial['misc']['vals']
                for arg_name_ in model_args:
                    if arg_name_ == 'id':
                        model_args_values['id'].append(idx_str)
                    elif arg_name_ == 'accuracy':
                        trial_accuracy = 1. - loss
                        model_args_values['accuracy'].append(trial_accuracy)
                    else:
                        if arg_name_ == arg_name:
                            val = arg_values[idx]
                        else:
                            val = best_trial_args_values[arg_name_]
                        model_args_values[arg_name_].append(val)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                for arg in model_args:
                    values = model_args_values[arg]
                    f.write('{}\t'.format(arg))
                    f.write('\t'.join(map(str, values)))
                    f.write('\n')
                open_mode = 'a'

        # Perform classification on the test set
        nfolds = int(np.floor(1 / test_ratio + 0.01))
        ntrials = nfolds * niters
        label_prediction_scores = [0] * ntrials
        label_hitss = [0] * ntrials
        label_missess = [0] * ntrials
        label_hitrates = np.empty((ntrials, nlabels))
        label_hitrates[:] = np.nan
        importancess = np.empty((ntrials, data.shape[1]))
        cfmats = np.ndarray((ntrials, nlabels, nlabels))
        ind = 0

        bar = Bar('Running CNN', max=ntrials)

        for iter in range(niters):
            traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max))
            traintetset.make_folds(nfolds, test_ratio)

            for k in range(nfolds):
                trainset, testset = traintetset.get_fold(k)
                train_x = np.array(trainset.data)
                train_y = np.array(trainset.labels, dtype=np.int32)
                test_x = np.array(testset.data)
                test_y = np.array(testset.labels, dtype=np.int32)

                score, label_hits, label_misses, cfmat, importances =\
                    cnn(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)

                label_prediction_scores[ind] = score
                label_hitss[ind] = label_hits
                label_missess[ind] = label_misses

                label_hitrate = label_hits / (label_hits +
                                              label_misses).astype(np.float)

                label_hitrates[ind, :] = label_hitrate
                importancess[ind, :] = importances
                cfmats[ind, :, :] = cfmat

                bar.next()
                ind += 1
        bar.finish()

        mean_label_prediction_scores = np.nanmean(label_prediction_scores)
        std_label_prediction_scores = np.nanstd(label_prediction_scores)
        sum_cfmat = np.nansum(cfmats, axis=0)

        with open(tsv_file, open_mode, encoding='utf-8') as f:
            f.write('Results using best-model\'s paramaters on testset\n')
            f.write('Feature group\tLabel prediction mean\tstdev\t{}\n'.format(
                '\t '.join(unique_labels)))
            f.write('{}\t{}\t{}\t{}\n'.format(
                'Spectrum', mean_label_prediction_scores,
                std_label_prediction_scores,
                '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))

            f.write('\t')
            f.write('\t'.join(unique_labels))
            f.write('\n')
            for i in range(nlabels):
                label = unique_labels[i]
                cfrow = sum_cfmat[:, i]
                f.write(label)
                f.write('\t')
                f.write('\t'.join(map(str, cfrow)))
                f.write('\n')
            f.write('\n')
Example #5
0
File: lstm.py Project: jren2019/koe
    def handle(self, *args, **options):
        database_name = options['database_name']
        annotator_name = options['annotator_name']
        # population_name = options['population_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        no_gpu = options['no_gpu']
        # feature_group = options['feature_group']

        # if feature_group:
        #     feature_names = feature_names.split(',')
        #     features = Feature.objects.filter(name__in=feature_names).order_by('id')
        # else:
        #     features = Feature.objects.all().order_by('id')
        #
        # features = features.exclude(is_fixed_length=True)

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = Feature.objects.all().order_by('id')
        # features = list(features)[:4]

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        sids, tids = get_sids_tids(database)
        labels, no_label_ids = get_labels_by_sids(sids, label_level, annotator,
                                                  min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(sids, tids, labels,
                                                   no_label_ids)

        full_data = extract_rawdata(tids, enabled_features)
        feature_inds = {x.name: idx for idx, x in enumerate(enabled_features)}

        for ftgroup_name in ftgroup_names + ['all']:
            data = []
            if ftgroup_name == 'all':
                features = feature_whereabout_flat
            else:
                features = feature_whereabout[ftgroup_name]
            ftgroup_col_inds = []
            for feature_name, is_fixed_length, _ in features:
                col_name = feature_name
                feature_ind = feature_inds.get(col_name, None)
                if feature_ind is not None:
                    ftgroup_col_inds.append(feature_ind)

            for full_row, sid in zip(full_data, sids):
                row = [full_row[x] for x in ftgroup_col_inds]
                try:
                    row = np.vstack(row).T
                except ValueError:
                    print('Encounter error at id={}'.format(sid))
                    for idx, (feature_name, is_fixed_length,
                              _) in enumerate(features):
                        print('{} - {}'.format(feature_name, row[idx].shape))
                data.append(row)

            data_provider = OneHotSequenceProvider(data, labels, balanced=True)
            model_name = '{}_{}_{}'.format(database_name, label_level,
                                           ftgroup_name)
            print('Training for: {}'.format(model_name))
            train(data_provider, name=model_name, disable_gpu=no_gpu)
Example #6
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options.get('profile', None)
        load_dir = options['load_dir']

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'

        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')
        aggregators = [aggregator_map[x.name] for x in aggregations]

        _sids, _tids = get_sids_tids(database)
        _labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                   annotator, min_occur)
        if len(no_label_ids) > 0:
            _sids, _tids, _labels = exclude_no_labels(_sids, _tids, _labels,
                                                      no_label_ids)

        unique_labels, enum_labels = np.unique(_labels, return_inverse=True)
        fold = split_classwise(enum_labels,
                               ratio=valid_ratio,
                               limits=(min_occur,
                                       int(np.floor(min_occur * 1.5))),
                               nfolds=1,
                               balanced=True)
        train = fold[0]['train']
        test = fold[0]['test']
        all_indices = np.concatenate((train, test))

        tids = _tids[all_indices]
        labels = _labels[all_indices]

        with open('/tmp/hyperopt.pkl', 'rb') as f:
            saved = pickle.load(f)

        performance_data = saved[clsf_type]
        accuracies = performance_data['accuracies']
        groups = performance_data['groups']
        params = performance_data['params']

        group_name = '{}-{}'.format('mfcc', source)
        group_member_inds = np.where(groups == group_name)
        group_accuracies = accuracies[group_member_inds]

        best_acc_idx = np.argmax(group_accuracies)

        group_params = {}
        best_params = {}
        for param_name in params:
            param_values = np.array(params[param_name])
            group_param_values = param_values[group_member_inds]
            group_params[param_name] = group_param_values

            converter = converters[clsf_type][param_name]
            best_params[param_name] = converter(
                group_param_values[best_acc_idx])

        params_names = []
        params_converters = []
        params_count = 0

        v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
        nfolds = int(np.floor(1. / v2t_ratio + 0.01))

        def loss(params):
            mfcc_args = {}
            for i in range(params_count):
                param_name = params_names[i]
                param_converter = params_converters[i]
                param_value = params[i]
                mfcc_args[param_name] = param_converter(param_value)

            _fmin = mfcc_args['fmin']
            _fmax = mfcc_args['fmax']
            _ncep = mfcc_args['ncep']

            extract_mfcc_multiparams(database_name, load_dir, _ncep, _fmin,
                                     _fmax)

            data = []
            tid2rows = {tid: [] for tid in tids}

            for aggregator in aggregators:
                agg_saved_file = 'database={}-feature=mfcc-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\
                    .format(database_name, aggregator.get_name(), _fmin, _fmax, _ncep)
                agg_saved_file_loc = os.path.join(load_dir, agg_saved_file)

                with open(agg_saved_file_loc, 'rb') as f:
                    tid2aval = pickle.load(f)
                    for tid in tids:
                        val = tid2aval[tid]
                        row = tid2rows[tid]
                        row.append(val)

            for tid in tids:
                row = tid2rows[tid]
                row = np.hstack(row).T
                data.append(row)
            data = np.array(data)
            data = zscore(data)
            data[np.where(np.isnan(data))] = 0
            data[np.where(np.isinf(data))] = 0

            unique_labels = np.unique(labels)
            nlabels = len(unique_labels)

            dp = EnumDataProvider(data, labels, balanced=True)
            trainvalidset, _ = dp.split(0, limits=(ipc_min, ipc_max))

            score = perform_k_fold(classifier, trainvalidset, nfolds,
                                   v2t_ratio, nlabels, **best_params)
            return 1. - score

        ncep_choices = hp.uniform('ncep', 13, 48)
        fmin_choices = hp.uniform('fmin', 0, 5)
        fmax_choices = hp.uniform('fmax', 8, 24)
        mfcc_params = {
            'ncep': (lambda x: int(np.round(x)), ncep_choices),
            'fmin': (lambda x: int(np.round(x) * 100), fmin_choices),
            'fmax': (lambda x: int(np.round(x) * 1000), fmax_choices),
        }

        space = []

        for arg_name, (converter, arg_values) in mfcc_params.items():
            space.append(arg_values)
            params_names.append(arg_name)
            params_converters.append(converter)
            params_count += 1

        trials = Trials()
        best = fmin(fn=loss,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=100,
                    trials=trials)
        print(best)

        with open(trials_file, 'wb') as f:
            pickle.dump(trials, f)

        best_trial = trials.best_trial
        best_trial_args_values_ = best_trial['misc']['vals']
        best_trial_args_values = {}
        for arg_name, arg_values in best_trial_args_values_.items():
            converter = mfcc_params[arg_name][0]
            arg_value = converter(arg_values[0])
            best_trial_args_values[arg_name] = arg_value

        model_args = ['id'] + list(
            best_trial_args_values.keys()) + ['accuracy']

        model_args_values = {x: [] for x in model_args}
        for idx, trial in enumerate(trials.trials):
            if trial == best_trial:
                idx = 'Best'
            trial_args_values = trial['misc']['vals']
            for arg_name in model_args:
                if arg_name == 'id':
                    model_args_values['id'].append(idx)
                elif arg_name == 'accuracy':
                    trial_accuracy = 1. - trial['result']['loss']
                    model_args_values['accuracy'].append(trial_accuracy)
                else:
                    converter = mfcc_params[arg_name][0]
                    val = converter(trial_args_values[arg_name][0])
                    model_args_values[arg_name].append(val)

        with open(tsv_file, open_mode, encoding='utf-8') as f:
            for arg in model_args:
                values = model_args_values[arg]
                f.write('{}\t'.format(arg))
                f.write('\t'.join(map(str, values)))
                f.write('\n')
            open_mode = 'a'
Example #7
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options['profile']
        agg = options['agg']

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = list(feature_map.values())
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')

        if agg == 'all':
            aggregators = [aggregator_map[x.name] for x in aggregations]
        else:
            aggregators = enabled_aggregators[agg]

        _sids, _tids = get_sids_tids(database)

        full_data, col_inds = extract_rawdata(_tids, features, aggregators)

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels,
                                                   no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        for ftgroup_name, feature_names in ftgroup_names.items():
            if ftgroup_name == 'all':
                features = list(feature_map.values())
            else:
                features = [feature_map[x] for x in feature_names]
            ft_col_inds = []
            for feature in features:
                if feature.is_fixed_length:
                    col_name = feature.name
                    col_range = col_inds[col_name]
                    ft_col_inds += range(col_range[0], col_range[1])
                else:
                    for aggregator in aggregators:
                        col_name = '{}_{}'.format(feature.name,
                                                  aggregator.get_name())
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])

            ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
            ndims = len(ft_col_inds)
            data = full_data[:, ft_col_inds]

            if source == 'pca':
                explained, data = pca_optimal(data, ndims, 0.9)
                pca_dims = data.shape[1]

            dp = EnumDataProvider(data, labels, balanced=True)
            trainvalidset, testset = dp.split(test_ratio,
                                              limits=(ipc_min, ipc_max))

            v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
            nfolds = int(np.floor(1. / v2t_ratio + 0.01))

            params_names = []
            params_converters = []
            params_count = 0

            def loss(params):
                classifier_args = {}
                for i in range(params_count):
                    param_name = params_names[i]
                    param_converter = params_converters[i]
                    param_value = params[i]
                    classifier_args[param_name] = param_converter(param_value)

                print(classifier_args)
                score = perform_k_fold(classifier, trainvalidset, nfolds,
                                       v2t_ratio, nlabels, **classifier_args)
                return 1. - score

            n_estimators_choices = hp.uniform('n_estimators', 40, 100)
            min_samples_split_choices = hp.uniform('min_samples_split', 2, 21)
            min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20)

            n_features = data.shape[1]
            auto_gamma = 1 / n_features
            gamma_choices = hp.uniform('gamma', auto_gamma / 10,
                                       auto_gamma * 10)
            c_choices = hp.uniform('C', -1, 2)
            hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100,
                                                   5000)
            n_neighbors_choices = hp.uniform('n_neighbors', 1, 10)

            choices = {
                'rf': {
                    'n_estimators':
                    (lambda x: int(np.round(x)), n_estimators_choices),
                    'min_samples_split':
                    (lambda x: int(np.round(x)), min_samples_split_choices),
                    'min_samples_leaf':
                    (lambda x: int(np.round(x)), min_samples_leaf_choices),
                },
                'svm_rbf': {
                    'gamma': (float, gamma_choices),
                    'C': (lambda x: 10**x, c_choices),
                },
                'svm_linear': {
                    'C': (lambda x: 10**x, c_choices),
                },
                'nnet': {
                    'hidden_layer_sizes':
                    (lambda x: (int(np.round(x)), ), hidden_layer_size_choices)
                },
                'knn': {
                    'n_neighbors':
                    (lambda x: int(np.round(x)), n_neighbors_choices)
                }
            }

            space = []
            for arg_name, (converter,
                           arg_values) in choices[clsf_type].items():
                space.append(arg_values)
                params_names.append(arg_name)
                params_converters.append(converter)
                params_count += 1

            trials = Trials()
            max_evals = params_count * 10
            best = fmin(fn=loss,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=max_evals,
                        trials=trials)
            print(best)

            with open(trials_file, 'wb') as f:
                pickle.dump(trials, f)

            best_trial = trials.best_trial
            best_trial_args_values_ = best_trial['misc']['vals']
            best_trial_args_values = {}
            for arg_name, arg_values in best_trial_args_values_.items():
                converter = choices[clsf_type][arg_name][0]
                arg_value = converter(arg_values[0])
                best_trial_args_values[arg_name] = arg_value

            model_args = ['id'] + list(
                best_trial_args_values.keys()) + ['accuracy']

            model_args_values = {x: [] for x in model_args}
            for idx, trial in enumerate(trials.trials):
                if trial == best_trial:
                    idx = 'Best'
                trial_args_values = trial['misc']['vals']
                for arg_name in model_args:
                    if arg_name == 'id':
                        model_args_values['id'].append(idx)
                    elif arg_name == 'accuracy':
                        trial_accuracy = 1. - trial['result']['loss']
                        model_args_values['accuracy'].append(trial_accuracy)
                    else:
                        # choice = choices[clsf_type][arg_name]
                        converter = choices[clsf_type][arg_name][0]
                        val = converter(trial_args_values[arg_name][0])
                        # val = choice[choice_idx]
                        model_args_values[arg_name].append(val)

            # Perform classification on the test set
            train_x = np.array(trainvalidset.data)
            train_y = np.array(trainvalidset.labels, dtype=np.int32)
            test_x = np.array(testset.data)
            test_y = np.array(testset.labels, dtype=np.int32)

            score, label_hits, label_misses, cfmat, importances =\
                classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)
            lb_hitrates = label_hits / (label_hits + label_misses).astype(
                np.float)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                for arg in model_args:
                    values = model_args_values[arg]
                    f.write('{}\t'.format(arg))
                    f.write('\t'.join(map(str, values)))
                    f.write('\n')

                f.write('Results using best-model\'s paramaters on testset\n')

                if source == 'full':
                    f.write(
                        'Feature group\tNdims\tLabel prediction score\t{}\n'.
                        format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, score,
                        '\t'.join(map(str, lb_hitrates))))
                else:
                    f.write(
                        'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n'
                        .format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, explained, pca_dims, score,
                        '\t'.join(map(str, lb_hitrates))))
                f.write('\n')
                open_mode = 'a'
Example #8
0
    def handle(self, database_name, population_name, type, perplexity,
               normalised, *args, **kwargs):
        database = get_or_error(Database, dict(name__iexact=database_name))
        assert type in ['tsne2', 'tsne3', 'mds', 'mdspca']

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.all().order_by('id')

        features_hash = '-'.join(
            list(map(str, features.values_list('id', flat=True))))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        full_tensor = FullTensorData.objects.filter(
            database=database,
            features_hash=features_hash,
            aggregations_hash=aggregations_hash).first()

        if full_tensor is None:
            raise Exception(
                'Full feature matrix not found. Need to create FullTensor first.'
            )

        full_sids_path = full_tensor.get_sids_path()
        full_bytes_path = full_tensor.get_bytes_path()

        full_sids = bytes_to_ndarray(full_sids_path, np.int32)
        full_data = get_rawdata_from_binary(full_bytes_path, len(full_sids))

        sids, tids = get_sids_tids(database, population_name)

        normalised_str = 'normed' if normalised else 'raw'
        if type.startswith('tsne'):
            file_name = '{}_{}_{}_{}_{}.pkl'.format(database_name,
                                                    population_name, type,
                                                    perplexity, normalised_str)
        else:
            file_name = '{}_{}_{}_{}.pkl'.format(database_name,
                                                 population_name, type,
                                                 normalised_str)
        if os.path.isfile(file_name):
            with open(file_name, 'rb') as f:
                saved = pickle.load(f)
                coordinate = saved['coordinate']
                stress = saved['stress']
        else:
            population_data = cherrypick_tensor_data_by_sids(
                full_data, full_sids, sids).astype(np.float64)

            if normalised:
                population_data = zscore(population_data)

            population_data[np.where(np.isnan(population_data))] = 0
            population_data[np.where(np.isinf(population_data))] = 0

            if type.startswith('mds'):
                if type == 'mdspca':
                    dim_reduce_func = PCA(n_components=50)
                    population_data = dim_reduce_func.fit_transform(
                        population_data, y=None)
                    if hasattr(dim_reduce_func, 'explained_variance_ratio_'):
                        print(
                            'Cumulative explained variation for {} principal components: {}'
                            .format(
                                50,
                                np.sum(dim_reduce_func.
                                       explained_variance_ratio_)))

                similarities = squareform(pdist(population_data, 'euclidean'))

                model = MDS(n_components=3,
                            dissimilarity='precomputed',
                            random_state=7,
                            verbose=1,
                            max_iter=1000)
                coordinate = model.fit_transform(similarities)
                stress = model.stress_
            else:
                ntsne_dims = int(type[4:])
                dim_reduce_func = PCA(n_components=50)
                population_data = dim_reduce_func.fit_transform(
                    population_data, y=None)

                print('Cumulative explained variation: {}'.format(
                    np.sum(dim_reduce_func.explained_variance_ratio_)))

                time_start = time.time()
                tsne = TSNE(n_components=ntsne_dims,
                            verbose=1,
                            perplexity=perplexity,
                            n_iter=4000)
                coordinate = tsne.fit_transform(population_data)
                print(
                    't-SNE done! Time elapsed: {} seconds'.format(time.time() -
                                                                  time_start))
                stress = None

        with open(file_name, 'wb') as f:
            pickle.dump(dict(coordinate=coordinate,
                             stress=stress,
                             sids=sids,
                             tids=tids),
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)
def extract_mfcc_multiparams(database_name, save_dir, ncep, fmin, fmax):
    xtra_args = dict(ncep=ncep, fmin=fmin, fmax=fmax)
    features = Feature.objects.filter(name='mfcc')

    database = get_or_error(Database, dict(name__iexact=database_name))

    aggregations = Aggregation.objects.filter(enabled=True).order_by('id')
    aggregators = [aggregator_map[x.name] for x in aggregations]

    sids, tids = get_sids_tids(database)
    segments = Segment.objects.filter(id__in=sids)
    vals = list(segments.order_by('audio_file', 'start_time_ms')
                .values_list('audio_file__name', 'tid', 'start_time_ms', 'end_time_ms'))

    af_to_segments = {}
    for afname, tid, start, end in vals:
        if afname not in af_to_segments:
            af_to_segments[afname] = []
        segs_info = af_to_segments[afname]
        segs_info.append((tid, start, end))

    for feature in features:
        tid2fval = {}
        saved_file = 'database={}-feature={}-fmin={}-fmax={}-ncep={}.pkl'\
            .format(database_name, feature.name, fmin, fmax, ncep)

        saved_file_loc = os.path.join(save_dir, saved_file)
        if os.path.isfile(saved_file_loc):
            print('{} already exists. Skip'.format(saved_file_loc))
            continue
        bar = Bar('Extracting to {}'.format(saved_file_loc), max=len(af_to_segments))
        for song_name, segs_info in af_to_segments.items():
            wav_file_path = wav_path(song_name)
            __tids, __fvals = extract_segment_feature_for_audio_file(wav_file_path, segs_info, feature, **xtra_args)
            bar.next()
            for tid, fval in zip(__tids, __fvals):
                tid2fval[tid] = fval
        bar.finish()

        with open(saved_file_loc, 'wb') as f:
            pickle.dump(tid2fval, f)

        bar = Bar('Aggregating...', max=len(aggregators))
        for aggregator in aggregators:
            tid2aval = {}
            agg_saved_file = 'database={}-feature={}-aggregator={}-fmin={}-fmax={}-ncep={}.pkl'\
                .format(database_name, feature.name, aggregator.get_name(), fmin, fmax, ncep)
            agg_saved_file_loc = os.path.join(save_dir, agg_saved_file)

            if os.path.isfile(agg_saved_file_loc):
                print('{} already exists. Skip'.format(agg_saved_file_loc))
                continue

            for tid, fval in tid2fval.items():
                aggregated = aggregator.process(fval)
                tid2aval[tid] = aggregated
            bar.next()

            with open(agg_saved_file_loc, 'wb') as f:
                pickle.dump(tid2aval, f)
        bar.finish()