コード例 #1
0
    def test_n_queries(self):
        annotator = StandardAnnot(self.X, self.Y, self.C)

        # test querying class labels of selected annotators
        ids = [0]
        annotator.class_labels(self.X[0:2], ids, query_value=3)

        # test number of queries
        np.testing.assert_array_equal([3, 0], annotator.n_queries())
コード例 #2
0
    def test_queried_samples(self):
        annotator = StandardAnnot(self.X, self.Y, self.C)

        # test querying class labels of selected annotators
        ids = [0]
        annotator.class_labels(self.X[0:2], ids)

        # test queried samples
        np.testing.assert_array_equal(self.X[0:2],
                                      annotator.queried_samples()[0])
        np.testing.assert_array_equal(
            np.array([]).reshape(0, 2),
            annotator.queried_samples()[1])
コード例 #3
0
    def test_class_labels(self):
        annotator = StandardAnnot(self.X, self.Y, self.C)

        # test querying class labels
        ids = [0, 2, 3]
        X = self.X[ids]
        Y = annotator.class_labels(X)
        np.testing.assert_array_equal(self.Y[ids], Y)

        # test querying class labels of missing samples
        X = np.array([[-1, -1], [-2, -3]])
        Y = annotator.class_labels(X)
        np.testing.assert_array_equal(
            np.array([[np.nan, np.nan], [np.nan, np.nan]]), Y)

        # test querying class labels of selected annotators
        ids = [0]
        Y = annotator.class_labels(self.X[0:2], ids)
        np.testing.assert_array_equal(
            np.array([[self.Y[0, 0], np.nan], [self.Y[0, 1], np.nan]]), Y)
コード例 #4
0
ファイル: experimental_setup.py プロジェクト: timosturm/mapal
def run(results_path, data_set, query_strategy, budget, test_ratio, seed):
    """
    Run experiments to compare query selection strategies.
    Experimental results are stored in a .csv-file.

    Parameters
    ----------
    results_path: str
        Absolute path to store results.
    data_set: str
        Name of the data set.
    query_strategy: str
        Determines query strategy.
    budget: int
        Maximal number of labeled samples.
    test_ratio: float in (0, 1)
        Ratio of test samples.
    seed: float
        Random seed.
    """
    # --------------------------------------------- LOAD DATA ----------------------------------------------------------
    is_cosine = 'reports' in data_set
    X, y_true, y = load_data(data_set_name=data_set)
    n_features = np.size(X, axis=1)
    n_classes = len(np.unique(y))
    n_annotators = np.size(y, axis=1)
    print(data_set + ': ' + str(investigate_data_set(data_set)))
    budget_str = str(budget)
    if budget > len(X) * n_annotators * (1 - test_ratio):
        budget = int(math.floor(len(X) * n_annotators * (1 - test_ratio)))
    elif budget > 1:
        budget = int(budget)
    elif 0 < budget <= 1:
        budget = int(
            math.floor(len(X) * n_annotators * (1 - test_ratio) * budget))
    else:
        raise ValueError(
            "'budget' must be a float in (0, 1] or an integer in [0, n_samples]"
        )
    budget = np.min((budget, 1000))

    # --------------------------------------------- STATISTICS ---------------------------------------------------------
    # define storage for performances
    results = {}

    # define performance functions
    C = 1 - np.eye(n_classes)

    perf_funcs = {
        'micro-misclf-rate':
        [partial(misclassification_costs, C=C, average='micro'), {}],
        'macro-misclf-rate':
        [partial(misclassification_costs, C=C, average='macro'), {}]
    }

    # ------------------------------------------- LOAD DATA ----------------------------------------------------
    print('seed: {}'.format(str(seed)))
    X_train, X_test, y_true_train, y_true_test, y_train, y_test = train_test_split(
        X, y_true, y, test_size=test_ratio, random_state=seed)
    while not np.array_equal(np.unique(y_true_train), np.unique(y_true_test)):
        X_train, X_test, y_true_train, y_true_test, y_train, y_test = train_test_split(
            X, y_true, y, random_state=seed, test_size=test_ratio)
        seed += 1000
        print('new seed: {}'.format(seed))
    n_samples = len(X_train)

    # --------------------------------------------- CSV NAMES ----------------------------------------------------------
    csv_name = '{}_{}_{}_{}_{}.csv'.format(data_set, query_strategy,
                                           budget_str, test_ratio, seed)

    # ------------------------------------------ PREPROCESS DATA -------------------------------------------------------
    # standardize data
    if is_cosine:
        kwargs = {'metric': 'cosine'}
    else:
        # standardize data
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # compute bandwidth
        bandwidth = estimate_bandwidth(n_samples=n_samples,
                                       n_features=n_features)
        print('bandwidth: {}'.format(str(bandwidth)))
        gamma = 0.5 * (bandwidth**(-2))

        kwargs = {'metric': 'rbf', 'gamma': gamma}

    # setup classifiers
    pwc_train = PWC(n_classes=n_classes,
                    combine_labels=False,
                    random_state=seed,
                    **kwargs)
    S_train = pairwise_kernels(X_train, X_train, **kwargs)
    pwc_test = PWC(n_classes=n_classes,
                   metric='precomputed',
                   combine_labels=False,
                   probabilistic=False,
                   random_state=seed)
    S_test = pairwise_kernels(X_test, X_train, **kwargs)

    # set up data set
    data_set = DataSet(X_train, n_annotators=n_annotators)
    annotators = StandardAnnot(X=X_train, Y=y_train)

    # create query strategy
    if query_strategy == 'ceal':
        query_strategy = CEAL(data_set=data_set,
                              n_classes=n_classes,
                              clf=pwc_train,
                              n_neighbors=10,
                              label_proportion=0.2 * budget / n_annotators,
                              random_state=seed,
                              **kwargs)
    elif query_strategy == 'alio':
        query_strategy = ALIO(data_set=data_set,
                              n_classes=n_classes,
                              clf=pwc_train,
                              label_proportion=0.2 * budget / n_annotators,
                              random_state=seed)
    elif query_strategy == 'proactive':
        query_strategy = Proactive(data_set=data_set,
                                   n_classes=n_classes,
                                   clf=pwc_train,
                                   n_components=20,
                                   label_proportion=0.2 * budget /
                                   n_annotators,
                                   random_state=seed)
    elif 'mapal' in query_strategy:
        params = query_strategy.split('-')
        mean_prior = float(params[1])
        sum_prior = (np.sum(S_train) - n_samples) / (
            n_samples**2 - n_samples) if params[2] == 'mean' else float(
                params[2])
        prior = np.array([mean_prior, 1 - mean_prior])
        prior /= np.sum(prior)
        prior *= sum_prior
        print('prior = {}'.format(prior))
        m_max = int(params[3])
        alpha = float(params[4])
        weights_type = str(params[5])
        bam = BAM(n_classes=n_classes,
                  weights_type=weights_type,
                  prior=prior,
                  random_state=seed,
                  **kwargs)
        query_strategy = MAPAL(data_set=data_set,
                               m_max=m_max,
                               n_classes=n_classes,
                               S=S_train,
                               bam=bam,
                               alpha_x=alpha,
                               alpha_c=alpha,
                               random_state=seed)
    elif query_strategy == 'ie-adj-cost':
        query_strategy = IEAdjCost(data_set=data_set,
                                   clf=pwc_train,
                                   n_classes=n_classes,
                                   delta=0.4,
                                   lmbda=0.4,
                                   alpha=0.05,
                                   epsilon=0.8,
                                   random_state=seed)
    elif query_strategy == 'ie-thresh':
        query_strategy = IEThresh(data_set=data_set,
                                  clf=pwc_train,
                                  n_classes=n_classes,
                                  epsilon=0.8,
                                  alpha=0.05,
                                  random_state=seed)
    elif query_strategy == 'random':
        query_strategy = RS(data_set=data_set, random_state=seed)
    else:
        raise ValueError(
            "query strategy must be in ['ceal', 'ie-thresh', 'pal-1-all', 'pal-1-single', 'mapal-..., random]"
        )

    # ----------------------------------------- ACTIVE LEARNING CYCLE --------------------------------------------------
    times = [0]
    for b in range(budget):
        print("budget: {}".format(b))
        # evaluate results
        eval_perfs(clf=pwc_test,
                   X_train=S_train,
                   y_train=y_true_train,
                   X_test=S_test,
                   y_test=y_true_test,
                   perf_results=results,
                   perf_funcs=perf_funcs)
        eval_annot_stats(y=data_set.y_, y_true=y_true_train, results=results)

        # select sample and annotator
        t = time()
        selection = query_strategy.make_query()
        times.append(time() - t)
        sample_id = selection[0, 0]
        annotator_id = [selection[0, 1]]
        print("selected sample: {}".format(sample_id))
        print("selected annotator: {}".format(annotator_id))

        # query selected annotator for labeling selected sample
        X_query = [X_train[sample_id]]
        y_query = annotators.class_labels(X_query, annotator_ids=annotator_id)
        print('class label: {}'.format(y_query[0, annotator_id[0]]))

        # update training data
        data_set.update_entries(sample_id, y_query)
        print(data_set.len_labeled(per_annotator=True))

        # retrain classifier
        pwc_test.fit(X=data_set.X_, y=data_set.y_, c=data_set.c_)

    # evaluate results
    eval_perfs(clf=pwc_test,
               X_train=S_train,
               y_train=y_true_train,
               X_test=S_test,
               y_test=y_true_test,
               perf_results=results,
               perf_funcs=perf_funcs)
    eval_annot_stats(y=data_set.y_, y_true=y_true_train, results=results)

    # store performance results
    results['times'] = times
    df = pd.DataFrame(results)
    df.to_csv('{}/{}'.format(results_path, csv_name), index_label='index')