Beispiel #1
0
def main():
    quota = 10  # ask human to label 30 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)

    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True,
               shadow=True, ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position([box.x0, box.y0 - box.height * 0.1, box.width,
                         box.height * 0.9])
    # Give each label its name (labels are from 0 to n_classes-1)
    lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])

    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))
Beispiel #2
0
def train_for_user(user_id=None, device_type=None, n_class=None):
    test_data = waterloo_iv_processing.get_per_user_data(
        user_id=user_id,
        device=device_type,
        video_name=['sports', 'document', 'nature', 'game', 'movie'])
    X, y = processing_training_data(n_class=n_class, train_data=test_data)
    test_size = 0.2  # the percentage of samples in the dataset that will be
    quota = 350  # number of samples to query

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(20):
        print('exp:', i)
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = split_train_test(
            X=X, y=y, test_size=test_size, n_class=n_class)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E1_class_' +
        str(n_class) + '.txt', result['E1'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E2_class_' +
        str(n_class) + '.txt', result['E2'])
    save_file(
        'results/' + device_type + '_user_' + str(user_id) + '_E3_class_' +
        str(n_class) + '.txt', result['E3'])

    print("Uncertainty: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    uncert, = plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    rd, = plt.plot(query_num, E_out_2, 'k', label='Random')
    alce, = plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result (user ' + str(user_id) + ')')
    plt.legend(handles=[uncert, rd, alce], loc=3)
    plt.show()
 def test_RandomSampling(self):
     trn_ds = Dataset(
         self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)]))
     qs = RandomSampling(trn_ds, random_state=1126)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(
         qseq, np.array([150, 16, 122, 157, 233, 160, 114, 163, 155, 56]))
Beispiel #4
0
    def test_RandomSampling(self):

        trn_ds = Dataset(
            self.X, np.concatenate([self.y[:5], [None] * (len(self.y) - 5)]))
        qs = RandomSampling(trn_ds, random_state=1126)
        qseq = run_qs(trn_ds, qs, self.y, self.quota)
        assert_array_equal(
            qseq, np.array([33, 143, 198, 29, 248, 92, 236, 212, 185, 163]))
Beispiel #5
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': []}
    for i in range(2):
        trn_ds, tst_ds, fully_labeled_trn_ds, cost_matrix = \
            split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = SVM(kernel='rbf', decision_function_shape='ovr')

        quota = 100  # number of samples to query

        qs = UncertaintySampling(trn_ds,
                                 method='sm',
                                 model=SVM(decision_function_shape='ovr'))
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota, cost_matrix)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota, cost_matrix)
        result['E2'].append(E_out_2)

        qs3 = ALCE(trn_ds3, cost_matrix, SVR())
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota, cost_matrix)
        result['E3'].append(E_out_3)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)

    #print("Uncertainty: ", E_out_1[::5].tolist())
    #print("Random: ", E_out_2[::5].tolist())
    #print("ALCE: ", E_out_3[::5].tolist())

    query_num = np.arange(0, quota + 1)
    plt.figure(figsize=(10, 8))
    plt.plot(query_num, E_out_1, 'g', label='Uncertainty sampling')
    plt.plot(query_num, E_out_2, 'k', label='Random')
    plt.plot(query_num, E_out_3, 'r', label='ALCE')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               ncol=5)
    plt.show()
    def heuristic_score_fun(inst_idx, ss_type):
        if ss_type == "Random":
            if "qs2" not in shared_variables:
                extractor = SynStateALHeuristic.build_feature_extractor(enriched_train_df, col_names)
                qs2 = RandomSampling(TextDataset(enriched_train_df, col_names, extractor))
                shared_variables["qs2"] = qs2
            qs2 = shared_variables["qs2"]
            return qs2.get_score(inst_idx)

        class Object(object):
            pass
        PS_type = type(ss_type.__name__, (object,), dict(orig_state=Object()))  # python hack for naming a type

        def prepare_prev_state(ss_type, prev_state=None):
            if prev_state is None:
                prev_state = PS_type()

            if issubclass(ss_type, SynStateALHeuristic):
                if str(ss_type)+"qs" not in shared_variables:
                    qs = ss_type.build_query_strategy(enriched_train_df, col_names)
                    shared_variables[str(ss_type)+"qs"] = qs
                qs = shared_variables[str(ss_type)+"qs"]
                prev_state.build_next_states_qs = lambda _: qs
             elif ss_type == SynStateTestDataGain:
                if "en_labeled_train_df" not in shared_variables:
                    enriched_labeled_train_df = SynStateTestDataGain. \
                        label_dataframe_with_expert(enriched_train_df, col_names, labeled_df)
                    shared_variables["en_labeled_train_df"] = enriched_labeled_train_df
                enriched_labeled_train_df = shared_variables["en_labeled_train_df"]
                prev_state.build_next_states_labeled_df = lambda _: enriched_labeled_train_df
            elif ss_type == SynStateRandom:
                pass  # return prev_state as it is
            return prev_state

        ss_prev_state = prepare_prev_state(ss_type)
        ss = ss_type(inst_idx, enriched_train_df, col_names, ss_prev_state)
        return ss.get_state_score()
Beispiel #7
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'diabetes.txt')
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10  # number of samples that are initially labeled

    # Load dataset
    trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
        split_train_test(dataset_filepath, test_size, n_labeled)
    trn_ds2 = copy.deepcopy(trn_ds)
    lbr = IdealLabeler(fully_labeled_trn_ds)

    quota = len(y_train) - n_labeled  # number of samples to query

    # Comparing UncertaintySampling strategy with RandomSampling.
    # model is the base learner, e.g. LogisticRegression, SVM ... etc.
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    model = LogisticRegression()
    E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)

    qs2 = RandomSampling(trn_ds2)
    model = LogisticRegression()
    E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, E_in_1, 'b', label='qs Ein')
    plt.plot(query_num, E_in_2, 'r', label='random Ein')
    plt.plot(query_num, E_out_1, 'g', label='qs Eout')
    plt.plot(query_num, E_out_2, 'k', label='random Eout')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()
Beispiel #8
0
    def test_ALBLTestCase(self):
        trn_ds = Dataset(
            self.X, np.concatenate([self.y[:10], [None] * (len(self.y) - 10)]))
        qs = ActiveLearningByLearning(
            trn_ds,
            T=self.quota,
            query_strategies=[
                UncertaintySampling(trn_ds,
                                    model=SVM(kernel="linear",
                                              decision_function_shape="ovr")),
                QUIRE(trn_ds),
                RandomSampling(trn_ds)
            ],
            model=SVM(kernel="linear", decision_function_shape="ovr"),
            random_state=1126)

        qseq = run_qs(trn_ds, qs, self.y, self.quota)
        assert_array_equal(
            qseq, np.array([173, 103, 133, 184, 187, 147, 251, 83, 93, 33]))
Beispiel #9
0
def initialQuerySetup(train_dataset,
                      queryStrategyID,
                      queryParams=None,
                      fixRandomState=False):

    if queryStrategyID == 0:
        queryStrategy = RandomSampling(train_dataset,random_state=137 \
                                       if fixRandomState else None)

    elif queryStrategyID == 1:
        queryStrategy = UncertaintySampling(train_dataset,
                                            method='sm',
                                            model=queryParams[0])

    elif queryStrategyID == 2:
        queryStrategy = QueryByCommittee(train_dataset,
                                         models=queryParams[0],
                                         disagreement='vote',
                                         random_state=23 \
                                         if fixRandomState else None)
    elif queryStrategyID == 3:
        queryStrategy = RandomBatchQuery(train_dataset,
                                         batch_size=queryParams[0],
                                         random_state=2311 \
                                         if fixRandomState else None)

    elif queryStrategyID == 4:
        queryStrategy = LeastCertainBatchQuery(train_dataset,
                                               model=queryParams[0],
                                               batch_size=queryParams[1],
                                               random_state=2317 \
                                               if fixRandomState else None)

    elif queryStrategyID == 5:
        queryStrategy = SemiSupervisedBatchQuery(train_dataset,
                                                 model=queryParams[0],
                                                 batch_size=queryParams[1],
                                                 random_state=3112 \
                                                 if fixRandomState else None)

    return queryStrategy
Beispiel #10
0
 def test_quire(self):
     trn_ds = Dataset(self.X, np.concatenate([self.y[:10], [None] * 10]))
     qs = RandomSampling(trn_ds, random_state=2019)
     qseq = run_qs(trn_ds, qs, self.y, self.quota)
     assert_array_equal(qseq,
                        np.array([18, 12, 19, 16, 10, 11, 14, 13, 15, 17]))
Beispiel #11
0
def main():
    # Specifiy the parameters here:
    # path to your binary classification dataset
    ds_name = 'australian'
    dataset_filepath = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name)
    test_size = 0.33  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set
    n_labeled = 10  # number of samples that are initially labeled
    results = []

    for T in range(20):  # repeat the experiment 20 times
        print("%dth experiment" % (T + 1))

        trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
            split_train_test(dataset_filepath, test_size, n_labeled)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)

        quota = len(y_train) - n_labeled  # number of samples to query

        # Comparing UncertaintySampling strategy with RandomSampling.
        # model is the base learner, e.g. LogisticRegression, SVM ... etc.
        qs = UncertaintySampling(trn_ds,
                                 model=SVM(decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        results.append(E_out_1.tolist())

        qs2 = RandomSampling(trn_ds2)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        results.append(E_out_2.tolist())

        qs3 = QUIRE(trn_ds3)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        results.append(E_out_3.tolist())

        qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0)
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        results.append(E_out_4.tolist())

        qs5 = ActiveLearningByLearning(
            trn_ds5,
            query_strategies=[
                UncertaintySampling(trn_ds5,
                                    model=SVM(kernel='linear',
                                              decision_function_shape='ovr')),
                QUIRE(trn_ds5),
                HintSVM(trn_ds5, cl=1.0, ch=1.0),
            ],
            T=quota,
            uniform_sampler=True,
            model=SVM(kernel='linear', decision_function_shape='ovr'))
        model = SVM(kernel='linear', decision_function_shape='ovr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        results.append(E_out_5.tolist())

    result = []
    for i in range(5):
        _temp = []
        for j in range(i, len(results), 5):
            _temp.append(results[j])
        result.append(np.mean(_temp, axis=0))

    # Plot the learning curve of UncertaintySampling to RandomSampling
    # The x-axis is the number of queries, and the y-axis is the corresponding
    # error rate.
    query_num = np.arange(1, quota + 1)
    plt.plot(query_num, result[0], 'g', label='uncertainty sampling')
    plt.plot(query_num, result[1], 'k', label='random')
    plt.plot(query_num, result[2], 'r', label='QUIRE')
    plt.plot(query_num, result[3], 'b', label='HintSVM')
    plt.plot(query_num, result[4], 'c', label='ALBL')
    plt.xlabel('Number of Queries')
    plt.ylabel('Error')
    plt.title('Experiment Result')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show()
Beispiel #12
0
def main():
    global pos_filepath, dataset_filepath, csv_filepath, vectors_list, ids_list
    dataset_filepath = "/Users/dndesign/Desktop/active_learning/vecteurs_et_infos/vectors_2015.txt"
    csv_filepath = "/Users/dndesign/Desktop/active_learning/donnees/corpus_2015_id-time-text.csv"
    pos_filepath = "/Users/dndesign/Desktop/active_learning/donnees/oriane_pos_id-time-text.csv"
    vectors_list, ids_list = get_vectors_list(dataset_filepath)

    timestr = time.strftime("%Y%m%d_%H%M%S")
    text_file = codecs.open("task_" + str(timestr) + ".txt", "w", "utf-8")

    print("Loading data...")
    text_file.write("Loading data...\n")
    # Open this file
    t0 = time.time()
    file = openfile_txt(dataset_filepath)
    num_lines = sum(1 for line in file)
    print("Treating " + str(num_lines) + " entries...")
    text_file.write("Treating : %s entries...\n" % str(num_lines))

    # Number of queries to ask human to label
    quota = 10
    E_out1, E_out2, E_out3, E_out4, E_out6, E_out7 = [], [], [], [], [], []
    trn_ds, tst_ds = split_train_test(csv_filepath)

    model = SVM(kernel='linear')
    # model = LogisticRegression()

    ''' UncertaintySampling (Least Confident)
     
        UncertaintySampling : it queries the instances about which 
        it is least certain how to label
        
        Least Confident : it queries the instance whose posterior 
        probability of being positive is nearest 0.5
    '''
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(C=.01))
    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

    ''' UncertaintySampling (Max Margin) 

    '''
    trn_ds2 = copy.deepcopy(trn_ds)
    qs2 = USampling(trn_ds2, method='mm', model=SVM(kernel='linear'))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    ''' CMB Sampling   
        Combination of active learning algorithms (distance-based (DIST), diversity-based (DIV)) 
    '''
    trn_ds3 = copy.deepcopy(trn_ds)
    qs3 = CMBSampling(trn_ds3, model=SVM(kernel='linear'))
    model.train(trn_ds3)
    E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

    ''' Random Sampling   
        Random : it chooses randomly a query
    '''
    trn_ds4 = copy.deepcopy(trn_ds)
    qs4 = RandomSampling(trn_ds4, random_state=1126)
    model.train(trn_ds4)
    E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Vote Entropy)
    
        QueryByCommittee : it keeps a committee of classifiers and queries 
        the instance that the committee members disagree, it  also examines 
        unlabeled examples and selects only those that are most informative 
        for labeling
        
        Vote Entropy : a way of measuring disagreement 
        
        Disadvantage : it does not consider the committee members’ class 
        distributions. It also misses some informative unlabeled examples 
        to label 
    '''
    trn_ds6 = copy.deepcopy(trn_ds)
    qs6 = QueryByCommittee(trn_ds6, disagreement='vote',
                              models=[LogisticRegression(C=1.0),
                                      LogisticRegression(C=0.01),
                                      LogisticRegression(C=100)],
                              random_state=1126)
    model.train(trn_ds6)
    E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

    ''' QueryByCommittee (Kullback-Leibler Divergence)
    
            QueryByCommittee : it examines unlabeled examples and selects only 
            those that are most informative for labeling
            
            Disadvantage :  it misses some examples on which committee members 
            disagree
    '''
    trn_ds7 = copy.deepcopy(trn_ds)
    qs7 = QueryByCommittee(trn_ds7, disagreement='kl_divergence',
                                  models=[LogisticRegression(C=1.0),
                                          LogisticRegression(C=0.01),
                                          LogisticRegression(C=100)],
                                  random_state=1126)
    model.train(trn_ds7)
    E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

    with sns.axes_style("darkgrid"):
        fig = plt.figure()
        ax = fig.add_subplot(1, 1, 1)

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'red')
    p2, = ax.plot(query_num, E_out2, 'blue')
    p3, = ax.plot(query_num, E_out3, 'green')
    p4, = ax.plot(query_num, E_out4, 'orange')
    p6, = ax.plot(query_num, E_out6, 'black')
    p7, = ax.plot(query_num, E_out7, 'purple')
    plt.legend(('Least Confident', 'Max Margin', 'Distance Diversity CMB', 'Random Sampling', 'Vote Entropy', 'KL Divergence'), loc=1)
    plt.ylabel('Accuracy')
    plt.xlabel('Number of Queries')
    plt.title('Active Learning - Query choice strategies')
    plt.ylim([0, 1])
    plt.show(block=False)

    for i in range(quota):
        print("\n#################################################")
        print("Query number " + str(i) + " : ")
        print("#################################################\n")
        text_file.write("\n#################################################\n")
        text_file.write("Query number %s : " % str(i))
        text_file.write("\n#################################################\n")

        ask_id = qs.make_query()
        print("\033[4mUsing Uncertainty Sampling (Least confident) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Least confident) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("\033[4mUsing Uncertainty Sampling (Max Margin) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Smallest Margin) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds2.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ask_id = qs3.make_query()
        print("\033[4mUsing CMB Distance-Diversity Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Uncertainty Sampling (Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds3.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds3)
        E_out3 = np.append(E_out3, 1 - model.score(tst_ds))

        ask_id = qs4.make_query()
        print("\033[4mUsing Random Sampling :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using Random Sampling :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds4.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds4)
        E_out4 = np.append(E_out4, 1 - model.score(tst_ds))

        ask_id = qs6.make_query()
        print("\033[4mUsing QueryByCommittee (Vote Entropy) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (Vote Entropy) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds6.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds6)
        E_out6 = np.append(E_out6, 1 - model.score(tst_ds))

        ask_id = qs7.make_query()
        print("\033[4mUsing QueryByCommittee (KL Divergence) :\033[0m")
        print("Tweet :" + define_tweet_by_id(ask_id), end='', flush=True)
        print("Simulating human response : " + str(simulate_human_decision(ask_id)) + " \n")
        text_file.write("Using QueryByCommittee (KL Divergence) :\n")
        text_file.write("Tweet : %s \n" % str(define_tweet_by_id(ask_id)))
        text_file.write("Simulating human response : %s \n\n" % str(simulate_human_decision(ask_id)))
        trn_ds7.update(ask_id, simulate_human_decision(ask_id))
        model.train(trn_ds7)
        E_out7 = np.append(E_out7, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2), max(E_out3), max(E_out4), max(E_out6), max(E_out7)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)
        p3.set_xdata(query_num)
        p3.set_ydata(E_out3)
        p4.set_xdata(query_num)
        p4.set_ydata(E_out4)
        p6.set_xdata(query_num)
        p6.set_ydata(E_out6)
        p7.set_xdata(query_num)
        p7.set_ydata(E_out7)

        plt.draw()

    t2 = time.time()
    time_total = t2 - t0
    print("\n\n\n#################################################\n")
    print("Execution time : %fs \n\n" % time_total)
    text_file.write("\n\n\n#################################################\n")
    text_file.write("Execution time : %fs \n" % time_total)
    text_file.close()
    input("Press any key to save the plot...")
    plt.savefig('task_' + str(timestr) + '.png')

    print("Done")
Beispiel #13
0
def main():
    quota = 10  # ask human to label 10 samples
    n_classes = 5
    E_out1, E_out2 = [], []

    trn_ds, tst_ds, ds = split_train_test(n_classes)
    trn_ds2 = copy.deepcopy(trn_ds)
    # print(trn_ds.get_entries())
    # print(len(trn_ds))
    qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression())
    qs2 = RandomSampling(trn_ds2)

    model = LogisticRegression()

    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    ax.set_xlabel('Number of Queries')
    ax.set_ylabel('Error')

    model.train(trn_ds)
    E_out1 = np.append(E_out1, 1 - model.score(tst_ds))
    model.train(trn_ds2)
    E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

    query_num = np.arange(0, 1)
    p1, = ax.plot(query_num, E_out1, 'g', label='qs Eout')
    p2, = ax.plot(query_num, E_out2, 'k', label='random Eout')
    plt.legend(loc='upper center',
               bbox_to_anchor=(0.5, -0.05),
               fancybox=True,
               shadow=True,
               ncol=5)
    plt.show(block=False)

    img_ax = fig.add_subplot(2, 1, 2)
    box = img_ax.get_position()
    img_ax.set_position(
        [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9])
    # Give each label its name (labels are from 0 to n_classes-1)
    lbr = InteractiveLabeler(label_name=[str(lbl) for lbl in range(n_classes)])

    for i in range(quota):
        ask_id = qs.make_query()
        print("asking sample from Uncertainty Sampling")
        # reshape the image to its width and height
        lb = lbr.label(trn_ds.data[ask_id][0].reshape(8, 8))
        trn_ds.update(ask_id, lb)
        model.train(trn_ds)
        E_out1 = np.append(E_out1, 1 - model.score(tst_ds))

        ask_id = qs2.make_query()
        print("asking sample from Random Sample")
        lb = lbr.label(trn_ds2.data[ask_id][0].reshape(8, 8))
        trn_ds2.update(ask_id, lb)
        model.train(trn_ds2)
        E_out2 = np.append(E_out2, 1 - model.score(tst_ds))

        ax.set_xlim((0, i + 1))
        ax.set_ylim((0, max(max(E_out1), max(E_out2)) + 0.2))
        query_num = np.arange(0, i + 2)
        p1.set_xdata(query_num)
        p1.set_ydata(E_out1)
        p2.set_xdata(query_num)
        p2.set_ydata(E_out2)

        plt.draw()

    input("Press any key to continue...")
    L_test = get_label(D_test, landmark, threshold)
    testset = Dataset(D_test, L_test)

    sigma = np.mean(pairwise_distances(D0))

    qs = EpsilonMarginSampling(
        dataset,  # Dataset object
        model=GPC(RBF(1), optimizer=None),
        margin=margin)

    qs1 = UncertSampling(
        dataset,  # Dataset object
        model=GPC(RBF(1), optimizer=None),
        method='sm')

    qs2 = RandomSampling(dataset)

    center0 = np.mean(D[L == 1], axis=0)
    center = center0
    bounds_old = np.vstack((np.min(D0, axis=0), np.max(D0, axis=0)))
    i = 0
    clf = GPC(RBF(1), optimizer=None)

    while i < n_iter + 1:

        print 'Iteration: %d/%d' % (i, n_iter)

        # Generate a pool and expand dataset
        pool, bounds_new = expand_pool(D, bounds_old, expansion_rate)
        for entry in pool:
            dataset.append(entry)
Beispiel #15
0
def getQueryStrategy(query_strategy,
                     train_ds,
                     disagreement,
                     estimator_name=None):
    print('Initialize Query Strategy')
    # no committee but baseline query strategy
    if query_strategy == 'uncertainty':
        qs = UncertaintySampling(train_ds,
                                 method='lc',
                                 model=la.LogisticRegression_())
    # no committee but baseline query strategy
    elif query_strategy == 'random':
        qs = RandomSampling(train_ds)
    elif query_strategy == 'lr_lsvc_rf_dt':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.RandomForest_(),
                                  la.DecisionTree_(),
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.LinearSVC_()
                              ],
                              disagreement=disagreement)
    # committee with probabilistic models (SVC with prob=True used here instead of LinearSVC)
    elif query_strategy == 'lr_svc_rf_dt':
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.RandomForest_(),
                                  la.DecisionTree_(),
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.SVC_(kernel='linear', probability=True)
                              ],
                              disagreement=disagreement)
    elif query_strategy == 'lr_svc_dt_xgb':
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.SVC_(kernel='linear', probability=True),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic")
            ],
            disagreement=disagreement)
    # committee of five
    elif query_strategy == 'lr_svc_dt_xgb_rf':
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.SVC_(kernel='linear', probability=True),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic"),
                la.RandomForest_()
            ],
            disagreement=disagreement)
    elif query_strategy == 'lr_lsvc_dt_gpc':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(train_ds,
                              models=[
                                  la.LogisticRegression_(solver='liblinear',
                                                         max_iter=1000),
                                  la.LinearSVC_(),
                                  la.DecisionTree_(),
                                  la.GaussianProcess_()
                              ],
                              disagreement=disagreement)
    elif query_strategy == 'lr_lsvc_dt_xgb':
        if disagreement == 'kl_divergence':
            raise ValueError(
                'when using kl_divergence lsvc cannot be in the committee as linearSVC does not provide predict_proba().\
                             Use svc instead or change disagreement to vote!')
        qs = QueryByCommittee(
            train_ds,
            models=[
                la.LogisticRegression_(solver='liblinear', max_iter=1000),
                la.LinearSVC_(),
                la.DecisionTree_(),
                la.XGBClassifier_(objective="binary:logistic")
            ],
            disagreement=disagreement)
    elif query_strategy == 'homogeneous_committee':
        committee = CommitteeModels(estimator_name)
        qs = QueryByCommittee(train_ds, models=committee.committee['models'])
    else:
        print("Query strategy not defined!")
        return None
    return qs
Beispiel #16
0
def run_featureselection(trn_dss,
                         tst_ds,
                         y_train,
                         model,
                         method_,
                         qs,
                         X_test,
                         y_test,
                         all_cols,
                         save_name,
                         save,
                         type_,
                         part=20):
    """
    Batch active learning algorithm with feature selection
    """
    E_in, E_out = [], []
    f1score = []
    features_ls = []
    label_holder, asked_id = [], []
    tn, fp, fn, tp = [], [], [], []

    k = trn_dss.len_labeled()
    k_beg = trn_dss.len_labeled()
    quota = len(trn_dss.data)
    iter_ = 0

    while (k < quota):
        clear_output(wait=True)

        # Standard usage of libact objects
        # make_query returns the index of the sample that the active learning algorithm would like to query
        lbls, asks = [], []

        if (part < trn_dss.len_unlabeled()):
            part1 = part
        else:
            part1 = trn_dss.len_unlabeled()

        # -------------------> Feature Selection
        # select features with feature selection
        X_train_feature = [i[0] for i in trn_dss.get_labeled_entries()]
        y_train_feature = [i[1] for i in trn_dss.get_labeled_entries()]
        col_index, features_f = feature_selection(X_train_feature,
                                                  y_train_feature,
                                                  all_cols,
                                                  f_class=True)

        features_ls.append(features_f)

        # update the X_train dataset and y_train with the current selection of variables
        X_train_updated = [i[0][col_index] for i in trn_dss.data]
        y_train_updated = [i[1] for i in trn_dss.data]
        trn_dss_updated = Dataset(X_train_updated, y_train_updated)

        # update X_test
        X_test_feature = [i[col_index] for i in X_test]

        if (type_ == 'random'):
            qs = RandomSampling(trn_dss_updated, method=method_, model=model)
            model1 = model
        elif (type_ == 'unc'):
            qs = UncertaintySampling(trn_dss_updated,
                                     method=method_,
                                     model=model)
            model1 = model
        elif (type_ == 'qbc'):
            qs = QueryByCommittee(trn_dss_updated, models=model)
            model1 = method_
        elif (type_ == 'dens'):
            qs = DWUS(trn_dss_updated, model=model)
            model1 = model

        for i in range(0, part1):
            # ask id only asks for particular id, not all, everytime
            ask_id = qs.make_query()
            asks.append(ask_id)
            # lbl label returns the label of a given sample
            lb = y_train[ask_id]
            lbls.append(lb)
            # update updates the unlabeled sample with queried sample
            trn_dss.update(ask_id, lb)
            trn_dss_updated.update(ask_id, lb)

        label_holder.append(lbls)
        asked_id.append(asks)

        # trains only on the labeled examples and chosen values
        model1.train(trn_dss_updated)
        # predict it
        pred_y = model1.predict(X_test_feature)

        # save the results
        f1score.append(f1_score(y_test, pred_y))
        tn.append(confusion_matrix(y_test, pred_y)[0][0])
        fp.append(confusion_matrix(y_test, pred_y)[0][1])
        fn.append(confusion_matrix(y_test, pred_y)[1][0])
        tp.append(confusion_matrix(y_test, pred_y)[1][1])

        # score returns the mean accuracy of the results
        #E_in = np.append(E_in, 1 - model.score(trn_dss)) #train
        #E_out = np.append(E_out, 1 - model.score(tst_ds)) #test

        k = trn_dss_updated.len_labeled()
        print(k)
        print(quota)
        print('iteration:', iter_)
        print(len(f1score))
        print('train dataset labeled:', trn_dss.len_labeled())
        print('train dataset shape:', trn_dss.format_sklearn()[0].shape)
        print('train dataset sum:', trn_dss.format_sklearn()[1].sum())
        print('Current f1 score:', f1_score(y_test, pred_y))
        print('Current progress:', np.round(k / quota * 100, 2), '%')
        print('Chosen_features:', features_f)

        # number of iterations
        iter_ = iter_ + 1

    q = [i for i in range(k_beg, quota, part)]
    iter_ = [i for i in range(0, len(f1score))]

    if (save == True):
        #q= [i for i in range(k_beg,quota,part)]
        #iter_=[i for i in range(0,len(f1score))]
        saved_file = pd.DataFrame({
            'iter': iter_,
            'quota': q,
            'f1_score': f1score,
            'tn': tn,
            'fp': fp,
            'fn': fn,
            'tp': tp,
            'id_index': asked_id,
            'label': label_holder,
            'features': features_ls
        })
        saved_file.to_csv(save_name)

    return q, iter_, f1score, tn, fp, fn, tp, k, trn_dss.data, label_holder, asked_id, features_ls
Beispiel #17
0
def main():
    test_size = 0.25  # the percentage of samples in the dataset that will be
    # randomly selected and assigned to the test set

    result = {'E1': [], 'E2': [], 'E3': [], 'E4': [], 'E5': [], 'E6': []}
    for i in range(10):  # repeat experiment
        trn_ds, tst_ds, fully_labeled_trn_ds = split_train_test(test_size)
        trn_ds2 = copy.deepcopy(trn_ds)
        trn_ds3 = copy.deepcopy(trn_ds)
        trn_ds4 = copy.deepcopy(trn_ds)
        trn_ds5 = copy.deepcopy(trn_ds)
        trn_ds6 = copy.deepcopy(trn_ds)
        lbr = IdealLabeler(fully_labeled_trn_ds)
        model = BinaryRelevance(LogisticRegression())

        quota = 150  # number of samples to query

        qs = MMC(trn_ds, br_base=LogisticRegression())
        _, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
        result['E1'].append(E_out_1)

        qs2 = RandomSampling(trn_ds2)
        _, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
        result['E2'].append(E_out_2)

        qs3 = MultilabelWithAuxiliaryLearner(trn_ds3,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='hlr')
        _, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
        result['E3'].append(E_out_3)

        qs4 = MultilabelWithAuxiliaryLearner(trn_ds4,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='shlr')
        _, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
        result['E4'].append(E_out_4)

        qs5 = MultilabelWithAuxiliaryLearner(trn_ds5,
                                             BinaryRelevance(
                                                 LogisticRegression()),
                                             BinaryRelevance(SVM()),
                                             criterion='mmr')
        _, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
        result['E5'].append(E_out_5)

        qs6 = BinaryMinimization(trn_ds6, LogisticRegression())
        _, E_out_6 = run(trn_ds6, tst_ds, lbr, model, qs6, quota)
        result['E6'].append(E_out_6)

    E_out_1 = np.mean(result['E1'], axis=0)
    E_out_2 = np.mean(result['E2'], axis=0)
    E_out_3 = np.mean(result['E3'], axis=0)
    E_out_4 = np.mean(result['E4'], axis=0)
    E_out_5 = np.mean(result['E5'], axis=0)
    E_out_6 = np.mean(result['E6'], axis=0)

    print("MMC: ", E_out_1[::5].tolist())
    print("Random: ", E_out_2[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_hlr: ", E_out_3[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_shlr: ", E_out_4[::5].tolist())
    print("MultilabelWithAuxiliaryLearner_mmr: ", E_out_5[::5].tolist())
    print("BinaryMinimization: ", E_out_6[::5].tolist())

    query_num = np.arange(1, quota + 1)
    fig = plt.figure(figsize=(9, 6))
    ax = plt.subplot(111)
    ax.plot(query_num, E_out_1, 'g', label='MMC')
    ax.plot(query_num, E_out_2, 'k', label='Random')
    ax.plot(query_num, E_out_3, 'r', label='AuxiliaryLearner_hlr')
    ax.plot(query_num, E_out_4, 'b', label='AuxiliaryLearner_shlr')
    ax.plot(query_num, E_out_5, 'c', label='AuxiliaryLearner_mmr')
    ax.plot(query_num, E_out_6, 'm', label='BinaryMinimization')

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.75, box.height])
    plt.legend(loc=2, bbox_to_anchor=(1.05, 1), borderaxespad=0.)
    plt.xlabel('Number of Queries')
    plt.ylabel('Loss')
    plt.title('Experiment Result (Hamming Loss)')
    plt.show()