def svm_learner(budget):
    accuracy = []
    data = csv_reader('resources/pool.csv')
    testset = csv_reader('resources/testSet.csv')
    true_labels = oracle.read_mat()
    used = {}

    # do nothing about model until reasonable training subset achieved
    [row, col] = data.shape
    preds = np.zeros(row)
    selected = []
    labels = []
    query = 0
    # query each point until get one with label 1
    while 1:
        r = compound.next_compound(data)
        r_str = np.array_str(np.char.mod('%d', r))
        if r_str[1: (len(r_str) - 1)] not in used:
            r_label = oracle.oracle2(r, data)
            query += 1
            used[r_str[1: (len(r_str) - 1)]] = r_label
            selected.append(r.tolist())
            labels.append(r_label)
            accuracy.append(error.generalization_error(preds, true_labels))
            if np.sum(labels) == 1 and len(labels) > 1:
                accuracy.pop()
                break
    x = np.array(selected)
    y = np.array(labels)
    clf = SVC(kernel='linear')
    clf.fit(x, y)
    preds = clf.predict(data)
    accuracy.append(error.generalization_error(preds, true_labels))

    num = 2543 - len(used)
    i = 0
    while i < num and query < budget:
        r = compound.next_compound(data)
        r_str = np.array_str(np.char.mod('%d', r))
        if r_str[1: (len(r_str) - 1)] not in used:
            i += 1
            distance = clf.decision_function(r)
            if np.abs(distance[0]) <= 0.78:
                x = np.vstack([x, r])
                r_label = oracle.oracle2(r, data)
                y = np.hstack([y.tolist(), r_label])
                query += 1
                clf.fit(x, y)
                preds = clf.predict(testset)
                accuracy.append(error.test_error(preds, true_labels))
    plt.plot(accuracy)
    plt.show()
    print f1_score(preds, true_labels[0:250])
    return
def svm_learner(option):
    accuracy = []
    data = pool_reader()
    [row, col] = data.shape
    true_labels = oracle.read_mat()

    # do nothing about model until reasonable training subset achieved
    active_count = 0
    preds = np.zeros(row)
    used = set()
    selected = []
    labels = []
    while 1:
        r = random.randint(0, row-1)
        if r not in used:
            used.add(r)
            selected.append(data[r].tolist())
            labels.append(true_labels[r])
            used.add(r)
            accuracy.append(err.generalization_error(preds, true_labels))
            if np.sum(labels) == 1 and len(labels) > 1:
                accuracy.pop()
                break

    X = np.array(selected)
    y = np.array(labels)
    clf = SVC(kernel='linear')
    clf.fit(X, y)
    preds = clf.predict(data)
    accuracy.append(err.generalization_error(preds, true_labels))
    for x in xrange(256-len(used)):
        if option == 'rand':
            # random selection strategy
            while 1:
                cur = random.randint(0, row-1)
                if cur not in used:
                    break
        else:
            # farthest or say most different to previous 1 active selection strategy
            active = np.where(y == 0)[0].tolist()
            # farthest to all used
            cur = get_next(data, active, used)
            print 'oracle', true_labels[cur]
        used.add(cur)
        X = np.vstack([X, data[cur]])
        y = np.hstack([y.tolist(),[true_labels[cur]]])
        clf.fit(X, y)
        preds = clf.predict(data)
        accuracy.append(err.generalization_error(preds, true_labels))

    print f1_score(preds, true_labels)
    return accuracy
def svm_margin_learner():
    accuracy = []
    data = pool_reader()
    [row, col] = data.shape
    true_labels = oracle.read_mat()

    # do nothing about model until reasonable training subset achieved
    preds = np.zeros(row)
    used = set()
    selected = []
    labels = []
    while 1:
        r = random.randint(0, row-1)
        if r not in used:
            used.add(r)
            selected.append(data[r].tolist())
            labels.append(true_labels[r])
            accuracy.append(err.generalization_error(preds, true_labels))
            if np.sum(labels) == 1 and len(labels) > 1:
                accuracy.pop()
                break

    X = np.array(selected)
    y = np.array(labels)

    clf = SVC(kernel='linear')
    clf.fit(X, y)
    preds = clf.predict(data)
    accuracy.append(err.generalization_error(preds, true_labels))
    for x in xrange(256-len(used)):
        # nearest to decision boundary
        distance = clf.decision_function(data)
        rank = np.argsort(np.abs(distance))
        for i in xrange(len(rank)):
            if rank[i] not in used:
                cur = rank[i]
                break
        print 'oracle', true_labels[cur]
        used.add(cur)
        X = np.vstack([X, data[cur]])
        y = np.hstack([y.tolist(),[true_labels[cur]]])
        clf.fit(X, y)
        preds = clf.predict(data)
        accuracy.append(err.generalization_error(preds, true_labels))
    print f1_score(preds, true_labels)
    return accuracy
def svm_learner_all():
    data = pool_reader()
    true_labels = oracle.read_mat()
    clf = SVC(kernel='linear')
    X = np.array(data)
    y = np.array(true_labels)
    clf.fit(X, y)
    preds = clf.predict(data)
    accuracy = (err.generalization_error(preds, true_labels))
    print accuracy
    print f1_score(preds, true_labels)
    return accuracy
def rfc_learner(option):
    accuracy = []
    data = pool_reader()
    true_labels = oracle.read_mat()
    [row_size, col_size] = data.shape
    points = np.empty([0, col_size])
    labels = []
    used = set()
    flag = True
    predictions = np.zeros(row_size)
    for i in xrange(0, 256):
        if option == 'select':
            if flag:
                pick = random.sample(range(row_size), 1)[0]
            else:
                # pick = get_next(data, points, used)
                clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
                clf.fit(points, np.array(labels))
                prob = clf.predict_proba(data)
                weight = np.abs(prob[:, 0] - 0.5)
                rank = np.argsort(weight)
                for x in xrange(len(rank)):
                    if rank[x] not in used:
                        pick = rank[x]
                        break
        else:
            while 1:
                pick = random.sample(range(row_size), 1)[0]
                if pick not in used:
                    break

        used.add(pick)
        points = np.vstack([points, data[pick]])
        if oracle.oracle1(true_labels, pick) == 1:
            flag = False
        labels.append(oracle.oracle1(true_labels, pick))
        clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
        clf.fit(points, np.array(labels))
        predictions = clf.predict(data)
        cur_acc = err.generalization_error(predictions, true_labels)
        accuracy.append(cur_acc)
    plt.plot(accuracy)
    plt.show()
    print "f1 ", f1_score(predictions, true_labels)
    return accuracy
def lrc_learner(option):
    accuracy = []
    data = pool_reader()
    true_labels = oracle.read_mat()
    [row_size, col_size] = data.shape
    predictions = np.zeros(row_size)
    points = np.empty([0, col_size])
    labels = []
    used = set()
    flag = True
    for i in xrange(0, 256):
        if option == "select":
            pick = -1
            if flag:
                while 1:
                    pick = random.sample(range(row_size), 1)[0]
                    if pick not in used:
                        used.add(pick)
                        points = np.vstack([points, data[pick]])
                        label = oracle.oracle1(true_labels, pick)
                        labels.append(label)
                        if label == 1:
                            flag = False
                        break
            else:
                clf = LogisticRegression()
                clf.fit(points, np.array(labels))
                prob = clf.predict_proba(data)
                weight = np.abs(prob[:, 0] - 0.5)
                rank = np.argsort(weight)
                for x in xrange(len(rank)):
                    if rank[x] not in used:
                        pick = rank[x]
                        break
                used.add(pick)
                points = np.vstack([points, data[pick]])
                label = oracle.oracle1(true_labels, pick)
                labels.append(label)
                clf.fit(points, np.array(labels))
                predictions = clf.predict(data)
                cur_acc = err.generalization_error(predictions, true_labels)
                accuracy.append(cur_acc)
        else:
            while 1:
                pick = random.sample(range(row_size), 1)[0]
                if pick not in used:
                    break
            used.add(pick)
            points = np.vstack([points, data[pick]])
            label = oracle.oracle1(true_labels, pick)
            labels.append(label)
            if label == 1:
                flag = False
            if not flag:
                clf = LogisticRegression()
                clf.fit(points, np.array(labels))
                predictions = clf.predict(data)
                cur_acc = err.generalization_error(predictions, true_labels)
                accuracy.append(cur_acc)
    plt.plot(accuracy)
    plt.show()
    print "f1 ", f1_score(predictions, true_labels)
    return accuracy