def svm_learner(budget):
    accuracy = []
    data = csv_reader('resources/pool.csv')
    testset = csv_reader('resources/testSet.csv')
    true_labels = oracle.read_mat()
    used = {}

    # do nothing about model until reasonable training subset achieved
    [row, col] = data.shape
    preds = np.zeros(row)
    selected = []
    labels = []
    query = 0
    # query each point until get one with label 1
    while 1:
        r = compound.next_compound(data)
        r_str = np.array_str(np.char.mod('%d', r))
        if r_str[1: (len(r_str) - 1)] not in used:
            r_label = oracle.oracle2(r, data)
            query += 1
            used[r_str[1: (len(r_str) - 1)]] = r_label
            selected.append(r.tolist())
            labels.append(r_label)
            accuracy.append(error.generalization_error(preds, true_labels))
            if np.sum(labels) == 1 and len(labels) > 1:
                accuracy.pop()
                break
    x = np.array(selected)
    y = np.array(labels)
    clf = SVC(kernel='linear')
    clf.fit(x, y)
    preds = clf.predict(data)
    accuracy.append(error.generalization_error(preds, true_labels))

    num = 2543 - len(used)
    i = 0
    while i < num and query < budget:
        r = compound.next_compound(data)
        r_str = np.array_str(np.char.mod('%d', r))
        if r_str[1: (len(r_str) - 1)] not in used:
            i += 1
            distance = clf.decision_function(r)
            if np.abs(distance[0]) <= 0.78:
                x = np.vstack([x, r])
                r_label = oracle.oracle2(r, data)
                y = np.hstack([y.tolist(), r_label])
                query += 1
                clf.fit(x, y)
                preds = clf.predict(testset)
                accuracy.append(error.test_error(preds, true_labels))
    plt.plot(accuracy)
    plt.show()
    print f1_score(preds, true_labels[0:250])
    return
def svm_learner_all():
    data = pool_reader()
    true_labels = oracle.read_mat()
    clf = SVC(kernel='linear')
    X = np.array(data)
    y = np.array(true_labels)
    clf.fit(X, y)
    preds = clf.predict(data)
    accuracy = (err.generalization_error(preds, true_labels))
    print accuracy
    print f1_score(preds, true_labels)
    return accuracy
def svm_learner(option):
    accuracy = []
    data = pool_reader()
    [row, col] = data.shape
    true_labels = oracle.read_mat()

    # do nothing about model until reasonable training subset achieved
    active_count = 0
    preds = np.zeros(row)
    used = set()
    selected = []
    labels = []
    while 1:
        r = random.randint(0, row-1)
        if r not in used:
            used.add(r)
            selected.append(data[r].tolist())
            labels.append(true_labels[r])
            used.add(r)
            accuracy.append(err.generalization_error(preds, true_labels))
            if np.sum(labels) == 1 and len(labels) > 1:
                accuracy.pop()
                break

    X = np.array(selected)
    y = np.array(labels)
    clf = SVC(kernel='linear')
    clf.fit(X, y)
    preds = clf.predict(data)
    accuracy.append(err.generalization_error(preds, true_labels))
    for x in xrange(256-len(used)):
        if option == 'rand':
            # random selection strategy
            while 1:
                cur = random.randint(0, row-1)
                if cur not in used:
                    break
        else:
            # farthest or say most different to previous 1 active selection strategy
            active = np.where(y == 0)[0].tolist()
            # farthest to all used
            cur = get_next(data, active, used)
            print 'oracle', true_labels[cur]
        used.add(cur)
        X = np.vstack([X, data[cur]])
        y = np.hstack([y.tolist(),[true_labels[cur]]])
        clf.fit(X, y)
        preds = clf.predict(data)
        accuracy.append(err.generalization_error(preds, true_labels))

    print f1_score(preds, true_labels)
    return accuracy
def svm_margin_learner():
    accuracy = []
    data = pool_reader()
    [row, col] = data.shape
    true_labels = oracle.read_mat()

    # do nothing about model until reasonable training subset achieved
    preds = np.zeros(row)
    used = set()
    selected = []
    labels = []
    while 1:
        r = random.randint(0, row-1)
        if r not in used:
            used.add(r)
            selected.append(data[r].tolist())
            labels.append(true_labels[r])
            accuracy.append(err.generalization_error(preds, true_labels))
            if np.sum(labels) == 1 and len(labels) > 1:
                accuracy.pop()
                break

    X = np.array(selected)
    y = np.array(labels)

    clf = SVC(kernel='linear')
    clf.fit(X, y)
    preds = clf.predict(data)
    accuracy.append(err.generalization_error(preds, true_labels))
    for x in xrange(256-len(used)):
        # nearest to decision boundary
        distance = clf.decision_function(data)
        rank = np.argsort(np.abs(distance))
        for i in xrange(len(rank)):
            if rank[i] not in used:
                cur = rank[i]
                break
        print 'oracle', true_labels[cur]
        used.add(cur)
        X = np.vstack([X, data[cur]])
        y = np.hstack([y.tolist(),[true_labels[cur]]])
        clf.fit(X, y)
        preds = clf.predict(data)
        accuracy.append(err.generalization_error(preds, true_labels))
    print f1_score(preds, true_labels)
    return accuracy
def rfc_learner(option):
    accuracy = []
    data = pool_reader()
    true_labels = oracle.read_mat()
    [row_size, col_size] = data.shape
    points = np.empty([0, col_size])
    labels = []
    used = set()
    flag = True
    predictions = np.zeros(row_size)
    for i in xrange(0, 256):
        if option == 'select':
            if flag:
                pick = random.sample(range(row_size), 1)[0]
            else:
                # pick = get_next(data, points, used)
                clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
                clf.fit(points, np.array(labels))
                prob = clf.predict_proba(data)
                weight = np.abs(prob[:, 0] - 0.5)
                rank = np.argsort(weight)
                for x in xrange(len(rank)):
                    if rank[x] not in used:
                        pick = rank[x]
                        break
        else:
            while 1:
                pick = random.sample(range(row_size), 1)[0]
                if pick not in used:
                    break

        used.add(pick)
        points = np.vstack([points, data[pick]])
        if oracle.oracle1(true_labels, pick) == 1:
            flag = False
        labels.append(oracle.oracle1(true_labels, pick))
        clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
        clf.fit(points, np.array(labels))
        predictions = clf.predict(data)
        cur_acc = err.generalization_error(predictions, true_labels)
        accuracy.append(cur_acc)
    plt.plot(accuracy)
    plt.show()
    print "f1 ", f1_score(predictions, true_labels)
    return accuracy
def stream_learner(method, option, budget):
    features = csv_reader('resources/pool.csv')
    [row, col] = features.shape
    testset = csv_reader('resources/testSet.csv')
    true_labels = oracle.read_mat()
    if method == "rf":
        clf = RandomForestClassifier(n_estimators=10, criterion='entropy')
    if method == "lr":
        clf = LogisticRegression(penalty='l2')
    accuracy = []
    points = []
    labels = []
    used = {}
    flag = True
    query_count = 0
    i = 0
    pred = np.zeros(250)
    if option == "select":
        # active learner
        while i < 2543 and query_count < budget:
            if flag:
                # call next compound until get one point with label 1
                cur_point = compound.next_compound(features)
                cur_str = np.array_str(np.char.mod('%d', cur_point))
                if cur_str[1: (len(cur_str) - 1)] not in used:
                    i += 1
                    points.append(cur_point)
                    cur_label = oracle.oracle2(cur_point, features)
                    labels.append(cur_label)
                    used[cur_str[1: (len(cur_str) - 1)]] = cur_label
                    query_count += 1
                    if cur_label == 1:
                        flag = False
            else:
                clf.fit(np.asarray(points), np.array(labels))
                cur_point = compound.next_compound(features)
                cur_str = np.array_str(np.char.mod('%d', cur_point))
                if cur_str[1: (len(cur_str) - 1)] not in used:
                    # decide if ask oracle for help
                    i += 1
                    prob = clf.predict_proba(cur_point)
                    if 0.1 <= prob[0][0] <= 0.9:
                        points.append(cur_point)
                        cur_label = oracle.oracle2(cur_point, features)
                        labels.append(cur_label)
                        query_count += 1
                        used[cur_str[1: (len(cur_str) - 1)]] = cur_label
                        clf.fit(np.asarray(points), np.array(labels))
                        pred = clf.predict(testset)
                        cur_acc = error.test_error(pred, true_labels)
                        print cur_acc, " ", query_count, " ", cur_label, " ", prob[0][0], " ", prob[0][1]
                        accuracy.append(cur_acc)
    else:
        # random learner
        while i < budget:
            cur_point = compound.next_compound(features)
            cur_str = np.array_str(np.char.mod('%d', cur_point))
            if cur_str[1: (len(cur_str) - 1)] not in used:
                points.append(cur_point)
                cur_label = oracle.oracle2(cur_point, features)
                if cur_label == 1:
                    flag = False
                labels.append(cur_label)
                used[cur_str[1: (len(cur_str) - 1)]] = cur_label
                query_count += 1
                i += 1
                if not flag:
                    clf.fit(np.asarray(points), np.array(labels))
                    pred = clf.predict(testset)
                    cur_acc = error.test_error(pred, true_labels)
                    print cur_acc, " ", query_count, " ", cur_label
                    accuracy.append(cur_acc)
    plt.plot(accuracy)
    plt.show()
    print "f1", f1_score(pred, true_labels[0:250])
    return
def lrc_learner(option):
    accuracy = []
    data = pool_reader()
    true_labels = oracle.read_mat()
    [row_size, col_size] = data.shape
    predictions = np.zeros(row_size)
    points = np.empty([0, col_size])
    labels = []
    used = set()
    flag = True
    for i in xrange(0, 256):
        if option == "select":
            pick = -1
            if flag:
                while 1:
                    pick = random.sample(range(row_size), 1)[0]
                    if pick not in used:
                        used.add(pick)
                        points = np.vstack([points, data[pick]])
                        label = oracle.oracle1(true_labels, pick)
                        labels.append(label)
                        if label == 1:
                            flag = False
                        break
            else:
                clf = LogisticRegression()
                clf.fit(points, np.array(labels))
                prob = clf.predict_proba(data)
                weight = np.abs(prob[:, 0] - 0.5)
                rank = np.argsort(weight)
                for x in xrange(len(rank)):
                    if rank[x] not in used:
                        pick = rank[x]
                        break
                used.add(pick)
                points = np.vstack([points, data[pick]])
                label = oracle.oracle1(true_labels, pick)
                labels.append(label)
                clf.fit(points, np.array(labels))
                predictions = clf.predict(data)
                cur_acc = err.generalization_error(predictions, true_labels)
                accuracy.append(cur_acc)
        else:
            while 1:
                pick = random.sample(range(row_size), 1)[0]
                if pick not in used:
                    break
            used.add(pick)
            points = np.vstack([points, data[pick]])
            label = oracle.oracle1(true_labels, pick)
            labels.append(label)
            if label == 1:
                flag = False
            if not flag:
                clf = LogisticRegression()
                clf.fit(points, np.array(labels))
                predictions = clf.predict(data)
                cur_acc = err.generalization_error(predictions, true_labels)
                accuracy.append(cur_acc)
    plt.plot(accuracy)
    plt.show()
    print "f1 ", f1_score(predictions, true_labels)
    return accuracy