def svm_learner(budget): accuracy = [] data = csv_reader('resources/pool.csv') testset = csv_reader('resources/testSet.csv') true_labels = oracle.read_mat() used = {} # do nothing about model until reasonable training subset achieved [row, col] = data.shape preds = np.zeros(row) selected = [] labels = [] query = 0 # query each point until get one with label 1 while 1: r = compound.next_compound(data) r_str = np.array_str(np.char.mod('%d', r)) if r_str[1: (len(r_str) - 1)] not in used: r_label = oracle.oracle2(r, data) query += 1 used[r_str[1: (len(r_str) - 1)]] = r_label selected.append(r.tolist()) labels.append(r_label) accuracy.append(error.generalization_error(preds, true_labels)) if np.sum(labels) == 1 and len(labels) > 1: accuracy.pop() break x = np.array(selected) y = np.array(labels) clf = SVC(kernel='linear') clf.fit(x, y) preds = clf.predict(data) accuracy.append(error.generalization_error(preds, true_labels)) num = 2543 - len(used) i = 0 while i < num and query < budget: r = compound.next_compound(data) r_str = np.array_str(np.char.mod('%d', r)) if r_str[1: (len(r_str) - 1)] not in used: i += 1 distance = clf.decision_function(r) if np.abs(distance[0]) <= 0.78: x = np.vstack([x, r]) r_label = oracle.oracle2(r, data) y = np.hstack([y.tolist(), r_label]) query += 1 clf.fit(x, y) preds = clf.predict(testset) accuracy.append(error.test_error(preds, true_labels)) plt.plot(accuracy) plt.show() print f1_score(preds, true_labels[0:250]) return
def stream_learner(method, option, budget): features = csv_reader('resources/pool.csv') [row, col] = features.shape testset = csv_reader('resources/testSet.csv') true_labels = oracle.read_mat() if method == "rf": clf = RandomForestClassifier(n_estimators=10, criterion='entropy') if method == "lr": clf = LogisticRegression(penalty='l2') accuracy = [] points = [] labels = [] used = {} flag = True query_count = 0 i = 0 pred = np.zeros(250) if option == "select": # active learner while i < 2543 and query_count < budget: if flag: # call next compound until get one point with label 1 cur_point = compound.next_compound(features) cur_str = np.array_str(np.char.mod('%d', cur_point)) if cur_str[1: (len(cur_str) - 1)] not in used: i += 1 points.append(cur_point) cur_label = oracle.oracle2(cur_point, features) labels.append(cur_label) used[cur_str[1: (len(cur_str) - 1)]] = cur_label query_count += 1 if cur_label == 1: flag = False else: clf.fit(np.asarray(points), np.array(labels)) cur_point = compound.next_compound(features) cur_str = np.array_str(np.char.mod('%d', cur_point)) if cur_str[1: (len(cur_str) - 1)] not in used: # decide if ask oracle for help i += 1 prob = clf.predict_proba(cur_point) if 0.1 <= prob[0][0] <= 0.9: points.append(cur_point) cur_label = oracle.oracle2(cur_point, features) labels.append(cur_label) query_count += 1 used[cur_str[1: (len(cur_str) - 1)]] = cur_label clf.fit(np.asarray(points), np.array(labels)) pred = clf.predict(testset) cur_acc = error.test_error(pred, true_labels) print cur_acc, " ", query_count, " ", cur_label, " ", prob[0][0], " ", prob[0][1] accuracy.append(cur_acc) else: # random learner while i < budget: cur_point = compound.next_compound(features) cur_str = np.array_str(np.char.mod('%d', cur_point)) if cur_str[1: (len(cur_str) - 1)] not in used: points.append(cur_point) cur_label = oracle.oracle2(cur_point, features) if cur_label == 1: flag = False labels.append(cur_label) used[cur_str[1: (len(cur_str) - 1)]] = cur_label query_count += 1 i += 1 if not flag: clf.fit(np.asarray(points), np.array(labels)) pred = clf.predict(testset) cur_acc = error.test_error(pred, true_labels) print cur_acc, " ", query_count, " ", cur_label accuracy.append(cur_acc) plt.plot(accuracy) plt.show() print "f1", f1_score(pred, true_labels[0:250]) return