def find_automatic_alignment_stats(ave_stats_writer): # open input files hits_input = csv.reader(open(sys.argv[5], 'rU'), delimiter = ",") hits_num = 0 ave_stats = [] for row in hits_input: try: # skip first line if row[0] == "source": continue print(row[0]) print(hits_num + 1) # convert all alignment results to sets sure_sub = set(row[4].split()) sure_ans = set(row[8].split()) # calculate stats of alignment results stats_list = [precision(sure_sub, sure_ans), recall(sure_sub, sure_ans)] stats_list.append(f1(stats_list[0], stats_list[1])) # if answer key has two possible alignments, take one with the higher f1 if row[12]: print("two answers exists") sure_ans_2 = set(row[12].split()) stats_list_2 = [precision(sure_sub, sure_ans_2), recall(sure_sub, sure_ans_2)] stats_list_2.append(f1(stats_list_2[0], stats_list_2[1])) print("1st F1 value: " + str(stats_list[2])) print("2nd F1 value: " + str(stats_list_2[2])) if stats_list_2[2] > stats_list[2]: print("selected answer B") stats_list = stats_list_2 else: print("selected answer A") # add values to the average calculation # the case where this is the first HIT in the list if hits_num == 0: hits_num += 1 ave_stats = stats_list # the case where this is not the first HIT in the list else: hits_num += 1 for i in range(0, 3): ave_stats[i] = float(ave_stats[i]) + ((stats_list[i] - ave_stats[i]) / float(hits_num)) print(ave_stats) print("") except: pass # print averages ave_stats_writer.writerow(["automatic_alignments", hits_num] + ave_stats) return
def scan_results(qual_type, results, worker_dict): for row in results: try: workerId = row[15] # skip first line if row[0] == "HITId": continue print(row[0]) print("WORKERID: " + workerId) # if an "unchanged" value is encountered, replace with the proper value if row[50] == "unchanged": #sureAlignments row[50] = row[31] # convert all alignment results to sets sure_sub = set(row[50].split()) sure_ans = set(row[35].split()) # calculate stats of worker and store in stats_list # stats_list[0] = precision, stats_list[1] = recall, stats_list[2] = f1 stats_list = [precision(sure_sub, sure_ans), recall(sure_sub, sure_ans)] stats_list.append(f1(stats_list[0], stats_list[1])) # if answer key has two possible alignments, take one with the higher f1 if row[39]: sure_ans_2 = set(row[39].split()) stats_list_2 = [precision(sure_sub, sure_ans_2), recall(sure_sub, sure_ans_2)] stats_list_2.append(f1(stats_list_2[0], stats_list_2[1])) print("1st F1 value: " + str(stats_list[2])) print("2nd F1 value: " + str(stats_list_2[2])) if stats_list_2[2] > stats_list[2]: print("selected answer 2") stats_list = stats_list_2 else: print("selected answer 1") # add worker to worker_dict # the case where worker does not exist in worker_dict if workerId not in worker_dict: worker_list = [1, qual_type] worker_list = worker_list + stats_list worker_dict[workerId] = worker_list # the case where worker already exists in worker_dict else: worker_list = worker_dict[workerId] worker_list[0] = worker_list[0] + 1 worker_list[2] = float(worker_list[2]) + ((stats_list[0] - worker_list[2]) / float(worker_list[0])) worker_list[3] = float(worker_list[3]) + ((stats_list[1] - worker_list[3]) / float(worker_list[0])) worker_list[4] = float(worker_list[4]) + ((stats_list[2] - worker_list[4]) / float(worker_list[0])) except: pass print("") return
def find_automatic_alignment_stats(ave_stats_writer): # open input files hits_input = csv.reader(open(sys.argv[5], 'rU'), delimiter = ",") hits_num = 0 ave_stats = [] for row in hits_input: try: # skip first line if row[0] == "source": continue print(row[0]) # if answer key has two possible alignments, choose one at random if row[8] and row[12]: print("selecting random answer") ans_string = random.choice([row[8], row[12]]) else: ans_string = row[8] if ans_string == row[8]: print("selected answer 1") else: print("selected answer 2") # convert all alignment results to sets sure_sub = set(row[4].split()) sure_ans = set(ans_string.split()) print(sure_sub) print(sure_ans) # calculate stats of alignment results stats_list = [precision(sure_sub, sure_ans), recall(sure_sub, sure_ans)] stats_list.append(f1(stats_list[0], stats_list[1])) print("selected F1 value: " + str(stats_list[2])) # add values to the average calculation # the case where this is the first HIT in the list if hits_num == 0: hits_num += 1 ave_stats = stats_list # the case where this is not the first HIT in the list else: hits_num += 1 for i in range(0, 3): ave_stats[i] = float(ave_stats[i]) + ((stats_list[i] - ave_stats[i]) / float(hits_num)) print("") except: pass # print averages ave_stats_writer.writerow(["automatic_alignments", hits_num] + ave_stats) return
def main(): # open input and writer CSV files train_input = csv.reader(open(sys.argv[1], 'rU'), delimiter = ",") output_writer = csv.writer(open(sys.argv[2], 'wb'), delimiter = ",") output_writer.writerow(["hitId", "inputSureAlignments", "answerSureAlignments", "precision", "recall", "f1"]) # set initial values for average precision, recall and f1 prec_ave = -1 rec_ave = -1 f1_ave = -1 num_hits = 0 for row in train_input: try: if row[0] == "instructions": continue num_hits += 1 # print the precision, recall and f1 performance on the current HIT sure_in = set(row[4].split()) sure_ans = set(row[8].split()) prec = precision(sure_in, sure_ans) rec = recall(sure_in, sure_ans) if prec == 0 and rec == 0: f1 = 0 else: f1 = float(2 * prec * rec) / float(prec + rec) output_writer.writerow([row[0], row[4], row[8], prec, rec, f1]) # set values as averages if this is the first row if prec_ave == -1: prec_ave = prec rec_ave = rec f1_ave = f1 # update averages if this is not the first row else: prec_ave = float(prec_ave) + ((prec - prec_ave) / float(num_hits)) rec_ave = float(rec_ave) + ((rec - rec_ave) / float(num_hits)) f1_ave = float(f1_ave) + ((f1 - f1_ave) / float(num_hits)) except: pass # write average values output_writer.writerow(["average", None, None, prec_ave, rec_ave, f1_ave])
def print_matrix(matrix, classes): p = precision(matrix) r = recall(matrix) f = f1(p, r) table = BeautifulTable() table.column_headers = [ "Class Name", "Precision", "Recall", "F1 Score" ] for i in range(len(classes)): table.append_row([classes[i][0], p[i], r[i], f[i]]) print(table) total_f1 = np.sum(f) / len(f) print("Total F1 score: " + str(total_f1)) return total_f1
def print_score(matrix, classes, points=8): def to_str(num): return ("%." + str(points) + "f") % round(float(num), points) p = precision(matrix) r = recall(matrix) f = f1(p, r) table = BeautifulTable() table.column_headers = [ "Class Name", "Precision", "Recall", "F1 Score" ] for i in range(len(classes)): table.append_row( [classes[i][0], to_str(p[i]), to_str(r[i]), to_str(f[i])]) print(table) total_f1 = np.sum(f) / len(f) print("Total F1 score: " + to_str(total_f1)) return total_f1
# Calculating ytilde and the model of logistic regression z = X_test @ betas_train # choosing best beta here? model = func.logistic_function(z) model = func.IndicatorFunc(model, threshold=0.44) # Get AUC score and predict_proba_scikit. Used for plots and terminal print acc_scikit, TPR_scikit, precision_scikit, f1_score_scikit, AUC_scikit, predict_proba_scikit \ = func.scikit(X_train, X_test, y_train, y_test, model) # Calculating the different metrics: print('\n-------------------------------------------') print('The accuracy is : %.3f' % func.accuracy(model, y_test)) print('The F1 score is : %.3f' % func.F1_score(y_test, model)) print('The precision is : %.3f' % func.precision(y_test, model)) print('The recall is : %.3f' % func.recall(y_test, model)) print('The AUC is : %.3f' % AUC_scikit) print('-------------------------------------------') # Make Cumulative gain and ROC plot P.Cumulative_gain_plot(y_test, model) P.ROC_plot(y_test, predict_proba_scikit) # Creating a Confusion matrix using pandas and pandas dataframe P.Confusion_matrix(y_test, model) elif arg == "NN": X_train_sc = X_train X_test_sc = X_test
def scan_results(qual_type, filename, worker_dict): print("****************************************************") print(qual_type) print(filename) print("****************************************************") results = csv.reader(open(filename, 'rU'), delimiter = ",") for row in results: try: print("") hit_id = row[0] worker_id = row[15] # skip first line and a particular user who was completed both "all" and "participated" HITs if hit_id == "HITId" or (worker_id == "AURYD2FH3FUOQ" and qual_type == "all"): continue print("") print(row) print("HITID: " + str(hit_id)) print("WORKERID: " + str(worker_id)) print("QUAL TYPE: " + qual_type) # make corrections to fields labeled "unchanged" or "{}" if row[46] == "unchanged": row[46] = row[31] if row[43] == "unchanged": row[43] = row[32] for i in [35, 36, 43, 46]: if row[i] == "{}": row[i] = "" sure_submission = row[46] poss_submission = row[43] sure_control = row[35] poss_control = row[36] print("defined rows") # convert all alignments to sets sure_submission_set = set(sure_submission.split()) print("SURE_SUBMISSION_STRING: " + str(sure_submission)) print("SURE_SUBMISSION_SET: " + str(sure_submission_set)) all_submission_set = set(poss_submission.split()) | sure_submission_set print("ALL_SUBMISSION_SET: " + str(all_submission_set)) sure_control_set = set(sure_control.split()) print("SURE_CONTROL_SET: " + str(sure_control_set)) all_control_set = set(poss_control.split()) | sure_control_set print("ALL_CONTROL_SET: " + str(all_control_set)) # create a list of accuracy stats for the current HIT # stats_list[0] = precision, stats_list[1] = recall, stats_list[2] = f1 stats_list = [precision(sure_submission_set, all_control_set), recall(all_submission_set, sure_control_set)] print("STATS_LIST_INITIAL: " + str(stats_list)) stats_list.append(f1(stats_list[0], stats_list[1])) print("STATS_LIST_FINAL: " + str(stats_list)) # add worker to worker_dict # the case where worker does not exist in worker_dict if worker_id not in worker_dict: worker_list = [1, qual_type] worker_list = worker_list + stats_list worker_dict[worker_id] = worker_list # the case where worker already exists in worker_dict else: worker_list = worker_dict[worker_id] worker_list[0] = worker_list[0] + 1 for i in range(0, 3): worker_list[i + 2] = float(worker_list[i + 2]) + ((stats_list[i] - worker_list[i + 2]) / float(worker_list[0])) except: pass
def Random_Forest(X_train, X_test, y_train, y_test, candidates, GoldiLock, \ feature_list, header_names, seed=0, threshold=0.5, \ plot_confuse_matrix=False, plot_feauture_importance=False, Goldilock_zone=False): """ Ha en input oversikt...? threshold | 0.5 == RF.predict 0.7 --> Need 70$%$ probability to be an exoplanet, to be calssified as an exoplanet """ print("Exoplanet threshold = %g" % threshold) # Print best parameters, this takes time! Parameters set in Best_params() #Best_params(seed, X_train, y_train) # Plot error against number of trees? RF = RandomForestClassifier( n_estimators=300, max_features='auto', max_depth=8, min_samples_leaf=1, random_state=seed, criterion='gini', # 'entropy' bootstrap=True) RF.fit(X_train, y_train) # https://github.com/erykml/medium_articles/blob/master/Machine%20Learning/feature_importance.ipynb header_names = np.load('feature_names.npy', allow_pickle=True) # function for creating a feature importance dataframe def feature_importance(column_names, importances): df = pd.DataFrame({'feature': column_names,'feature_importance': importances}) \ .sort_values('feature_importance', ascending = False) \ .reset_index(drop = True) return df # plotting a feature importance dataframe (horizontal barchart) def feature_importance_plot(feature_importances, title): feature_importances.columns = ['feature', 'feature_importance'] sns.barplot(x = 'feature_importance', y = 'feature', data = feature_importances, orient = 'h') \ .set_title(title, fontsize = 15) plt.ylabel('Feature', fontsize=15) plt.xlabel('Feature importance', fontsize=15) plt.show() if plot_feauture_importance == True: feature_imp = feature_importance(header_names[1:], RF.feature_importances_) feature_importance_plot(feature_imp[:11], "Feature Importance (Random Forest)") # Calculating different metrics predict = RF.predict(X_test) accuracy = RF.score(X_test, y_test) precision = func.precision(y_test, predict) recall = func.recall(y_test, predict) F1_score = func.F1_score(y_test, predict) # Calculate the absolute errors errors = abs(predict - y_test) # Printing the different metrics: func.Print_parameters(accuracy, F1_score, precision, recall, errors, name='Random Forest') if plot_confuse_matrix == True: func.ConfusionMatrix_Plot(y_test, predict, 'Random Forest (Candidates)', threshold) #print(RF.decision_path(X_test)) # Pull out one tree from the forest tree_nr = 5 tree = RF.estimators_[tree_nr] func.PlotOneTree(tree, feature_list) # header_names? predict_candidates = np.array(RF.predict_proba(candidates)) # Prediction with threshold predict_candidates[:, 0] = (predict_candidates[:, 0] < threshold).astype('int') predict_candidates[:, 1] = (predict_candidates[:, 1] >= threshold).astype('int') predicted_false_positive = (predict_candidates[:, 1] == 0).sum() predicted_exoplanets = (predict_candidates[:, 1] == 1).sum() # Information print to terminal print('\nThe Random Forest Classifier predicted') print('--------------------------------------') print('%-5g exoplanets of %g candidates' % (predicted_exoplanets, len(predict_candidates))) print('%-5g false positives of %g candidates' % (predicted_false_positive, len(predict_candidates))) # Plotting a bar plot of candidates predicted as confirmed and false positives func.Histogram2(predict_candidates[:, 1], 'Random Forest (Candidates)', threshold) #func.Histogram2(g=df.loc[:, (df.columns == 'koi_disposition')].values) if Goldilock_zone: print("") print("Goldilock zone calculations") predict_goldilocks = np.array(RF.predict_proba(GoldiLock)) predict_goldilocks[:, 0] = (predict_goldilocks[:, 0] < threshold).astype('int') predict_goldilocks[:, 1] = (predict_goldilocks[:, 1] >= threshold).astype('int') predicted_false_positive_goldilocs = ( predict_goldilocks[:, 1] == 0).sum() predicted_exoplanets_goldilocks = (predict_goldilocks[:, 1] == 1).sum() # Information print to terminal print('\nThe Random Forest Classifier predicted') print('--------------------------------------') print('%-3g exoplanets of %g GL candidates' % (predicted_exoplanets_goldilocks, len(predict_goldilocks))) print('%-3g false positives of %g GL candidates' % (predicted_false_positive_goldilocs, len(predict_goldilocks))) # Plotting a bar plot of candidates predicted as confirmed and false positives func.Histogram2(predict_goldilocks[:, 1], 'Random Forest (Goldilock)', threshold) GL.GoldilocksZone(predict_goldilocks[:, 1], 'RandomForest', threshold) ''' feature_importance = RF.feature_importances_ print(feature_importance) print(len(feature_importance)) #for i in range(len(feature_importance)): # Check the i in feature_importance # assign corresponding header name plt.hist(feature_importance, align='left', histtype='bar', orientation='horizontal', rwidth=0.3) plt.title('Feature Importance') plt.xlabel('--') plt.ylabel('--') #plt.xlim([lb-width/2, ub-width/2]) plt.show() ''' '''
def scan_csv(results, worker_dict, hits_result_writer): # Dictionary indexes hits by hitId, each hitId maps to a list # where l[0] = # hits completed and l[1] = # hits correct hits_dict = {} num_rows = 0 for row in results: print(num_rows) num_rows += 1 try: hitId = row[0] workerId = row[15] print("hitId: " + hitId) print("workerId: " + workerId) # skip first line if hitId == "HITId": continue ### skip HITs 311HQEI8RS1Q91M7H2OGRN5V4US7ZI and 37Y5RYYI0PQNN46K4NY6PTLGJI8SXE ### This is because those HITs have strange answer values if hitId == "311HQEI8RS1Q91M7H2OGRN5V4US7ZI" or hitId == "37Y5RYYI0PQNN46K4NY6PTLGJI8SXE": continue # if an "unchanged" value is encountered, replace with the proper value if row[48] == "unchanged": #sureAlignments row[48] = row[31] if row[42] == "unchanged": #possAlignments row[42] = "{}" if row[44] == "unchanged": #sourceHighlights row[44] = "{}" if row[50] == "unchanged": #targetHighlights row[50] = "{}" # convert all alignment results to sets sure_sub_f = set(row[48].split()) sure_sub_i = set(row[49].split()) sure_ans = set(row[35].split()) pos_sub_f = set(row[42].split()) pos_sub_i = set(row[43].split()) pos_ans = set(row[36].split()) src_sub_f = set(row[44].split()) src_sub_i = set(row[45].split()) src_ans = set(row[37].split()) tgt_sub_f = set(row[50].split()) tgt_sub_i = set(row[51].split()) tgt_ans = set(row[38].split()) prec_i = precision(sure_sub_i, sure_ans) print("prec_i" + str(prec_i)) rec_i = recall(sure_sub_i, sure_ans) print("rec_i" + str(rec_i)) f1_i = f1(prec_i, rec_i) print("f1_i" + str(f1_i)) prec_f = precision(sure_sub_f, sure_ans) print("prec_f" + str(prec_f)) rec_f = recall(sure_sub_f, sure_ans) print("rec_f" + str(rec_f)) f1_f = f1(prec_f, rec_f) print("f1_f" + str(f1_f)) # create dictionary of HITs data if hitId not in hits_dict: hits_list = [1, 0] hits_dict[hitId] = hits_list else: hits_list = hits_dict[hitId] hits_list[0] = hits_list[0] + 1 # the case where the worker does not exist in worker_dict if workerId not in worker_dict: # initialize initial worker_list completed_HITs = [hitId] worker_list = [1, 0, 0, prec_i, rec_i, f1_i, prec_f, rec_f, f1_f, completed_HITs] worker_dict[workerId] = worker_list # check if user's final submission is correct if sure_sub_f == sure_ans and pos_sub_f == pos_ans and src_sub_f == src_ans and tgt_sub_f == tgt_ans: print("Correct submission") worker_list[1] = worker_list[1] + 1 hits_list[1] = hits_list[1] + 1 ### DELETE THIS LATER, DEBUGGING ONLY # print out errors if encountered if sure_sub_f != sure_ans: print("Sure alignments incorrect, expected " + str(sure_ans) + ", got " + str(sure_sub_f)) if pos_sub_f != pos_ans: print("Possible alignments incorrect, expected " + str(pos_ans) + ", got " + str(pos_sub_f)) if src_sub_f != src_ans: print("Source highlights incorrect, expected " + str(src_ans) + ", got " + str(src_sub_f)) if tgt_sub_f != tgt_ans: print("Target highlights incorrect, expected " + str(tgt_ans) + ", got " + str(tgt_sub_f)) #change correctness rate worker_list[2] = worker_list[1] # the case where the worker already exists in work_dict else: worker_list = worker_dict[workerId] # if user has already completed this HIT, skip if hitId in worker_list[9]: continue # mark the worker as having completed this HIT worker_list[9].append(hitId) # increase worker's completed HIT count worker_list[0] = worker_list[0] + 1 # check if user's final submission is correct if sure_sub_f == sure_ans and pos_sub_f == pos_ans and src_sub_f == src_ans and tgt_sub_f == tgt_ans: print("Correct submission") worker_list[1] = worker_list[1] + 1 hits_list[1] = hits_list[1] + 1 ### print out errors if encountered if sure_sub_f != sure_ans: print("Sure alignments incorrect, expected " + str(sure_ans) + ", got " + str(sure_sub_f)) if pos_sub_f != pos_ans: print("Possible alignments incorrect, expected " + str(pos_ans) + ", got " + str(pos_sub_f)) if src_sub_f != src_ans: print("Source highlights incorrect, expected " + str(src_ans) + ", got " + str(src_sub_f)) if tgt_sub_f != tgt_ans: print("Target highlights incorrect, expected " + str(tgt_ans) + ", got " + str(tgt_sub_f)) # update correctness rate worker_list[2] = float(worker_list[1]) / float(worker_list[0]) # update precision, recall and f1 for initial guesses worker_list[3] = float(worker_list[3]) + ((prec_i - worker_list[3]) / float(worker_list[0])) worker_list[4] = float(worker_list[4]) + ((rec_i - worker_list[4]) / float(worker_list[0])) worker_list[5] = float(worker_list[5]) + ((f1_i - worker_list[5]) / float(worker_list[0])) # update precision, recall and f1 after seeing answer key worker_list[6] = float(worker_list[6]) + ((prec_f - worker_list[6]) / float(worker_list[0])) worker_list[7] = float(worker_list[7]) + ((rec_f - worker_list[7]) / float(worker_list[0])) worker_list[8] = float(worker_list[8]) + ((f1_f - worker_list[8]) / float(worker_list[0])) print("") except: pass # write HITs statistics for hitId in hits_dict: hits_list = hits_dict[hitId] hits_result_writer.writerow([hitId, hits_list[0], hits_list[1], float(hits_list[1]) / float(hits_list[0])]) print (str(hitId) + ": [" + str(hits_list[0]) + ", " + str(hits_list[1]) + ", " + str((float(hits_list[1]) / float(hits_list[0]))) + "]") print("") return
betas_train = func.steepest(X_train, y_train, gamma) #betas_train = func.SGD_beta(X_train, y_train, eta, gamma) # Calculating ytilde and the model of logistic regression z = X_test @ betas_train # choosing best beta here? model = func.logistic_function(z) model = func.IndicatorFunc(model) acc_scikit, TPR_scikit, precision_scikit, f1_score_scikit, AUC_scikit, predict_proba_scikit = func.scikit( X_train, X_test, y_train, y_test, model) Acc = func.accuracy(model, y_test) Acc_sklearn = acc_scikit F1 = func.F1_score(y_test, model) F1_sklearn = f1_score_scikit Rec = func.recall(y_test, model) Rec_sklearn = TPR_scikit #precision = func.precision(y_test, model) #------------------------------------------------------------------------------ # We can test Accuracy score against scikit learn: #------------------------------------------------------------------------------ def test_Accuracy(): assert Acc == Acc_sklearn, \ print("Our Accuracy score is not equal to the scikit learn Accuracy score.\ Accuracy = %s, Accuracy (sklearn) = %s" %(Acc, Acc_sklearn)) print("Our Accuracy score is equal to the scikit learn Accuracy score")