def getF1_SAF_allrows(allEntries): y_pred = {} for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() y_pred[algoStr] = [] y_pred[algoStr].append([]) y_pred[algoStr].append([]) y_actual = [] print(len(allEntries)) threshold_sets = { 'St_c_DS': getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), 'St_n_DS': getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA, 'all'), 'O_s_DS': getThreshold(THRESHOLD_SETS.OPTIMAL, DB_SETS.GT10_DB_DATA, 'all'), 'O_c_DS': getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE, DB_SETS.GT10_DB_DATA, 'all'), 'O_n_DS': getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND, DB_SETS.GT10_DB_DATA, 'all') } # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')] # threshold_sets["sample_based"] = [getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')] algoScoreRows = [] fieldNames = [ 'thresholdSet', 'algoName', 'threshold', 'precision', 'recall', 'f1' ] dummyClassifier = dummy.DummyClassifier(strategy="stratified") print(threshold_sets) for threshold_set_name in threshold_sets: threshold_set = threshold_sets[threshold_set_name] for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() threshold = threshold_set[algoStr] precision, recall, f1 = getF1_SAF(threshold, allEntries, algoStr) algoScoreRows.append({ 'thresholdSet': threshold_set_name, 'algoName': algoStr, 'threshold': threshold, 'precision': precision, 'recall': recall, 'f1': f1 }) writeCSV(fieldNames, algoScoreRows, "rq2_1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
def getOptimalThresholds_SAF(): global algoStr global allEntries optimalThresholds = [] dbName = "/comparator/src/main/resources/GoldStandards/SS.db" # dbName = "/Test/gt10/DS.db" excludeAlgos = [] for appName in APPS: print(appName) connectToDB(dbName) allEntries = fetchAllNearDuplicates( 'where human_classification>=0 and appname="{0}"'.format(appName)) closeDBConnection() print(len(allEntries)) for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() if (algoStr in excludeAlgos): continue print(algoStr) getDistances(algoStr) get_y_actual_SAF() space = { 't': hp.uniform('t', 0, getMax(algoStr)), } try: best = fmin(fn=getLoss_SAF, space=space, algo=tpe.suggest, max_evals=1000) row = { 'thresholdSet': "optimal", 'appName': appName, 'algoName': algoStr, 'thre': best['t'] } optimalThresholds.append(row) print(best) except Exception as ex: print(ex) print( "Error getting optimal threshold for {0}".format(algoStr)) fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre'] writeCSV(fieldNames, optimalThresholds, "optimalThresholds_SAF.csv")
def getOptimalThresholds_Classification(iterations=10000): global algoStr global allEntries optimalThresholds = [] dbName = "/Test/gt10/DS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() get_y_actual_Classification() for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() print(algoStr) getDistances(algoStr) space = { 'tc': hp.uniform('tc', 0, getMax(algoStr)), 'tn': hp.uniform('tn', 0, getMax(algoStr)) } best = fmin(fn=getLoss_Classification, space=space, algo=tpe.suggest, max_evals=iterations) row = { 'thresholdSet': "optimal", 'algoName': algoStr, 'c-thre': best['tc'], 'n-thre': best['tn'] } optimalThresholds.append(row) print(best) fieldNames = ['thresholdSet', 'algoName', 'c-thre', 'n-thre'] writeCSV( fieldNames, optimalThresholds, "optimalThresholds_Classification" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
def getOptimalThreshold_SAF(algoString, iterations=10000): global algoStr optimalThresholds = [] global allEntries dbName = "/comparator/src/main/resources/GoldStandards/SS.db" algoStr = algoString print(algoStr) for appName in APPS: connectToDB(dbName) allEntries = fetchAllNearDuplicates( 'where human_classification>=0 and appname="{0}"'.format(appName)) closeDBConnection() print(len(allEntries)) getDistances(algoStr) get_y_actual_SAF() space = { 't': hp.uniform('t', 0, getMax(algoStr)), } best = fmin(fn=getLoss_SAF, space=space, algo=tpe.suggest, max_evals=iterations) row = { 'thresholdSet': "optimal", 'appName': appName, 'algoName': algoStr, 'thre': best['t'] } optimalThresholds.append(row) print(best) fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre'] writeCSV( fieldNames, optimalThresholds, "optimalThresholds_SAF_" + algoStr + "_" + str(iterations) + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv") return optimalThresholds
def getOptimalThreshold_SAF_Universal(algoString, iterations=10000): global algoStr optimalThresholds = [] global allEntries algoStr = algoString print(algoStr) dbName = "/Test/gt10/DS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() getDistances(algoStr) get_y_actual_SAF() space = { 't': hp.uniform('t', 0, getMax(algoStr)), } best = fmin(fn=getLoss_SAF, space=space, algo=tpe.suggest, max_evals=iterations) row = { 'thresholdSet': "optimal", 'appName': "Universal", 'algoName': algoStr, 'thre': best['t'] } optimalThresholds.append(row) print(best) fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre'] writeCSV( fieldNames, optimalThresholds, "optimalThresholds_SAF_Universal" + algoStr + "_" + str(iterations) + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv") return optimalThresholds
def testDummyClassifier(): dummyClassifier = dummy.DummyClassifier(strategy="stratified") X = [[0]] * 10 y = [0, 1, 2, 0, 1, 2, 0, 0, 1, 2] dummyClassifier.fit(X, y) dbName = "/Test/gt10/gt10_last500Responses.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() y_actual = [] fieldNames = [ 'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall', 'f1' ] for entry in allEntries: index = 4 for algo in ALGOS: index = index + 1 y_actual.append(entry[index]) y_pred = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred, average="macro") # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro") # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro") # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro") row1 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } print(row1) dbName = "/comparator/src/main/resources/GoldStandards/SS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() y_actual = [] for entry in allEntries: index = 4 for algo in ALGOS: index = index + 1 y_actual.append(entry[index]) y_pred = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred, average="macro") row2 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } print(row2) writeCSV(fieldNames, [row1, row2], "rq1_dummy.csv")
def getF1_Classifier(allEntries): y_pred = {} for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() y_pred[algoStr] = [] y_actual = [] print(len(allEntries)) threshold_sets = {} # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')] threshold_sets["statistical"] = [ getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA, 'all') ] threshold_sets["optimal"] = [ getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND, DB_SETS.GT10_DB_DATA, 'all') ] algoScoreRows = [] fieldNames = [ 'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall', 'f1' ] dummyClassifier = dummy.DummyClassifier(strategy="stratified") print(threshold_sets) for threshold_set_name in threshold_sets: threshold_set = threshold_sets[threshold_set_name] cloneThresholds = threshold_set[0] ndThresholds = threshold_set[1] # print(cloneThresholds) for entry in allEntries: index = 4 for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() value = float(entry[index]) pred = -1 if algo.value[2] == "lt": if value <= cloneThresholds[algoStr]: pred = 0 if value > cloneThresholds[algoStr]: if value <= ndThresholds[algoStr]: pred = 1 else: pred = 2 else: if value >= cloneThresholds[algoStr]: pred = 0 if value < cloneThresholds[algoStr]: if value >= ndThresholds[algoStr]: pred = 1 else: pred = 2 y_pred[algoStr].append(pred) index = index + 1 y_actual.append(entry[index]) for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() cm = metrics.confusion_matrix(y_actual, y_pred[algoStr]) # print(cm) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred[algoStr], average="macro") # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro") # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro") # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro") row = { 'thresholdSet': threshold_set_name, 'algoName': algoStr, 'c-thre': cloneThresholds[algoStr], 'n-thre': ndThresholds[algoStr], 'precision': precision, 'recall': recall, 'f1': f1 } algoScoreRows.append(row) X = [[0]] * len(y_actual) dummyClassifier.fit(X, y_actual) y_pred_dummy = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred_dummy, average="macro") row2 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } algoScoreRows.append(row2) writeCSV( fieldNames, algoScoreRows, os.path.join( os.path.abspath(".."), RESULTS_FOLDER, "rq1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv"))