def getOptimalThresholds_SAF(): global algoStr global allEntries optimalThresholds = [] dbName = "/comparator/src/main/resources/GoldStandards/SS.db" # dbName = "/Test/gt10/DS.db" excludeAlgos = [] for appName in APPS: print(appName) connectToDB(dbName) allEntries = fetchAllNearDuplicates( 'where human_classification>=0 and appname="{0}"'.format(appName)) closeDBConnection() print(len(allEntries)) for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() if (algoStr in excludeAlgos): continue print(algoStr) getDistances(algoStr) get_y_actual_SAF() space = { 't': hp.uniform('t', 0, getMax(algoStr)), } try: best = fmin(fn=getLoss_SAF, space=space, algo=tpe.suggest, max_evals=1000) row = { 'thresholdSet': "optimal", 'appName': appName, 'algoName': algoStr, 'thre': best['t'] } optimalThresholds.append(row) print(best) except Exception as ex: print(ex) print( "Error getting optimal threshold for {0}".format(algoStr)) fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre'] writeCSV(fieldNames, optimalThresholds, "optimalThresholds_SAF.csv")
def getOptimalThresholds_Classification(iterations=10000): global algoStr global allEntries optimalThresholds = [] dbName = "/Test/gt10/DS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() get_y_actual_Classification() for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() print(algoStr) getDistances(algoStr) space = { 'tc': hp.uniform('tc', 0, getMax(algoStr)), 'tn': hp.uniform('tn', 0, getMax(algoStr)) } best = fmin(fn=getLoss_Classification, space=space, algo=tpe.suggest, max_evals=iterations) row = { 'thresholdSet': "optimal", 'algoName': algoStr, 'c-thre': best['tc'], 'n-thre': best['tn'] } optimalThresholds.append(row) print(best) fieldNames = ['thresholdSet', 'algoName', 'c-thre', 'n-thre'] writeCSV( fieldNames, optimalThresholds, "optimalThresholds_Classification" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
def getOptimalThreshold_SAF(algoString, iterations=10000): global algoStr optimalThresholds = [] global allEntries dbName = "/comparator/src/main/resources/GoldStandards/SS.db" algoStr = algoString print(algoStr) for appName in APPS: connectToDB(dbName) allEntries = fetchAllNearDuplicates( 'where human_classification>=0 and appname="{0}"'.format(appName)) closeDBConnection() print(len(allEntries)) getDistances(algoStr) get_y_actual_SAF() space = { 't': hp.uniform('t', 0, getMax(algoStr)), } best = fmin(fn=getLoss_SAF, space=space, algo=tpe.suggest, max_evals=iterations) row = { 'thresholdSet': "optimal", 'appName': appName, 'algoName': algoStr, 'thre': best['t'] } optimalThresholds.append(row) print(best) fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre'] writeCSV( fieldNames, optimalThresholds, "optimalThresholds_SAF_" + algoStr + "_" + str(iterations) + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv") return optimalThresholds
def getOptimalThreshold_SAF_Universal(algoString, iterations=10000): global algoStr optimalThresholds = [] global allEntries algoStr = algoString print(algoStr) dbName = "/Test/gt10/DS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() getDistances(algoStr) get_y_actual_SAF() space = { 't': hp.uniform('t', 0, getMax(algoStr)), } best = fmin(fn=getLoss_SAF, space=space, algo=tpe.suggest, max_evals=iterations) row = { 'thresholdSet': "optimal", 'appName': "Universal", 'algoName': algoStr, 'thre': best['t'] } optimalThresholds.append(row) print(best) fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre'] writeCSV( fieldNames, optimalThresholds, "optimalThresholds_SAF_Universal" + algoStr + "_" + str(iterations) + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv") return optimalThresholds
def testDummyClassifier(): dummyClassifier = dummy.DummyClassifier(strategy="stratified") X = [[0]] * 10 y = [0, 1, 2, 0, 1, 2, 0, 0, 1, 2] dummyClassifier.fit(X, y) dbName = "/Test/gt10/gt10_last500Responses.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() y_actual = [] fieldNames = [ 'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall', 'f1' ] for entry in allEntries: index = 4 for algo in ALGOS: index = index + 1 y_actual.append(entry[index]) y_pred = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred, average="macro") # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro") # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro") # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro") row1 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } print(row1) dbName = "/comparator/src/main/resources/GoldStandards/SS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() y_actual = [] for entry in allEntries: index = 4 for algo in ALGOS: index = index + 1 y_actual.append(entry[index]) y_pred = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred, average="macro") row2 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } print(row2) writeCSV(fieldNames, [row1, row2], "rq1_dummy.csv")
tags = entry[15] if classification == 1: total += 1 if "dditional" in tags: nd3count += 1 else: nd2count += 1 print(total) print(nd2count) print(nd3count) if __name__ == "__main__": dbName = "/comparator/src/main/resources/GoldStandards/SS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() # testDummyClassifier() # getNdCategories(allEntries) # dbName = "/Test/gt10/gt10_last500Responses.db" # connectToDB(dbName) # allEntries = fetchAllNearDuplicates("where human_classification>=0") # closeDBConnection() # getF1_Classifier(allEntries) getF1_SAF_allrows(allEntries) # y_pred = {}
def randomPairOuput(): DB_PATH = "/gt10_Doms/" DB_NAME = "DS.db" CRAWL_PATH = "/gt10_Doms/" RESOURCES = os.path.join( os.path.abspath("/state-abstraction-study/HTMLStuff/resources/"), '') TEMPLATE_DIR = os.path.join( os.path.abspath("/state-abstraction-study/HTMLStuff/"), '') TEMPLATE_HTML = "jinjaTemplate.html" timeString = str(datetime.now().strftime("%Y%m%d-%H%M%S")) NUMBER = 500 OUTPUT_PATH = os.path.join(os.path.abspath("/htmloutputs/"), "htmloutput_" + str(NUMBER) + "_" + timeString) OUTPUT_HTML_NAME = "randomPairOutput.html" IMAGES = "images" saveJsonName = "responseResults_" + str( NUMBER) + "_" + timeString + ".json" if len(sys.argv) <= 6: print("You have not provided ENOUGH arguments. ") print( "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>" ) print( "DO you want to use defaults DB : {0}, CRAWLS : {1}, OutputPath: {2}, OutputName : {3} and Number : {4} ?" .format(DB_PATH + DB_NAME, CRAWL_PATH, OUTPUT_PATH, OUTPUT_HTML_NAME, NUMBER)) response = input("Y/N : ").strip().lower() if (response == 'y'): print( 'Okay. Continuing with Defaults : Your output will be available at : ' + OUTPUT_PATH) else: DB_PATH = input("DB_PATH:").strip() DB_NAME = input("DB_NAME:").strip() CRAWL_PATH = input("CRAWL_PATH:").strip() OUTPUT_PATH = input("OUTPUT_PATH:").strip() OUTPUT_HTML_NAME = input("OUTPUT_HTML_NAME:").strip() MAX_RETRY_NUM = 3 numberNotConfirmed = True retryNum = 0 while numberNotConfirmed: try: NUMBER = int(input("NUMBER_OF_PAIRS:").strip()) break except Exception as e: print(e) if (retryNum >= MAX_RETRY_NUM): print("EXCEEDED MAX RETRY. ABORTING!! ") print( "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>" ) sys.exit() retryNum += 1 print("Please provide a valid number.") elif len(sys.argv) == 7: DB_PATH = sys.argv[1] DB_NAME = sys.argv[2] CRAWL_PATH = sys.argv[3] OUTPUT_PATH = sys.argv[4] OUTPUT_HTML_NAME = sys.argv[5] NUMBER = int(sys.argv[6]) DB_PATH = os.path.join(os.path.abspath(DB_PATH.strip()), '') CRAWL_PATH = os.path.join(os.path.abspath(CRAWL_PATH.strip()), '') OUTPUT_PATH = os.path.join(os.path.abspath(OUTPUT_PATH.strip()), '') print( "USING THESE VALUES FOR PROGRAM: DB : {0}, CRAWLS : {1}, OutputPath: {2}, OutputName : {3} and Number : {4} ?" .format(DB_PATH + DB_NAME, CRAWL_PATH, OUTPUT_PATH, OUTPUT_HTML_NAME, NUMBER)) response = input("Continue ? Y/N : ").strip().lower() if response == 'y': print("Okay. Continuing!!") else: print("ABORTING!! TRY AGAIN") print( "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>" ) sys.exit() if not os.path.exists(DB_PATH + DB_NAME): print("DB DOES NOT EXIST. ABORTING!!") sys.exit() if not os.path.exists(CRAWL_PATH): print("CRAWL PATH DOES NOT EXIST. ABORTING") sys.exit() if not os.path.exists(RESOURCES): print("RESOURCES NOT FOUND AT : {0} ".format(RESOURCES)) RESOURCES = os.path.join( os.path.abspath(input("Provide RESOURCES FOLDER.").strip()), '') if not os.path.exists(RESOURCES): print("PROVIDED RESOURCES PATH DOES NOT EXIST : {0}. ABORTING!!". format(RESOURCES)) sys.exit() if not os.path.exists(TEMPLATE_DIR + TEMPLATE_HTML): print("TEMPALTE {0} NOT FOUND AT : {1} ".format( TEMPLATE_HTML, TEMPLATE_DIR)) TEMPLATE_DIR = os.path.join( os.path.abspath( input("Provide FOLDER where TEMPLATE HTML IS.").strip()), '') if not os.path.exists(TEMPLATE_DIR + TEMPLATE_HTML): print("TEMPALTE {0} NOT FOUND AT : {1} ".format( TEMPLATE_HTML, TEMPLATE_DIR)) TEMPLATE_HTML = input("Provide HTML Template to use.").strip() if not os.path.exists(RESOURCES + TEMPLATE_HTML): print("TEMPALTE {0} NOT FOUND AT : {1} . ABORTING !!".format( TEMPLATE_HTML, TEMPLATE_DIR)) sys.exit() else: print("USING TEMPLATE {0} at {1}".format(TEMPLATE_HTML, TEMPLATE_DIR)) outputPathConfirmed, OUTPUT_PATH = confirmOutputPath( OUTPUT_PATH, RESOURCES) if not outputPathConfirmed: print( "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>" ) sys.exit() connectToDB(DB_PATH + DB_NAME) #randomNDs = fetchRandomNearDuplicates(NUMBER*2) # randomNDs = fetchRandomNearDuplicates(NUMBER*2, "appname in (select name from apps where numAddedStates>=10) AND ") randomNDs = fetchRandomNearDuplicates(NUMBER * 2, "HUMAN_CLASSIFICATION==-1 AND ") closeDBConnection() data = [] numCreated = 0 missing = 0 for randND in randomNDs: jsonRecord = {} appName = randND[0] crawl = randND[1] state1 = randND[2] state2 = randND[3] jsonRecord['appname'] = appName jsonRecord['crawl'] = crawl jsonRecord['state1'] = state1 jsonRecord['state2'] = state2 image1Name = state1 + ".png" image2Name = state2 + ".png" image1 = searchImageInCrawlPath(appName, crawl, image1Name, CRAWL_PATH) image2 = searchImageInCrawlPath(appName, crawl, image2Name, CRAWL_PATH) if (image1 == None) or (image2 == None): print("IMAGES NOT FOUND ABORTING THIS PAIR : " + str(jsonRecord)) missing += 1 continue ndImageName = appName + "_" + crawl + "_" + state1 + "_" + state2 + ".jpg" destination = OUTPUT_PATH + IMAGES + "/" + ndImageName if createNDImage(image1, image2, destination): numCreated += 1 else: print("COULD NOT CREATE ND IMAGE FOR RECORD : " + str(jsonRecord)) jsonRecord['image'] = ndImageName jsonRecord['response'] = randND[14] jsonRecord['tags'] = "" jsonRecord['comments'] = "" data.append(jsonRecord) if (numCreated >= NUMBER): break jsonData = json.dumps(data) TEMPLATE_HTML_PATH = os.path.abspath( os.path.join(TEMPLATE_DIR, TEMPLATE_HTML)) print(jsonData) output_html_doc(TEMPLATE_DIR, TEMPLATE_HTML, OUTPUT_PATH + OUTPUT_HTML_NAME, jsonData, saveJsonName=saveJsonName) print("FAILED CREATING {0} PAIRS BECAUSE IMAGES WERE MISSING".format( missing)) print("OUTPUT HTML CREATED WITH : {0} IMAGE PAIRS".format(numCreated))