Exemple #1
0
def getOptimalThresholds_SAF():
    global algoStr
    global allEntries
    optimalThresholds = []

    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"

    # dbName = "/Test/gt10/DS.db"
    excludeAlgos = []
    for appName in APPS:
        print(appName)
        connectToDB(dbName)
        allEntries = fetchAllNearDuplicates(
            'where human_classification>=0 and appname="{0}"'.format(appName))
        closeDBConnection()
        print(len(allEntries))
        for algo in ALGOS:
            algoStr = str(algo).split('.')[1].upper()
            if (algoStr in excludeAlgos):
                continue
            print(algoStr)
            getDistances(algoStr)
            get_y_actual_SAF()
            space = {
                't': hp.uniform('t', 0, getMax(algoStr)),
            }
            try:
                best = fmin(fn=getLoss_SAF,
                            space=space,
                            algo=tpe.suggest,
                            max_evals=1000)

                row = {
                    'thresholdSet': "optimal",
                    'appName': appName,
                    'algoName': algoStr,
                    'thre': best['t']
                }
                optimalThresholds.append(row)
                print(best)
            except Exception as ex:
                print(ex)
                print(
                    "Error getting optimal threshold for {0}".format(algoStr))

    fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre']
    writeCSV(fieldNames, optimalThresholds, "optimalThresholds_SAF.csv")
Exemple #2
0
def getOptimalThresholds_Classification(iterations=10000):
    global algoStr
    global allEntries
    optimalThresholds = []

    dbName = "/Test/gt10/DS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    get_y_actual_Classification()

    for algo in ALGOS:
        algoStr = str(algo).split('.')[1].upper()
        print(algoStr)
        getDistances(algoStr)

        space = {
            'tc': hp.uniform('tc', 0, getMax(algoStr)),
            'tn': hp.uniform('tn', 0, getMax(algoStr))
        }

        best = fmin(fn=getLoss_Classification,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=iterations)

        row = {
            'thresholdSet': "optimal",
            'algoName': algoStr,
            'c-thre': best['tc'],
            'n-thre': best['tn']
        }
        optimalThresholds.append(row)

        print(best)

    fieldNames = ['thresholdSet', 'algoName', 'c-thre', 'n-thre']
    writeCSV(
        fieldNames, optimalThresholds, "optimalThresholds_Classification" +
        str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
Exemple #3
0
def getOptimalThreshold_SAF(algoString, iterations=10000):
    global algoStr
    optimalThresholds = []
    global allEntries
    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"
    algoStr = algoString
    print(algoStr)
    for appName in APPS:
        connectToDB(dbName)
        allEntries = fetchAllNearDuplicates(
            'where human_classification>=0 and appname="{0}"'.format(appName))
        closeDBConnection()
        print(len(allEntries))

        getDistances(algoStr)
        get_y_actual_SAF()
        space = {
            't': hp.uniform('t', 0, getMax(algoStr)),
        }
        best = fmin(fn=getLoss_SAF,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=iterations)

        row = {
            'thresholdSet': "optimal",
            'appName': appName,
            'algoName': algoStr,
            'thre': best['t']
        }
        optimalThresholds.append(row)
        print(best)

    fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre']
    writeCSV(
        fieldNames, optimalThresholds,
        "optimalThresholds_SAF_" + algoStr + "_" + str(iterations) +
        str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
    return optimalThresholds
Exemple #4
0
def getOptimalThreshold_SAF_Universal(algoString, iterations=10000):
    global algoStr
    optimalThresholds = []
    global allEntries

    algoStr = algoString
    print(algoStr)
    dbName = "/Test/gt10/DS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    getDistances(algoStr)
    get_y_actual_SAF()
    space = {
        't': hp.uniform('t', 0, getMax(algoStr)),
    }
    best = fmin(fn=getLoss_SAF,
                space=space,
                algo=tpe.suggest,
                max_evals=iterations)

    row = {
        'thresholdSet': "optimal",
        'appName': "Universal",
        'algoName': algoStr,
        'thre': best['t']
    }
    optimalThresholds.append(row)
    print(best)

    fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre']
    writeCSV(
        fieldNames, optimalThresholds,
        "optimalThresholds_SAF_Universal" + algoStr + "_" + str(iterations) +
        str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
    return optimalThresholds
Exemple #5
0
def testDummyClassifier():
    dummyClassifier = dummy.DummyClassifier(strategy="stratified")

    X = [[0]] * 10
    y = [0, 1, 2, 0, 1, 2, 0, 0, 1, 2]
    dummyClassifier.fit(X, y)

    dbName = "/Test/gt10/gt10_last500Responses.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    y_actual = []
    fieldNames = [
        'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall',
        'f1'
    ]

    for entry in allEntries:
        index = 4
        for algo in ALGOS:
            index = index + 1

        y_actual.append(entry[index])

    y_pred = dummyClassifier.predict(y_actual)

    precision, recall, f1, support = metrics.precision_recall_fscore_support(
        y_actual, y_pred, average="macro")
    # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro")
    # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro")
    # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro")
    row1 = {
        'thresholdSet': None,
        'algoName': "dummy",
        'c-thre': None,
        'n-thre': None,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    print(row1)

    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    y_actual = []

    for entry in allEntries:
        index = 4
        for algo in ALGOS:
            index = index + 1

        y_actual.append(entry[index])

    y_pred = dummyClassifier.predict(y_actual)

    precision, recall, f1, support = metrics.precision_recall_fscore_support(
        y_actual, y_pred, average="macro")
    row2 = {
        'thresholdSet': None,
        'algoName': "dummy",
        'c-thre': None,
        'n-thre': None,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    print(row2)
    writeCSV(fieldNames, [row1, row2], "rq1_dummy.csv")
Exemple #6
0
        tags = entry[15]

        if classification == 1:
            total += 1
            if "dditional" in tags:
                nd3count += 1
            else:
                nd2count += 1
    print(total)
    print(nd2count)
    print(nd3count)


if __name__ == "__main__":
    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    # testDummyClassifier()
    # getNdCategories(allEntries)

    # dbName = "/Test/gt10/gt10_last500Responses.db"
    # connectToDB(dbName)
    # allEntries = fetchAllNearDuplicates("where human_classification>=0")
    # closeDBConnection()

    # getF1_Classifier(allEntries)
    getF1_SAF_allrows(allEntries)

    # y_pred = {}
Exemple #7
0
def randomPairOuput():
    DB_PATH = "/gt10_Doms/"
    DB_NAME = "DS.db"
    CRAWL_PATH = "/gt10_Doms/"
    RESOURCES = os.path.join(
        os.path.abspath("/state-abstraction-study/HTMLStuff/resources/"), '')
    TEMPLATE_DIR = os.path.join(
        os.path.abspath("/state-abstraction-study/HTMLStuff/"), '')
    TEMPLATE_HTML = "jinjaTemplate.html"
    timeString = str(datetime.now().strftime("%Y%m%d-%H%M%S"))
    NUMBER = 500
    OUTPUT_PATH = os.path.join(os.path.abspath("/htmloutputs/"),
                               "htmloutput_" + str(NUMBER) + "_" + timeString)
    OUTPUT_HTML_NAME = "randomPairOutput.html"
    IMAGES = "images"

    saveJsonName = "responseResults_" + str(
        NUMBER) + "_" + timeString + ".json"

    if len(sys.argv) <= 6:
        print("You have not provided ENOUGH arguments. ")
        print(
            "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>"
        )
        print(
            "DO you want to use defaults DB : {0}, CRAWLS : {1}, OutputPath: {2}, OutputName : {3} and Number : {4} ?"
            .format(DB_PATH + DB_NAME, CRAWL_PATH, OUTPUT_PATH,
                    OUTPUT_HTML_NAME, NUMBER))
        response = input("Y/N : ").strip().lower()
        if (response == 'y'):
            print(
                'Okay. Continuing with Defaults : Your output will be available at : '
                + OUTPUT_PATH)
        else:
            DB_PATH = input("DB_PATH:").strip()
            DB_NAME = input("DB_NAME:").strip()
            CRAWL_PATH = input("CRAWL_PATH:").strip()
            OUTPUT_PATH = input("OUTPUT_PATH:").strip()
            OUTPUT_HTML_NAME = input("OUTPUT_HTML_NAME:").strip()
            MAX_RETRY_NUM = 3
            numberNotConfirmed = True
            retryNum = 0
            while numberNotConfirmed:
                try:
                    NUMBER = int(input("NUMBER_OF_PAIRS:").strip())
                    break
                except Exception as e:
                    print(e)
                    if (retryNum >= MAX_RETRY_NUM):
                        print("EXCEEDED MAX RETRY. ABORTING!! ")
                        print(
                            "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>"
                        )
                        sys.exit()
                    retryNum += 1
                    print("Please provide a valid number.")

    elif len(sys.argv) == 7:
        DB_PATH = sys.argv[1]
        DB_NAME = sys.argv[2]
        CRAWL_PATH = sys.argv[3]
        OUTPUT_PATH = sys.argv[4]
        OUTPUT_HTML_NAME = sys.argv[5]
        NUMBER = int(sys.argv[6])

    DB_PATH = os.path.join(os.path.abspath(DB_PATH.strip()), '')
    CRAWL_PATH = os.path.join(os.path.abspath(CRAWL_PATH.strip()), '')
    OUTPUT_PATH = os.path.join(os.path.abspath(OUTPUT_PATH.strip()), '')

    print(
        "USING THESE VALUES FOR PROGRAM: DB : {0}, CRAWLS : {1}, OutputPath: {2}, OutputName : {3} and Number : {4} ?"
        .format(DB_PATH + DB_NAME, CRAWL_PATH, OUTPUT_PATH, OUTPUT_HTML_NAME,
                NUMBER))
    response = input("Continue ? Y/N : ").strip().lower()
    if response == 'y':
        print("Okay. Continuing!!")
    else:
        print("ABORTING!! TRY AGAIN")
        print(
            "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>"
        )
        sys.exit()

    if not os.path.exists(DB_PATH + DB_NAME):
        print("DB DOES NOT EXIST. ABORTING!!")
        sys.exit()

    if not os.path.exists(CRAWL_PATH):
        print("CRAWL PATH DOES NOT EXIST. ABORTING")
        sys.exit()

    if not os.path.exists(RESOURCES):
        print("RESOURCES NOT FOUND AT : {0} ".format(RESOURCES))
        RESOURCES = os.path.join(
            os.path.abspath(input("Provide RESOURCES FOLDER.").strip()), '')
        if not os.path.exists(RESOURCES):
            print("PROVIDED RESOURCES PATH DOES NOT EXIST : {0}. ABORTING!!".
                  format(RESOURCES))
            sys.exit()

    if not os.path.exists(TEMPLATE_DIR + TEMPLATE_HTML):
        print("TEMPALTE {0} NOT FOUND AT : {1} ".format(
            TEMPLATE_HTML, TEMPLATE_DIR))
        TEMPLATE_DIR = os.path.join(
            os.path.abspath(
                input("Provide FOLDER where TEMPLATE HTML IS.").strip()), '')
        if not os.path.exists(TEMPLATE_DIR + TEMPLATE_HTML):
            print("TEMPALTE {0} NOT FOUND AT : {1} ".format(
                TEMPLATE_HTML, TEMPLATE_DIR))
            TEMPLATE_HTML = input("Provide HTML Template to use.").strip()
            if not os.path.exists(RESOURCES + TEMPLATE_HTML):
                print("TEMPALTE {0} NOT FOUND AT : {1} . ABORTING !!".format(
                    TEMPLATE_HTML, TEMPLATE_DIR))
                sys.exit()
    else:
        print("USING TEMPLATE {0} at {1}".format(TEMPLATE_HTML, TEMPLATE_DIR))

    outputPathConfirmed, OUTPUT_PATH = confirmOutputPath(
        OUTPUT_PATH, RESOURCES)
    if not outputPathConfirmed:
        print(
            "USAGE : program <DB_PATH> <DB_NAME> <CRAWL_PATH> <OUTPUT_PATH> <OUTPUT_HTML_NAME> <NUMBEROFPAIRS>"
        )
        sys.exit()

    connectToDB(DB_PATH + DB_NAME)
    #randomNDs = fetchRandomNearDuplicates(NUMBER*2)
    # randomNDs = fetchRandomNearDuplicates(NUMBER*2, "appname in (select name from apps where numAddedStates>=10) AND ")
    randomNDs = fetchRandomNearDuplicates(NUMBER * 2,
                                          "HUMAN_CLASSIFICATION==-1 AND ")
    closeDBConnection()
    data = []
    numCreated = 0
    missing = 0
    for randND in randomNDs:
        jsonRecord = {}
        appName = randND[0]
        crawl = randND[1]
        state1 = randND[2]
        state2 = randND[3]

        jsonRecord['appname'] = appName
        jsonRecord['crawl'] = crawl
        jsonRecord['state1'] = state1
        jsonRecord['state2'] = state2
        image1Name = state1 + ".png"
        image2Name = state2 + ".png"
        image1 = searchImageInCrawlPath(appName, crawl, image1Name, CRAWL_PATH)
        image2 = searchImageInCrawlPath(appName, crawl, image2Name, CRAWL_PATH)
        if (image1 == None) or (image2 == None):
            print("IMAGES NOT FOUND ABORTING THIS PAIR :  " + str(jsonRecord))
            missing += 1
            continue
        ndImageName = appName + "_" + crawl + "_" + state1 + "_" + state2 + ".jpg"
        destination = OUTPUT_PATH + IMAGES + "/" + ndImageName
        if createNDImage(image1, image2, destination):
            numCreated += 1
        else:
            print("COULD NOT CREATE ND IMAGE FOR RECORD : " + str(jsonRecord))

        jsonRecord['image'] = ndImageName
        jsonRecord['response'] = randND[14]
        jsonRecord['tags'] = ""
        jsonRecord['comments'] = ""
        data.append(jsonRecord)

        if (numCreated >= NUMBER):
            break

    jsonData = json.dumps(data)

    TEMPLATE_HTML_PATH = os.path.abspath(
        os.path.join(TEMPLATE_DIR, TEMPLATE_HTML))
    print(jsonData)
    output_html_doc(TEMPLATE_DIR,
                    TEMPLATE_HTML,
                    OUTPUT_PATH + OUTPUT_HTML_NAME,
                    jsonData,
                    saveJsonName=saveJsonName)
    print("FAILED CREATING {0} PAIRS BECAUSE IMAGES WERE MISSING".format(
        missing))
    print("OUTPUT HTML CREATED WITH : {0} IMAGE PAIRS".format(numCreated))