Ejemplo n.º 1
0
def getF1_SAF_allrows(allEntries):

    y_pred = {}

    for algo in ALGOS:
        algoStr = str(algo).split('.')[1].upper()
        y_pred[algoStr] = []
        y_pred[algoStr].append([])
        y_pred[algoStr].append([])

    y_actual = []

    print(len(allEntries))

    threshold_sets = {
        'St_c_DS':
        getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA,
                     'all'),
        'St_n_DS':
        getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA,
                     'all'),
        'O_s_DS':
        getThreshold(THRESHOLD_SETS.OPTIMAL, DB_SETS.GT10_DB_DATA, 'all'),
        'O_c_DS':
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE,
                     DB_SETS.GT10_DB_DATA, 'all'),
        'O_n_DS':
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND,
                     DB_SETS.GT10_DB_DATA, 'all')
    }

    # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')]
    # threshold_sets["sample_based"] = [getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')]

    algoScoreRows = []
    fieldNames = [
        'thresholdSet', 'algoName', 'threshold', 'precision', 'recall', 'f1'
    ]

    dummyClassifier = dummy.DummyClassifier(strategy="stratified")

    print(threshold_sets)

    for threshold_set_name in threshold_sets:
        threshold_set = threshold_sets[threshold_set_name]
        for algo in ALGOS:
            algoStr = str(algo).split('.')[1].upper()
            threshold = threshold_set[algoStr]
            precision, recall, f1 = getF1_SAF(threshold, allEntries, algoStr)
            algoScoreRows.append({
                'thresholdSet': threshold_set_name,
                'algoName': algoStr,
                'threshold': threshold,
                'precision': precision,
                'recall': recall,
                'f1': f1
            })

    writeCSV(fieldNames, algoScoreRows,
             "rq2_1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
Ejemplo n.º 2
0
def getOptimalThresholds_SAF():
    global algoStr
    global allEntries
    optimalThresholds = []

    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"

    # dbName = "/Test/gt10/DS.db"
    excludeAlgos = []
    for appName in APPS:
        print(appName)
        connectToDB(dbName)
        allEntries = fetchAllNearDuplicates(
            'where human_classification>=0 and appname="{0}"'.format(appName))
        closeDBConnection()
        print(len(allEntries))
        for algo in ALGOS:
            algoStr = str(algo).split('.')[1].upper()
            if (algoStr in excludeAlgos):
                continue
            print(algoStr)
            getDistances(algoStr)
            get_y_actual_SAF()
            space = {
                't': hp.uniform('t', 0, getMax(algoStr)),
            }
            try:
                best = fmin(fn=getLoss_SAF,
                            space=space,
                            algo=tpe.suggest,
                            max_evals=1000)

                row = {
                    'thresholdSet': "optimal",
                    'appName': appName,
                    'algoName': algoStr,
                    'thre': best['t']
                }
                optimalThresholds.append(row)
                print(best)
            except Exception as ex:
                print(ex)
                print(
                    "Error getting optimal threshold for {0}".format(algoStr))

    fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre']
    writeCSV(fieldNames, optimalThresholds, "optimalThresholds_SAF.csv")
Ejemplo n.º 3
0
def getOptimalThresholds_Classification(iterations=10000):
    global algoStr
    global allEntries
    optimalThresholds = []

    dbName = "/Test/gt10/DS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    get_y_actual_Classification()

    for algo in ALGOS:
        algoStr = str(algo).split('.')[1].upper()
        print(algoStr)
        getDistances(algoStr)

        space = {
            'tc': hp.uniform('tc', 0, getMax(algoStr)),
            'tn': hp.uniform('tn', 0, getMax(algoStr))
        }

        best = fmin(fn=getLoss_Classification,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=iterations)

        row = {
            'thresholdSet': "optimal",
            'algoName': algoStr,
            'c-thre': best['tc'],
            'n-thre': best['tn']
        }
        optimalThresholds.append(row)

        print(best)

    fieldNames = ['thresholdSet', 'algoName', 'c-thre', 'n-thre']
    writeCSV(
        fieldNames, optimalThresholds, "optimalThresholds_Classification" +
        str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
Ejemplo n.º 4
0
def getOptimalThreshold_SAF(algoString, iterations=10000):
    global algoStr
    optimalThresholds = []
    global allEntries
    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"
    algoStr = algoString
    print(algoStr)
    for appName in APPS:
        connectToDB(dbName)
        allEntries = fetchAllNearDuplicates(
            'where human_classification>=0 and appname="{0}"'.format(appName))
        closeDBConnection()
        print(len(allEntries))

        getDistances(algoStr)
        get_y_actual_SAF()
        space = {
            't': hp.uniform('t', 0, getMax(algoStr)),
        }
        best = fmin(fn=getLoss_SAF,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=iterations)

        row = {
            'thresholdSet': "optimal",
            'appName': appName,
            'algoName': algoStr,
            'thre': best['t']
        }
        optimalThresholds.append(row)
        print(best)

    fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre']
    writeCSV(
        fieldNames, optimalThresholds,
        "optimalThresholds_SAF_" + algoStr + "_" + str(iterations) +
        str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
    return optimalThresholds
Ejemplo n.º 5
0
def getOptimalThreshold_SAF_Universal(algoString, iterations=10000):
    global algoStr
    optimalThresholds = []
    global allEntries

    algoStr = algoString
    print(algoStr)
    dbName = "/Test/gt10/DS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    getDistances(algoStr)
    get_y_actual_SAF()
    space = {
        't': hp.uniform('t', 0, getMax(algoStr)),
    }
    best = fmin(fn=getLoss_SAF,
                space=space,
                algo=tpe.suggest,
                max_evals=iterations)

    row = {
        'thresholdSet': "optimal",
        'appName': "Universal",
        'algoName': algoStr,
        'thre': best['t']
    }
    optimalThresholds.append(row)
    print(best)

    fieldNames = ['thresholdSet', 'appName', 'algoName', 'thre']
    writeCSV(
        fieldNames, optimalThresholds,
        "optimalThresholds_SAF_Universal" + algoStr + "_" + str(iterations) +
        str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
    return optimalThresholds
Ejemplo n.º 6
0
def testDummyClassifier():
    dummyClassifier = dummy.DummyClassifier(strategy="stratified")

    X = [[0]] * 10
    y = [0, 1, 2, 0, 1, 2, 0, 0, 1, 2]
    dummyClassifier.fit(X, y)

    dbName = "/Test/gt10/gt10_last500Responses.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    y_actual = []
    fieldNames = [
        'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall',
        'f1'
    ]

    for entry in allEntries:
        index = 4
        for algo in ALGOS:
            index = index + 1

        y_actual.append(entry[index])

    y_pred = dummyClassifier.predict(y_actual)

    precision, recall, f1, support = metrics.precision_recall_fscore_support(
        y_actual, y_pred, average="macro")
    # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro")
    # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro")
    # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro")
    row1 = {
        'thresholdSet': None,
        'algoName': "dummy",
        'c-thre': None,
        'n-thre': None,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    print(row1)

    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    y_actual = []

    for entry in allEntries:
        index = 4
        for algo in ALGOS:
            index = index + 1

        y_actual.append(entry[index])

    y_pred = dummyClassifier.predict(y_actual)

    precision, recall, f1, support = metrics.precision_recall_fscore_support(
        y_actual, y_pred, average="macro")
    row2 = {
        'thresholdSet': None,
        'algoName': "dummy",
        'c-thre': None,
        'n-thre': None,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    print(row2)
    writeCSV(fieldNames, [row1, row2], "rq1_dummy.csv")
Ejemplo n.º 7
0
def getF1_Classifier(allEntries):
    y_pred = {}

    for algo in ALGOS:
        algoStr = str(algo).split('.')[1].upper()
        y_pred[algoStr] = []

    y_actual = []

    print(len(allEntries))

    threshold_sets = {}
    # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')]
    threshold_sets["statistical"] = [
        getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA,
                     'all'),
        getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA,
                     'all')
    ]
    threshold_sets["optimal"] = [
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE,
                     DB_SETS.GT10_DB_DATA, 'all'),
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND,
                     DB_SETS.GT10_DB_DATA, 'all')
    ]

    algoScoreRows = []
    fieldNames = [
        'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall',
        'f1'
    ]

    dummyClassifier = dummy.DummyClassifier(strategy="stratified")

    print(threshold_sets)
    for threshold_set_name in threshold_sets:
        threshold_set = threshold_sets[threshold_set_name]
        cloneThresholds = threshold_set[0]
        ndThresholds = threshold_set[1]
        # print(cloneThresholds)
        for entry in allEntries:
            index = 4
            for algo in ALGOS:
                algoStr = str(algo).split('.')[1].upper()
                value = float(entry[index])
                pred = -1
                if algo.value[2] == "lt":
                    if value <= cloneThresholds[algoStr]:
                        pred = 0
                    if value > cloneThresholds[algoStr]:
                        if value <= ndThresholds[algoStr]:
                            pred = 1
                        else:
                            pred = 2
                else:
                    if value >= cloneThresholds[algoStr]:
                        pred = 0
                    if value < cloneThresholds[algoStr]:
                        if value >= ndThresholds[algoStr]:
                            pred = 1
                        else:
                            pred = 2

                y_pred[algoStr].append(pred)
                index = index + 1

            y_actual.append(entry[index])

        for algo in ALGOS:
            algoStr = str(algo).split('.')[1].upper()
            cm = metrics.confusion_matrix(y_actual, y_pred[algoStr])
            # print(cm)
            precision, recall, f1, support = metrics.precision_recall_fscore_support(
                y_actual, y_pred[algoStr], average="macro")
            # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro")
            # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro")
            # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro")
            row = {
                'thresholdSet': threshold_set_name,
                'algoName': algoStr,
                'c-thre': cloneThresholds[algoStr],
                'n-thre': ndThresholds[algoStr],
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
            algoScoreRows.append(row)
        X = [[0]] * len(y_actual)
        dummyClassifier.fit(X, y_actual)
        y_pred_dummy = dummyClassifier.predict(y_actual)
        precision, recall, f1, support = metrics.precision_recall_fscore_support(
            y_actual, y_pred_dummy, average="macro")
        row2 = {
            'thresholdSet': None,
            'algoName': "dummy",
            'c-thre': None,
            'n-thre': None,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
        algoScoreRows.append(row2)

    writeCSV(
        fieldNames, algoScoreRows,
        os.path.join(
            os.path.abspath(".."), RESULTS_FOLDER,
            "rq1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv"))