Ejemplo n.º 1
0
def get_deviations_to_notoriginal(oPandasData, sReferenceCollectionName):
    """ used for FAR test """
    oPandasFullHashTypeDataset = None
    for sHashType in oPandasData.hashalgorithm.unique():
        for i, sImageBaseName in enumerate(
                oPandasData[oPandasData.collection ==
                            sReferenceCollectionName].image.unique()):
            sImageBaseName = sImageBaseName.split(".")[-2]
            oPandasHashData = oPandasData[
                (oPandasData.hashalgorithm == sHashType)
                & (oPandasData.image.str.contains(sImageBaseName))]
            # choose a deterministic ref image that is not the original
            np.random.seed(i)
            oRefHash = np.random.choice(oPandasData[
                (oPandasData.collection == sReferenceCollectionName)
                & (oPandasData.hashalgorithm == sHashType) &
                (~oPandasData.image.str.contains(sImageBaseName))].hash.values)
            oPandasHashData = oPandasHashData[
                oPandasHashData.collection != sReferenceCollectionName]
            oPandasHashData["deviation"] = oPandasHashData.apply(
                lambda row: deviation.hamming_distance(row["hash"], oRefHash),
                axis=1)
            if isinstance(oPandasFullHashTypeDataset, pd.DataFrame):
                oPandasFullHashTypeDataset = pd.concat(
                    [oPandasFullHashTypeDataset, oPandasHashData])
            else:
                oPandasFullHashTypeDataset = oPandasHashData

    return oPandasFullHashTypeDataset
Ejemplo n.º 2
0
def calculate_stats(oSensitivityTestData,
                    lTestDatasetSize,
                    lRandomDataSplitSeed=None):

    dicResult = {}

    # do it for every single hash algorithm
    for sHashAlgo in oSensitivityTestData.hashalgorithm.unique():
        # filter dataset by hashalgo
        oFilteredSensitivityTestData = oSensitivityTestData[
            oSensitivityTestData.hashalgorithm == sHashAlgo]

        #  split the data in a testset and a big reference set
        oBigDataset, oTestDataset = train_test_split(
            oFilteredSensitivityTestData,
            test_size=lTestDatasetSize,
            random_state=lRandomDataSplitSeed)

        # DEBUG
        print(sHashAlgo)
        print(len(oBigDataset.image.unique()))
        print(len(oTestDataset.image.unique()))

        # create deviation array having size of
        # nr. of test_images * # nr. of reference_images
        lNumberOfTestReferenceRelationsTotal = oTestDataset.shape[0] * \
            oBigDataset.shape[0]
        aHashDeviations = np.zeros(lNumberOfTestReferenceRelationsTotal)
        # iterate over every image in the test dataset
        i = 0
        for aTestImageHash in oTestDataset["hash"].values:
            # iterate over all images in the big reference dataset
            for aReferenceImageHash in oBigDataset["hash"].values:
                aHashDeviations[i] = deviation.hamming_distance(
                    aTestImageHash, aReferenceImageHash)
                i += 1

        # calculate metrics
        dicMetrics = {
            "min": np.min(aHashDeviations),
            "p25": np.percentile(aHashDeviations, 25),
            "p75": np.percentile(aHashDeviations, 75),
            "max": np.max(aHashDeviations),
            "mean": np.mean(aHashDeviations)
        }

        # calculate FAR error rate for every threshold step
        aErrorRate = []
        for dThreshold in aThresholdSteps:
            lCountOfValuesSmallerThreshold = np.sum(
                np.array(aHashDeviations) < dThreshold)
            aErrorRate.append(lCountOfValuesSmallerThreshold /
                              lNumberOfTestReferenceRelationsTotal)

        dicResult[sHashAlgo] = {"metrics": dicMetrics, "errorrate": aErrorRate}
    return dicResult
Ejemplo n.º 3
0
def get_deviations_to_original(oPandasData, sReferenceCollectionName):
    oPandasFullHashTypeDataset = None
    for sHashType in oPandasData.hashalgorithm.unique():
        for sImageBaseName in oPandasData[
                oPandasData.collection ==
                sReferenceCollectionName].image.unique():
            sImageBaseName = sImageBaseName.split(".")[-2]
            oPandasHashData = oPandasData[
                (oPandasData.hashalgorithm == sHashType)
                & (oPandasData.image.str.contains(sImageBaseName))]
            oRefHash = oPandasHashData[oPandasHashData.collection ==
                                       sReferenceCollectionName].hash.values[0]
            oPandasHashData = oPandasHashData[
                oPandasHashData.collection != sReferenceCollectionName]
            oPandasHashData["deviation"] = oPandasHashData.apply(
                lambda row: deviation.hamming_distance(row["hash"], oRefHash),
                axis=1)
            if isinstance(oPandasFullHashTypeDataset, pd.DataFrame):
                oPandasFullHashTypeDataset = pd.concat(
                    [oPandasFullHashTypeDataset, oPandasHashData])
            else:
                oPandasFullHashTypeDataset = oPandasHashData

    return oPandasFullHashTypeDataset