def get_deviations_to_notoriginal(oPandasData, sReferenceCollectionName): """ used for FAR test """ oPandasFullHashTypeDataset = None for sHashType in oPandasData.hashalgorithm.unique(): for i, sImageBaseName in enumerate( oPandasData[oPandasData.collection == sReferenceCollectionName].image.unique()): sImageBaseName = sImageBaseName.split(".")[-2] oPandasHashData = oPandasData[ (oPandasData.hashalgorithm == sHashType) & (oPandasData.image.str.contains(sImageBaseName))] # choose a deterministic ref image that is not the original np.random.seed(i) oRefHash = np.random.choice(oPandasData[ (oPandasData.collection == sReferenceCollectionName) & (oPandasData.hashalgorithm == sHashType) & (~oPandasData.image.str.contains(sImageBaseName))].hash.values) oPandasHashData = oPandasHashData[ oPandasHashData.collection != sReferenceCollectionName] oPandasHashData["deviation"] = oPandasHashData.apply( lambda row: deviation.hamming_distance(row["hash"], oRefHash), axis=1) if isinstance(oPandasFullHashTypeDataset, pd.DataFrame): oPandasFullHashTypeDataset = pd.concat( [oPandasFullHashTypeDataset, oPandasHashData]) else: oPandasFullHashTypeDataset = oPandasHashData return oPandasFullHashTypeDataset
def calculate_stats(oSensitivityTestData, lTestDatasetSize, lRandomDataSplitSeed=None): dicResult = {} # do it for every single hash algorithm for sHashAlgo in oSensitivityTestData.hashalgorithm.unique(): # filter dataset by hashalgo oFilteredSensitivityTestData = oSensitivityTestData[ oSensitivityTestData.hashalgorithm == sHashAlgo] # split the data in a testset and a big reference set oBigDataset, oTestDataset = train_test_split( oFilteredSensitivityTestData, test_size=lTestDatasetSize, random_state=lRandomDataSplitSeed) # DEBUG print(sHashAlgo) print(len(oBigDataset.image.unique())) print(len(oTestDataset.image.unique())) # create deviation array having size of # nr. of test_images * # nr. of reference_images lNumberOfTestReferenceRelationsTotal = oTestDataset.shape[0] * \ oBigDataset.shape[0] aHashDeviations = np.zeros(lNumberOfTestReferenceRelationsTotal) # iterate over every image in the test dataset i = 0 for aTestImageHash in oTestDataset["hash"].values: # iterate over all images in the big reference dataset for aReferenceImageHash in oBigDataset["hash"].values: aHashDeviations[i] = deviation.hamming_distance( aTestImageHash, aReferenceImageHash) i += 1 # calculate metrics dicMetrics = { "min": np.min(aHashDeviations), "p25": np.percentile(aHashDeviations, 25), "p75": np.percentile(aHashDeviations, 75), "max": np.max(aHashDeviations), "mean": np.mean(aHashDeviations) } # calculate FAR error rate for every threshold step aErrorRate = [] for dThreshold in aThresholdSteps: lCountOfValuesSmallerThreshold = np.sum( np.array(aHashDeviations) < dThreshold) aErrorRate.append(lCountOfValuesSmallerThreshold / lNumberOfTestReferenceRelationsTotal) dicResult[sHashAlgo] = {"metrics": dicMetrics, "errorrate": aErrorRate} return dicResult
def get_deviations_to_original(oPandasData, sReferenceCollectionName): oPandasFullHashTypeDataset = None for sHashType in oPandasData.hashalgorithm.unique(): for sImageBaseName in oPandasData[ oPandasData.collection == sReferenceCollectionName].image.unique(): sImageBaseName = sImageBaseName.split(".")[-2] oPandasHashData = oPandasData[ (oPandasData.hashalgorithm == sHashType) & (oPandasData.image.str.contains(sImageBaseName))] oRefHash = oPandasHashData[oPandasHashData.collection == sReferenceCollectionName].hash.values[0] oPandasHashData = oPandasHashData[ oPandasHashData.collection != sReferenceCollectionName] oPandasHashData["deviation"] = oPandasHashData.apply( lambda row: deviation.hamming_distance(row["hash"], oRefHash), axis=1) if isinstance(oPandasFullHashTypeDataset, pd.DataFrame): oPandasFullHashTypeDataset = pd.concat( [oPandasFullHashTypeDataset, oPandasHashData]) else: oPandasFullHashTypeDataset = oPandasHashData return oPandasFullHashTypeDataset