예제 #1
0
def find_repeat_detections(inputFilename, outputFilename=None, options=None):

    ##%% Input handling

    if options is None:
        options = RepeatDetectionOptions()

    toReturn = RepeatDetectionResults()

    # Check early to avoid problems with the output folder

    if options.bWriteFilteringFolder or options.bRenderHtml:
        assert options.outputBase is not None and len(options.outputBase) > 0
        os.makedirs(options.outputBase, exist_ok=True)

    # Load file

    detectionResults, otherFields = load_api_results(
        inputFilename,
        normalize_paths=True,
        filename_replacements=options.filenameReplacements)
    toReturn.detectionResults = detectionResults
    toReturn.otherFields = otherFields

    # Before doing any real work, make sure we can *probably* access images
    # This is just a cursory check on the first image, but it heads off most
    # problems related to incorrect mount points, etc.  Better to do this before
    # spending 20 minutes finding repeat detections.
    if options.bWriteFilteringFolder or options.bRenderHtml:
        if not is_sas_url(options.imageBase):
            row = detectionResults.iloc[0]
            relativePath = row['file']
            for s in options.filenameReplacements.keys():
                relativePath = relativePath.replace(
                    s, options.filenameReplacements[s])
            assert os.path.isfile(os.path.join(options.imageBase,
                                               relativePath))

    ##%% Separate files into directories

    # This will be a map from a directory name to smaller data frames
    rowsByDirectory = {}

    # This is a mapping back into the rows of the original table
    filenameToRow = {}

    # TODO: in the case where we're loading an existing set of FPs after manual filtering,
    # we should load these data frames too, rather than re-building them from the input.

    print('Separating files into directories...')

    # iRow = 0; row = detectionResults.iloc[0]
    for iRow, row in detectionResults.iterrows():
        relativePath = row['file']
        dirName = os.path.dirname(relativePath)

        if len(dirName) == 0:
            assert options.nDirLevelsFromLeaf == 0, 'Can' 't use the dirLevelsFromLeaf option with flat filenames'
        else:
            if options.nDirLevelsFromLeaf > 0:
                iLevel = 0
                while (iLevel < options.nDirLevelsFromLeaf):
                    iLevel += 1
                    dirName = os.path.dirname(dirName)
            assert len(dirName) > 0

        if not dirName in rowsByDirectory:
            # Create a new DataFrame with just this row
            # rowsByDirectory[dirName] = pd.DataFrame(row)
            rowsByDirectory[dirName] = []

        rowsByDirectory[dirName].append(row)

        assert relativePath not in filenameToRow
        filenameToRow[relativePath] = iRow

    # Convert lists of rows to proper DataFrames
    dirs = list(rowsByDirectory.keys())
    for d in dirs:
        rowsByDirectory[d] = pd.DataFrame(rowsByDirectory[d])

    toReturn.rowsByDirectory = rowsByDirectory
    toReturn.filenameToRow = filenameToRow

    print('Finished separating {} files into {} directories'.format(
        len(detectionResults), len(rowsByDirectory)))

    ##% Look for matches (or load them from file)

    dirsToSearch = list(rowsByDirectory.keys())
    if options.debugMaxDir > 0:
        dirsToSearch = dirsToSearch[0:options.debugMaxDir]

    # length-nDirs list of lists of DetectionLocation objects
    suspiciousDetections = [None] * len(dirsToSearch)

    # Are we actually looking for matches, or just loading from a file?
    if len(options.filterFileToLoad) == 0:

        # We're actually looking for matches...
        print('Finding similar detections...')

        allCandidateDetections = [None] * len(dirsToSearch)

        if not options.bParallelizeComparisons:

            options.pbar = None
            # iDir = 0; dirName = dirsToSearch[iDir]
            for iDir, dirName in enumerate(tqdm(dirsToSearch)):
                allCandidateDetections[iDir] = find_matches_in_directory(
                    dirName, options, rowsByDirectory)

        else:

            options.pbar = tqdm(total=len(dirsToSearch))
            allCandidateDetections = Parallel(
                n_jobs=options.nWorkers,
                prefer='threads')(delayed(find_matches_in_directory)(
                    dirName, options, rowsByDirectory)
                                  for dirName in tqdm(dirsToSearch))

        print('\nFinished looking for similar bounding boxes')

        ##%% Find suspicious locations based on match results

        print('Filtering out repeat detections...')

        nImagesWithSuspiciousDetections = 0
        nSuspiciousDetections = 0

        # For each directory
        #
        # iDir = 51
        for iDir in range(len(dirsToSearch)):

            # A list of DetectionLocation objects
            suspiciousDetectionsThisDir = []

            # A list of DetectionLocation objects
            candidateDetectionsThisDir = allCandidateDetections[iDir]

            for iLocation, candidateLocation in enumerate(
                    candidateDetectionsThisDir):

                # occurrenceList is a list of file/detection pairs
                nOccurrences = len(candidateLocation.instances)

                if nOccurrences < options.occurrenceThreshold:
                    continue

                nImagesWithSuspiciousDetections += nOccurrences
                nSuspiciousDetections += 1

                suspiciousDetectionsThisDir.append(candidateLocation)
                # Find the images corresponding to this bounding box, render boxes

            suspiciousDetections[iDir] = suspiciousDetectionsThisDir

        print(
            'Finished searching for repeat detections\nFound {} unique detections on {} images that are suspicious'
            .format(nSuspiciousDetections, nImagesWithSuspiciousDetections))

    else:

        print('Bypassing detection-finding, loading from {}'.format(
            options.filterFileToLoad))

        # Load the filtering file
        detectionIndexFileName = options.filterFileToLoad
        sIn = open(detectionIndexFileName, 'r').read()
        suspiciousDetections = jsonpickle.decode(sIn)
        filteringBaseDir = os.path.dirname(options.filterFileToLoad)
        assert len(suspiciousDetections) == len(dirsToSearch)

        nDetectionsRemoved = 0
        nDetectionsLoaded = 0

        # We're skipping detection-finding, but to see which images are actually legit false
        # positives, we may be looking for physical files or loading from a text file.
        fileList = None
        if options.filteredFileListToLoad is not None:
            with open(options.filteredFileListToLoad) as f:
                fileList = f.readlines()
                fileList = [x.strip() for x in fileList]
            nSuspiciousDetections = sum([len(x) for x in suspiciousDetections])
            print(
                'Loaded false positive list from file, will remove {} of {} suspicious detections'
                .format(len(fileList), nSuspiciousDetections))

        # For each directory
        # iDir = 0; detections = suspiciousDetections[0]
        #
        # suspiciousDetections is an array of DetectionLocation objects,
        # one per directory.
        for iDir, detections in enumerate(suspiciousDetections):

            bValidDetection = [True] * len(detections)
            nDetectionsLoaded += len(detections)

            # For each detection that was present before filtering
            # iDetection = 0; detection = detections[iDetection]
            for iDetection, detection in enumerate(detections):

                # Are we checking the directory to see whether detections were actually false
                # positives, or reading from a list?
                if fileList is None:

                    # Is the image still there?
                    imageFullPath = os.path.join(
                        filteringBaseDir,
                        detection.sampleImageRelativeFileName)

                    # If not, remove this from the list of suspicious detections
                    if not os.path.isfile(imageFullPath):
                        nDetectionsRemoved += 1
                        bValidDetection[iDetection] = False

                else:

                    if detection.sampleImageRelativeFileName not in fileList:
                        nDetectionsRemoved += 1
                        bValidDetection[iDetection] = False

            # ...for each detection

            nRemovedThisDir = len(bValidDetection) - sum(bValidDetection)
            if nRemovedThisDir > 0:
                print('Removed {} of {} detections from directory {}'.format(
                    nRemovedThisDir, len(detections), iDir))

            detectionsFiltered = list(compress(detections, bValidDetection))
            suspiciousDetections[iDir] = detectionsFiltered

        # ...for each directory

        print('Removed {} of {} total detections via manual filtering'.format(
            nDetectionsRemoved, nDetectionsLoaded))

    # ...if we are/aren't finding detections (vs. loading from file)

    toReturn.suspiciousDetections = suspiciousDetections

    if options.bRenderHtml:

        # Render problematic locations with html (loop)

        print('Rendering html')

        nDirs = len(dirsToSearch)
        directoryHtmlFiles = [None] * nDirs

        if options.bParallelizeRendering:

            # options.pbar = tqdm(total=nDirs)
            options.pbar = None

            directoryHtmlFiles = Parallel(
                n_jobs=options.nWorkers,
                prefer='threads')(delayed(render_images_for_directory)(
                    iDir, directoryHtmlFiles, suspiciousDetections, options)
                                  for iDir in tqdm(range(nDirs)))

        else:

            options.pbar = None

            # For each directory
            # iDir = 51
            for iDir in range(nDirs):
                # Add this directory to the master list of html files
                directoryHtmlFiles[iDir] = render_images_for_directory(
                    iDir, directoryHtmlFiles, suspiciousDetections, options)

            # ...for each directory

        # Write master html file

        masterHtmlFile = os.path.join(options.outputBase, 'index.html')
        os.makedirs(options.outputBase, exist_ok=True)
        toReturn.masterHtmlFile = masterHtmlFile

        with open(masterHtmlFile, 'w') as fHtml:

            fHtml.write('<html><body>\n')
            fHtml.write(
                '<h2><b>Repeat detections by directory</b></h2></br>\n')

            for iDir, dirHtmlFile in enumerate(directoryHtmlFiles):

                if dirHtmlFile is None:
                    continue

                relPath = os.path.relpath(dirHtmlFile, options.outputBase)
                dirName = dirsToSearch[iDir]

                # Remove unicode characters before formatting
                relPath = relPath.encode('ascii', 'ignore').decode('ascii')
                dirName = dirName.encode('ascii', 'ignore').decode('ascii')

                fHtml.write('<a href={}>{}</a><br/>\n'.format(
                    relPath, dirName))

            fHtml.write('</body></html>\n')

    # ...if we're rendering html

    toReturn.allRowsFiltered = update_detection_table(toReturn, options,
                                                      outputFilename)

    # Create filtering directory
    if options.bWriteFilteringFolder:

        print('Creating filtering folder...')

        dateString = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
        filteringDir = os.path.join(options.outputBase,
                                    'filtering_' + dateString)
        os.makedirs(filteringDir, exist_ok=True)

        # iDir = 0; suspiciousDetectionsThisDir = suspiciousDetections[iDir]
        for iDir, suspiciousDetectionsThisDir in enumerate(
                tqdm(suspiciousDetections)):

            # suspiciousDetectionsThisDir is a list of DetectionLocation objects
            # iDetection = 0; detection = suspiciousDetectionsThisDir[0]
            for iDetection, detection in enumerate(
                    suspiciousDetectionsThisDir):

                instance = detection.instances[0]
                relativePath = instance.filename
                outputRelativePath = 'dir{:0>4d}_det{:0>4d}_n{:0>4d}.jpg'.format(
                    iDir, iDetection, len(detection.instances))
                outputFullPath = os.path.join(filteringDir, outputRelativePath)

                if is_sas_url(options.imageBase):
                    inputFullPath = relative_sas_url(options.imageBase,
                                                     relativePath)
                else:
                    inputFullPath = os.path.join(options.imageBase,
                                                 relativePath)
                    assert (os.path.isfile(inputFullPath)
                            ), 'Not a file: {}'.format(inputFullPath)

                try:
                    render_bounding_box(detection,
                                        inputFullPath,
                                        outputFullPath,
                                        lineWidth=options.lineThickness,
                                        expansion=options.boxExpansion)
                except Exception as e:
                    print(
                        'Warning: error rendering bounding box from {} to {}: {}'
                        .format(inputFullPath, outputFullPath, e))
                    if options.bFailOnRenderError:
                        raise
                detection.sampleImageRelativeFileName = outputRelativePath

        # Write out the detection index
        detectionIndexFileName = os.path.join(filteringDir,
                                              DETECTION_INDEX_FILE_NAME)
        jsonpickle.set_encoder_options('json', sort_keys=True, indent=4)
        s = jsonpickle.encode(suspiciousDetections)
        with open(detectionIndexFileName, 'w') as f:
            f.write(s)
        toReturn.filterFile = detectionIndexFileName

        print('Done')

    # ...if we're writing filtering info

    return toReturn
예제 #2
0
def render_images_for_directory(iDir, directoryHtmlFiles, suspiciousDetections,
                                options):

    nDirs = len(directoryHtmlFiles)

    if options.pbar is not None:
        options.pbar.update()

    if options.debugMaxRenderDir > 0 and iDir > options.debugMaxRenderDir:
        return None

    dirName = 'dir{:0>4d}'.format(iDir)

    # suspiciousDetectionsThisDir is a list of DetectionLocation objects
    suspiciousDetectionsThisDir = suspiciousDetections[iDir]

    if len(suspiciousDetectionsThisDir) == 0:
        return None

    timeStr = datetime.now().strftime('%H:%M:%S')
    print('Processing directory {} of {} ({})'.format(iDir, nDirs, timeStr))

    dirBaseDir = os.path.join(options.outputBase, dirName)
    os.makedirs(dirBaseDir, exist_ok=True)

    directoryDetectionHtmlFiles = []
    directoryDetectionImageInfo = []

    # For each problematic detection in this directory
    #
    # iDetection = 0; detection = suspiciousDetectionsThisDir[iDetection];
    nDetections = len(suspiciousDetectionsThisDir)
    bPrintedMissingImageWarning = False

    # iDetection = 0; detection = suspiciousDetectionsThisDir[0]
    for iDetection, detection in enumerate(suspiciousDetectionsThisDir):

        if options.debugMaxRenderDetection > 0 and iDetection > options.debugMaxRenderDetection:
            break

        nInstances = len(detection.instances)
        print('Processing detection {} of {} ({} instances)'.format(
            iDetection, nDetections, nInstances))
        detectionName = 'detection{:0>4d}'.format(iDetection)
        detectionBaseDir = os.path.join(dirBaseDir, detectionName)
        os.makedirs(detectionBaseDir, exist_ok=True)

        # _ = pretty_print_object(detection)
        assert (nInstances >= options.occurrenceThreshold)

        imageInfo = []

        # Render images

        # iInstance = 0; instance = detection.instances[iInstance]
        for iInstance, instance in enumerate(detection.instances):

            if options.debugMaxRenderInstance >= 0 and iInstance >= options.debugMaxRenderInstance:
                break

            imageRelativeFilename = 'image{:0>4d}.jpg'.format(iInstance)
            imageOutputFilename = os.path.join(detectionBaseDir,
                                               imageRelativeFilename)
            thisImageInfo = {}
            thisImageInfo['filename'] = imageRelativeFilename
            confidence = instance.confidence
            confidenceStr = '{:.2f}'.format(confidence)
            t = confidenceStr + ' (' + instance.filename + ')'
            thisImageInfo['title'] = t
            imageInfo.append(thisImageInfo)

            if not is_sas_url(options.imageBase):
                inputFileName = os.path.join(options.imageBase,
                                             instance.filename)
                if not os.path.isfile(inputFileName):
                    if options.bPrintMissingImageWarnings:
                        if (options.missingImageWarningType
                                == 'all') or (not bPrintedMissingImageWarning):
                            print('Warning: could not find file {}'.format(
                                inputFileName))
                            bPrintedMissingImageWarning = True
                    continue
            else:
                inputFileName = relative_sas_url(options.imageBase,
                                                 instance.filename)
            render_bounding_box(detection,
                                inputFileName,
                                imageOutputFilename,
                                lineWidth=options.lineThickness,
                                expansion=options.boxExpansion)

        # ...for each instance

        # Write html for this detection
        detectionHtmlFile = os.path.join(detectionBaseDir, 'index.html')

        htmlOptions = write_html_image_list.write_html_image_list()
        htmlOptions['defaultImageStyle'] = 'max-width:650px;'
        write_html_image_list.write_html_image_list(detectionHtmlFile,
                                                    imageInfo, htmlOptions)

        thisDirectoryImageInfo = {}
        directoryDetectionHtmlFiles.append(detectionHtmlFile)

        # Use the first image from this detection (arbitrary) as the canonical example
        # that we'll render for the directory-level page.
        thisDirectoryImageInfo['filename'] = os.path.join(
            detectionName, imageInfo[0]['filename'])
        detectionHtmlFileRelative = os.path.relpath(detectionHtmlFile,
                                                    dirBaseDir)
        title = '<a href="{}">{}</a>'.format(detectionHtmlFileRelative,
                                             detectionName)
        thisDirectoryImageInfo['title'] = title
        directoryDetectionImageInfo.append(thisDirectoryImageInfo)

    # ...for each detection

    # Write the html file for this directory
    directoryHtmlFile = os.path.join(dirBaseDir, 'index.html')

    htmlOptions = write_html_image_list.write_html_image_list()
    htmlOptions['defaultImageStyle'] = 'max-width:650px;'
    write_html_image_list.write_html_image_list(directoryHtmlFile,
                                                directoryDetectionImageInfo,
                                                htmlOptions)

    return directoryHtmlFile