Example #1
0
    def create_descriptors(self):
        """
        Creates initial descriptors needed for the BoW by loading images in the trainSVM segment. 
        Returns: list of descriptors
        """
        # For testing purposes only load 15 classes and 20 images per class.
        descriptors = []
        totalNumberOfDescriptors = float(self.testData.numberOfClasses * self.testData.segmentSizeMean["trainSVM"])

        if Settings.G_DETAILED_CONSOLE_OUTPUT:
            print "\nCreating {0} {1} descriptors.".format(totalNumberOfDescriptors, str(self))
        for img, _ in self.testData.load_data("trainSVM", outputActions=False, resolutionSize=self.imageSize, transformation=self.transform):
            
            _, descriptor = self.compute_descriptor(img)
            # Add the descriptor to the other descriptors we have.
            if descriptor is None or len(descriptor) == 0:
                print "\n** Could not find descriptors and/or keypoints for Image. **\n"

                # save for failure analysis
                vPath = self.modelSaver.get_save_path_for_visualizations() + "/noKeyPoints/"
                utils.create_dir_if_necessary(vPath)
                fileName = utils.get_uuid() + ".jpg"
                cv.imwrite(vPath + fileName, img)
                continue
  
            descriptors.append(descriptor)   
            # clear to prevent messing up RAM.
            img = []

            # Print progress
            utils.show_progress(Settings.G_DETAILED_CONSOLE_OUTPUT, len(descriptors), totalNumberOfDescriptors, "Descriptor creation Progress:")

        return descriptors
Example #2
0
    def crop_bounding_boxes(self, boundingBoxFileName):        
        # segment data with only one segment
        self.segment_test_data({"all": 1})
        self.new_segmentation()

        numberOfImages = self.segmentSizeMean["all"] * self.numberOfClasses
        numberOfImagesDone = 0        
        currentClass = ""

        print "Starting bounding box cropping \n"
        for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True):
            path = self.get_root_path() + "/" + class_ + "/"
            if not currentClass == class_:
                currentClass = class_

                # contains all bounding boxes for a class. Key is the image id
                boundingBoxes = {}

                # find bounding box file
                try:
                    with open(path + boundingBoxFileName, "r") as f:
                        firstLine = True
                        for line in f:
                            # skip the first line because the first line only contains the header
                            if firstLine:
                                firstLine = False
                                continue
                            data = line.split()
                            boundingBoxes[data[0]] = ImageRegion(upperLeft=(int(data[1]), int(data[2])), lowerRight=(int(data[3]), int(data[4])))
                except:
                    logging.exception("Could not open bounding box file under path {0}.".format(str(path + boundingBoxFileName)))


            # crop bounding box
            fileId = osPath.splitext(fileName)[0]
            if not fileId in boundingBoxes:
                print "Could not find bounding box for image",fileName
                continue
            else:
                bb = boundingBoxes[fileId]
                try:
                    img = bb.crop_image_region(img)
                except:
                    logging.exception("Could not crop bounding box for image id {0}. Bounding Box: {1} - {2}".format(fileId, bb.upperLeft, bb.lowerRight))
              
            path += fileName    
            cv.imwrite(path, img)

            numberOfImagesDone += 1
            utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class {0}.\tTotal progress:", class_)
        print "Cropping finished."
        print "\n"

        if Settings.G_MAIL_REPORTS:
            MailServer.send_mail("", "cropping finished")

        raw_input("Press any key to continue.")
Example #3
0
    def normalize_test_data(self, size, newName="", forceOverwrite=False):
        normTestDataRootPath = utils.get_parent_dir(self.get_root_path()) + "/"
        if newName == "":
            if not forceOverwrite:
                overwrite = utils.radio_question("[?]", "Do you really wish to overwrite existing images?", None, ["Yes", "No"], [True, False])
            else:
                overwrite = True
            if not overwrite:
                normTestDataRootPath += utils.value_question("", "Provide new foldername:", "s")
            else:
                normTestDataRootPath = self.get_root_path()
        else:
            normTestDataRootPath += newName

        utils.create_dir_if_necessary(normTestDataRootPath)
        print "Saving equalized test data set in path",normTestDataRootPath

        # segment data with only one segment
        self.segment_test_data({"all": 1})
        self.new_segmentation()

        numberOfImages = self.testDataSize
        numberOfImagesDone = 0
        
        currentClass = ""

        print "Starting equalization.\n"
        for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True):

            path = normTestDataRootPath + "/" + class_ + "/" 
            # reset counter if new class
            if not currentClass == class_:
                currentClass = class_                
                utils.create_dir_if_necessary(path)
                                
            resizedImg = utils.equalize_image_size(img, size)
            path += fileName
            cv.imwrite(path, resizedImg)
            numberOfImagesDone += 1
            utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class {0}.\tTotal progress:", currentClass)
        print "\nEqualization finished."
        print "\n"
Example #4
0
    def add_prefix_to_test_data(self, prefix):
        # terrible code. I know renaming is much easier but this was simple copy paste from normalize_test_data


        # segment data with only one segment
        self.segment_test_data({"all": 1})
        self.new_segmentation()

        numberOfImages = self.testDataSize
        numberOfImagesDone = 0        

        print "Starting renaming to {0}_XX.jpg.\n".format(prefix)
        for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True):
            oldPath = self.get_root_path() + "/" + class_ + "/" + fileName    
            path = self.get_root_path() + "/" + class_ + "/" + prefix + "_" + fileName    
            remove(oldPath)
            cv.imwrite(path, img)

            numberOfImagesDone += 1
            utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class {0}.\tTotal progress:", class_)
        print "Renaming finished."
Example #5
0
    def crop_test_data_to_square(self, manuallyDecideFolderName):
        self.segment_test_data({"all": 1})
        self.new_segmentation()

        numberOfImages = self.testDataSize
        numberOfImagesDone = 0        
        rejectedImages = 0

        manDir = utils.get_parent_dir(self.get_root_path()) + "/" + manuallyDecideFolderName

        print "Starting cropping to square aspect ratio. Files that can't be processed automatically will be saved in path {0}.\n".format(manDir)

        currentClass = ""
        for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True):
            currentFilePath = self.get_root_path() + "/" + class_ + "/" + fileName   

            if not currentClass == class_:
                currentClass = class_
                manDir = utils.get_parent_dir(self.get_root_path()) + "/" + manuallyDecideFolderName + "/" + class_ + "/" 
                utils.create_dir_if_necessary(manDir)

            croppedImg = utils.crop_to_square(img)
            if croppedImg is None:
                # could not crop image to square because aspect ration was to big / small
                # save to path were we have to decide manually and remove the other image
                cv.imwrite(manDir + fileName, img)
                remove(currentFilePath)
                rejectedImages += 1
            else:
                cv.imwrite(currentFilePath, croppedImg)
            numberOfImagesDone += 1
            utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class \t{0}.\tRejected images:{1}\tTotal progress:", class_, rejectedImages)
        print "\n\nCropping finished. Rejected images:{0}".format(rejectedImages)
        print "\n"

        if Settings.G_MAIL_REPORTS:
            MailServer.send_mail("Rejected images:{0}".format(rejectedImages), "cropping finished")

        raw_input("Press any key to continue.")
Example #6
0
    def __augment_test_data_iteration(self, normTestDataRootPath, iteration, iterationOps, numberOfOps, uniqueOps=True, saveOriginalImage=False):
        """
        normTestDataRootPath: root path for the dataset
        iteration: number of current iteration (only cosmetic)
        iterationOps: list of iteration operation tuples [(function, [possibleParams])]
        numberOfOps: number of ops to perform - format tuple range (min, max)
        uniqueOps: should operations be unique (only one op of this type per image)
        saveOriginalImage: should we save the original image
        """
        print "\n\nIteration {0}:\n".format(iteration)

         # segment test data
        self.segment_test_data({"all": 1})
        self.new_segmentation()

        numberOfImages = self.testDataSize
        numberOfImagesDone = 0
        numberOfnewImagesDone = 0
        
        currentClass = ""
        for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True):
            fileId = osPath.splitext(fileName)[0]
            path = normTestDataRootPath + "/" + class_ + "/" 
            # reset counter if new class
            if not currentClass == class_:
                currentClass = class_                
                utils.create_dir_if_necessary(path)
            path += str(numberOfImagesDone) + "_it" + str(iteration)
            
            # calculate the actual number of ops for this image using the numberOfOps range tuple
            numberOfOpsForImage = randint(numberOfOps[0], numberOfOps[1])

            # get a list of unique indices for operations to perform on images if uniqueOps requires it
            ops = []
            if uniqueOps:
                ops = sample(range(len(iterationOps)), numberOfOpsForImage)
            else:
                ops = [randint(0,len(iterationOps)-1) for _ in xrange(numberOfOpsForImage)]
            for op in ops:
                changedImg = None
                # check if op needs a parameter
                if iterationOps[op][1]:
                    parameterIndex = randint(0, len(iterationOps[op][1])-1)
                    changedImg = iterationOps[op][0](img, iterationOps[op][1][parameterIndex])
                else:
                    changedImg = iterationOps[op][0](img)
                changedImgPath = path + "_" + str(numberOfnewImagesDone) + "_OP" + str(op) + ".jpg"
                cv.imwrite(changedImgPath, changedImg)
                numberOfnewImagesDone += 1
            # save original image in new dataset if saveOriginalImage requires it
            if saveOriginalImage:
                cv.imwrite(path + ".jpg", img)
            numberOfImagesDone += 1
            utils.show_progress(True, numberOfImagesDone, numberOfImages, "Iteration {0} - New images: {1}\tProgress:", iteration, numberOfnewImagesDone)
        numberOfTotalImages = numberOfImagesDone + numberOfnewImagesDone
        print "\nIteration {0} done.\nCurrent number of images in data set: {1}\nReloading dataset.".format(iteration, numberOfTotalImages)

        self.reset_data_set()
        loadedSize = self.set_root_path(normTestDataRootPath)
        if not loadedSize >= numberOfTotalImages:
            print "Reloading was not successfull! Number of actual reloaded images: {0} - Expected number of images: {1}.".format(loadedSize, numberOfTotalImages)
            raw_input("Press any key to continue.")
            return None
        print "Reloading successfull."
        return numberOfnewImagesDone
Example #7
0
    def load_data(self, segmentIndex, numberOfClasses=None, classes=[], grayscale=True, resizeFactor=1, outputActions=True, maxNumberOfImagesPerClass=-1, yieldFilename=False, size=(-1,-1), transformation=None, resolutionSize=-1, forceNormalization=0, forceZNormalization=0, forceZcaWhitening=0):
        """Loads the images of the test data as a generator.

        Keyword argument 
        segmentIndex -- the segment index to load from. If default values were used in segment_test_data use "train" or "test"
        numberOfClasses -- the number of classes to load (default None (load all classes))
        classes -- explicitly specified classes (default [])
        grayscale -- should images be loaded in grayscale or not (default True)
        maxNumberOfImagesPerClass -- limits the loading of images (default -1 means no limit)
        """
        #Parameter validation
        if segmentIndex not in self.segmentRatio and segmentIndex != "all":
            raise AttributeError(segmentIndex + " is not in segments")

        # test if size is valid
        try:
            sizeLen = len(size)
            if sizeLen != 2:
                raise AttributeError("size has to be a 2-element tuple. Use (-1, -1) if you don't care.")
        except:
            raise AttributeError("size has to be a 2-element tuple. Use (-1, -1) if you don't care.")             

        desired_h = size[0]
        desired_w = size[1]  

        if (desired_h / desired_w) != 1.0:
            raise AttributeError("The current version does not support an aspect ratio other than 1.0.")
        
        if resizeFactor != 1 and resolutionSize != -1:
            raise AttributeError("You can't set resizeFactor and resolutionSize at the same time.")

        # if forceX parameter is 0 it means do not change. -1 -> force false, +1 -> force true
        forceNormalization = (forceNormalization == 0 and Settings.F_APPLY_NORMALIZATION) or forceNormalization == 1
        forceZNormalization = (forceZNormalization == 0 and Settings.F_APPLY_ZNORMALIZATION) or forceZNormalization == 1

        if (forceNormalization and forceZNormalization):
            raise AttributeError("You can't apply normalization (mean substraction) and z-normalization (normalization divided by std) at the same time.")


        # calculate mean and std if normalization enabled and needed
        if forceNormalization or forceZNormalization:
            if desired_h == -1:
                if resolutionSize == -1:
                    raise AttributeError("If F_APPLY_NORMALIZATION or F_APPLY_ZNORMALIZATION is enabled the image must have a fixed size or at least a fixed resolution size.")
                else:
                    desired_h = desired_w = int(sqrt(resolutionSize))
                    size = (desired_w, desired_h)
                

            if len(self.mean) == 0 or len(self.std) == 0:
                self.__calculate_mean_std(size, grayscale)

        preprocessImage = forceNormalization or forceZNormalization or Settings.F_APPLY_ZCA_WHITENING or Settings.F_APPLY_CLAHE or Settings.F_APPLY_HISTOGRAM_EQ

        if numberOfClasses is None:
            numberOfClasses = self.numberOfClasses
        else:
            numberOfClasses = min(numberOfClasses, self.numberOfClasses)

        if not classes:
            #Fill classes with numberOfClasses count classes
            classes = [key for key in self.__classDictionary]
            # only take numberOfClasses classes
            classes = classes[:numberOfClasses]

        limitImageLoading = True
        if maxNumberOfImagesPerClass == -1:
            limitImageLoading = False
        else: 
            maxNumberOfImagesPerClass = max(1, maxNumberOfImagesPerClass) # load at least one image


        #Load flag for cv.imread.
        loadFlag = cv.IMREAD_GRAYSCALE if grayscale else cv.IMREAD_UNCHANGED
        if outputActions:
            print "Loading dataset {0} {1}. |Classes: {2}| - number of images per segment: {3}".format(segmentIndex, self.__segmentSliceIndex[segmentIndex], numberOfClasses, self.segmentSizeMean[segmentIndex])
            print "Loading in grayscale",grayscale
        for class_ in classes:
            segmentedSamples = self.__classDictionary[class_]
            
            if segmentIndex != "all":
                samples = segmentedSamples[segmentIndex]
            else:
                samples = []
                for segId in self.__classDictionary[class_]:
                    samples.extend(self.__classDictionary[class_][segId])

            if limitImageLoading:
                samplesToLoad = min(maxNumberOfImagesPerClass, len(samples))
            else:
                samplesToLoad = len(samples)            
            if outputActions:
                print "\n"
            
            errors = 0
            lastError = ""
            for i in xrange(samplesToLoad):
                filename = self.get_root_path() + class_ + "/" + samples[i]
                img = load_image(filename, loadFlag, resizeFactor)
                # img is None if the image could not be read. imread cannot read .gif for example
                if img is None:
                    errors += 1
                    lastError = "Image {0} was None.\nSamples:{1}".format(filename, samples)
                    continue

                # if image has an alpha channel reduce
                if len(img.shape) > 2 and img.shape[2] == 4L:
                    img = cv.cvtColor(img, cv.COLOR_BGRA2BGR)

                # do we need to crop and adjust the size
                if desired_h != -1 and desired_w != -1:
                    img = utils.crop_to_square(img)
                    desiredArea = desired_h * desired_w
                    img = utils.equalize_image_size(img, desiredArea)
                    if img is None:
                        errors += 1
                        lastError = "Image {0} after eq-1 was None".format(filename)
                        continue
                    if desired_w != img.shape[1] or desired_h != img.shape[0]:
                        img = utils.crop_around_center(img, desired_w, desired_h)
                        if img is None:
                            errors += 1
                            lastError = "Image {0} after cropping was None".format(filename)
                            continue

                # resize image to set size
                if resolutionSize != -1 and size[0] == -1 and size[1] == -1:
                    try:
                        img = utils.equalize_image_size(img, resolutionSize)
                    except:
                        logging.exception("Could not eq image size: img.shape: {0} - new size: {1}".format(img.shape, resolutionSize))
                        lastError = "Image eq-2 exception".format(filename)
                        img = None
                    if img is None:
                        lastError = "Image {0} after eq-2 was None".format(filename)
                        errors += 1
                        continue


                if not transformation is None:
                    try:                        
                        img = transformation(img)
                    except Exception, e:
                        errors += 1
                        lastError = "Exception during image transformation:",e.message
                        continue

                if preprocessImage:
                    try:                        
                        img = self.__preprocess_image(img, forceNormalization, forceZNormalization, grayscale)
                    except Exception, e:
                        errors += 1
                        lastError = "Exception during image preprocessing:",e.message
                        continue


                utils.show_progress(outputActions, i+1, samplesToLoad, "Loading {0} images in class {1} (Loss: {2}):", samplesToLoad, class_, errors)
                if yieldFilename:
                    yield (img, class_, samples[i])
                else:
                    yield (img, class_)
Example #8
0
    def __create_ast_vectors(self):
        """
        Creates abstract syntax trees for each class in each project.
        Tokens will be reused throughout all projects.
        """
        # get the total number of classes for all projects
        number_of_classes = sum(
            [len(self.test_data[i]) for i in range(self.num_projects)])
        current_data_set_index = 0
        for project_index in range(self.num_projects):
            project_test_data = self.test_data[project_index]

            # start index for the test_data_X / Y which contains all features for every project.
            start_index = current_data_set_index

            # counter for self.test_data index. This index starts at zero for each project because the projects are sep. in test_data.
            project_test_data_index = 0

            logger.debug(
                'Creating abstract syntax trees for project {0}. {1} classes.'.
                format(project_index, len(project_test_data)))
            for (class_info, path_to_class_file,
                 number_of_bugs) in project_test_data:
                # open file and read it
                source_code = ''
                with open(path_to_class_file, 'rb') as f:
                    source_code = f.read()

                try:
                    tree = javalang.parse.parse(source_code)
                except:
                    logger.exception(
                        'Could not parse sourcefile {0} (Path: {1}) (Project {2}). (Syntax errors)'
                        .format(class_info, path_to_class_file, project_index))
                    continue

                # try to generate feature vector
                tree_feature_vector = self.__convert_tree_to_feature_vector_unstructured(
                    tree)
                self.test_data_X.append(tree_feature_vector)
                self.test_data_Y.append(number_of_bugs)

                # replace existing test_data entry tuples with additional info
                self.test_data[project_index][project_test_data_index] = (
                    class_info, path_to_class_file, number_of_bugs, tree,
                    tree_feature_vector)

                project_test_data_index += 1
                current_data_set_index += 1
                utils.show_progress(
                    True, current_data_set_index, number_of_classes,
                    'AST creation for {0} classes.\tToken mappings: {1}:',
                    number_of_classes, len(self.token_mapping_names))
            print('\n')
            logger.debug(
                'AST creation for project {0} done. Progress: {1:.2f}%'.format(
                    project_index,
                    ((current_data_set_index / number_of_classes) * 100)))

            end_index = current_data_set_index - 1
            logger.debug(
                'Data set index interval for project {0}: {1} to {2}'.format(
                    project_index, start_index, end_index))
            self.test_data_project_indices.append((start_index, end_index))
            print('')
        print('\n**')