def create_descriptors(self): """ Creates initial descriptors needed for the BoW by loading images in the trainSVM segment. Returns: list of descriptors """ # For testing purposes only load 15 classes and 20 images per class. descriptors = [] totalNumberOfDescriptors = float(self.testData.numberOfClasses * self.testData.segmentSizeMean["trainSVM"]) if Settings.G_DETAILED_CONSOLE_OUTPUT: print "\nCreating {0} {1} descriptors.".format(totalNumberOfDescriptors, str(self)) for img, _ in self.testData.load_data("trainSVM", outputActions=False, resolutionSize=self.imageSize, transformation=self.transform): _, descriptor = self.compute_descriptor(img) # Add the descriptor to the other descriptors we have. if descriptor is None or len(descriptor) == 0: print "\n** Could not find descriptors and/or keypoints for Image. **\n" # save for failure analysis vPath = self.modelSaver.get_save_path_for_visualizations() + "/noKeyPoints/" utils.create_dir_if_necessary(vPath) fileName = utils.get_uuid() + ".jpg" cv.imwrite(vPath + fileName, img) continue descriptors.append(descriptor) # clear to prevent messing up RAM. img = [] # Print progress utils.show_progress(Settings.G_DETAILED_CONSOLE_OUTPUT, len(descriptors), totalNumberOfDescriptors, "Descriptor creation Progress:") return descriptors
def crop_bounding_boxes(self, boundingBoxFileName): # segment data with only one segment self.segment_test_data({"all": 1}) self.new_segmentation() numberOfImages = self.segmentSizeMean["all"] * self.numberOfClasses numberOfImagesDone = 0 currentClass = "" print "Starting bounding box cropping \n" for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True): path = self.get_root_path() + "/" + class_ + "/" if not currentClass == class_: currentClass = class_ # contains all bounding boxes for a class. Key is the image id boundingBoxes = {} # find bounding box file try: with open(path + boundingBoxFileName, "r") as f: firstLine = True for line in f: # skip the first line because the first line only contains the header if firstLine: firstLine = False continue data = line.split() boundingBoxes[data[0]] = ImageRegion(upperLeft=(int(data[1]), int(data[2])), lowerRight=(int(data[3]), int(data[4]))) except: logging.exception("Could not open bounding box file under path {0}.".format(str(path + boundingBoxFileName))) # crop bounding box fileId = osPath.splitext(fileName)[0] if not fileId in boundingBoxes: print "Could not find bounding box for image",fileName continue else: bb = boundingBoxes[fileId] try: img = bb.crop_image_region(img) except: logging.exception("Could not crop bounding box for image id {0}. Bounding Box: {1} - {2}".format(fileId, bb.upperLeft, bb.lowerRight)) path += fileName cv.imwrite(path, img) numberOfImagesDone += 1 utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class {0}.\tTotal progress:", class_) print "Cropping finished." print "\n" if Settings.G_MAIL_REPORTS: MailServer.send_mail("", "cropping finished") raw_input("Press any key to continue.")
def normalize_test_data(self, size, newName="", forceOverwrite=False): normTestDataRootPath = utils.get_parent_dir(self.get_root_path()) + "/" if newName == "": if not forceOverwrite: overwrite = utils.radio_question("[?]", "Do you really wish to overwrite existing images?", None, ["Yes", "No"], [True, False]) else: overwrite = True if not overwrite: normTestDataRootPath += utils.value_question("", "Provide new foldername:", "s") else: normTestDataRootPath = self.get_root_path() else: normTestDataRootPath += newName utils.create_dir_if_necessary(normTestDataRootPath) print "Saving equalized test data set in path",normTestDataRootPath # segment data with only one segment self.segment_test_data({"all": 1}) self.new_segmentation() numberOfImages = self.testDataSize numberOfImagesDone = 0 currentClass = "" print "Starting equalization.\n" for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True): path = normTestDataRootPath + "/" + class_ + "/" # reset counter if new class if not currentClass == class_: currentClass = class_ utils.create_dir_if_necessary(path) resizedImg = utils.equalize_image_size(img, size) path += fileName cv.imwrite(path, resizedImg) numberOfImagesDone += 1 utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class {0}.\tTotal progress:", currentClass) print "\nEqualization finished." print "\n"
def add_prefix_to_test_data(self, prefix): # terrible code. I know renaming is much easier but this was simple copy paste from normalize_test_data # segment data with only one segment self.segment_test_data({"all": 1}) self.new_segmentation() numberOfImages = self.testDataSize numberOfImagesDone = 0 print "Starting renaming to {0}_XX.jpg.\n".format(prefix) for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True): oldPath = self.get_root_path() + "/" + class_ + "/" + fileName path = self.get_root_path() + "/" + class_ + "/" + prefix + "_" + fileName remove(oldPath) cv.imwrite(path, img) numberOfImagesDone += 1 utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class {0}.\tTotal progress:", class_) print "Renaming finished."
def crop_test_data_to_square(self, manuallyDecideFolderName): self.segment_test_data({"all": 1}) self.new_segmentation() numberOfImages = self.testDataSize numberOfImagesDone = 0 rejectedImages = 0 manDir = utils.get_parent_dir(self.get_root_path()) + "/" + manuallyDecideFolderName print "Starting cropping to square aspect ratio. Files that can't be processed automatically will be saved in path {0}.\n".format(manDir) currentClass = "" for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True): currentFilePath = self.get_root_path() + "/" + class_ + "/" + fileName if not currentClass == class_: currentClass = class_ manDir = utils.get_parent_dir(self.get_root_path()) + "/" + manuallyDecideFolderName + "/" + class_ + "/" utils.create_dir_if_necessary(manDir) croppedImg = utils.crop_to_square(img) if croppedImg is None: # could not crop image to square because aspect ration was to big / small # save to path were we have to decide manually and remove the other image cv.imwrite(manDir + fileName, img) remove(currentFilePath) rejectedImages += 1 else: cv.imwrite(currentFilePath, croppedImg) numberOfImagesDone += 1 utils.show_progress(True, numberOfImagesDone, numberOfImages, "Processing class \t{0}.\tRejected images:{1}\tTotal progress:", class_, rejectedImages) print "\n\nCropping finished. Rejected images:{0}".format(rejectedImages) print "\n" if Settings.G_MAIL_REPORTS: MailServer.send_mail("Rejected images:{0}".format(rejectedImages), "cropping finished") raw_input("Press any key to continue.")
def __augment_test_data_iteration(self, normTestDataRootPath, iteration, iterationOps, numberOfOps, uniqueOps=True, saveOriginalImage=False): """ normTestDataRootPath: root path for the dataset iteration: number of current iteration (only cosmetic) iterationOps: list of iteration operation tuples [(function, [possibleParams])] numberOfOps: number of ops to perform - format tuple range (min, max) uniqueOps: should operations be unique (only one op of this type per image) saveOriginalImage: should we save the original image """ print "\n\nIteration {0}:\n".format(iteration) # segment test data self.segment_test_data({"all": 1}) self.new_segmentation() numberOfImages = self.testDataSize numberOfImagesDone = 0 numberOfnewImagesDone = 0 currentClass = "" for img, class_, fileName in self.load_data("all", grayscale=False, outputActions=False, yieldFilename=True): fileId = osPath.splitext(fileName)[0] path = normTestDataRootPath + "/" + class_ + "/" # reset counter if new class if not currentClass == class_: currentClass = class_ utils.create_dir_if_necessary(path) path += str(numberOfImagesDone) + "_it" + str(iteration) # calculate the actual number of ops for this image using the numberOfOps range tuple numberOfOpsForImage = randint(numberOfOps[0], numberOfOps[1]) # get a list of unique indices for operations to perform on images if uniqueOps requires it ops = [] if uniqueOps: ops = sample(range(len(iterationOps)), numberOfOpsForImage) else: ops = [randint(0,len(iterationOps)-1) for _ in xrange(numberOfOpsForImage)] for op in ops: changedImg = None # check if op needs a parameter if iterationOps[op][1]: parameterIndex = randint(0, len(iterationOps[op][1])-1) changedImg = iterationOps[op][0](img, iterationOps[op][1][parameterIndex]) else: changedImg = iterationOps[op][0](img) changedImgPath = path + "_" + str(numberOfnewImagesDone) + "_OP" + str(op) + ".jpg" cv.imwrite(changedImgPath, changedImg) numberOfnewImagesDone += 1 # save original image in new dataset if saveOriginalImage requires it if saveOriginalImage: cv.imwrite(path + ".jpg", img) numberOfImagesDone += 1 utils.show_progress(True, numberOfImagesDone, numberOfImages, "Iteration {0} - New images: {1}\tProgress:", iteration, numberOfnewImagesDone) numberOfTotalImages = numberOfImagesDone + numberOfnewImagesDone print "\nIteration {0} done.\nCurrent number of images in data set: {1}\nReloading dataset.".format(iteration, numberOfTotalImages) self.reset_data_set() loadedSize = self.set_root_path(normTestDataRootPath) if not loadedSize >= numberOfTotalImages: print "Reloading was not successfull! Number of actual reloaded images: {0} - Expected number of images: {1}.".format(loadedSize, numberOfTotalImages) raw_input("Press any key to continue.") return None print "Reloading successfull." return numberOfnewImagesDone
def load_data(self, segmentIndex, numberOfClasses=None, classes=[], grayscale=True, resizeFactor=1, outputActions=True, maxNumberOfImagesPerClass=-1, yieldFilename=False, size=(-1,-1), transformation=None, resolutionSize=-1, forceNormalization=0, forceZNormalization=0, forceZcaWhitening=0): """Loads the images of the test data as a generator. Keyword argument segmentIndex -- the segment index to load from. If default values were used in segment_test_data use "train" or "test" numberOfClasses -- the number of classes to load (default None (load all classes)) classes -- explicitly specified classes (default []) grayscale -- should images be loaded in grayscale or not (default True) maxNumberOfImagesPerClass -- limits the loading of images (default -1 means no limit) """ #Parameter validation if segmentIndex not in self.segmentRatio and segmentIndex != "all": raise AttributeError(segmentIndex + " is not in segments") # test if size is valid try: sizeLen = len(size) if sizeLen != 2: raise AttributeError("size has to be a 2-element tuple. Use (-1, -1) if you don't care.") except: raise AttributeError("size has to be a 2-element tuple. Use (-1, -1) if you don't care.") desired_h = size[0] desired_w = size[1] if (desired_h / desired_w) != 1.0: raise AttributeError("The current version does not support an aspect ratio other than 1.0.") if resizeFactor != 1 and resolutionSize != -1: raise AttributeError("You can't set resizeFactor and resolutionSize at the same time.") # if forceX parameter is 0 it means do not change. -1 -> force false, +1 -> force true forceNormalization = (forceNormalization == 0 and Settings.F_APPLY_NORMALIZATION) or forceNormalization == 1 forceZNormalization = (forceZNormalization == 0 and Settings.F_APPLY_ZNORMALIZATION) or forceZNormalization == 1 if (forceNormalization and forceZNormalization): raise AttributeError("You can't apply normalization (mean substraction) and z-normalization (normalization divided by std) at the same time.") # calculate mean and std if normalization enabled and needed if forceNormalization or forceZNormalization: if desired_h == -1: if resolutionSize == -1: raise AttributeError("If F_APPLY_NORMALIZATION or F_APPLY_ZNORMALIZATION is enabled the image must have a fixed size or at least a fixed resolution size.") else: desired_h = desired_w = int(sqrt(resolutionSize)) size = (desired_w, desired_h) if len(self.mean) == 0 or len(self.std) == 0: self.__calculate_mean_std(size, grayscale) preprocessImage = forceNormalization or forceZNormalization or Settings.F_APPLY_ZCA_WHITENING or Settings.F_APPLY_CLAHE or Settings.F_APPLY_HISTOGRAM_EQ if numberOfClasses is None: numberOfClasses = self.numberOfClasses else: numberOfClasses = min(numberOfClasses, self.numberOfClasses) if not classes: #Fill classes with numberOfClasses count classes classes = [key for key in self.__classDictionary] # only take numberOfClasses classes classes = classes[:numberOfClasses] limitImageLoading = True if maxNumberOfImagesPerClass == -1: limitImageLoading = False else: maxNumberOfImagesPerClass = max(1, maxNumberOfImagesPerClass) # load at least one image #Load flag for cv.imread. loadFlag = cv.IMREAD_GRAYSCALE if grayscale else cv.IMREAD_UNCHANGED if outputActions: print "Loading dataset {0} {1}. |Classes: {2}| - number of images per segment: {3}".format(segmentIndex, self.__segmentSliceIndex[segmentIndex], numberOfClasses, self.segmentSizeMean[segmentIndex]) print "Loading in grayscale",grayscale for class_ in classes: segmentedSamples = self.__classDictionary[class_] if segmentIndex != "all": samples = segmentedSamples[segmentIndex] else: samples = [] for segId in self.__classDictionary[class_]: samples.extend(self.__classDictionary[class_][segId]) if limitImageLoading: samplesToLoad = min(maxNumberOfImagesPerClass, len(samples)) else: samplesToLoad = len(samples) if outputActions: print "\n" errors = 0 lastError = "" for i in xrange(samplesToLoad): filename = self.get_root_path() + class_ + "/" + samples[i] img = load_image(filename, loadFlag, resizeFactor) # img is None if the image could not be read. imread cannot read .gif for example if img is None: errors += 1 lastError = "Image {0} was None.\nSamples:{1}".format(filename, samples) continue # if image has an alpha channel reduce if len(img.shape) > 2 and img.shape[2] == 4L: img = cv.cvtColor(img, cv.COLOR_BGRA2BGR) # do we need to crop and adjust the size if desired_h != -1 and desired_w != -1: img = utils.crop_to_square(img) desiredArea = desired_h * desired_w img = utils.equalize_image_size(img, desiredArea) if img is None: errors += 1 lastError = "Image {0} after eq-1 was None".format(filename) continue if desired_w != img.shape[1] or desired_h != img.shape[0]: img = utils.crop_around_center(img, desired_w, desired_h) if img is None: errors += 1 lastError = "Image {0} after cropping was None".format(filename) continue # resize image to set size if resolutionSize != -1 and size[0] == -1 and size[1] == -1: try: img = utils.equalize_image_size(img, resolutionSize) except: logging.exception("Could not eq image size: img.shape: {0} - new size: {1}".format(img.shape, resolutionSize)) lastError = "Image eq-2 exception".format(filename) img = None if img is None: lastError = "Image {0} after eq-2 was None".format(filename) errors += 1 continue if not transformation is None: try: img = transformation(img) except Exception, e: errors += 1 lastError = "Exception during image transformation:",e.message continue if preprocessImage: try: img = self.__preprocess_image(img, forceNormalization, forceZNormalization, grayscale) except Exception, e: errors += 1 lastError = "Exception during image preprocessing:",e.message continue utils.show_progress(outputActions, i+1, samplesToLoad, "Loading {0} images in class {1} (Loss: {2}):", samplesToLoad, class_, errors) if yieldFilename: yield (img, class_, samples[i]) else: yield (img, class_)
def __create_ast_vectors(self): """ Creates abstract syntax trees for each class in each project. Tokens will be reused throughout all projects. """ # get the total number of classes for all projects number_of_classes = sum( [len(self.test_data[i]) for i in range(self.num_projects)]) current_data_set_index = 0 for project_index in range(self.num_projects): project_test_data = self.test_data[project_index] # start index for the test_data_X / Y which contains all features for every project. start_index = current_data_set_index # counter for self.test_data index. This index starts at zero for each project because the projects are sep. in test_data. project_test_data_index = 0 logger.debug( 'Creating abstract syntax trees for project {0}. {1} classes.'. format(project_index, len(project_test_data))) for (class_info, path_to_class_file, number_of_bugs) in project_test_data: # open file and read it source_code = '' with open(path_to_class_file, 'rb') as f: source_code = f.read() try: tree = javalang.parse.parse(source_code) except: logger.exception( 'Could not parse sourcefile {0} (Path: {1}) (Project {2}). (Syntax errors)' .format(class_info, path_to_class_file, project_index)) continue # try to generate feature vector tree_feature_vector = self.__convert_tree_to_feature_vector_unstructured( tree) self.test_data_X.append(tree_feature_vector) self.test_data_Y.append(number_of_bugs) # replace existing test_data entry tuples with additional info self.test_data[project_index][project_test_data_index] = ( class_info, path_to_class_file, number_of_bugs, tree, tree_feature_vector) project_test_data_index += 1 current_data_set_index += 1 utils.show_progress( True, current_data_set_index, number_of_classes, 'AST creation for {0} classes.\tToken mappings: {1}:', number_of_classes, len(self.token_mapping_names)) print('\n') logger.debug( 'AST creation for project {0} done. Progress: {1:.2f}%'.format( project_index, ((current_data_set_index / number_of_classes) * 100))) end_index = current_data_set_index - 1 logger.debug( 'Data set index interval for project {0}: {1} to {2}'.format( project_index, start_index, end_index)) self.test_data_project_indices.append((start_index, end_index)) print('') print('\n**')