def check(data_dir): imgs = get_files(data_dir, 'jpg') xmls = get_files(data_dir, 'xml') xmls_set = set([xml.replace(".xml", "") for xml in xmls]) imgs_set = set([img.replace(".jpg", "") for img in imgs]) diff = imgs_set.difference(xmls_set) print(len(diff))
def find_invalid_jpgs(data_path): print("start to check!") imgs = get_files(data_path, 'jpg') print("there are %d files !" % len(imgs)) for img in imgs: if not is_valid_jpg(img): print(img)
def rename_file(data_dir): xmls = get_files(data_dir, 'xml') print("there are %d xmls" % len(xmls)) for xml in xmls: tree = ET.ElementTree(file=xml) name = os.path.split(xml)[-1].replace(".xml", '.jpg') print(name) tree.find('./filename').text = name tree.write(xml)
def get_shape(img_path): imgs = get_files(img_path, 'jpg') print("there are %d imgages in total !"%len(imgs)) #img_shape = {} for img in imgs: height, width, _ = cv2.imread(img).shape print(img) xml_path = img.replace('jpg','xml') tree = ET.ElementTree(file=xml_path) tree.find('./size/width').text = str(int(width)) tree.find('./size/height').text = str(int(height)) tree.write(xml_path)
def find_invalid_xmls(data_dir): xmls = get_files(data_dir,suffix='xml') print("there are %d files need to check"%len(xmls)) #valid_xmls = [] for xml in xmls: invalid_bboxes,info = check_bboxes(xml) if len(invalid_bboxes) > 0: print(xml) print(info) #print(len(invalid_bboxes)) for bndbox in invalid_bboxes: print(format_bbox(bndbox))
def get_stats(path): xmls = get_files(path) labels = {} for xml in xmls: tree = ET.ElementTree(file=xml) names = [k.text for k in tree.findall("/object/name")] for name in names: if name not in labels: labels[name] = 1 else: labels[name] += 1 for key, value in labels.items(): print(key, "\t", value)
def get_stats_of_bboxes(xml_dir, save_dir): xmls = get_files(xml_dir, ".xml") statistics = defaultdict(list) for xml in xmls: tree = ET.ElementTree(file=xml) objs = tree.findall("./object") for obj in objs: name = obj.find("./name") xmin = obj.find('./bndbox/xmin').text xmax = obj.find("./bndbox/xmax").text ymin = obj.find("./bndbox/ymin").text ymax = obj.find('./bndbox/ymax').text statistics[name].append([int(k) for k in [xmin, xmax, ymin, ymax]]) with open(os.path.join(save_dir, 'statistics.pickle'), 'rb') as f: pickle.dump(statistics, f)
def get_mean_of_rgb(path, njob=1): imgs = get_files(path, 'jpg') nums_img = len(imgs) block = nums_img // njob pool = Pool(processes=njob) res = [] for k in njob: if k != njob - 1: #imgs_tmp = nums_img[k*block:(k+1)*block] res.append( pool.apply_async(_sum_rgb, (imgs[k * block:(k + 1) * block], ))) else: res.append(pool.apply_async(_sum_rgb, (imgs[k * block:], ))) if len(res) != njob: raise ValueError("some error!") r = 0 g = 0 b = 0 for k in res: r += k[0] g += k[1] b += k[2] return r / nums_img, g / nums_img, b / nums_img
def find_invalid_jpgs(data_path): imgs = get_files(data_path, 'jpg') for img in imgs: if not is_valid_jpg(img): print(img)