def test_xml_parse(self): self.bk.manifest_iter.side_effect = lambda: bk_manifest_iter([( 'media_overlays1', 'file_href1', 'application/smil+xml')]) self.bk.text_iter.side_effect = lambda: bk_text_iter([]) collector = core.XHTMLAttributes() core.parse_xml(self.bk, collector, self.prefs) self.assertEqual(collector.class_names, set()) self.assertEqual(collector.literal_class_values, set()) self.assertEqual(collector.id_values, set()) self.assertEqual( collector.fragment_identifier, { 'ch3_figure1', 'ch3_figure1_title', 'ch3_figure1_caption', 'ch3_figure1_text1', 'ch3_figure1_text2' }) self.assertEqual(collector.info_class_names, {}) self.assertEqual(collector.info_id_values, {})
def _load_pascal_annotations(_abs_xml_path, _args): """ Parse annotations from XML file in Pascal VOC format :param _abs_xml_path: absolute path of xml file :param _args: input args :return: objects, type=dict, for example {'name': [class1, class2, ...], 'area': [area1, area2, ...]} """ objects = dict() objects.setdefault('name', []) objects.setdefault('area', []) annotaion = parse_xml(_abs_xml_path) objs = annotaion['annotation'] if len(objs) == 0: tqdm.write(" There is no objects in xml : %s" % os.path.basename(_abs_xml_path)) for obj in objs: objects['name'].append(obj['category']) objects['area'].append(obj['area']) return objects
def _load_pascal_annotations(_abs_xml_path, _args): """ Parse annotations from XML file in Pascal VOC format :param _abs_xml_path: absolute path of xml file :param _args: input args :return: objects, type=dict, for example {'name': [class1, class2, ...], 'area': [area1, area2, ...]} """ objects = dict() objects.setdefault('name', []) objects.setdefault('area', []) #annotaion = parse_xml(_abs_xml_path) #annotaion=dict() temp = parse_xml(_abs_xml_path) if temp is not None: annotaion = temp else: return None objs = annotaion['annotation'] if len(objs) == 0: tqdm.write(" There is no objects in xml : %s" % os.path.basename(_abs_xml_path)) for obj in objs: #hl->以下四行用来过滤标注不正确的xml文件 # if obj['category'] == 'unknow' or obj['category'] == 'Copy of bus tail' or obj['category'] == 'Copy of car tail' \ # or obj['category'] == 'Copy of car head' or obj['category'] == 'high_60' or obj['category'] == 'high_40': # print(f"\n{_abs_xml_path} is {obj['category']}") # return None objects['name'].append(obj['category']) objects['area'].append(obj['area']) return objects
if __name__ == '__main__': args = parse_args() src_path = args.org_path dst_path = args.save_path xmls = [item for item in os.listdir(src_path) if item.endswith('.xml')] for idx in tqdm(range(len(xmls)), ncols=100): xml_name = xmls[idx] abs_src_xml_path = os.path.join(src_path, xml_name) abs_dst_xml_path = os.path.join(dst_path, xml_name) abs_image_path = os.path.join(args.image_path, "%s.png" % os.path.splitext(xml_name)[0]) annotation = parse_xml(abs_src_xml_path) image_des = annotation['image'] if not os.path.exists(abs_image_path): tqdm.write("Non existed pic path: %s" % abs_image_path) continue cv_image = cv2.imread(abs_image_path) image_height, image_width= cv_image.shape[:2] if image_height != image_des['height'] or image_width != image_des['width']: image_des['height'] = image_height image_des['width'] = image_width tqdm.write("Invalid image width/height found in: %s" % xml_name)
def convert(self): # 遍历所有的XML文件 bnd_id = 1 image_id = 1 for xml_idx in tqdm(range(len(self.m_xml_index)), ncols=100, desc="VOC2COCO"): xml = self.m_xml_index[xml_idx] abs_xml_path = os.path.join(self.m_xml_path, "%s.xml" % xml) if not os.path.exists(abs_xml_path): raise ValueError("Non existed xml path: %s" % abs_xml_path) annotaion = parse_xml(abs_xml_path, "png") print(abs_xml_path + '\n') image_des = annotaion['image'] image_des['id'] = image_id abs_image_path = os.path.join(self.m_pics_path, image_des['file_name']) if not os.path.exists(abs_image_path): tqdm.write("Non existed pic path: %s" % abs_image_path) continue self.m_json_dict['images'].append(image_des) # TODO: Support segmentation. Currently we do not support segmentation. objs = annotaion['annotation'] if len(objs) == 0: tqdm.write(" There is no objects in xml : %s" % os.path.basename(xml)) for obj in objs: # 所有标签均以小写格式保存,兼容xml中出现大写字母的情况 category = obj['category'].lower() if category == 'prohibit': category = 'limitspeed' # 排除不需要的标签 if category not in self.m_categories: continue ''' if obj['area'] < 40*40: print('area') continue ''' category_id = self.m_categories[category] obj_annotation = { 'area': obj['area'], 'iscrowd': 0, 'bbox': obj['bbox'], 'category_id': category_id, 'id': bnd_id, 'ignore': 0, 'segmentation': [], 'image_id': image_id } self.m_json_dict['annotations'].append(obj_annotation) bnd_id += 1 image_id += 1 # category for category, category_id in self.m_categories.items(): #ignore background if category == 'background': continue self.m_json_dict['categories'].append({ 'supercategory': category.split(' ')[0], 'id': category_id, 'name': category }) # save json with open(self.m_save_path, 'w') as fjson: ##todo json.dump(self.m_json_dict, fjson)
xml_list = mmcv.list_from_file(args.image_set) xml_list = list(map(lambda x: x + '.xml', xml_list)) mismatched_nums = 0 # 遍历xml 列表 for i in tqdm(range(len(xml_list)), ncols=100, desc='Merging '): xml = xml_list[i] abs_src_xml_path1 = os.path.join(args.anno_path1, xml) abs_src_xml_path2 = os.path.join(args.anno_path2, xml) if not (os.path.exists(abs_src_xml_path1) and os.path.exists(abs_src_xml_path2)): mismatched_nums += 1 tqdm.write("**** Mismatched xml: %s" % xml) continue src_anno1 = parse_xml(abs_src_xml_path1) src_anno2 = parse_xml(abs_src_xml_path2) # # check whether image is same # assert src_anno1['image']['file_name'] == src_anno2['image']['file_name'] \ # and src_anno1['image']['width'] == src_anno2['image']['width'] \ # and src_anno1['image']['height'] == src_anno2['image']['height'], "Mismatched xml %s" % xml dst_anno = src_anno1.copy() dst_anno['annotation'] += src_anno2['annotation'] abs_dst_xml_path = os.path.join(args.save_path, xml) dump_xml(dst_anno, abs_dst_xml_path) print("There are %d mismatched xmls in total!!!" % mismatched_nums)