Ejemplo n.º 1
0
    def process_folder(self, in_dir, out_dir):
        """
        处理文件夹
        """
        print('[Info] in_dir: {}'.format(in_dir))
        print('[Info] out_dir: {}'.format(out_dir))
        mkdir_if_not_exist(out_dir)

        paths_list, names_list = traverse_dir_files(in_dir)
        print('[Info] 待处理文件数量: {}'.format(len(paths_list)))

        random.seed(47)
        paths_list, names_list = shuffle_two_list(paths_list, names_list)

        n_prc = 40
        pool = Pool(processes=n_prc)  # 多线程下载

        for idx, (path, name) in enumerate(zip(paths_list, names_list)):
            pool.apply_async(DataProcessor.process_img,
                             args=(path, name, out_dir))
            # DataProcessor.process_img(path, name, out_dir)
            if (idx + 1) % 1000 == 0:
                print('[Info] num: {}'.format(idx + 1))

        # 多进程逻辑
        pool.close()
        pool.join()

        print('[Info] 处理完成! {}'.format(out_dir))
        return
    def process(self):
        data_dir = os.path.join(DATA_DIR, 'biaozhu_csv')
        out_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out')
        mkdir_if_not_exist(out_dir)

        paths_list, names_list = traverse_dir_files(data_dir)
        for path, name in zip(paths_list, names_list):
            print('[Info] path: {}'.format(path))
            name_items = name.split(' ')
            out_name = "_".join(name_items[0:2])
            out_path = os.path.join(out_dir, '{}.txt'.format(out_name))
            self.process_path_1(path, out_path)
    def process_folder(self, img_dir, out_dir):
        """
        处理文件夹
        :param img_dir: 输入文件夹
        :param out_dir: 输出文件夹
        :return: None
        """
        print('[Info] 处理文件夹: {}'.format(img_dir))
        print('[Info] 输出文件夹: {}'.format(out_dir))
        mkdir_if_not_exist(out_dir)

        paths_list, names_list = traverse_dir_files(img_dir)

        for path, name in zip(paths_list, names_list):
            patch_list = self.process_img(path)
            out_name_f = name.split('.')[0] + ".o{}.jpg"
            out_path_f = os.path.join(out_dir, out_name_f)
            for idx, img_p in enumerate(patch_list):
                out_path = out_path_f.format(idx)
                cv2.imwrite(out_path, img_p)

        print('[Info] 处理完成: {}'.format(out_dir))
    def __init__(self):
        self.file_dir = os.path.join(DATA_DIR, 'train_data_v3')
        self.folder1_dir = os.path.join(self.file_dir, 'formula_dec')
        self.folder2_dir = os.path.join(self.file_dir, 'wrote_formula_dec')
        self.folder3_dir = os.path.join(self.file_dir, 'text_dec', 'csv')
        self.folder4_dir = os.path.join(self.file_dir, 'text_dec')
        self.folder5_dir = os.path.join(self.file_dir, 'formula_detection')

        self.file1_path = os.path.join(self.folder4_dir, 'raw_ciyubuquan.txt')
        self.file2_path = os.path.join(self.folder4_dir,
                                       'raw_shouxietouzi.txt')
        self.file3_path = os.path.join(self.folder4_dir,
                                       'text_formula_biaozhu_1.txt')
        self.file4_path = os.path.join(
            self.folder4_dir, 'yuwen_text_detection_1105_zl_1w_clean.csv')
        self.file5_path = os.path.join(self.folder4_dir, 'wrote_touzi_all.txt')
        self.file6_path = os.path.join(self.folder4_dir, 'jingbiao.txt')

        self.out_dir = os.path.join(DATA_DIR, 'train_data_v3_out')
        mkdir_if_not_exist(self.out_dir)
        self.out1_dir = os.path.join(self.out_dir, 'formula_dec_out')
        mkdir_if_not_exist(self.out1_dir)
        self.out2_dir = os.path.join(self.out_dir, 'wrote_formula_dec_out')
        mkdir_if_not_exist(self.out2_dir)
        self.out3_up_dir = os.path.join(self.out_dir, 'text_dec_out')
        mkdir_if_not_exist(self.out3_up_dir)
        self.out3_dir = os.path.join(self.out_dir, 'text_dec_out', 'csv_out')
        mkdir_if_not_exist(self.out3_dir)
        self.out5_dir = os.path.join(self.out_dir, 'formula_detection_out')
        mkdir_if_not_exist(self.out5_dir)

        self.out1_path = os.path.join(self.out_dir, 'text_dec_out',
                                      'raw_ciyubuquan.out.txt')
        self.out2_path = os.path.join(self.out_dir, 'text_dec_out',
                                      'raw_shouxietouzi.out.txt')
        self.out3_path = os.path.join(self.out_dir, 'text_dec_out',
                                      'text_formula_biaozhu_1.out.txt')
        self.out4_path = os.path.join(
            self.out_dir, 'text_dec_out',
            'yuwen_text_detection_1105_zl_1w_clean.out.txt')
        self.out5_path = os.path.join(self.out_dir, 'text_dec_out',
                                      'wrote_touzi_all.out.txt')
        self.out6_path = os.path.join(self.out_dir, 'text_dec_out',
                                      'jingbiao.out.txt')
 def __init__(self):
     self.file_path = os.path.join(DATA_DIR, 'sample_complex_formula.txt')
     self.out_dir = os.path.join(DATA_DIR, 'sample_complex_formula_out')
     mkdir_if_not_exist(self.out_dir)
Ejemplo n.º 6
0
 def __init__(self, model_path, out_dir):
     self.model_path = model_path
     self.out_dir = out_dir
     mkdir_if_not_exist(self.out_dir)
     print('[Info] 模型路径: {}'.format(self.model_path))
     print('[Info] 输出文件夹: {}'.format(self.out_dir))
    def generate_file(file_path, file_idx):
        file_idx = str(file_idx).zfill(4)
        print('[Info] file_path: {}, file_idx: {}'.format(file_path, file_idx))

        url_format = "http://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_segmentation/" \
                          "datasets/prelabeled-20201224/{}.jpg"
        out_dataset_dir = os.path.join(DATA_DIR, 'ps_datasets_v2')

        out_images_dir = os.path.join(out_dataset_dir, 'images')
        out_images_train_dir = os.path.join(out_images_dir, 'train')
        out_images_val_dir = os.path.join(out_images_dir, 'val')

        out_labels_dir = os.path.join(out_dataset_dir, 'labels')
        out_labels_train_dir = os.path.join(out_labels_dir, 'train')
        out_labels_val_dir = os.path.join(out_labels_dir, 'val')

        mkdir_if_not_exist(out_dataset_dir)
        mkdir_if_not_exist(out_images_dir)
        mkdir_if_not_exist(out_images_train_dir)
        mkdir_if_not_exist(out_images_val_dir)
        mkdir_if_not_exist(out_labels_dir)
        mkdir_if_not_exist(out_labels_train_dir)
        mkdir_if_not_exist(out_labels_val_dir)

        print('[Info] 处理数据开始: {}'.format(file_path))
        data_line = read_file(file_path)[0]
        data_dict = json.loads(data_line)
        print('[Info] keys: {}'.format(data_dict.keys()))
        images = data_dict['images']

        id_name_dict = {}
        for idx, img in enumerate(images):
            img_id = img['id']
            image_name = img['file_name'].split('.')[0]
            height = img['height']
            width = img['width']

            # print('[Info] img: {}'.format(img))
            # print('[Info] img_id: {}, file_name: {}'.format(img_id, image_name))
            id_name_dict[img_id] = [image_name, height, width]
            # if idx == 20:
            #     break

        annotations = data_dict["annotations"]

        image_dict = collections.defaultdict(list)
        for idx, anno in enumerate(annotations):
            image_id = anno['image_id']
            image_name, ih, iw = id_name_dict[image_id]
            wh_box = anno['bbox']
            bbox = [wh_box[0], wh_box[1], wh_box[0] + wh_box[2], wh_box[1] + wh_box[3]]
            if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]:
                continue
            bbox_yolo = DatasetGeneratorV2.convert(iw, ih, bbox)
            bbox_yolo = [str(round(i, 6)) for i in bbox_yolo]
            # print('[Info] image_id: {}, ih: {}, iw: {}, bbox: {}, bbox_yolo: {}'
            #       .format(image_name, ih, iw, bbox, bbox_yolo))

            image_dict[image_name].append(" ".join(["0", *bbox_yolo]))

        print('[Info] 样本数: {}'.format(len(image_dict.keys())))

        image_name_list = list(image_dict.keys())
        gap = len(image_name_list) // 10
        image_train_list = image_name_list[:gap*9]
        image_val_list = image_name_list[gap*9:]
        print('[Info] 训练: {}, 验证: {}'.format(len(image_train_list), len(image_val_list)))

        for idx, image_name in enumerate(image_train_list):
            print('[Info] idx: {}'.format(idx))
            bbox_yolo_list = image_dict[image_name]

            image_url = url_format.format(image_name)
            is_ok, img_bgr = download_url_img(image_url)

            out_name = "train_{}_{}".format(file_idx, str(idx).zfill(6))
            img_path = os.path.join(out_images_train_dir, '{}.jpg'.format(out_name))
            cv2.imwrite(img_path, img_bgr)  # 写入图像
            print('[Info] img_path: {}'.format(img_path))

            lbl_path = os.path.join(out_labels_train_dir, '{}.txt'.format(out_name))
            write_list_to_file(lbl_path, bbox_yolo_list)
            print('[Info] lbl_path: {}'.format(lbl_path))

            print('[Info] ' + "-" * 100)
            # if idx == 20:
            #     break

        for idx, image_name in enumerate(image_val_list):
            print('[Info] idx: {}'.format(idx))
            bbox_yolo_list = image_dict[image_name]

            image_url = url_format.format(image_name)
            is_ok, img_bgr = download_url_img(image_url)

            out_name = "val_{}_{}".format(file_idx, str(idx).zfill(6))
            img_path = os.path.join(out_images_val_dir, '{}.jpg'.format(out_name))
            cv2.imwrite(img_path, img_bgr)  # 写入图像
            print('[Info] img_path: {}'.format(img_path))

            lbl_path = os.path.join(out_labels_val_dir, '{}.txt'.format(out_name))
            write_list_to_file(lbl_path, bbox_yolo_list)
            print('[Info] lbl_path: {}'.format(lbl_path))

            print('[Info] ' + "-" * 100)
            # if idx == 20:
            #     break
        print('[Info] 处理完成! {}'.format(file_path))