コード例 #1
0
    def process_csv(self, file_path, out_path):
        """
        处理CSV文件
        :param file_path: csv输入文件
        :param out_path: 输出文件
        :return: None
        """
        data_lines = read_file(file_path)  # 读取数据

        url_list = []
        for idx, data_line in enumerate(data_lines):
            if idx == 0:
                continue
            # print('[Info] data_line: {}'.format(data_line))
            items = data_line.split(';')
            # print('[Info] items: {}'.format(len(items)))
            label = items[3]
            url = items[5].replace("\"", "")
            # print('[Info] label: {}'.format(label))
            # print('[Info] url: {}'.format(url))
            if self.check_label_num(label):
                url_list.append(url)

        print('[Info] 样本数量: {}'.format(len(url_list)))
        write_list_to_file(out_path, url_list)
        print('[Info] 写入数据完成: {}'.format(out_path))
コード例 #2
0
    def process_csv_v2(self, file_path, out_path):
        data_lines = read_file(file_path)  # 读取数据
        url_list = []
        for idx, data_line in enumerate(data_lines):
            # print('[Info] data_line: {}'.format(data_line))
            items = data_line.split(';')
            url = items[1]
            url_list.append(url)

        write_list_to_file(out_path, url_list)
        print('[Info] 写入完成: {}'.format(out_path))
コード例 #3
0
    def process_txt(self, file_path, out_path):
        data_lines = read_file(file_path)  # 读取数据

        url_list = []
        for idx, data_line in enumerate(data_lines):
            print('[Info] data_line: {}'.format(data_line))
            items = data_line.split('\t')
            print('[Info] items: {}'.format(len(items)))
            url = items[0]
            url_list.append(url)

        write_list_to_file(out_path, url_list)
        print('[Info] 写入完成: {}'.format(out_path))
コード例 #4
0
    def process_txt_v2(self, file_path, out_path):
        data_lines = read_file(file_path)  # 读取数据

        url_list = []
        for idx, data_line in enumerate(data_lines):
            # print('[Info] data_line: {}'.format(data_line))
            data_line = data_line.replace("\'", "\"")
            data_dict = json.loads(data_line)
            url = data_dict['url']
            url_list.append(url)

        write_list_to_file(out_path, url_list)
        print('[Info] 写入完成: {}'.format(out_path))
コード例 #5
0
    def merge_files(self):
        data_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out')
        paths_list, names_list = traverse_dir_files(data_dir)
        out_path = os.path.join(DATA_DIR, 'biaozhu_csv_out.txt')

        all_data_lines = []
        for path, name in zip(paths_list, names_list):
            data_lines = read_file(path)

            for data_line in data_lines:
                data_line.replace("\"", "")
                all_data_lines.append(data_line)
        write_list_to_file(out_path, all_data_lines)
コード例 #6
0
    def merge(self):
        out_format = os.path.join(DATA_DIR, 'train_data_v3_out.{}.txt')
        paths_list, names_list = traverse_dir_files(self.out_dir)
        print('[Info] 总文本数: {}'.format(len(paths_list)))
        all_data_lines = []
        for path in paths_list:
            data_lines = read_file(path)
            all_data_lines += data_lines

        all_data_lines = sorted(list(set(all_data_lines)))
        out_path = out_format.format(len(all_data_lines))
        print('[Info] 总数据量: {}'.format(len(all_data_lines)))
        write_list_to_file(out_path, all_data_lines)
        print('[Info] 写入数据完成: {}'.format(out_path))
コード例 #7
0
ファイル: ps_evaluator.py プロジェクト: SpikeKing/essay-ocr
    def make_html_page(html_file, imgs_path, n=1):
        header = """
        <!DOCTYPE html>
        <html>
        <head>
        <title>MathJax TeX Test Page</title>
        <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
        <script type="text/javascript" id="MathJax-script" async
          src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js">
        </script>
        <style>
        img{
            max-height:640px;
            max-width: 640px;
            vertical-align:middle;
        }
        </style>
        </head>
        <body>
        <table>
        """

        tail = """
        </table>
        </body>
        </html>
        """

        data_lines = read_file(imgs_path)
        print('[Info] 样本行数: {}'.format(len(data_lines)))
        urls_list = []  # url列表
        for data_line in data_lines:
            # urls = data_line.split("<sep>")
            urls = data_line.split(",")
            urls_list.append(urls)

        with open(html_file, 'w') as f:
            f.write(header)
            for idx, items in enumerate(urls_list):
                f.write('<tr>\n')
                f.write('<td>%d</td>\n' % ((idx / n)))
                for item in items:
                    f.write('<td>\n')
                    if item.startswith("http"):
                        f.write('<img src="%s" width="600">\n' % item)
                    else:
                        f.write('%s' % item)
                    f.write('</td>\n')
                f.write('</tr>\n')
            f.write(tail)
コード例 #8
0
    def process(self):
        data_lines = read_file(self.file_path)
        pool = Pool(processes=40)
        for idx, data_line in enumerate(data_lines):
            # print('[Info] url: {}'.format(data_line))
            out_path = os.path.join(self.out_dir, '{}.jpg'.format(idx))

            # pool.apply_async(DataPrepareV4.process_url, (data_line, out_path))
            DataPrepareV4.process_url(data_line, out_path)
            if idx % 100 == 0:
                print('[Info] idx: {}'.format(idx))

        pool.close()
        pool.join()

        print('[Info] 下载完成')
コード例 #9
0
    def process_raw_data(self, path):
        file_name = path.split('/')[-1]

        print('[Info] file_name: {}'.format(file_name))
        data_lines = read_file(path)

        res_list = []
        for data_line in data_lines:
            try:
                item_list = data_line.split('<sep>')
                # print('[Info] num of items: {}'.format(len(item_list)))
                item_id = item_list[0]
                url = item_list[1]
                ocr_json = item_list[2]
                ocr_dict = json.loads(ocr_json)
                angle = int(ocr_dict['angel'])
                out_json_dict = {"id": item_id, "url": url, "angle": angle}
                res_list.append(json.dumps(out_json_dict))
            except Exception as e:
                continue

        print('[Info] 样本数量: {}'.format(len(res_list)))
        return res_list
コード例 #10
0
 def process_path_1(self, path, out_path):
     data_lines = read_file(path)
     """
     Task 61786 d7af1d07-4833-402d-b988-98277e997d51.csv:
     index;markTime;checkLabel;label;checker;url;marker;_id;taskId
     """
     for idx, data_line in enumerate(data_lines):
         if idx == 0:
             print(data_line)
         else:
             try:
                 items = data_line.split(';')
                 label_str = items[3]
                 url = items[5]
                 # print('[Info] label: {}, url: {}'.format(label_str, url))
                 label_list = json.loads(label_str)[0]
                 n_label = len(label_list)
                 # print('[Info] num: {}'.format(n_label))
                 if n_label > 0:
                     write_line(out_path, url)
             except Exception as e:
                 continue
             if idx % 1000 == 0:
                 print('[Info] idx: {}'.format(idx))
コード例 #11
0
    def generate_file(file_path, file_idx):
        file_idx = str(file_idx).zfill(4)
        print('[Info] file_path: {}, file_idx: {}'.format(file_path, file_idx))

        url_format = "http://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_segmentation/" \
                          "datasets/prelabeled-20201224/{}.jpg"
        out_dataset_dir = os.path.join(DATA_DIR, 'ps_datasets_v2')

        out_images_dir = os.path.join(out_dataset_dir, 'images')
        out_images_train_dir = os.path.join(out_images_dir, 'train')
        out_images_val_dir = os.path.join(out_images_dir, 'val')

        out_labels_dir = os.path.join(out_dataset_dir, 'labels')
        out_labels_train_dir = os.path.join(out_labels_dir, 'train')
        out_labels_val_dir = os.path.join(out_labels_dir, 'val')

        mkdir_if_not_exist(out_dataset_dir)
        mkdir_if_not_exist(out_images_dir)
        mkdir_if_not_exist(out_images_train_dir)
        mkdir_if_not_exist(out_images_val_dir)
        mkdir_if_not_exist(out_labels_dir)
        mkdir_if_not_exist(out_labels_train_dir)
        mkdir_if_not_exist(out_labels_val_dir)

        print('[Info] 处理数据开始: {}'.format(file_path))
        data_line = read_file(file_path)[0]
        data_dict = json.loads(data_line)
        print('[Info] keys: {}'.format(data_dict.keys()))
        images = data_dict['images']

        id_name_dict = {}
        for idx, img in enumerate(images):
            img_id = img['id']
            image_name = img['file_name'].split('.')[0]
            height = img['height']
            width = img['width']

            # print('[Info] img: {}'.format(img))
            # print('[Info] img_id: {}, file_name: {}'.format(img_id, image_name))
            id_name_dict[img_id] = [image_name, height, width]
            # if idx == 20:
            #     break

        annotations = data_dict["annotations"]

        image_dict = collections.defaultdict(list)
        for idx, anno in enumerate(annotations):
            image_id = anno['image_id']
            image_name, ih, iw = id_name_dict[image_id]
            wh_box = anno['bbox']
            bbox = [wh_box[0], wh_box[1], wh_box[0] + wh_box[2], wh_box[1] + wh_box[3]]
            if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]:
                continue
            bbox_yolo = DatasetGeneratorV2.convert(iw, ih, bbox)
            bbox_yolo = [str(round(i, 6)) for i in bbox_yolo]
            # print('[Info] image_id: {}, ih: {}, iw: {}, bbox: {}, bbox_yolo: {}'
            #       .format(image_name, ih, iw, bbox, bbox_yolo))

            image_dict[image_name].append(" ".join(["0", *bbox_yolo]))

        print('[Info] 样本数: {}'.format(len(image_dict.keys())))

        image_name_list = list(image_dict.keys())
        gap = len(image_name_list) // 10
        image_train_list = image_name_list[:gap*9]
        image_val_list = image_name_list[gap*9:]
        print('[Info] 训练: {}, 验证: {}'.format(len(image_train_list), len(image_val_list)))

        for idx, image_name in enumerate(image_train_list):
            print('[Info] idx: {}'.format(idx))
            bbox_yolo_list = image_dict[image_name]

            image_url = url_format.format(image_name)
            is_ok, img_bgr = download_url_img(image_url)

            out_name = "train_{}_{}".format(file_idx, str(idx).zfill(6))
            img_path = os.path.join(out_images_train_dir, '{}.jpg'.format(out_name))
            cv2.imwrite(img_path, img_bgr)  # 写入图像
            print('[Info] img_path: {}'.format(img_path))

            lbl_path = os.path.join(out_labels_train_dir, '{}.txt'.format(out_name))
            write_list_to_file(lbl_path, bbox_yolo_list)
            print('[Info] lbl_path: {}'.format(lbl_path))

            print('[Info] ' + "-" * 100)
            # if idx == 20:
            #     break

        for idx, image_name in enumerate(image_val_list):
            print('[Info] idx: {}'.format(idx))
            bbox_yolo_list = image_dict[image_name]

            image_url = url_format.format(image_name)
            is_ok, img_bgr = download_url_img(image_url)

            out_name = "val_{}_{}".format(file_idx, str(idx).zfill(6))
            img_path = os.path.join(out_images_val_dir, '{}.jpg'.format(out_name))
            cv2.imwrite(img_path, img_bgr)  # 写入图像
            print('[Info] img_path: {}'.format(img_path))

            lbl_path = os.path.join(out_labels_val_dir, '{}.txt'.format(out_name))
            write_list_to_file(lbl_path, bbox_yolo_list)
            print('[Info] lbl_path: {}'.format(lbl_path))

            print('[Info] ' + "-" * 100)
            # if idx == 20:
            #     break
        print('[Info] 处理完成! {}'.format(file_path))
コード例 #12
0
    tail = """
</table>
</body>
</html>
"""
    # image_dir = sys.argv[1]
    # image_paths = get_image_paths(image_dir)
    # html_file = sys.argv[2]
    # image_paths = ["https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg",
    #                "https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg"]
    # image_paths2 = ["https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg",
    #                "https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg"]

    image_file1 = os.path.join(DATA_DIR, "url_0108_高航.out1.txt")
    image_file2 = os.path.join(DATA_DIR, "url_0108_高航.out2.txt")
    image_paths = read_file(image_file1)
    image_paths2 = read_file(image_file2)
    html_file = "xxx.html"
    with open(html_file, 'w') as f:
        f.write(header)
        for index, (image_path,
                    image_path2) in enumerate(zip(image_paths, image_paths2)):
            f.write('<tr>\n')
            f.write('<td>%d</td>\n' % (index + 1))
            f.write('<td>\n')
            f.write('<img src="%s" width="400">\t' % (image_path))
            f.write('<img src="%s" width="400">\n' % (image_path2))
            f.write('</td>\n')
            f.write('</tr>\n')
        f.write(tail)