def process_csv(self, file_path, out_path):
        """
        处理CSV文件
        :param file_path: csv输入文件
        :param out_path: 输出文件
        :return: None
        """
        data_lines = read_file(file_path)  # 读取数据

        url_list = []
        for idx, data_line in enumerate(data_lines):
            if idx == 0:
                continue
            # print('[Info] data_line: {}'.format(data_line))
            items = data_line.split(';')
            # print('[Info] items: {}'.format(len(items)))
            label = items[3]
            url = items[5].replace("\"", "")
            # print('[Info] label: {}'.format(label))
            # print('[Info] url: {}'.format(url))
            if self.check_label_num(label):
                url_list.append(url)

        print('[Info] 样本数量: {}'.format(len(url_list)))
        write_list_to_file(out_path, url_list)
        print('[Info] 写入数据完成: {}'.format(out_path))
    def process_csv_v2(self, file_path, out_path):
        data_lines = read_file(file_path)  # 读取数据
        url_list = []
        for idx, data_line in enumerate(data_lines):
            # print('[Info] data_line: {}'.format(data_line))
            items = data_line.split(';')
            url = items[1]
            url_list.append(url)

        write_list_to_file(out_path, url_list)
        print('[Info] 写入完成: {}'.format(out_path))
    def process_txt_v2(self, file_path, out_path):
        data_lines = read_file(file_path)  # 读取数据

        url_list = []
        for idx, data_line in enumerate(data_lines):
            # print('[Info] data_line: {}'.format(data_line))
            data_line = data_line.replace("\'", "\"")
            data_dict = json.loads(data_line)
            url = data_dict['url']
            url_list.append(url)

        write_list_to_file(out_path, url_list)
        print('[Info] 写入完成: {}'.format(out_path))
    def process_txt(self, file_path, out_path):
        data_lines = read_file(file_path)  # 读取数据

        url_list = []
        for idx, data_line in enumerate(data_lines):
            print('[Info] data_line: {}'.format(data_line))
            items = data_line.split('\t')
            print('[Info] items: {}'.format(len(items)))
            url = items[0]
            url_list.append(url)

        write_list_to_file(out_path, url_list)
        print('[Info] 写入完成: {}'.format(out_path))
    def merge_files(self):
        data_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out')
        paths_list, names_list = traverse_dir_files(data_dir)
        out_path = os.path.join(DATA_DIR, 'biaozhu_csv_out.txt')

        all_data_lines = []
        for path, name in zip(paths_list, names_list):
            data_lines = read_file(path)

            for data_line in data_lines:
                data_line.replace("\"", "")
                all_data_lines.append(data_line)
        write_list_to_file(out_path, all_data_lines)
    def merge(self):
        out_format = os.path.join(DATA_DIR, 'train_data_v3_out.{}.txt')
        paths_list, names_list = traverse_dir_files(self.out_dir)
        print('[Info] 总文本数: {}'.format(len(paths_list)))
        all_data_lines = []
        for path in paths_list:
            data_lines = read_file(path)
            all_data_lines += data_lines

        all_data_lines = sorted(list(set(all_data_lines)))
        out_path = out_format.format(len(all_data_lines))
        print('[Info] 总数据量: {}'.format(len(all_data_lines)))
        write_list_to_file(out_path, all_data_lines)
        print('[Info] 写入数据完成: {}'.format(out_path))
Example #7
0
    def process(self):
        folder_name = "2020_11_12"
        file_folder = os.path.join(DATA_DIR, folder_name)
        out_format = os.path.join(DATA_DIR, folder_name + "_out.{}.txt")

        paths_list, names_list = traverse_dir_files(file_folder)

        data_list = []
        for path, name in zip(paths_list, names_list):
            sub_list = self.process_raw_data(path)  # 根据宽高筛选图像
            data_list += sub_list
        print('[Info] 样本数: {}'.format(len(data_list)))

        out_file = out_format.format(len(data_list))
        write_list_to_file(out_file, data_list)
        print('[Info] 写入文件完成! {}'.format(out_file))
    def generate_file(file_path, file_idx):
        file_idx = str(file_idx).zfill(4)
        print('[Info] file_path: {}, file_idx: {}'.format(file_path, file_idx))

        url_format = "http://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_segmentation/" \
                          "datasets/prelabeled-20201224/{}.jpg"
        out_dataset_dir = os.path.join(DATA_DIR, 'ps_datasets_v2')

        out_images_dir = os.path.join(out_dataset_dir, 'images')
        out_images_train_dir = os.path.join(out_images_dir, 'train')
        out_images_val_dir = os.path.join(out_images_dir, 'val')

        out_labels_dir = os.path.join(out_dataset_dir, 'labels')
        out_labels_train_dir = os.path.join(out_labels_dir, 'train')
        out_labels_val_dir = os.path.join(out_labels_dir, 'val')

        mkdir_if_not_exist(out_dataset_dir)
        mkdir_if_not_exist(out_images_dir)
        mkdir_if_not_exist(out_images_train_dir)
        mkdir_if_not_exist(out_images_val_dir)
        mkdir_if_not_exist(out_labels_dir)
        mkdir_if_not_exist(out_labels_train_dir)
        mkdir_if_not_exist(out_labels_val_dir)

        print('[Info] 处理数据开始: {}'.format(file_path))
        data_line = read_file(file_path)[0]
        data_dict = json.loads(data_line)
        print('[Info] keys: {}'.format(data_dict.keys()))
        images = data_dict['images']

        id_name_dict = {}
        for idx, img in enumerate(images):
            img_id = img['id']
            image_name = img['file_name'].split('.')[0]
            height = img['height']
            width = img['width']

            # print('[Info] img: {}'.format(img))
            # print('[Info] img_id: {}, file_name: {}'.format(img_id, image_name))
            id_name_dict[img_id] = [image_name, height, width]
            # if idx == 20:
            #     break

        annotations = data_dict["annotations"]

        image_dict = collections.defaultdict(list)
        for idx, anno in enumerate(annotations):
            image_id = anno['image_id']
            image_name, ih, iw = id_name_dict[image_id]
            wh_box = anno['bbox']
            bbox = [wh_box[0], wh_box[1], wh_box[0] + wh_box[2], wh_box[1] + wh_box[3]]
            if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]:
                continue
            bbox_yolo = DatasetGeneratorV2.convert(iw, ih, bbox)
            bbox_yolo = [str(round(i, 6)) for i in bbox_yolo]
            # print('[Info] image_id: {}, ih: {}, iw: {}, bbox: {}, bbox_yolo: {}'
            #       .format(image_name, ih, iw, bbox, bbox_yolo))

            image_dict[image_name].append(" ".join(["0", *bbox_yolo]))

        print('[Info] 样本数: {}'.format(len(image_dict.keys())))

        image_name_list = list(image_dict.keys())
        gap = len(image_name_list) // 10
        image_train_list = image_name_list[:gap*9]
        image_val_list = image_name_list[gap*9:]
        print('[Info] 训练: {}, 验证: {}'.format(len(image_train_list), len(image_val_list)))

        for idx, image_name in enumerate(image_train_list):
            print('[Info] idx: {}'.format(idx))
            bbox_yolo_list = image_dict[image_name]

            image_url = url_format.format(image_name)
            is_ok, img_bgr = download_url_img(image_url)

            out_name = "train_{}_{}".format(file_idx, str(idx).zfill(6))
            img_path = os.path.join(out_images_train_dir, '{}.jpg'.format(out_name))
            cv2.imwrite(img_path, img_bgr)  # 写入图像
            print('[Info] img_path: {}'.format(img_path))

            lbl_path = os.path.join(out_labels_train_dir, '{}.txt'.format(out_name))
            write_list_to_file(lbl_path, bbox_yolo_list)
            print('[Info] lbl_path: {}'.format(lbl_path))

            print('[Info] ' + "-" * 100)
            # if idx == 20:
            #     break

        for idx, image_name in enumerate(image_val_list):
            print('[Info] idx: {}'.format(idx))
            bbox_yolo_list = image_dict[image_name]

            image_url = url_format.format(image_name)
            is_ok, img_bgr = download_url_img(image_url)

            out_name = "val_{}_{}".format(file_idx, str(idx).zfill(6))
            img_path = os.path.join(out_images_val_dir, '{}.jpg'.format(out_name))
            cv2.imwrite(img_path, img_bgr)  # 写入图像
            print('[Info] img_path: {}'.format(img_path))

            lbl_path = os.path.join(out_labels_val_dir, '{}.txt'.format(out_name))
            write_list_to_file(lbl_path, bbox_yolo_list)
            print('[Info] lbl_path: {}'.format(lbl_path))

            print('[Info] ' + "-" * 100)
            # if idx == 20:
            #     break
        print('[Info] 处理完成! {}'.format(file_path))