Example #1
0
    def process_folder(self, in_dir, out_dir):
        """
        处理文件夹
        """
        print('[Info] in_dir: {}'.format(in_dir))
        print('[Info] out_dir: {}'.format(out_dir))
        mkdir_if_not_exist(out_dir)

        paths_list, names_list = traverse_dir_files(in_dir)
        print('[Info] 待处理文件数量: {}'.format(len(paths_list)))

        random.seed(47)
        paths_list, names_list = shuffle_two_list(paths_list, names_list)

        n_prc = 40
        pool = Pool(processes=n_prc)  # 多线程下载

        for idx, (path, name) in enumerate(zip(paths_list, names_list)):
            pool.apply_async(DataProcessor.process_img,
                             args=(path, name, out_dir))
            # DataProcessor.process_img(path, name, out_dir)
            if (idx + 1) % 1000 == 0:
                print('[Info] num: {}'.format(idx + 1))

        # 多进程逻辑
        pool.close()
        pool.join()

        print('[Info] 处理完成! {}'.format(out_dir))
        return
 def check(self):
     data_dir = os.path.join(ROOT_DIR, '..', 'datasets', 'datasets_v3')
     paths_list, names_list = traverse_dir_files(data_dir, is_sorted=False)
     print('[Info] 文件数量: {}'.format(len(paths_list)))
     for path in paths_list:
         x_path = path.split("?")[0]
         shutil.move(path, x_path)
     print('[Info] 处理完成!')
def get_problems_data(img_dir):
    image_paths, image_names = traverse_dir_files(img_dir, is_sorted=False)

    # 90% train images and 10% test images
    n_train_samples = int(len(image_paths) * 0.95)
    train_filenames = image_paths[:n_train_samples]
    test_filenames = image_paths[n_train_samples:]

    return train_filenames, test_filenames
    def process(self):
        data_dir = os.path.join(DATA_DIR, 'biaozhu_csv')
        out_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out')
        mkdir_if_not_exist(out_dir)

        paths_list, names_list = traverse_dir_files(data_dir)
        for path, name in zip(paths_list, names_list):
            print('[Info] path: {}'.format(path))
            name_items = name.split(' ')
            out_name = "_".join(name_items[0:2])
            out_path = os.path.join(out_dir, '{}.txt'.format(out_name))
            self.process_path_1(path, out_path)
Example #5
0
    def filter_folder(self, in_dir):
        paths_list, names_list = traverse_dir_files(in_dir)
        print('[Info] 样本数: {}'.format(len(paths_list)))

        n_remove = 0
        count = 0
        for path, name in zip(paths_list, names_list):
            img_bgr = cv2.imread(path)
            h, w, _ = img_bgr.shape
            x = safe_div(h, w)

            if x > 2:
                print('[Info] 删除: {}'.format(path))
                os.remove(path)
                n_remove += 1
            count += 1
            if count % 100 == 0:
                print(count)

        print('[Info] 删除: {}'.format(n_remove))
        paths_list, names_list = traverse_dir_files(in_dir)
        print('[Info] 处理后, 样本数: {}'.format(len(paths_list)))
    def merge_files(self):
        data_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out')
        paths_list, names_list = traverse_dir_files(data_dir)
        out_path = os.path.join(DATA_DIR, 'biaozhu_csv_out.txt')

        all_data_lines = []
        for path, name in zip(paths_list, names_list):
            data_lines = read_file(path)

            for data_line in data_lines:
                data_line.replace("\"", "")
                all_data_lines.append(data_line)
        write_list_to_file(out_path, all_data_lines)
    def merge(self):
        out_format = os.path.join(DATA_DIR, 'train_data_v3_out.{}.txt')
        paths_list, names_list = traverse_dir_files(self.out_dir)
        print('[Info] 总文本数: {}'.format(len(paths_list)))
        all_data_lines = []
        for path in paths_list:
            data_lines = read_file(path)
            all_data_lines += data_lines

        all_data_lines = sorted(list(set(all_data_lines)))
        out_path = out_format.format(len(all_data_lines))
        print('[Info] 总数据量: {}'.format(len(all_data_lines)))
        write_list_to_file(out_path, all_data_lines)
        print('[Info] 写入数据完成: {}'.format(out_path))
def process():
    dir_path = os.path.join(DATA_DIR, 'ps_datasets_v2_raw')
    paths_list, names_list = traverse_dir_files(dir_path)

    pool = Pool(processes=20)

    for file_idx, (path, name) in enumerate(zip(paths_list, names_list)):
        # DatasetGeneratorV2.generate_file(path)
        print('[Info] path: {}'.format(path))
        pool.apply_async(DatasetGeneratorV2.generate_file, (path, file_idx))

    pool.close()
    pool.join()
    print('[Info] 全部处理完成: {}'.format(dir_path))
    def process_folder(self, folder_dir, out_folder):
        """
        处理CSV文件夹
        :param folder_dir: 文件夹
        :param out_folder: 输出文件夹
        :return: None
        """
        print('[Info] 待处理文件夹: {}'.format(folder_dir))
        paths_list, names_list = traverse_dir_files(folder_dir, ext='csv')
        print('[Info] 文件数量: {}'.format(len(paths_list)))

        for path, name in zip(paths_list, names_list):
            print('[Info] path: {}'.format(path))
            file_name = name.split('.')[0]
            out_path = os.path.join(out_folder, '{}.out.txt'.format(file_name))
            create_file(out_path)
            self.process_csv(path, out_path)
    def process_folder(self, img_dir, out_dir):
        """
        处理文件夹
        :param img_dir: 输入文件夹
        :param out_dir: 输出文件夹
        :return: None
        """
        print('[Info] 处理文件夹: {}'.format(img_dir))
        print('[Info] 输出文件夹: {}'.format(out_dir))
        mkdir_if_not_exist(out_dir)

        paths_list, names_list = traverse_dir_files(img_dir)

        for path, name in zip(paths_list, names_list):
            patch_list = self.process_img(path)
            out_name_f = name.split('.')[0] + ".o{}.jpg"
            out_path_f = os.path.join(out_dir, out_name_f)
            for idx, img_p in enumerate(patch_list):
                out_path = out_path_f.format(idx)
                cv2.imwrite(out_path, img_p)

        print('[Info] 处理完成: {}'.format(out_dir))