def process_csv(self, file_path, out_path): """ 处理CSV文件 :param file_path: csv输入文件 :param out_path: 输出文件 :return: None """ data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): if idx == 0: continue # print('[Info] data_line: {}'.format(data_line)) items = data_line.split(';') # print('[Info] items: {}'.format(len(items))) label = items[3] url = items[5].replace("\"", "") # print('[Info] label: {}'.format(label)) # print('[Info] url: {}'.format(url)) if self.check_label_num(label): url_list.append(url) print('[Info] 样本数量: {}'.format(len(url_list))) write_list_to_file(out_path, url_list) print('[Info] 写入数据完成: {}'.format(out_path))
def process_csv_v2(self, file_path, out_path): data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): # print('[Info] data_line: {}'.format(data_line)) items = data_line.split(';') url = items[1] url_list.append(url) write_list_to_file(out_path, url_list) print('[Info] 写入完成: {}'.format(out_path))
def process_txt_v2(self, file_path, out_path): data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): # print('[Info] data_line: {}'.format(data_line)) data_line = data_line.replace("\'", "\"") data_dict = json.loads(data_line) url = data_dict['url'] url_list.append(url) write_list_to_file(out_path, url_list) print('[Info] 写入完成: {}'.format(out_path))
def process_txt(self, file_path, out_path): data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): print('[Info] data_line: {}'.format(data_line)) items = data_line.split('\t') print('[Info] items: {}'.format(len(items))) url = items[0] url_list.append(url) write_list_to_file(out_path, url_list) print('[Info] 写入完成: {}'.format(out_path))
def merge_files(self): data_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out') paths_list, names_list = traverse_dir_files(data_dir) out_path = os.path.join(DATA_DIR, 'biaozhu_csv_out.txt') all_data_lines = [] for path, name in zip(paths_list, names_list): data_lines = read_file(path) for data_line in data_lines: data_line.replace("\"", "") all_data_lines.append(data_line) write_list_to_file(out_path, all_data_lines)
def merge(self): out_format = os.path.join(DATA_DIR, 'train_data_v3_out.{}.txt') paths_list, names_list = traverse_dir_files(self.out_dir) print('[Info] 总文本数: {}'.format(len(paths_list))) all_data_lines = [] for path in paths_list: data_lines = read_file(path) all_data_lines += data_lines all_data_lines = sorted(list(set(all_data_lines))) out_path = out_format.format(len(all_data_lines)) print('[Info] 总数据量: {}'.format(len(all_data_lines))) write_list_to_file(out_path, all_data_lines) print('[Info] 写入数据完成: {}'.format(out_path))
def process(self): folder_name = "2020_11_12" file_folder = os.path.join(DATA_DIR, folder_name) out_format = os.path.join(DATA_DIR, folder_name + "_out.{}.txt") paths_list, names_list = traverse_dir_files(file_folder) data_list = [] for path, name in zip(paths_list, names_list): sub_list = self.process_raw_data(path) # 根据宽高筛选图像 data_list += sub_list print('[Info] 样本数: {}'.format(len(data_list))) out_file = out_format.format(len(data_list)) write_list_to_file(out_file, data_list) print('[Info] 写入文件完成! {}'.format(out_file))
def generate_file(file_path, file_idx): file_idx = str(file_idx).zfill(4) print('[Info] file_path: {}, file_idx: {}'.format(file_path, file_idx)) url_format = "http://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_segmentation/" \ "datasets/prelabeled-20201224/{}.jpg" out_dataset_dir = os.path.join(DATA_DIR, 'ps_datasets_v2') out_images_dir = os.path.join(out_dataset_dir, 'images') out_images_train_dir = os.path.join(out_images_dir, 'train') out_images_val_dir = os.path.join(out_images_dir, 'val') out_labels_dir = os.path.join(out_dataset_dir, 'labels') out_labels_train_dir = os.path.join(out_labels_dir, 'train') out_labels_val_dir = os.path.join(out_labels_dir, 'val') mkdir_if_not_exist(out_dataset_dir) mkdir_if_not_exist(out_images_dir) mkdir_if_not_exist(out_images_train_dir) mkdir_if_not_exist(out_images_val_dir) mkdir_if_not_exist(out_labels_dir) mkdir_if_not_exist(out_labels_train_dir) mkdir_if_not_exist(out_labels_val_dir) print('[Info] 处理数据开始: {}'.format(file_path)) data_line = read_file(file_path)[0] data_dict = json.loads(data_line) print('[Info] keys: {}'.format(data_dict.keys())) images = data_dict['images'] id_name_dict = {} for idx, img in enumerate(images): img_id = img['id'] image_name = img['file_name'].split('.')[0] height = img['height'] width = img['width'] # print('[Info] img: {}'.format(img)) # print('[Info] img_id: {}, file_name: {}'.format(img_id, image_name)) id_name_dict[img_id] = [image_name, height, width] # if idx == 20: # break annotations = data_dict["annotations"] image_dict = collections.defaultdict(list) for idx, anno in enumerate(annotations): image_id = anno['image_id'] image_name, ih, iw = id_name_dict[image_id] wh_box = anno['bbox'] bbox = [wh_box[0], wh_box[1], wh_box[0] + wh_box[2], wh_box[1] + wh_box[3]] if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]: continue bbox_yolo = DatasetGeneratorV2.convert(iw, ih, bbox) bbox_yolo = [str(round(i, 6)) for i in bbox_yolo] # print('[Info] image_id: {}, ih: {}, iw: {}, bbox: {}, bbox_yolo: {}' # .format(image_name, ih, iw, bbox, bbox_yolo)) image_dict[image_name].append(" ".join(["0", *bbox_yolo])) print('[Info] 样本数: {}'.format(len(image_dict.keys()))) image_name_list = list(image_dict.keys()) gap = len(image_name_list) // 10 image_train_list = image_name_list[:gap*9] image_val_list = image_name_list[gap*9:] print('[Info] 训练: {}, 验证: {}'.format(len(image_train_list), len(image_val_list))) for idx, image_name in enumerate(image_train_list): print('[Info] idx: {}'.format(idx)) bbox_yolo_list = image_dict[image_name] image_url = url_format.format(image_name) is_ok, img_bgr = download_url_img(image_url) out_name = "train_{}_{}".format(file_idx, str(idx).zfill(6)) img_path = os.path.join(out_images_train_dir, '{}.jpg'.format(out_name)) cv2.imwrite(img_path, img_bgr) # 写入图像 print('[Info] img_path: {}'.format(img_path)) lbl_path = os.path.join(out_labels_train_dir, '{}.txt'.format(out_name)) write_list_to_file(lbl_path, bbox_yolo_list) print('[Info] lbl_path: {}'.format(lbl_path)) print('[Info] ' + "-" * 100) # if idx == 20: # break for idx, image_name in enumerate(image_val_list): print('[Info] idx: {}'.format(idx)) bbox_yolo_list = image_dict[image_name] image_url = url_format.format(image_name) is_ok, img_bgr = download_url_img(image_url) out_name = "val_{}_{}".format(file_idx, str(idx).zfill(6)) img_path = os.path.join(out_images_val_dir, '{}.jpg'.format(out_name)) cv2.imwrite(img_path, img_bgr) # 写入图像 print('[Info] img_path: {}'.format(img_path)) lbl_path = os.path.join(out_labels_val_dir, '{}.txt'.format(out_name)) write_list_to_file(lbl_path, bbox_yolo_list) print('[Info] lbl_path: {}'.format(lbl_path)) print('[Info] ' + "-" * 100) # if idx == 20: # break print('[Info] 处理完成! {}'.format(file_path))