def process_csv(self, file_path, out_path): """ 处理CSV文件 :param file_path: csv输入文件 :param out_path: 输出文件 :return: None """ data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): if idx == 0: continue # print('[Info] data_line: {}'.format(data_line)) items = data_line.split(';') # print('[Info] items: {}'.format(len(items))) label = items[3] url = items[5].replace("\"", "") # print('[Info] label: {}'.format(label)) # print('[Info] url: {}'.format(url)) if self.check_label_num(label): url_list.append(url) print('[Info] 样本数量: {}'.format(len(url_list))) write_list_to_file(out_path, url_list) print('[Info] 写入数据完成: {}'.format(out_path))
def process_csv_v2(self, file_path, out_path): data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): # print('[Info] data_line: {}'.format(data_line)) items = data_line.split(';') url = items[1] url_list.append(url) write_list_to_file(out_path, url_list) print('[Info] 写入完成: {}'.format(out_path))
def process_txt(self, file_path, out_path): data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): print('[Info] data_line: {}'.format(data_line)) items = data_line.split('\t') print('[Info] items: {}'.format(len(items))) url = items[0] url_list.append(url) write_list_to_file(out_path, url_list) print('[Info] 写入完成: {}'.format(out_path))
def process_txt_v2(self, file_path, out_path): data_lines = read_file(file_path) # 读取数据 url_list = [] for idx, data_line in enumerate(data_lines): # print('[Info] data_line: {}'.format(data_line)) data_line = data_line.replace("\'", "\"") data_dict = json.loads(data_line) url = data_dict['url'] url_list.append(url) write_list_to_file(out_path, url_list) print('[Info] 写入完成: {}'.format(out_path))
def merge_files(self): data_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out') paths_list, names_list = traverse_dir_files(data_dir) out_path = os.path.join(DATA_DIR, 'biaozhu_csv_out.txt') all_data_lines = [] for path, name in zip(paths_list, names_list): data_lines = read_file(path) for data_line in data_lines: data_line.replace("\"", "") all_data_lines.append(data_line) write_list_to_file(out_path, all_data_lines)
def merge(self): out_format = os.path.join(DATA_DIR, 'train_data_v3_out.{}.txt') paths_list, names_list = traverse_dir_files(self.out_dir) print('[Info] 总文本数: {}'.format(len(paths_list))) all_data_lines = [] for path in paths_list: data_lines = read_file(path) all_data_lines += data_lines all_data_lines = sorted(list(set(all_data_lines))) out_path = out_format.format(len(all_data_lines)) print('[Info] 总数据量: {}'.format(len(all_data_lines))) write_list_to_file(out_path, all_data_lines) print('[Info] 写入数据完成: {}'.format(out_path))
def make_html_page(html_file, imgs_path, n=1): header = """ <!DOCTYPE html> <html> <head> <title>MathJax TeX Test Page</title> <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script> <script type="text/javascript" id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml.js"> </script> <style> img{ max-height:640px; max-width: 640px; vertical-align:middle; } </style> </head> <body> <table> """ tail = """ </table> </body> </html> """ data_lines = read_file(imgs_path) print('[Info] 样本行数: {}'.format(len(data_lines))) urls_list = [] # url列表 for data_line in data_lines: # urls = data_line.split("<sep>") urls = data_line.split(",") urls_list.append(urls) with open(html_file, 'w') as f: f.write(header) for idx, items in enumerate(urls_list): f.write('<tr>\n') f.write('<td>%d</td>\n' % ((idx / n))) for item in items: f.write('<td>\n') if item.startswith("http"): f.write('<img src="%s" width="600">\n' % item) else: f.write('%s' % item) f.write('</td>\n') f.write('</tr>\n') f.write(tail)
def process(self): data_lines = read_file(self.file_path) pool = Pool(processes=40) for idx, data_line in enumerate(data_lines): # print('[Info] url: {}'.format(data_line)) out_path = os.path.join(self.out_dir, '{}.jpg'.format(idx)) # pool.apply_async(DataPrepareV4.process_url, (data_line, out_path)) DataPrepareV4.process_url(data_line, out_path) if idx % 100 == 0: print('[Info] idx: {}'.format(idx)) pool.close() pool.join() print('[Info] 下载完成')
def process_raw_data(self, path): file_name = path.split('/')[-1] print('[Info] file_name: {}'.format(file_name)) data_lines = read_file(path) res_list = [] for data_line in data_lines: try: item_list = data_line.split('<sep>') # print('[Info] num of items: {}'.format(len(item_list))) item_id = item_list[0] url = item_list[1] ocr_json = item_list[2] ocr_dict = json.loads(ocr_json) angle = int(ocr_dict['angel']) out_json_dict = {"id": item_id, "url": url, "angle": angle} res_list.append(json.dumps(out_json_dict)) except Exception as e: continue print('[Info] 样本数量: {}'.format(len(res_list))) return res_list
def process_path_1(self, path, out_path): data_lines = read_file(path) """ Task 61786 d7af1d07-4833-402d-b988-98277e997d51.csv: index;markTime;checkLabel;label;checker;url;marker;_id;taskId """ for idx, data_line in enumerate(data_lines): if idx == 0: print(data_line) else: try: items = data_line.split(';') label_str = items[3] url = items[5] # print('[Info] label: {}, url: {}'.format(label_str, url)) label_list = json.loads(label_str)[0] n_label = len(label_list) # print('[Info] num: {}'.format(n_label)) if n_label > 0: write_line(out_path, url) except Exception as e: continue if idx % 1000 == 0: print('[Info] idx: {}'.format(idx))
def generate_file(file_path, file_idx): file_idx = str(file_idx).zfill(4) print('[Info] file_path: {}, file_idx: {}'.format(file_path, file_idx)) url_format = "http://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_segmentation/" \ "datasets/prelabeled-20201224/{}.jpg" out_dataset_dir = os.path.join(DATA_DIR, 'ps_datasets_v2') out_images_dir = os.path.join(out_dataset_dir, 'images') out_images_train_dir = os.path.join(out_images_dir, 'train') out_images_val_dir = os.path.join(out_images_dir, 'val') out_labels_dir = os.path.join(out_dataset_dir, 'labels') out_labels_train_dir = os.path.join(out_labels_dir, 'train') out_labels_val_dir = os.path.join(out_labels_dir, 'val') mkdir_if_not_exist(out_dataset_dir) mkdir_if_not_exist(out_images_dir) mkdir_if_not_exist(out_images_train_dir) mkdir_if_not_exist(out_images_val_dir) mkdir_if_not_exist(out_labels_dir) mkdir_if_not_exist(out_labels_train_dir) mkdir_if_not_exist(out_labels_val_dir) print('[Info] 处理数据开始: {}'.format(file_path)) data_line = read_file(file_path)[0] data_dict = json.loads(data_line) print('[Info] keys: {}'.format(data_dict.keys())) images = data_dict['images'] id_name_dict = {} for idx, img in enumerate(images): img_id = img['id'] image_name = img['file_name'].split('.')[0] height = img['height'] width = img['width'] # print('[Info] img: {}'.format(img)) # print('[Info] img_id: {}, file_name: {}'.format(img_id, image_name)) id_name_dict[img_id] = [image_name, height, width] # if idx == 20: # break annotations = data_dict["annotations"] image_dict = collections.defaultdict(list) for idx, anno in enumerate(annotations): image_id = anno['image_id'] image_name, ih, iw = id_name_dict[image_id] wh_box = anno['bbox'] bbox = [wh_box[0], wh_box[1], wh_box[0] + wh_box[2], wh_box[1] + wh_box[3]] if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]: continue bbox_yolo = DatasetGeneratorV2.convert(iw, ih, bbox) bbox_yolo = [str(round(i, 6)) for i in bbox_yolo] # print('[Info] image_id: {}, ih: {}, iw: {}, bbox: {}, bbox_yolo: {}' # .format(image_name, ih, iw, bbox, bbox_yolo)) image_dict[image_name].append(" ".join(["0", *bbox_yolo])) print('[Info] 样本数: {}'.format(len(image_dict.keys()))) image_name_list = list(image_dict.keys()) gap = len(image_name_list) // 10 image_train_list = image_name_list[:gap*9] image_val_list = image_name_list[gap*9:] print('[Info] 训练: {}, 验证: {}'.format(len(image_train_list), len(image_val_list))) for idx, image_name in enumerate(image_train_list): print('[Info] idx: {}'.format(idx)) bbox_yolo_list = image_dict[image_name] image_url = url_format.format(image_name) is_ok, img_bgr = download_url_img(image_url) out_name = "train_{}_{}".format(file_idx, str(idx).zfill(6)) img_path = os.path.join(out_images_train_dir, '{}.jpg'.format(out_name)) cv2.imwrite(img_path, img_bgr) # 写入图像 print('[Info] img_path: {}'.format(img_path)) lbl_path = os.path.join(out_labels_train_dir, '{}.txt'.format(out_name)) write_list_to_file(lbl_path, bbox_yolo_list) print('[Info] lbl_path: {}'.format(lbl_path)) print('[Info] ' + "-" * 100) # if idx == 20: # break for idx, image_name in enumerate(image_val_list): print('[Info] idx: {}'.format(idx)) bbox_yolo_list = image_dict[image_name] image_url = url_format.format(image_name) is_ok, img_bgr = download_url_img(image_url) out_name = "val_{}_{}".format(file_idx, str(idx).zfill(6)) img_path = os.path.join(out_images_val_dir, '{}.jpg'.format(out_name)) cv2.imwrite(img_path, img_bgr) # 写入图像 print('[Info] img_path: {}'.format(img_path)) lbl_path = os.path.join(out_labels_val_dir, '{}.txt'.format(out_name)) write_list_to_file(lbl_path, bbox_yolo_list) print('[Info] lbl_path: {}'.format(lbl_path)) print('[Info] ' + "-" * 100) # if idx == 20: # break print('[Info] 处理完成! {}'.format(file_path))
tail = """ </table> </body> </html> """ # image_dir = sys.argv[1] # image_paths = get_image_paths(image_dir) # html_file = sys.argv[2] # image_paths = ["https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg", # "https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg"] # image_paths2 = ["https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg", # "https://img.alicdn.com/imgextra/i2/6000000003335/O1CN01aNTJ931aVTVuytHXc_!!6000000003335-0-quark.jpg"] image_file1 = os.path.join(DATA_DIR, "url_0108_高航.out1.txt") image_file2 = os.path.join(DATA_DIR, "url_0108_高航.out2.txt") image_paths = read_file(image_file1) image_paths2 = read_file(image_file2) html_file = "xxx.html" with open(html_file, 'w') as f: f.write(header) for index, (image_path, image_path2) in enumerate(zip(image_paths, image_paths2)): f.write('<tr>\n') f.write('<td>%d</td>\n' % (index + 1)) f.write('<td>\n') f.write('<img src="%s" width="400">\t' % (image_path)) f.write('<img src="%s" width="400">\n' % (image_path2)) f.write('</td>\n') f.write('</tr>\n') f.write(tail)