def process_csv(file_name): data_lines, tag_dict = read_csv(file_name) all_file = os.path.join(TXT_DATA, 'all_raws') create_file(all_file) for data_line in data_lines: cid, tags, content = data_line write_line(all_file, cid + u'---' + tags + u'---' + content) # seg_list = cut_sentence(content) # print(seg_list) # if seg_list: # write_line(all_file, ' '.join(seg_list)) tags_folder = os.path.join(TXT_DATA, 'raws') mkdir_if_not_exist(tags_folder, is_delete=True) for tag in tag_dict.keys(): tag_file = os.path.join(tags_folder, tag) feed_dict = dict() for data_feed in tag_dict[tag]: (feed_id, content) = data_feed if feed_id in feed_dict: print('重复 ID {}'.format(feed_id)) feed_dict[feed_id] = content for feed_id in feed_dict.keys(): content = feed_dict[feed_id] write_line(tag_file, feed_id + u',' + content)
def process_csv(file_path): """ 处理CSV文件 :param file_path: csv文件名 :return: None """ file_name = file_path.split('/')[-1] # 文件名 out_name = file_name.replace('.csv', '.txt') csv_rows = get_csv_reader(file_path) included_cols = [0, 1, 14, 18] # ["ID", "标签", "描述"] tags_all = traverse_tags() out_file = os.path.join(DATA_DIR, out_name) create_file(out_file) count = 0 for row in csv_rows: count += 1 if count == 1: print(row) if count == 1 or not row or len(row) < 13: # 去掉头部 continue c_row = [remove_slash(row[i]) for i in included_cols] [c_id, c_imgs, r_tag, c_content] = c_row c_tags = filter_content_tags(r_tag.split(',')) # 只保留最终的Tag for c_tag in c_tags: if c_tag in tags_all: write_line( out_file, c_id + u'---' + c_imgs + u'---' + ','.join(c_tags) + u'---' + c_content) break print('CSV 处理完成!')
def process_csv(file_name): """ 处理CSV文件 :param file_name: csv文件名 :return: None """ csv_rows = get_csv_reader(file_name) out_folder = SAMPLES_DIR mkdir_if_not_exist(out_folder, is_delete=True) included_cols = [0, 9, 13] # ["ID", "标签", "描述"] tags_all = traverse_tags() out_file = os.path.join(DATA_DIR, 'hot_content-2018-08-08-17283268.txt') create_file(out_file) count = 0 for row in csv_rows: count += 1 if count == 1 or not row or len(row) < 13: # 去掉头部 continue c_row = [remove_slash(row[i]) for i in included_cols] [c_id, r_tag, c_content] = c_row c_tags = filter_content_tags(r_tag.split(',')) # 只保留最终的Tag for c_tag in c_tags: if c_tag in tags_all: write_line( out_file, c_id + u'---' + ','.join(c_tags) + u'---' + c_content) break try: print('CSV 处理!') except: pass
def init_city_keywords(): kw_path = os.path.join(TXT_DATA, 'res_kw', 'cities') mkdir_if_not_exist(kw_path) if os.path.exists(kw_path): print('文件已存在!') return all_city = get_all_cities() for city in all_city: city_path = os.path.join(kw_path, city) write_line(city_path, city)
def write_tag_keywords(): """ 写入文本的标签 :return: None """ kw_folder = KEYWORDS_DIR mkdir_if_not_exist(kw_folder) all_tags = traverse_tags() for tag in all_tags: file_name = os.path.join(kw_folder, tag) write_line(file_name, tag) # 写入全部标签
def reorder_boxes(): in_file = os.path.join(OUTPUT_DATA, 'logAll_out.txt') out_file = os.path.join(OUTPUT_DATA, 'logAll_json.txt') data_lines = read_file(in_file) for data_line in data_lines: items = data_line.split(' ') img_name = items[0] json_dict = dict() json_dict['tag'] = 'face' json_dict['name'] = img_name.split('/')[-1] image = Image.open(os.path.join(IMG_DATA, img_name)) if 'size' not in json_dict: json_dict['size'] = dict() json_dict['size']['width'] = image.width json_dict['size']['height'] = image.height if 'label' not in json_dict: json_dict['label'] = [] if items >= 2: boxes = items[1:] for box in boxes: box_dict = dict() x_min, y_min, x_max, y_max, label = box.split(',') box_dict['minX'] = x_min box_dict['minY'] = y_min box_dict['maxX'] = x_max box_dict['maxY'] = y_max box_dict['tag'] = 'face' box_dict['tagId'] = 0 json_dict['label'].append(box_dict) json_str = json.dumps(json_dict) print(json_str) write_line(out_file, json_str)
def detect_image(self, image, image_name=None, out_file=None): start = timer() # 起始时间 if self.model_image_size != ( None, None): # 416x416, 416=32*13,必须为32的倍数,最小尺度是除以32 assert self.model_image_size[ 0] % 32 == 0, 'Multiples of 32 required' assert self.model_image_size[ 1] % 32 == 0, 'Multiples of 32 required' boxed_image = letterbox_image(image, tuple(reversed( self.model_image_size))) # 填充图像 else: new_image_size = (image.width - (image.width % 32), image.height - (image.height % 32)) boxed_image = letterbox_image(image, new_image_size) image_data = np.array(boxed_image, dtype='float32') # print('detector size {}'.format(image_data.shape)) image_data /= 255. # 转换0~1 image_data = np.expand_dims(image_data, 0) # 添加批次维度,将图片增加1维 # 参数盒子、得分、类别;输入图像0~1,4维;原始图像的尺寸 out_boxes, out_scores, out_classes = self.sess.run( [self.boxes, self.scores, self.classes], feed_dict={ self.yolo_model.input: image_data, # 输入检测图片 self.input_image_shape: [image.size[1], image.size[0]], # 输入检测尺寸 K.learning_phase(): 0 # 学习率, 0表示测试, 1表示训练 }) if image_name and out_file: boxes_list = [image_name] for box, clazz in zip(out_boxes, out_classes): line_list = box.tolist() line_list = [str(int(x)) for x in line_list] line_list.append(str(clazz)) boxes_list.append(','.join(line_list)) write_line(out_file, ' '.join(boxes_list)) # 过滤小于0.004的图像 # tmp_boxes, tmp_scores, tmp_classes = [], [], [] # for out_box, out_score, out_class in zip(out_boxes, out_scores, out_classes): # img_size = image.size[1] * image.size[0] # # print_info('图片大小: %s' % img_size) # top, left, bottom, right = out_box # box_ratio = abs(top - bottom) * abs(left - right) / img_size # if box_ratio > 0.004: # tmp_boxes.append(out_box) # tmp_scores.append(out_score) # tmp_classes.append(out_class) # out_boxes, out_scores, out_classes = tmp_boxes, tmp_scores, tmp_classes print('Found {} boxes for {}'.format(len(out_boxes), 'img')) # 检测出的框 font = ImageFont.truetype(font='font/FiraMono-Medium.otf', size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32')) # 字体 thickness = (image.size[0] + image.size[1]) // 512 # 厚度 for i, c in reversed(list(enumerate(out_classes))): predicted_class = self.class_names[c] # 类别 box = out_boxes[i] # 框 score = out_scores[i] # 执行度 label = '{} {:.2f}'.format(predicted_class, score) # 标签 draw = ImageDraw.Draw(image) # 画图 label_size = draw.textsize(label, font) # 标签文字 top, left, bottom, right = box top = max(0, np.floor(top + 0.5).astype('int32')) left = max(0, np.floor(left + 0.5).astype('int32')) bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32')) right = min(image.size[0], np.floor(right + 0.5).astype('int32')) # print(label, (left, top), (right, bottom)) # 边框 if top - label_size[1] >= 0: # 标签文字 text_origin = np.array([left, top - label_size[1]]) else: text_origin = np.array([left, top + 1]) # My kingdom for a good redistributable image drawing library. for i in range(thickness): # 画框 draw.rectangle([left + i, top + i, right - i, bottom - i], outline=self.colors[c]) draw.rectangle( # 文字背景 [tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[c]) draw.text(text_origin, label, fill=(0, 0, 0), font=font) # 文案 del draw end = timer() # print(end - start) # 检测执行时间 return image