Ejemplo n.º 1
0
def process_csv(file_name):
    data_lines, tag_dict = read_csv(file_name)

    all_file = os.path.join(TXT_DATA, 'all_raws')
    create_file(all_file)

    for data_line in data_lines:
        cid, tags, content = data_line
        write_line(all_file, cid + u'---' + tags + u'---' + content)
        # seg_list = cut_sentence(content)
        # print(seg_list)
        # if seg_list:
        #     write_line(all_file, ' '.join(seg_list))

    tags_folder = os.path.join(TXT_DATA, 'raws')
    mkdir_if_not_exist(tags_folder, is_delete=True)
    for tag in tag_dict.keys():
        tag_file = os.path.join(tags_folder, tag)
        feed_dict = dict()
        for data_feed in tag_dict[tag]:
            (feed_id, content) = data_feed
            if feed_id in feed_dict:
                print('重复 ID {}'.format(feed_id))
            feed_dict[feed_id] = content
        for feed_id in feed_dict.keys():
            content = feed_dict[feed_id]
            write_line(tag_file, feed_id + u',' + content)
Ejemplo n.º 2
0
def process_csv(file_path):
    """
    处理CSV文件
    :param file_path: csv文件名
    :return: None
    """
    file_name = file_path.split('/')[-1]  # 文件名
    out_name = file_name.replace('.csv', '.txt')

    csv_rows = get_csv_reader(file_path)

    included_cols = [0, 1, 14, 18]  # ["ID", "标签", "描述"]
    tags_all = traverse_tags()

    out_file = os.path.join(DATA_DIR, out_name)
    create_file(out_file)

    count = 0
    for row in csv_rows:
        count += 1
        if count == 1:
            print(row)
        if count == 1 or not row or len(row) < 13:  # 去掉头部
            continue
        c_row = [remove_slash(row[i]) for i in included_cols]
        [c_id, c_imgs, r_tag, c_content] = c_row
        c_tags = filter_content_tags(r_tag.split(','))  # 只保留最终的Tag
        for c_tag in c_tags:
            if c_tag in tags_all:
                write_line(
                    out_file, c_id + u'---' + c_imgs + u'---' +
                    ','.join(c_tags) + u'---' + c_content)
                break

    print('CSV 处理完成!')
Ejemplo n.º 3
0
def process_csv(file_name):
    """
    处理CSV文件
    :param file_name: csv文件名
    :return: None
    """
    csv_rows = get_csv_reader(file_name)
    out_folder = SAMPLES_DIR
    mkdir_if_not_exist(out_folder, is_delete=True)

    included_cols = [0, 9, 13]  # ["ID", "标签", "描述"]
    tags_all = traverse_tags()

    out_file = os.path.join(DATA_DIR, 'hot_content-2018-08-08-17283268.txt')
    create_file(out_file)

    count = 0
    for row in csv_rows:
        count += 1
        if count == 1 or not row or len(row) < 13:  # 去掉头部
            continue
        c_row = [remove_slash(row[i]) for i in included_cols]
        [c_id, r_tag, c_content] = c_row
        c_tags = filter_content_tags(r_tag.split(','))  # 只保留最终的Tag
        for c_tag in c_tags:
            if c_tag in tags_all:
                write_line(
                    out_file,
                    c_id + u'---' + ','.join(c_tags) + u'---' + c_content)
                break

    try:
        print('CSV 处理!')
    except:
        pass
Ejemplo n.º 4
0
def init_city_keywords():
    kw_path = os.path.join(TXT_DATA, 'res_kw', 'cities')
    mkdir_if_not_exist(kw_path)
    if os.path.exists(kw_path):
        print('文件已存在!')
        return
    all_city = get_all_cities()

    for city in all_city:
        city_path = os.path.join(kw_path, city)
        write_line(city_path, city)
Ejemplo n.º 5
0
def write_tag_keywords():
    """
    写入文本的标签
    :return: None
    """
    kw_folder = KEYWORDS_DIR
    mkdir_if_not_exist(kw_folder)
    all_tags = traverse_tags()
    for tag in all_tags:
        file_name = os.path.join(kw_folder, tag)
        write_line(file_name, tag)  # 写入全部标签
Ejemplo n.º 6
0
def reorder_boxes():
    in_file = os.path.join(OUTPUT_DATA, 'logAll_out.txt')
    out_file = os.path.join(OUTPUT_DATA, 'logAll_json.txt')

    data_lines = read_file(in_file)

    for data_line in data_lines:
        items = data_line.split(' ')
        img_name = items[0]
        json_dict = dict()
        json_dict['tag'] = 'face'
        json_dict['name'] = img_name.split('/')[-1]
        image = Image.open(os.path.join(IMG_DATA, img_name))

        if 'size' not in json_dict:
            json_dict['size'] = dict()
        json_dict['size']['width'] = image.width
        json_dict['size']['height'] = image.height

        if 'label' not in json_dict:
            json_dict['label'] = []

        if items >= 2:
            boxes = items[1:]
            for box in boxes:
                box_dict = dict()
                x_min, y_min, x_max, y_max, label = box.split(',')
                box_dict['minX'] = x_min
                box_dict['minY'] = y_min
                box_dict['maxX'] = x_max
                box_dict['maxY'] = y_max
                box_dict['tag'] = 'face'
                box_dict['tagId'] = 0
                json_dict['label'].append(box_dict)

        json_str = json.dumps(json_dict)
        print(json_str)
        write_line(out_file, json_str)
Ejemplo n.º 7
0
    def detect_image(self, image, image_name=None, out_file=None):
        start = timer()  # 起始时间

        if self.model_image_size != (
                None, None):  # 416x416, 416=32*13,必须为32的倍数,最小尺度是除以32
            assert self.model_image_size[
                0] % 32 == 0, 'Multiples of 32 required'
            assert self.model_image_size[
                1] % 32 == 0, 'Multiples of 32 required'
            boxed_image = letterbox_image(image,
                                          tuple(reversed(
                                              self.model_image_size)))  # 填充图像
        else:
            new_image_size = (image.width - (image.width % 32),
                              image.height - (image.height % 32))
            boxed_image = letterbox_image(image, new_image_size)

        image_data = np.array(boxed_image, dtype='float32')
        # print('detector size {}'.format(image_data.shape))
        image_data /= 255.  # 转换0~1
        image_data = np.expand_dims(image_data, 0)  # 添加批次维度,将图片增加1维

        # 参数盒子、得分、类别;输入图像0~1,4维;原始图像的尺寸
        out_boxes, out_scores, out_classes = self.sess.run(
            [self.boxes, self.scores, self.classes],
            feed_dict={
                self.yolo_model.input: image_data,  # 输入检测图片
                self.input_image_shape: [image.size[1],
                                         image.size[0]],  # 输入检测尺寸
                K.learning_phase(): 0  # 学习率, 0表示测试, 1表示训练
            })

        if image_name and out_file:
            boxes_list = [image_name]
            for box, clazz in zip(out_boxes, out_classes):
                line_list = box.tolist()
                line_list = [str(int(x)) for x in line_list]
                line_list.append(str(clazz))
                boxes_list.append(','.join(line_list))
            write_line(out_file, ' '.join(boxes_list))

        # 过滤小于0.004的图像
        # tmp_boxes, tmp_scores, tmp_classes = [], [], []
        # for out_box, out_score, out_class in zip(out_boxes, out_scores, out_classes):
        #     img_size = image.size[1] * image.size[0]
        #     # print_info('图片大小: %s' % img_size)
        #     top, left, bottom, right = out_box
        #     box_ratio = abs(top - bottom) * abs(left - right) / img_size
        #     if box_ratio > 0.004:
        #         tmp_boxes.append(out_box)
        #         tmp_scores.append(out_score)
        #         tmp_classes.append(out_class)
        # out_boxes, out_scores, out_classes = tmp_boxes, tmp_scores, tmp_classes

        print('Found {} boxes for {}'.format(len(out_boxes), 'img'))  # 检测出的框

        font = ImageFont.truetype(font='font/FiraMono-Medium.otf',
                                  size=np.floor(3e-2 * image.size[1] +
                                                0.5).astype('int32'))  # 字体
        thickness = (image.size[0] + image.size[1]) // 512  # 厚度

        for i, c in reversed(list(enumerate(out_classes))):
            predicted_class = self.class_names[c]  # 类别
            box = out_boxes[i]  # 框
            score = out_scores[i]  # 执行度

            label = '{} {:.2f}'.format(predicted_class, score)  # 标签
            draw = ImageDraw.Draw(image)  # 画图
            label_size = draw.textsize(label, font)  # 标签文字

            top, left, bottom, right = box
            top = max(0, np.floor(top + 0.5).astype('int32'))
            left = max(0, np.floor(left + 0.5).astype('int32'))
            bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
            right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
            # print(label, (left, top), (right, bottom))  # 边框

            if top - label_size[1] >= 0:  # 标签文字
                text_origin = np.array([left, top - label_size[1]])
            else:
                text_origin = np.array([left, top + 1])

            # My kingdom for a good redistributable image drawing library.
            for i in range(thickness):  # 画框
                draw.rectangle([left + i, top + i, right - i, bottom - i],
                               outline=self.colors[c])
            draw.rectangle(  # 文字背景
                [tuple(text_origin),
                 tuple(text_origin + label_size)],
                fill=self.colors[c])
            draw.text(text_origin, label, fill=(0, 0, 0), font=font)  # 文案
            del draw

        end = timer()
        # print(end - start)  # 检测执行时间
        return image