Esempio n. 1
0
def baidu_qa_2019(code_type):
    """
      将baidu_qa_2019数据集转换存储为macadam需要的格式
    """
    path_corpus_tc = os.path.join(path_root, "data", "corpus",
                                  "text_classification", "baidu_qa_2019")
    path_real = os.path.join(path_corpus_tc, "{}.csv".format(code_type))
    datas = txt_read(path_real)
    train_data = []

    for da in datas[1:]:
        da_sp = da.split(",")
        y = da_sp[0]
        x = da_sp[1].replace(" ", "")
        # texts2其实是None,但是为了测试模拟, 所以实际取了值
        xy = {"x": {"text": x, "texts2": []}, "y": [y]}
        xy_json = json.dumps(xy, ensure_ascii=False) + "\n"
        train_data.append(xy_json)

    txt_write(train_data, os.path.join(path_corpus_tc, "{}.json".format(code_type)))
Esempio n. 2
0
def thucnews(code_type):
    """
      将baidu_qa_2019数据集转换存储为macadam需要的格式
    """
    path_corpus_text_classification_thucnews = os.path.join(path_root, "data", "corpus",
                                                            "text_classification", "thucnews")
    datas = txt_read(os.path.join(path_corpus_text_classification_thucnews, "{}.txt".format(code_type)))
    train_data = []

    for da in datas:
        da_sp = da.split("\t")
        y = da_sp[0]
        x = da_sp[1]
        # texts2其实是None,但是为了测试模拟, 所以实际取了值
        # xy = {"x":{"text":x, "texts2":[x[0], x[1:3]]}, "y":y}
        xy = {"x": {"text": x, "texts2": []}, "y": y}
        xy_json = json.dumps(xy, ensure_ascii=False) + "\n"
        train_data.append(xy_json)

        # train_data.append((da_sp[1], da_sp[0]))
    txt_write(train_data, os.path.join(path_corpus_text_classification_thucnews, "{}.json".format(code_type)))

    mm = 0
Esempio n. 3
0
        for k, v in label.items():
            for k2,v2 in v.items():
                for v2_idx in v2:
                    start = v2_idx[0]
                    end = v2_idx[1]
                    if start==end:
                        y[start] = "S-{}".format(k)
                    else:
                        y[start:end] = ["I-{}".format(k)] * len(k2)
                        y[start] = "B-{}".format(k)
        data_json_save["y"] = y
        # res.append(data_json_save)
        line_save = json.dumps(data_json_save, ensure_ascii=False) + "\n"
        res.append(line_save)

    txt_write(res, path_save)
    # save_json(res, path_save, indent=4)

mm = 0


# CLUENER 细粒度命名实体识别
#
# 数据分为10个标签类别,分别为:
# 地址(address),
# 书名(book),
# 公司(company),
# 游戏(game),
# 政府(goverment),
# 电影(movie),
# 姓名(name),