Esempio n. 1
0
def get_char_dict(file_name="E:/githubWorkSpace/KnowledgeAlignmentCode/knowledgeAlignment_merged/dataset/dbp_wd/mapping/0_3/attr/attr_triples_1"):
    dp_att_trip_file = file_name
    ent_ids, att_ids, values = [], [], []
    with open(dp_att_trip_file, 'r', encoding="utf-8") as f:
        att_val_count = Counter()
        num, line_num = 0, 0
        length_count = defaultdict(int)
        for line_num, trip in enumerate(f.readlines()):
            att_sp = trip.split("\t")
            att_val = att_sp[2].strip()
            if att_val.endswith("@en"):
                att_val = att_val[:-3]
            print(att_val)
            if "\\U" not in att_val and "\\N" not in att_val:
                att_val = eval("u'" + (att_val.strip().lstrip("\"").rstrip("\"")).strip("\\").replace("'", "\\'") + "'")
            chars = [ch for ch in att_val if ch in alphabet]
            filtered = ''.join(chars).lower().strip()
            # print(filtered)
            if len(filtered) == len(att_val):
                print(att_val, "----", filtered)
                filtered = filtered.replace('−', '-')
                filtered = filtered.replace('–', '-')
                filtered = filtered.replace('\\', '')
                filtered = filtered.replace('*', '')
                filtered = filtered.replace('~', '')
                filtered = filtered.replace('#', '')
                filtered = filtered.replace('+', '')
                # print(att_val)
            if not is_valid(filtered):
                # print(filtered)
                pass
            else:
                ent_ids.append(att_sp[0])
                att_ids.append(att_sp[1])
                values.append(filtered)
                length_count[len(filtered)] += 1
                att_val_count.update(filtered)
                num += 1
    print(att_val_count.most_common(20))
    print(line_num, num)
    # write_counter_2file(att_val_count, os.path.dirname(curPath) + "\dataset\dp_att_value_count.csv")
    print(length_count)
    x, y = [], []
    keys = length_count.keys()
    keys = sorted(keys)
    # print(keys)
    for k in keys:
        last = 0
        if len(x) > 0:
            last = y[len(x) - 1]
        # print('k', last)
        x.append(k)
        y.append(length_count[k] + last)
    paint_xy(x, y)
    print(len(set(att_ids)))
    print(len(set(ent_ids)))
    print(len(set(values)))
    return ent_ids, att_ids, values
Esempio n. 2
0
def _display_dict_dis(d):
    x, y = [], []
    keys = d.keys()
    keys = sorted(keys)
    for k in keys:
        last = 0
        if len(x) > 0:
            last = y[len(x) - 1]
        x.append(k)
        y.append(d[k] + last)
    paint_xy(x, y)
Esempio n. 3
0
 def pxy(d):
     x, y = [], []
     keys = d.keys()
     keys = sorted(keys)
     print(keys)
     for k in keys:
         last = 0
         if len(x) > 0:
             last = y[len(x) - 1]
         # print('k', last)
         x.append(k)
         y.append(d[k] + last)
     paint_xy(x, y)
Esempio n. 4
0
def _display_dict_dis(d):
    """ 传进来一个 dict 然后打印 x y
        # x 表示 dict 的所有 key,且有序
        # y 表示 dict 的 value 累加,比如:
        # dict = {[5, 1], [6,10], [7, 7]}
        # x = [5, 6, 7], y = [1, 11, 18]
    """
    x, y = [], []
    keys = d.keys()
    keys = sorted(keys)
    for k in keys:
        last = 0
        if len(x) > 0:
            last = y[len(x) - 1]
        x.append(k)
        y.append(d[k] + last)
    paint_xy(x, y)
Esempio n. 5
0
def get_char_dict(
    file_name="E:/githubWorkSpace/KnowledgeAlignmentCode/knowledgeAlignment_merged/dataset/dbp_wd/mapping/0_3/attr/attr_triples_1"
):
    """ 输入属性三元组的数据文件,对数据进行检查清理,拿到有效的,且都是小写的数据
        :param file_name : 需要打开的文件 
        :return ent_ids : 有效的实体列表
        :return att_ids : 有效的属性列表
        :return values : 有效的值列表
    """
    # dbpedia_attr_triples_file_name
    # dbpedia 数据库的属性三元组的文件名
    dp_att_trip_file = file_name
    # entity attr value => head relation tail
    ent_ids, att_ids, values = [], [], []
    with open(dp_att_trip_file, 'r', encoding="utf-8") as f:
        # collections.Conter 对象 用于追踪值的出现次数
        # XXX(zdh) 具体拿来做啥不知道
        att_val_count = Counter()
        # num 表示存的有效个数 ent_ids.size() line_num 最后会变成输入数据的 size
        num, line_num = 0, 0
        # collections.defaultdict 对象,是一个有默认值的 dict
        length_count = defaultdict(int)
        # enumerate 会列出下标和数据, 从 start = 0 开始
        for line_num, trip in enumerate(f.readlines()):
            # 根据 \t 分割
            att_sp = trip.split("\t")
            # 拿到 tail / value
            att_val = att_sp[2].strip()
            # 如果末尾是 @en 就删掉
            if att_val.endswith("@en"):
                att_val = att_val[:-3]
            print(att_val)
            # XXX(zdh) 看起来是在做 非 \U \N 数据的转换
            if "\\U" not in att_val and "\\N" not in att_val:
                att_val = eval("u'" +
                               (att_val.strip().lstrip("\"").rstrip("\"")
                                ).strip("\\").replace("'", "\\'") + "'")
            # 把 value 转换成 有效的 chars
            chars = [ch for ch in att_val if ch in alphabet]
            # 把 chars 转成 filtered 全小写 + 去空格
            filtered = ''.join(chars).lower().strip()
            # print(filtered)
            # XXX(zdh) 似乎都是比较离谱的字符,做一个转换
            if len(filtered) == len(att_val):
                print(att_val, "----", filtered)
                filtered = filtered.replace('−', '-')
                filtered = filtered.replace('–', '-')
                filtered = filtered.replace('\\', '')
                filtered = filtered.replace('*', '')
                filtered = filtered.replace('~', '')
                filtered = filtered.replace('#', '')
                filtered = filtered.replace('+', '')
                # print(att_val)
            if not is_valid(filtered):
                # 没有有效字符则 pass
                # print(filtered)
                pass
            else:
                # 有一个有效字符就存进来
                ent_ids.append(att_sp[0])
                att_ids.append(att_sp[1])
                values.append(filtered)
                # 存 xx len 出现的次数
                length_count[len(filtered)] += 1
                # 一个 filtered 的 map, 每个 filtered 出现的次数
                att_val_count.update(filtered)
                # 有效个数
                num += 1
    # 打印出现最多的 20个
    print(att_val_count.most_common(20))
    print(line_num, num)
    # XXX(zdh) 注释了干啥?
    # write_counter_2file(att_val_count, os.path.dirname(curPath) + "\dataset\dp_att_value_count.csv")
    # XXX(zdh) 打印长度字典?
    print(length_count)

    x, y = [], []
    # 拿到有效数据的全部长度, 并排序
    keys = length_count.keys()
    keys = sorted(keys)
    # print(keys)
    # x 表示 length_count 的所有 key,且有序
    # y 表示 length_count 的 value 累加,比如:
    # length_count = {[5, 1], [6,10], [7, 7]}
    # x = [5, 6, 7], y = [1, 11, 18]
    for k in keys:
        last = 0
        if len(x) > 0:
            last = y[len(x) - 1]
        # print('k', last)
        x.append(k)
        y.append(length_count[k] + last)
    paint_xy(x, y)
    # 打印去重之后的个数
    print(len(set(att_ids)))
    print(len(set(ent_ids)))
    print(len(set(values)))
    return ent_ids, att_ids, values