Ejemplo n.º 1
0
 def get_disambi_title(self, infile):
     disambi_title = {}
     for line in LoadFile.readline(infile):
         words = line.strip().split("\",\"")
         title_tmp = Clean.clean_word(words[1], clean_level="title")
         disambi_tmp = Clean.clean_word(words[0], clean_level="disambi")
         #            title_tmp = title_tmp.strip().strip("\"")
         disambi_title[disambi_tmp] = title_tmp
     return disambi_title
Ejemplo n.º 2
0
def main():
    with open("./410_baidu/410_disambi_infobox.csv", 'r',
              encoding='UTF-8') as inf:
        lines = inf.readlines()
        f = open("./410_baidu/410_disambi_infobox_out.csv",
                 "w",
                 encoding='utf-8')
        list_attr = []
        title_list = get_word_list("./410_baidu/410_title.csv")
        err_count = 0
        counts = {}
        for line in tqdm(lines):
            words = line.strip().split(",")
            disambi = Clean.clean_word(words[0], clean_level='disambi')
            infobox = ",".join(words[1:])
            try:
                info_dict = json.loads(json.loads(infobox))
                for attr in info_dict.keys():
                    clean_attr = Clean.clean_word(attr)
                    info_dict[clean_attr] = info_dict.pop(attr)
                    value = info_dict[clean_attr]
                    clean_attr = clean_attr
                    counts[clean_attr] = counts.setdefault(clean_attr, 0) + 1
                    list_attr.append(clean_attr)
                    value_split = re.split(u"[,。、,/]", value.strip())
                    for v in value_split:
                        v = Clean.clean_word(v).strip(u"等").strip(u"收起")
                        title_list.append(v)
                        f.write("\"" + disambi + "\",\"" + clean_attr +
                                "\",\"" + v + "\"" + "\r\n")
            except Exception as e:
                print(e)
                err_count += 1
        title_list = [t.strip(u"\\") for t in title_list]
        title_list = list(set(title_list))
        list_attr = list(set(list_attr))
        sort_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
        with open("./sort_counts.txt", "w", encoding='utf-8') as ouf:
            for i in sort_counts:
                ouf.write(str(i) + "\n")
        with open("./all_attr.txt", "w", encoding='utf-8') as ouf:
            for word_counts in sort_counts:
                if word_counts[1] >= 10:
                    ouf.write(str(word_counts[0]) + "\n")
        with open("./410_baidu/410_title_new.csv", "w",
                  encoding='utf-8') as ouf:
            for i in title_list:
                ouf.write("\"" + i + "\"\r\n")
        with open("./410_baidu/all_attr.txt", "w", encoding='utf-8') as ouf:
            for i in list_attr:
                ouf.write(i + "\n")

        print("err_count: ", err_count)
Ejemplo n.º 3
0
def clean_title_disambi(infile="title_disambi.csv", outfile="title_disambi_out.csv"):
    with open(infile, "r",encoding='utf-8') as inf:
        lines = inf.readlines()
        err_counts = 0
        with open(outfile, "w",encoding='utf-8') as ouf:
            for line in tqdm(lines):
                words = line.strip().split("\",\"")
                if len(words) != 2:
                    err_counts += 1
                    continue
                title = Clean.clean_word(words[0], clean_level='title')
                disambi = Clean.clean_word(words[1], clean_level='disambi')
                ouf.write("\"" + title + "\",\"" + disambi + "\"\r\n")
            print("err_counts for disambi_redirect: ", err_counts)
Ejemplo n.º 4
0
def clean_disambi_redirect(infile="source", outfile="target"):
    with open(infile) as inf:
        reader = csv.reader(inf)
        err_counts = 0
        with open(outfile, "w") as ouf:
            for line in tqdm(reader):
                if len(line) != 2:
                    err_counts += 1
                    continue
                print(line)
                disambi = Clean.clean_word(line[0], clean_level='disambi')
                redirect = Clean.clean_word(line[1], clean_level='redirect')
                ouf.write("\"" + disambi + "\",\"" + redirect + "\"\n")
            print("err_counts for disambi_redirect:%d" % (err_counts))
def get_word_list(filename):
    with open(filename, "r") as inf:
        lines = inf.readlines()
        #        print "type line: ", type(lines[0].encode("utf-8"))
        lines = [
            Clean.clean_word(line.decode('utf-8'), clean_level='title')
            for line in lines
        ]
        return lines
Ejemplo n.º 6
0
def get_title(infile):
    all_title = set([])
    for line in LoadFile.readline(infile):
        title_tmp = Clean.clean_word(line.strip(), clean_level="title")
        title_tmp = title_tmp.strip().strip("\"")
        if title_tmp == "":
            continue
        all_title.add(title_tmp)
    return all_title
Ejemplo n.º 7
0
def clean_disambi_subject(infile="disambi_subject.csv",
                          outfile="disambi_subject_out.csv"):
    with open(infile) as inf:
        lines = inf.readlines()
        err_counts = 0
        with open(outfile, "w") as ouf:
            for line in tqdm(lines):
                words = line.strip().split("\",\"")
                if len(words) != 2:
                    err_counts += 1
                    continue
                disambi = Clean.clean_word(
                    words[0].decode('utf-8'),
                    clean_level='disambi').encode('utf-8')
                subject = Clean.clean_word(
                    words[1].decode('utf-8'),
                    clean_level='subject').encode('utf-8')
                ouf.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
            print "err_counts for disambi_redirect: ", err_counts
Ejemplo n.º 8
0
def clean_disambi_literal(infile="source", outfile="target"):
    with open(infile) as inf:
        reader = csv.reader(inf)
        err_counts = 0
        with open(outfile, "w") as ouf:
            for line in tqdm(reader):
                if len(line) != 2:
                    err_counts += 1
                    continue
                disambi = Clean.clean_word(line[0], clean_level='disambi')
                literal = Clean.clean_word(line[1], clean_level='literal')
                if literal != '' and disambi != '':
                    if '[朱槿品种]' in disambi:
                        literal = '快乐'
                        disambi = '快乐[[朱槿品种]]'
                    if '"' in literal:
                        literal = literal.replace('"', '""')
                    if '\\' in literal:
                        literal = literal.replace('\\', '')
                    if '"' in disambi:
                        disambi = disambi.replace('"', '""')
                    ouf.write("\"" + disambi + "\",\"" + literal + "\"\n")
            print("err_counts for disambi_redirect:%d" % (err_counts))
Ejemplo n.º 9
0
def main():
    with open("source/disambi.csv") as in_f_disambi, open("source/infobox.csv", "r") as in_f_infobox,\
            open('source/literal.csv') as in_f_literal, open("target/disambi_infobox.csv", "w") as out_f:
        literal_list = get_word_list(in_f_literal)
        disambi_reader = csv.reader(in_f_disambi)
        info_lines = in_f_infobox.readlines()
        list_attr = []
        list_value = []
        err_count = 0
        attr_counts = {}
        for (disambi, infobox) in tqdm(zip(disambi_reader, info_lines)):
            disambi = Clean.clean_word(disambi[0], clean_level='disambi')
            if '"' in disambi:
                disambi = disambi.replace('"', '""')
            if infobox != '{}':
                try:
                    #print(json.loads(infobox))
                    info_dict = json.loads(json.loads(infobox).replace("\\", r"\\"))
                    clean_info_dict = {}
                    for attr in info_dict.keys():
                        clean_attr = Clean.clean_word(attr, clean_level='others')
                        if clean_attr not in clean_info_dict.keys():
                            clean_info_dict[clean_attr] = info_dict[attr]
                    for clean_attr in clean_info_dict.keys():
                        value = str(','.join(clean_info_dict[clean_attr])) if clean_info_dict[clean_attr] != [] else None
                        if value:
                            #value = value.replace('\"','').replace("\\",'').replace('"','""')
                            value = value.replace('"','""')
                            attr_counts[clean_attr] = attr_counts.setdefault(clean_attr, 0) + 1 # Collect Attr. Frequency
                            list_attr.append(clean_attr)

                            #######
                            #literal_list.append(value)
                            list_value.append(value)
                            out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + value + "\"" + "\n")
                            #######

                        #value_split = re.split(u"[,。、,/]", value.strip())
                        #for v in value_split:
                            #v = Clean.clean_word(v).strip(u"等").strip(u"收起")
                            #v = v.strip(u"等").strip(u"收起")
                            #if len(v) > 0:
                                #literal_list.append(v)
                                #list_value.append(v)
                                #out_f.write("\"" + disambi + "\",\"" + clean_attr + "\",\"" + v + "\"" + "\r\n")

                except Exception as e:
                    print(f'Error:{e},Disambi:{disambi},Infobox:{infobox}')
                    err_count += 1
                    #break                    
        literal_list = [t.replace('\"','').replace("\\",'').replace('"','""') for t in literal_list]
        literal_list = list(set(literal_list))
        list_attr = list(set(list_attr))
        list_value = list(set(list_value))
        sort_counts = sorted(attr_counts.items(),key = lambda x:x[1],reverse = True)
        with open("target/sorted_all_attr.txt", "w") as ouf:
            for i in sort_counts:
                ouf.write(str(i) + "\n")
        with open("target/sorted_filerted_attr.txt", "w") as ouf:
            for word_counts in sort_counts:
                if  word_counts[1] >= 10:
                    ouf.write(str(word_counts[0]) + "\n")
        with open("target/literal.csv", "w") as ouf:
            for i in literal_list:
                ouf.write("\"" + i + "\"\n")
        with open("target/attr.txt", "w") as ouf:
            for i in list_attr:
                ouf.write(i + "\n")
        with open("target/value.csv", "w") as ouf:
            for i in list_value:
                ouf.write("\"" + i + "\"\n")
            
        print("err_count: ", err_count)
Ejemplo n.º 10
0
def get_word_list(in_f):
    #with open(filename, "r") as in_f:
    reader = csv.reader(in_f)
    lines = [Clean.clean_word(line[0], clean_level='literal') for line in reader]
    lines.remove('')
    return lines
Ejemplo n.º 11
0
#!/usr/bin/env python
# coding=utf-8
import re
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi.csv", "r", encoding='utf-8') as inf:
    title_dict = {}
    count = 0
    lines = inf.readlines()
    for line in tqdm(lines):
        words = line.strip().split("\",\"")
        if len(words) != 4:
            count += 1
        clean_disambi = Clean.clean_word(words[0], 'disambi')
        title_dict[clean_disambi] = words[1:]
    print("Error lines: ", count)
    with open("./410_baidu/410_disambi_new.csv", "w", encoding='utf-8') as ouf:
        for i in title_dict.keys():
            ouf.write("\"" + i + "\",\"" + "\",\"".join(title_dict[i]) +
                      "\r\n")
Ejemplo n.º 12
0
'''
将disambi名称进行清洗,其余不变 
'''
import re
from clean import Clean
from tqdm import tqdm
import csv

with open("source/disambi_attrs.csv") as inf:
    title_dict = {}
    err_count = 0
    reader = csv.reader(inf)
    for line in tqdm(reader):
        curLink = line[-2]
        exterLink = line[-1]
        clean_disambi = Clean.clean_word(line[0], 'disambi')
        if '"' in clean_disambi:
            clean_disambi = clean_disambi.replace('"', '""')
        if curLink == 'http://www.baike.com/wiki/%22':
            clean_disambi = '""[标点符号]'
        if len(line) < 5:
            print(f'\n{line},{len(line)}')
            err_count += 1
            literal = '""'
            abstract = Clean.clean_word(line[1], 'others').strip()
        else:
            literal = Clean.clean_word(line[0], 'title')
            abstract = line[2] if len(line) == 5 else ''.join(line[2:-2])
            abstract = abstract.replace('编辑摘要 ', '').replace('"', "'").strip()
        title_dict[clean_disambi] = [literal, abstract, curLink, exterLink]
    print("Error count:%d" % (err_count))
Ejemplo n.º 13
0
#!/usr/bin/env python
# coding=utf-8

from collections import defaultdict
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi_subject.csv", "r", encoding='utf-8') as inf:
    lines = inf.readlines()
    #    all_subject = defaultdict(list)
    total_subject = []
    f = open("./410_baidu/disambi_subject.csv", "w", encoding='utf-8')
    for line in tqdm(lines):
        words = line.strip().split(",")
        disambi = Clean.clean_word(words[0], clean_level='disambi')
        subjects = words[1:]
        subjects = [
            Clean.clean_word(s, clean_level="subject") for s in subjects
        ]
        #        subjects = [s.replace("\"", "").strip("\\") for s in subjects]
        #        subjects = [s.strip() for s in subjects]
        total_subject.extend(subjects)
        for subject in subjects:
            if subject == "":
                continue
            f.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
#        all_subject[disambi].append(subjects)
    f.close()
    total_subject = list(set(total_subject))
    print("Total subjects: ", len(total_subject))
    with open("./410_baidu/all_subject.csv", "w", encoding='utf-8') as ouf:
Ejemplo n.º 14
0
#!/usr/bin/env python
# coding=utf-8
import re
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi.csv") as inf:
    title_dict = {}
    count = 0
    lines = inf.readlines()
    for line in tqdm(lines):
        words = line.strip().split("\",\"")
        if len(words) != 4:
            count += 1
        clean_disambi = Clean.clean_word(words[0].decode('utf-8'), 'disambi')
        title_dict[clean_disambi] = words[1:]
    print "Error lines: ", count
    with open("./410_baidu/410_disambi_new.csv", "w") as ouf:
        for i in title_dict.keys():
            ouf.write("\"" + i.encode('utf-8') + "\",\"" +
                      "\",\"".join(title_dict[i]) + "\r\n")
Ejemplo n.º 15
0
def build_entity_relation(in_file,
                          train_file,
                          test_file,
                          rfile,
                          tfile=args.title_path):
    with open(in_file) as inf, open(train_file,
                                    "w") as trf, open(test_file, "w") as tef:
        disambi_dict, tt_pair_set = load_dict(rfile, regen=True)
        all_title = get_title(tfile)
        title_id = {k: n for n, k in enumerate(all_title)}
        disambi_id = {k: n for n, k in enumerate(disambi_dict.keys())}
        #        print("tt_pair_set: ", tt_pair_set)
        total_lines = int(
            commands.getoutput("awk 'END{print NR}' %s" % (in_file)))
        error_counts = 0
        RESID = 0
        RES_list = []
        count_re = 0
        count_na = 0
        total_sentence_used = 0
        re_set = set([])
        # each line here is all_text for lemma
        #        re_in_lemma = 0
        for line_num in tqdm(range(total_lines)):
            re_in_lemma = 0
            if count_na + count_re > args.max_sentence:
                #                re_in_lemma = 0
                continue
            all_info = inf.readline().strip().decode("utf-8")
            title_disambi_text = all_info.split(",")
            if len(title_disambi_text) != 3:
                error_counts += 1
                continue
            text_title = Clean.clean_word(title_disambi_text[0],
                                          clean_level="title")
            text_disambi = Clean.clean_word(title_disambi_text[1],
                                            clean_level="disambi")
            all_text = title_disambi_text[2].replace("\"", "")
            try:
                relation_tuple = disambi_dict[text_disambi]
            except:
                continue
            if relation_tuple == "":
                error_counts += 1
                continue
            lines = re.split(u"[。?;!!?]", all_text)
            #            relation_in_sentence = 0
            for r_tuple in relation_tuple:
                if un_ner(text_title) or un_ner(r_tuple[1]):
                    continue
                sentence_used = False
                for line in lines:
                    RES_sentence_dict = {}
                    RES_head_dict = {}
                    RES_tail_dict = {}
                    line = line.strip().replace(" ", "") + u"。"
                    seg_line = " ".join(jieba.cut(line))
                    pos1 = string.find(seg_line, text_title)
                    pos2 = string.find(seg_line, r_tuple[1])
                    if pos1 != -1 and pos2 != -1 and pos1 != pos2:
                        if count_re > args.max_sentence * 0.25:
                            continue
                        if r_tuple[0] != "":
                            count_re += 1
                            re_in_lemma += 1
                            sentence_used = True
                            RES_head_dict['word'] = text_title
                            RES_head_dict['id'] = str(disambi_id[text_disambi])
                            RES_head_dict['type'] = "None"
                            RES_tail_dict['word'] = r_tuple[1]
                            RES_tail_dict['id'] = str(title_id[r_tuple[1]])
                            RES_tail_dict['type'] = "None"
                            RES_sentence_dict['sentence'] = seg_line
                            RES_sentence_dict['head'] = RES_head_dict
                            RES_sentence_dict['tail'] = RES_tail_dict
                            RES_sentence_dict['relation'] = r_tuple[0]
                            RES_list.append(RES_sentence_dict)
                            RESID += 1
                            re_set.add(r_tuple[0])
            for line in lines:
                if count_na > args.max_sentence * 0.75 or re_in_lemma > args.max_NA_in_lemmas:
                    break
                sentence_used = False
                relation_in_sentence = 0
                RES_sentence_dict = {}
                RES_head_dict = {}
                RES_tail_dict = {}
                line = line.strip().replace(" ", "") + u"。"
                seg_line = " ".join(jieba.cut(line))
                words = seg_line.split()
                words = [
                    Clean.clean_word(word, clean_level="others")
                    for word in words
                ]
                enti_pos = []
                for i in range(len(words)):
                    if words[i] != u'' and words[
                            i] in all_title and not un_ner(words[i]):
                        enti_pos.append(i)
                enti_pair_pos = []
                for i in enti_pos:
                    for j in enti_pos:
                        if i != j and not (
                            (words[i] + "#" + words[j]) in tt_pair_set):
                            enti_pair_pos.append((words[i], words[j]))
                if enti_pair_pos == []:
                    continue
                for enti_pair in enti_pair_pos:
                    if relation_in_sentence > args.max_relation_in_sentence or re_in_lemma > args.max_NA_in_lemmas:
                        break
                    count_na += 1
                    relation_in_sentence += 1
                    re_in_lemma += 1
                    sentence_used = True
                    RES_head_dict['word'] = enti_pair[0]
                    RES_head_dict['id'] = str(title_id[enti_pair[0]])
                    RES_head_dict['type'] = "None"
                    RES_tail_dict['word'] = enti_pair[1]
                    RES_tail_dict['id'] = str(title_id[enti_pair[1]])
                    RES_tail_dict['type'] = "None"
                    RES_sentence_dict['sentence'] = seg_line
                    RES_sentence_dict['head'] = RES_head_dict
                    RES_sentence_dict['tail'] = RES_tail_dict
                    RES_sentence_dict['relation'] = "NA"
                    RES_list.append(RES_sentence_dict)
                    RESID += 1
                if sentence_used == True:
                    total_sentence_used += 1
            print "Total Realtion: ", count_re + count_na, "\t Current RE: ", count_re, "\t Current NA: ", count_na
        with open("rel2id.json", "w") as rf:
            relation_id = {k: v + 1 for v, k in enumerate(re_set)}
            relation_id['NA'] = 0
            rf.write(json.dumps(relation_id))
        print "count_re: ", count_re, "\t count_na: ", count_na, "\t count_total: ", count_re + count_na
        print "total_sentence_used: ", total_sentence_used
        total_len = count_re + count_na if count_re + count_na < args.max_sentence else args.max_sentence
        train_list = RES_list[:int(0.8 * total_len)]
        test_list = RES_list[int(0.8 * total_len) + 1:]
        json.dump(train_list, trf)
        json.dump(test_list, tef)
Ejemplo n.º 16
0
from collections import defaultdict
from clean import Clean
from tqdm import tqdm
import csv

with open("source/disambi_topic.csv") as in_f, open("target/disambi_topic.csv",
                                                    "w") as out_f:
    reader = csv.reader(in_f)
    total_topic = []
    for line in tqdm(reader):
        #print(line)
        disambi = line[0]
        topics = []
        for i in line[1].split(','):
            topics.extend(i.split())
        disambi = Clean.clean_word(disambi, clean_level='disambi')
        topics = [Clean.clean_word(s, clean_level="topic") for s in topics]
        total_topic.extend(topics)
        for topic in topics:
            if topic == "":
                continue
            if '[朱槿品种]' in disambi:
                disambi = '快乐[[朱槿品种]]'
            if '"' in disambi:
                disambi = disambi.replace('"', '""')
            if '"' in topic:
                topic = topic.replace('"', '""')
            out_f.write("\"" + disambi + "\",\"" + topic + "\"\n")
    total_topic = list(set(total_topic))
    print("Total topics:%d " % (len(total_topic)))
Ejemplo n.º 17
0
#!/usr/bin/env python
# coding=utf-8

from collections import defaultdict
from clean import Clean
from tqdm import tqdm

with open("./410_baidu/410_disambi_subject.csv") as inf:
    lines = inf.readlines()
    #    all_subject = defaultdict(list)
    total_subject = []
    f = open("./410_baidu/disambi_subject.csv", "w")
    for line in tqdm(lines):
        words = line.strip().split(",")
        disambi = Clean.clean_word(words[0].decode('utf-8'),
                                   clean_level='disambi').encode("utf-8")
        subjects = words[1:]
        subjects = [
            Clean.clean_word(s.decode('utf-8'),
                             clean_level="subject").encode("utf-8")
            for s in subjects
        ]
        #        subjects = [s.replace("\"", "").strip("\\") for s in subjects]
        #        subjects = [s.strip() for s in subjects]
        total_subject.extend(subjects)
        for subject in subjects:
            if subject == "":
                continue
            f.write("\"" + disambi + "\",\"" + subject + "\"\r\n")
#        all_subject[disambi].append(subjects)
    f.close()
Ejemplo n.º 18
0
import re
import json
import re
from tqdm import tqdm
from clean import Clean


def get_word_list(filename):
    with open(filename, "r", encoding='utf-8') as inf:
        lines = inf.readlines()
        #        print "type line: ", type(lines[0].encode("utf-8"))
        lines = [Clean.clean_word(line, clean_level='title') for line in lines]
        return lines


print(Clean.clean_word(u"\"你好   呀#\"$%^&*@!,。、;:‘’】季    候【"))


def main():
    with open("./410_baidu/410_disambi_infobox.csv", 'r',
              encoding='UTF-8') as inf:
        lines = inf.readlines()
        f = open("./410_baidu/410_disambi_infobox_out.csv",
                 "w",
                 encoding='utf-8')
        list_attr = []
        title_list = get_word_list("./410_baidu/410_title.csv")
        err_count = 0
        counts = {}
        for line in tqdm(lines):
            words = line.strip().split(",")