Ejemplo n.º 1
0
def run_text(
        path="/mnt/data/dev/github/数据处理工具/tool_data_processing/data/text"):
    tt = tkitText.Text()
    for f in tfile.file_List(path):
        t = tfile.open_file(f)
        for s in tt.sentence_segmentation_v1(t):
            # for key,item in kg.get_unmarked():
            print("#################标记数据######")
            # print("本次已经标注:",i)
            # print(key)
            if len(s) > 50:
                continue
            ner_list, vs = get_w_v(s)
            ner_list = ner(s) + ner_list
            print("实体", ner_list)
            #判断是不是宠物
            # if check_pet(item['sentence'])==0:
            #     continue

            # print("ht知识:",ht0.triple_extraction(sent=item['sentence']))
            # tt.ht.triple_extraction(sent=sentence)
            ht_kg = tt.ht.triple_extraction(sent=s)
            jiagu_kg = jiagu.knowledge(s)
            # c_kg=[item['kg']]
            all_kg = ht_kg + jiagu_kg
            end_kg = []
            # print("所有知识:",all_kg)

            for k in all_kg:
                if k in end_kg:
                    continue
                if k[0] in ner_list and k[1] in vs:
                    new = {'sentence': s, 'kg': k}
                    one(new)
                    end_kg.append(k)
Ejemplo n.º 2
0
    def auto_mark_wiki(self, text):
        """
        自动标记wiki句子
        """
        # print(ner(text))
        kg = Kg()
        data = []
        self.kg_tmp = []
        self.tdb.load("mark")
        word = self.keyword
        # print(word)
        # print(kg.get(word))
        tt = tkit.Text()
        knowledge = jiagu.knowledge(text)
        # print(knowledge)
        # for one in  knowledge:
        if len(knowledge) > 0:
            key = tt.md5(text + "".join("".join(knowledge)))
            one_item = {"text": text, "data": knowledge}
            print("jiagu成功匹配知识", one_item)
            self.tdb.put(key, one_item)
            self.i = self.i + 1
        kg_one = kg.get(word)
        # print(kg_one)
        if kg_one == None or text.find(word) < 0:
            return data
        print("句子:", text)
        print("发现word", word)
        for b in kg_one.keys():
            one_p = [word]
            # print(key)
            # print(text.find(key))
            if text.find(b) > 0 or b in ["是"]:
                one_p.append(b)
                # print(one_p)
                if type(kg_one.get(b)) == dict:
                    for c in kg_one.get(b).keys():
                        # print(c)
                        # print(text.find(c))
                        if text.find(c) > 0:
                            # print("222")
                            one_p.append(c)
                            # print(c)
                        else:
                            one_p = []
                        # print(one_p)
                        if len(one_p) == 3:
                            # 发现一条可标记数据
                            # print("zui",one_p)

                            data.append(one_p)
            else:
                pass
            self.kg_tmp.append((word, kg_one))
        return data
Ejemplo n.º 3
0
def run_mark():
    tt = tkitText.Text()
    tt.load_ht()
    i = 0
    for key, item in kg.get_unmarked():
        print("#################标记数据######")
        # print("本次已经标注:",i)
        print(item)
        s = item['sentence']
        print(key)
        if len(s) > 50:
            continue
        # ner_list,vs=get_w_v(s)
        # ner_list=ner(s)+ner_list
        ner_list = ner(s)
        ner_s = tt.named_entity_recognition(s)
        print("提取实体")
        for key in ner_s:
            # print(key)
            ner_list.append(key)

        vs = []
        print('预测成功', i)
        print("句子", s)
        print("实体", ner_list)

        ht_kg = tt.ht.triple_extraction(sent=s)
        # print(ht_kg)
        # ht_kg=ht0.triple_extraction(sent=s)
        # print(ht_kg)
        jiagu_kg = jiagu.knowledge(s)
        # c_kg=[item['kg']]
        all_kg = ht_kg + jiagu_kg
        end_kg = []
        # print("所有知识:",all_kg)

        for k in all_kg:
            if k in end_kg:
                continue
            if k[0] in ner_list and k[1] in vs:
                new = {'sentence': s, 'kg': k}
                if k[0] in ner_list:
                    one(new)
                    end_kg.append(k)

        if item['kg'][0] in ner_list:
            one(item)
Ejemplo n.º 4
0
def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        all_data = [line.strip('\n') for line in f.readlines()]
    result = []
    for data in all_data:
        # print(jiagu.seg(''.join(data)))
        one = []
        for ner in jiagu.ner(data):
            if ner is not 'O':
                one.append(ner)
        if one:
            result.append(one)

    text = '''
4. 香农的信息定义
假定事物状态可以用一个以经典集合论为基础的概率模型来描述,则信息就是用来消除不确定性的东西,或信息是事物运动状态或存在方式的不确定性描述。
但在实际中要寻找一个合适的概率模型往往是非常困难的,有时是否存在这样一种模型还值得探讨。此外,信息有很强的主观性和实用性,但该定义没有考虑信息接收者的主观特性和主观意义,不顾信息的具体含义、具体用途、重要程度和可能引起的后果等因素,这就与实际情况不完全一致。
'''
    keywords = jiagu.knowledge(text)  # 关键词
    print(keywords)
Ejemplo n.º 5
0
    def nlp_jiagu(self, btn):

        text = str(self.lbl.text.strip()).replace("\n", "")
        if text[-2:] == "qg":
            sentiment = jiagu.sentiment(text[:-2])
            self.lbl.text = pprint.pformat(sentiment)
        elif text[-2:] == "cq":
            keywords = jiagu.keywords(text, 5)  # 关键词
            self.lbl.text = pprint.pformat(keywords)
        elif text[-2:] == "jl":
            if "," in self.lbl.text:

                docs = self.lbl.text.split(",")
            else:
                docs = self.lbl.text.split(",")
            #print(docs)
            cluster = jiagu.text_cluster(docs)
            self.lbl.text = pprint.pformat(cluster)

        else:
            knowledge = jiagu.knowledge(text)
            self.lbl.text = pprint.pformat(knowledge)
Ejemplo n.º 6
0
“MODIS的数据让我们能在非常小的尺度上理解这一现象,我们发现人类活动也作出了贡献。”
NASA文章介绍,在中国为全球绿化进程做出的贡献中,有42%来源于植树造林工程,对于减少土壤侵蚀、空气污染与气候变化发挥了作用。
据观察者网过往报道,2017年我国全国共完成造林736.2万公顷、森林抚育830.2万公顷。其中,天然林资源保护工程完成造林26万公顷,退耕还林工程完成造林91.2万公顷。京津风沙源治理工程完成造林18.5万公顷。三北及长江流域等重点防护林体系工程完成造林99.1万公顷。完成国家储备林建设任务68万公顷。
'''

keywords = jiagu.keywords(text, 5)  # 关键词抽取
print(keywords)

summarize = jiagu.summarize(text, 3)  # 文本摘要
print(summarize)

# jiagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。

# 知识图谱关系抽取
text = '姚明1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。'
knowledge = jiagu.knowledge(text)
print(knowledge)

# 情感分析
text = '很讨厌还是个懒鬼'
sentiment = jiagu.sentiment(text)
print(sentiment)

# 文本聚类(需要调参)
docs = [
    "百度深度学习中文情感分析工具Senta试用及在线测试", "情感分析是自然语言处理里面一个热门话题",
    "AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", "深度学习实践:从零开始做电影评论文本情感分析",
    "BERT相关论文、文章和代码资源汇总", "将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上",
    "自然语言处理工具包spaCy介绍", "现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文"
]
cluster = jiagu.text_cluster(docs)
Ejemplo n.º 7
0
filelist = os.listdir(dir)

tot = 0
entities = []
tpcsv = open("news_tp/relationship.csv", "a+", newline='', encoding='utf-8')
encsv = open('news_tp/entities.csv', "a+", newline='', encoding='utf-8')
tot_news = 0
for file in filelist:
    path = os.path.join(dir, file)

    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

        for l in lines:
            tot_news += 1
            knowledge = jiagu.knowledge(l)
            csvwriter = csv.writer(tpcsv)
            tot += len(knowledge)
            for tp in knowledge:
                csvwriter.writerow(tp)
                entities.append(tp[0])
    entities = list(set(entities))

csvwriter = csv.writer(encsv)
for en in entities:
    csvwriter.writerow([en])
print(len(entities))
print(tot)
print(tot_news)
tpcsv.close()
encsv.close()