def __init__(self, triple_path):
     self.entity_set = set()  # 实体节点集合
     self.nlp = NLP()
     # 连接neo4j数据库
     self.graph = Graph(host='localhost',
                        http_port=7474,
                        user='******',
                        password='******')
     f_in = open(triple_path, 'r')
     triple_str = f_in.read()  # 读取整个Json
     self.triple = json.loads(triple_str)
sys.path.append("..")  # 先跳出当前目录
from core.nlp import NLP
from core.extractor import Extractor

if __name__ == '__main__':
    input_path = '../../data/input_text.txt'  # 输入的文本文件
    output_path = '../../data/knowledge_triple.json'  # 输出的处理结果Json文件
    if os.path.isfile(output_path):
        os.remove(output_path)
    # os.mkdir(output_path)

    print('Start extracting...')

    # 实例化NLP(分词,词性标注,命名实体识别,依存句法分析)
    nlp = NLP()
    num = 1  # 知识三元组

    with open(input_path, 'r', encoding='utf-8') as f_in:
        # 分句,获得句子列表
        origin_sentences = re.split('[。?!;]|\n', f_in.read())
        # 遍历每一篇文档中的句子
        for origin_sentence in origin_sentences:
            # 原始句子长度小于6,跳过
            if (len(origin_sentence) < 6):
                continue
            print('*****')
            # print(origin_sentence)
            # 分词处理
            lemmas = nlp.segment(origin_sentence)
class WriteToNeo4j:
    """将Json类型的知识三元组导入Neo4j数据库"""
    def __init__(self, triple_path):
        self.entity_set = set()  # 实体节点集合
        self.nlp = NLP()
        # 连接neo4j数据库
        self.graph = Graph(host='localhost',
                           http_port=7474,
                           user='******',
                           password='******')
        f_in = open(triple_path, 'r')
        triple_str = f_in.read()  # 读取整个Json
        self.triple = json.loads(triple_str)

    def write_litigant(self, litigants, relation):
        """处理当事人信息(原告和原告)
        Args:
            litigant: list,当事人信息
        """
        for litigant in litigants:
            node_litigant = Node(self.get_label(litigant['名字']),
                                 name=litigant['名字'],
                                 id=litigant['编号'])
            self.graph.create(node_litigant)
            self.entity_set.add(litigant['名字'])
            node_root = self.graph.find_one('判决书',
                                            property_key='name',
                                            property_value='判决书001')
            entity_relation = Relationship(node_root,
                                           relation,
                                           node_litigant,
                                           label='relation')
            self.graph.create(entity_relation)

            for item in litigant:
                if item != '名字' and item != '编号':
                    node_repr = Node(self.get_label(litigant[item]),
                                     name=litigant[item])  # 负责人,委托代理人
                    self.graph.create(node_repr)
                    self.entity_set.add(litigant[item])
                    entity_relation = Relationship(node_litigant,
                                                   item,
                                                   node_repr,
                                                   label='关系')
                    self.graph.create(entity_relation)

    def get_label(self, word):
        """根据单词获得标签
        Args:
            word: str,单词
        Returns:
            label: str,类型标签
        """
        label = ''
        postag = self.nlp.get_postag(word)
        if postag == 'nh':
            label = '人'
        elif postag == 'ni':
            label = '组织'
        elif postag == 'ns':
            label = '地点'
        else:
            label = '其他'
        return label

    def write(self):
        """写入图数据库"""
        # 根节点
        # 一篇判决书具有"文书编号","文书标题","按键编号","文书类型","案件编号"几个属性
        node_root = Node('判决书',
                         name='判决书001',
                         id=self.triple['文书编号'],
                         title=self.triple['文书标题'],
                         type=self.triple['文书类型'],
                         case=self.triple['案件编号'])
        self.graph.create(node_root)
        self.entity_set.add('判决书001')
        node_court = Node('组织', name=self.triple['受理法院'])
        self.graph.create(node_court)
        self.entity_set.add(self.triple['受理法院'])

        entity_rerlation = Relationship(node_root,
                                        '受理法院',
                                        node_court,
                                        label='关系')
        self.graph.create(entity_rerlation)

        # 遍历原告,被告
        plaintiffs = self.triple['原告']
        self.write_litigant(plaintiffs, '原告')
        defendants = self.triple['被告']
        self.write_litigant(defendants, '被告')

        facts = self.triple['案情事实']
        for fact in facts:
            tri = fact['知识']
            entity1 = tri[0]
            relation = tri[1]
            entity2 = tri[2]

            node_list = []
            node1 = Node(self.get_label(entity1), name=entity1)
            if entity1 not in self.entity_set:
                self.graph.create(node1)
                node_list.append(node1)
                self.entity_set.add(entity1)
            else:
                node_list.append(
                    self.graph.find_one(self.get_label(entity1),
                                        property_key='name',
                                        property_value=entity1))

            node2 = Node(self.get_label(entity2), name=entity2)
            if entity2 not in self.entity_set:
                self.graph.create(node2)
                node_list.append(node2)
                self.entity_set.add(entity2)
            else:
                node_list.append(
                    self.graph.find_one(self.get_label(entity2),
                                        property_key='name',
                                        property_value=entity2))

            entity_relation = Relationship(node_list[0],
                                           relation,
                                           node_list[1],
                                           label='关系')
            self.graph.create(entity_relation)
Example #4
0
# filter_str('acbdDEF哈哈哈🍕')

if __name__ == '__main__':
    # input_path = '../../data/input_text.txt'  # 输入的文本文件
    input_path = '../../data/第三次减半后内容.txt'  # 输入的文本文件
    output_path = '../../data/knowledge_triple6.json'  # 输出的处理结果Json文件

    if os.path.isfile(output_path):  ###删除指定路径下的文件
        os.remove(output_path)
    # os.mkdir(output_path)

    print('Start extracting...')

    # 实例化NLP(分词,词性标注,命名实体识别,依存句法分析)
    nlp = NLP()
    num = 1  # 知识三元组
    with open(input_path, 'r', encoding='utf-8') as f_in:
        # 分句,获得句子列表
        origin_sentences = re.split('[。?!;]|\n', f_in.read())
        # 遍历每一篇文档中的句子
        for origin_sentence in origin_sentences:
            # 原始句子长度小于2,跳过
            if (len(origin_sentence) < 2):
                continue
            # print('原始句子:',origin_sentence)
            origin_sentence = filter_str(origin_sentence)
            # print('*****')
            # print('处理句子:',origin_sentence)
            # print('type:',type(origin_sentence))
            # 分词处理 jieba分词工具