Ejemplo n.º 1
0
# encoding:utf-8

# @Author: Rilzob
# @Time: 2019/3/31 下午10:16

from Spider.HtmlParser import HtmlParser
from Spider.DataOutput import DataOutput
from EventTriplesExtraction.triple_extraction import TripleExtractor

if __name__ == "__main__":
    # url = 'https://baike.baidu.com/item/%E4%B8%9C%E5%8C%97%E5%A4%A7%E5%AD%A6/18014'

    # whole_info = HtmlParser().main_parse()
    whole_info = HtmlParser().new_main_parse()
    DataOutput().output('output2.json', whole_info)

    # 将school_introduce生成三元组并将生成后的结果添加到whole_info中
    extractor = TripleExtractor()
    for school_name, item in whole_info.items():
        introduce_tripe = extractor.triples_main(item['school_introduce'])
        school_introduce_triple = {}
        school_introduce_triple['school_introduce_triple'] = introduce_tripe
        whole_info[school_name].update(school_introduce_triple)

    DataOutput().output('data.json', whole_info)
Ejemplo n.º 2
0
from database_interface import neo4j_select_triple_table
from EventTriplesExtraction.triple_extraction import TripleExtractor

extractor = TripleExtractor()
attribute_columns = ('triple_subject', 'triple_object')  # 属性列
data = neo4j_select_triple_table('triple_domestic', attribute_columns)

for record in data:
    print('主语:', record[0], extractor.entity_annotation(record[0]))
    print('宾语:', record[1], extractor.entity_annotation(record[1]))
Ejemplo n.º 3
0
#!/usr/bin/env python 
# -*- coding:utf-8 -*-
from database_interface import test_retrieve
from EventTriplesExtraction.triple_extraction import TripleExtractor

sql = "select news_title,news_summary from sea_news_domestic limit 5  "
data=test_retrieve(sql)
extractor = TripleExtractor()

pig_text = '''
    猪爷爷的年龄是15岁。猪奶奶的年龄是14岁。猪爸爸的年龄是10岁。猪妈妈的年龄是9岁。乔治的年龄是2岁。佩奇的年龄是1岁。
    猪爸爸和猪妈妈和乔治是家人。猪爷爷和猪奶奶的关系是夫妻。乔治和佩奇是姐弟。猪爷爷和猪爸爸是父子。猪爸爸和佩奇是父子。
    '''
pig_svo_list = extractor.triples_main(pig_text)
for pig_svo in pig_svo_list:
    print(pig_svo)
    #show_detail(pig_svo[0])
for count,record in enumerate(data):
    svo_title = extractor.triples_main(record[0])
    svo_summary = extractor.triples_main(record[1])
    print('{}:\n'.format(record[0]), svo_title)
    #show_detail(svo_title[0])
    print('{}:\n'.format(record[1]), svo_summary)
Ejemplo n.º 4
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from database_interface import test_retrieve_neo4j
from EventTriplesExtraction.triple_extraction import TripleExtractor
from neo4j_interface import punctuation_remove

sql = "select triple_subject,triple_object from triple_culture   "
data = test_retrieve_neo4j(sql)
extractor = TripleExtractor()
sting = "fdsaf'sdf"
sting2 = 'fdsaf"sdf'
svo_subject = punctuation_remove(sting)
if not svo_subject:
    print('空白:', sting)
else:
    print('{}:'.format(sting), svo_subject)
svo_subject = extractor.punctuation_remove(sting2)
if not svo_subject:
    print('空白:', sting2)
else:
    print('{}:'.format(sting2), svo_subject)
for record in data:
    svo_subject = extractor.punctuation_remove(record[0])
    if not svo_subject:
        print('空白:', record[0])
    else:
        print('{}:'.format(record[0]), svo_subject)
    svo_object = extractor.punctuation_remove(record[1])
    if not svo_object:
        print('空白:', record[1])
    else:
Ejemplo n.º 5
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from database_interface import select_triple_table, creat_triple_table, creat_table_neo4j, record_had_extracting, table_not_exists, drop_table_neo4j
from EventTriplesExtraction.triple_extraction import TripleExtractor
from neo4j_interface import duplicate_removal_svo, punctuation_remove

#MATCH (n {name:"杰克"}) set n:animal return n
#match (n) detach delete n

extractor = TripleExtractor()
table_list = [
    'domestic', 'culture', 'cbhg', 'economics', 'edu', 'international', 'mil',
    'tech', 'trave'
]
#table_list=['trave']

#清空表
#truncate_table_neo4j("triple_{}".format('domestic'))

#表的创建
for table in table_list:  #读取表,抽取三元组
    #truncate_table_neo4j("triple_{}".format(table))
    #sql = "select id,news_title,news_summary from sea_news_{}".format(table)
    attribute_columns = ('id', 'news_title', 'news_summary')  #属性列
    table_name = "sea_news_{}".format(table)
    table_name_triple = "triple_{}_test".format(table)
    if not table_not_exists(table_name_triple):
        drop_table_neo4j(table_name_triple)
        print('删除{}表成功'.format(table_name_triple))
    creat_table_neo4j(table_name_triple)
    print('创建{}表成功'.format(table_name_triple))