def service(i_json): o_json = [] # -- html_table = i_json['html_table'] # -- table parsing & exclusion import class_table_processing TP = class_table_processing.table_processing() table = TP.parse(html_table) if TP.is_rejected(table): return [] print('table', table) # -- entity norm import class_entity_norm EN = class_entity_norm.entity_norm() # -- cell_list = [] table_head, table_body = table for row in table_body: for cell in row: cell_list.append(cell) # -- norm_dict, norm_list = EN.norm_entity(cell_list) print('norm_dict', norm_dict) # -- entity linking import class_entity_linking EL = class_entity_linking.entity_linking() # -- link_dict = EL.link_entity(norm_list, KB) print('link_dict', link_dict) # -- table to text import class_table_to_text TT = class_table_to_text.table_to_text() # -- text_list = [] text_list += TT.table_to_text(table, norm_dict, link_dict, KB) print('text_list', text_list) # -- predicate linking import class_utility utility = class_utility.utility() L2K_input = [] for e1, e2, p0, e1_type, e2_type, p, text in text_list: sentence = text.replace(utility.uri2name(e1), ' << {0} >> '.format(utility.uri2name(e1))) sentence = sentence.replace(utility.uri2name(e2), ' << {0} >> '.format(utility.uri2name(e2))) L2K_input.append(sentence) # -- import json L2K_response = POST_request('http://qamel.kaist.ac.kr:60005/service', json.dumps(L2K_input)) L2K_output = [] import json for four_tuple in json.loads(L2K_response): L2K_output.append(tuple(four_tuple)) L2K_output = list(set(L2K_output)) print('L2K_output', L2K_output) # -- post-processing postprocessed_output = [] for s, p, o, c in L2K_output: entity_set = set([]) for surface in link_dict.keys(): if link_dict[surface] != None: if not utility.is_literal(link_dict[surface]): entity_set.add(link_dict[surface]) # -- if utility.is_literal(s): KB_s = s else: KB_s = utility.name2uri(s, entity_set) # -- if utility.is_literal(o): KB_o = o else: KB_o = utility.name2uri(o, entity_set) # -- KB_p = list(KB.p_name2uri[utility.uri2name(p)])[0] # -- import re KB_p = re.sub(r'^dbo:', 'http://dbpedia.org/ontology/', KB_p) KB_p = re.sub(r'^dbp:', 'http://dbpedia.org/property/', KB_p) KB_p = re.sub(r'^sport:', 'http://www.bbc.co.uk/ontologies/sport/', KB_p) # -- postprocessed_output.append([KB_s, KB_p, KB_o, c]) print('postprocessed_output', postprocessed_output) # -- domain/range filtering for KB_s, KB_p, KB_o, c in postprocessed_output: if KB.check_domain_range(KB_s, KB_p, KB_o): o_json.append([KB_s, KB_p, KB_o, c]) # -- return o_json
def main(): # -- # entity detection # -- try: with open('data/surface-dict.json') as i_file: import json surface_dict = json.loads(i_file.read()) except FileNotFoundError: try: with open('data/NLP/NLP-news-list-pyeongchang-olympic.json') as i_file: import json parsing = json.loads(i_file.read()) except FileNotFoundError: with open('data/news/news-list-pyeongchang-olympic.json') as i_file: import json news_list = json.loads(i_file.read()) # -- import class_NLP NLP = class_NLP.NLP() # -- parsing = {} P = 0 for url, title, date, content, provider in news_list: P += 1 if P % 100 == 0: print('parsing', P, len(news_list)) # -- parsing[url] = NLP.parse(content) # -- with open('data/NLP/NLP-news-list-pyeongchang-olympic.json', 'w+') as o_file: import json o_file.write(json.dumps(parsing, ensure_ascii=False)) # -- import class_entity_detection ED = class_entity_detection.entity_detection() # -- surface_dict = {} P = 0 for url in parsing.keys(): P += 1 if P % 1000 == 0: print('entity detection', P, len(parsing.keys())) # -- surface_dict[url] = ED.detect_entity(parsing[url]) # -- with open('data/surface-dict.json', 'w+') as o_file: import json o_file.write(json.dumps(surface_dict, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)) print('entity detected') # -- url_set = set([]) sentence_set = set([]) for url in surface_dict.keys(): url_set.add(url) for sentence in surface_dict[url].keys(): sentence_set.add(sentence) print('url_set', len(url_set)) print('sentence_set', len(sentence_set)) # -- # entity normalization # -- with open('data/news/news-list-pyeongchang-olympic.json') as i_file: import json news_list = json.loads(i_file.read()) try: with open('data/norm-dict.json') as i_file: import json norm_dict = json.loads(i_file.read()) with open('data/norm-list.json') as i_file: import json norm_list = json.loads(i_file.read()) except FileNotFoundError: import class_entity_norm EN = class_entity_norm.entity_norm() # -- norm_dict = {} norm_list = [] # -- import class_utility utility = class_utility.utility() # -- P = 0 for url, title, date, content, provider in news_list: P += 1 if P % 1000 == 0: print('entity normalization', P, len(news_list)) # -- norm_dict_url, norm_list_url = EN.norm_entity(surface_dict[url], {'date': utility.norm_article_date(date)}) # -- norm_dict[url] = norm_dict_url norm_list += norm_list_url # -- norm_list = list(set(norm_list)) # -- with open('data/norm-dict.json', 'w+') as o_file: import json o_file.write(json.dumps(norm_dict, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)) with open('data/norm-list.json', 'w+') as o_file: norm_list = list(norm_list) # -- import json o_file.write(json.dumps(norm_list, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)) print('entity normalized') # -- # entity linking # -- import class_KB KB = class_KB.KB() KB.load() print('KB loaded') # -- try: with open('data/link-dict.json') as i_file: import json link_dict = json.loads(i_file.read()) except FileNotFoundError: import class_entity_linking EL = class_entity_linking.entity_linking() # -- import multiprocessing norm_queue = multiprocessing.Manager().Queue() for norm in norm_list: norm_queue.put(norm, False) # -- link_dict = multiprocessing.Manager().dict() # -- multiprocessing.Pool(12, EL.link_entity_multiprocess, (norm_queue, link_dict, KB)) # -- import time start_time = time.time() while not norm_queue.empty(): print(['entity linking', len(norm_list) - norm_queue.qsize(), len(norm_list)]) time.sleep(60) while len(link_dict.keys()) != len(norm_list): pass print(['entity linking', len(norm_list) - norm_queue.qsize(), len(norm_list)]) print(['elapsed time', time.time() - start_time]) # == with open('data/link-dict.json', 'w+') as o_file: o_data = {} for norm in link_dict.keys(): o_data[norm] = link_dict[norm] # -- import json o_file.write(json.dumps(o_data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False)) print('entity linked') # -- # DS data extraction # -- import class_DS_data_extraction DSDE = class_DS_data_extraction.DS_data_extraction() # -- DS_data = [] # -- embedding_corpus = [] # -- P = 0 for url in norm_dict.keys(): P += 1 if P % 1000 == 0: print('DS data extraction', P, len(norm_dict.keys())) # -- pholded_sentence_list = DSDE.placehold_sentence(norm_dict[url], link_dict) # -- DS_sentence_list = DSDE.extract_DS_sentence(pholded_sentence_list, KB) # -- embedding_sentence_list = list(norm_dict[url].keys()) embedding_sentence_list += DSDE.get_embedding_sentence(DS_sentence_list) # -- DS_data += DS_sentence_list embedding_corpus += embedding_sentence_list # -- p_list, p_cnt = DSDE.get_property_list(DS_data) # -- with open('data/DS-data.tsv', 'w+') as o_file: for x in DS_data: o_file.write('\t'.join(x) + '\n') print('DS_data', len(DS_data)) # -- with open('data/DS-embedding-corpus.txt', 'w+') as o_file: for x in embedding_corpus: o_file.write(x + '\n') print('embedding_corpus', len(embedding_corpus)) # -- with open('data/DS-data-property-list.txt', 'w+') as o_file: trainable_p_list = [] for p, cnt in sorted(p_cnt.items(), key=lambda x: x[1], reverse=True): if cnt >= 50: trainable_p_list.append(p) # -- o_file.write(', '.join(trainable_p_list)) print('trainable_p_list', len(trainable_p_list)) # -- with open('data/DS-data-property-count.tsv', 'w+') as o_file: for p, cnt in sorted(p_cnt.items(), key=lambda x: x[1], reverse=True): o_file.write('\t'.join([p, str(cnt)]) + '\n') # -- for p, cnt in sorted(p_cnt.items(), key=lambda x: x[1], reverse=True): print(p, cnt)
def __init__(self): import class_utility self.utility = class_utility.utility()
def service(i_json): o_json = [] # -- date = i_json['date'] content = i_json['content'] # -- parsing import class_NLP NLP = class_NLP.NLP() parsing = NLP.parse(content) # -- entity detection import class_entity_detection ED = class_entity_detection.entity_detection() surface_dict = ED.detect_entity(parsing) import json print('surface_dict', json.dumps(surface_dict, indent=4, separators=(',', ': '), ensure_ascii=False)) # -- entity norm import class_entity_norm EN = class_entity_norm.entity_norm() import class_utility utility = class_utility.utility() norm_dict, norm_list = EN.norm_entity(surface_dict, {'date': utility.norm_article_date(date)}) norm_list = list(set(norm_list)) import json print('norm_dict', json.dumps(norm_dict, indent=4, separators=(',', ': '), ensure_ascii=False)) print('norm_list', norm_list) # -- entity linking import class_entity_linking EL = class_entity_linking.entity_linking() link_dict = EL.link_entity(norm_list, KB) import json print('link_dict', json.dumps(link_dict, indent=4, separators=(',', ': '), ensure_ascii=False)) # -- placeholded sentence extraction import class_DS_data_extraction DSDE = class_DS_data_extraction.DS_data_extraction() pholded_sentence_dict = {} for sentence in norm_dict.keys(): pholded_sentence_dict = DSDE.placehold_sentence(norm_dict, link_dict) # -- predicate linking import class_utility utility = class_utility.utility() L2K_input = [] for e1, e2, pholded_sentence in pholded_sentence_dict: sentence = pholded_sentence.replace(' << _sbj_ >> ', ' << {0} >> '.format(utility.uri2name(e1))) sentence = sentence.replace(' << _obj_ >> ', ' << {0} >> '.format(utility.uri2name(e2))) L2K_input.append(sentence) L2K_input = list(set(L2K_input)) # -- L2K_response = POST_request('http://qamel.kaist.ac.kr:60002/service', json.dumps(L2K_input)) L2K_output = [] import json for four_tuple in json.loads(L2K_response): L2K_output.append(tuple(four_tuple)) L2K_output = list(set(L2K_output)) #import json #print('L2K_output', json.dumps(L2K_output, indent=4, separators=(',', ': '), ensure_ascii=False)) # -- post-processing postprocessed_output = [] for s, p, o, c in L2K_output: entity_set = set([]) for surface in link_dict.keys(): if link_dict[surface] != None: if not utility.is_literal(link_dict[surface]): entity_set.add(link_dict[surface]) # -- if utility.is_literal(s): KB_s = s else: KB_s = utility.name2uri(s, entity_set) # -- if utility.is_literal(o): KB_o = o else: KB_o = utility.name2uri(o, entity_set) # -- KB_p = list(KB.p_name2uri[utility.uri2name(p)])[0] # -- import re KB_p = re.sub(r'^dbo:', 'http://dbpedia.org/ontology/', KB_p) KB_p = re.sub(r'^dbp:', 'http://dbpedia.org/property/', KB_p) KB_p = re.sub(r'^sport:', 'http://www.bbc.co.uk/ontologies/sport/', KB_p) # -- postprocessed_output.append([KB_s, KB_p, KB_o, c]) import json print('postprocessed_output', json.dumps(postprocessed_output, indent=4, separators=(',', ': '), ensure_ascii=False)) ''' with open('postprocessed_output', 'w+') as o_file: import json o_file.write(json.dumps(postprocessed_output, indent=4, separators=(',', ': '), ensure_ascii=False)) ''' # -- domain/range filtering for KB_s, KB_p, KB_o, c in postprocessed_output: if KB.check_domain_range(KB_s, KB_p, KB_o): o_json.append([KB_s, KB_p, KB_o, c]) # -- return o_json