def service(i_json):
    o_json = []

    # --

    html_table = i_json['html_table']

    # -- table parsing & exclusion

    import class_table_processing

    TP = class_table_processing.table_processing()

    table = TP.parse(html_table)

    if TP.is_rejected(table):
        return []

    print('table', table)

    # -- entity norm

    import class_entity_norm

    EN = class_entity_norm.entity_norm()

    # --

    cell_list = []

    table_head, table_body = table

    for row in table_body:
        for cell in row:
            cell_list.append(cell)

    # --

    norm_dict, norm_list = EN.norm_entity(cell_list)

    print('norm_dict', norm_dict)

    # -- entity linking

    import class_entity_linking

    EL = class_entity_linking.entity_linking()

    # --

    link_dict = EL.link_entity(norm_list, KB)

    print('link_dict', link_dict)

    # -- table to text

    import class_table_to_text

    TT = class_table_to_text.table_to_text()

    # --

    text_list = []

    text_list += TT.table_to_text(table, norm_dict, link_dict, KB)

    print('text_list', text_list)

    # -- predicate linking

    import class_utility

    utility = class_utility.utility()

    L2K_input = []

    for e1, e2, p0, e1_type, e2_type, p, text in text_list:
        sentence = text.replace(utility.uri2name(e1),
                                ' << {0} >> '.format(utility.uri2name(e1)))
        sentence = sentence.replace(utility.uri2name(e2),
                                    ' << {0} >> '.format(utility.uri2name(e2)))

        L2K_input.append(sentence)

    # --

    import json

    L2K_response = POST_request('http://qamel.kaist.ac.kr:60005/service',
                                json.dumps(L2K_input))

    L2K_output = []

    import json

    for four_tuple in json.loads(L2K_response):
        L2K_output.append(tuple(four_tuple))

    L2K_output = list(set(L2K_output))

    print('L2K_output', L2K_output)

    # -- post-processing

    postprocessed_output = []

    for s, p, o, c in L2K_output:
        entity_set = set([])

        for surface in link_dict.keys():
            if link_dict[surface] != None:
                if not utility.is_literal(link_dict[surface]):
                    entity_set.add(link_dict[surface])

        # --

        if utility.is_literal(s):
            KB_s = s

        else:
            KB_s = utility.name2uri(s, entity_set)

        # --

        if utility.is_literal(o):
            KB_o = o

        else:
            KB_o = utility.name2uri(o, entity_set)

        # --

        KB_p = list(KB.p_name2uri[utility.uri2name(p)])[0]

        # --

        import re

        KB_p = re.sub(r'^dbo:', 'http://dbpedia.org/ontology/', KB_p)
        KB_p = re.sub(r'^dbp:', 'http://dbpedia.org/property/', KB_p)
        KB_p = re.sub(r'^sport:', 'http://www.bbc.co.uk/ontologies/sport/',
                      KB_p)

        # --

        postprocessed_output.append([KB_s, KB_p, KB_o, c])

    print('postprocessed_output', postprocessed_output)

    # -- domain/range filtering

    for KB_s, KB_p, KB_o, c in postprocessed_output:
        if KB.check_domain_range(KB_s, KB_p, KB_o):
            o_json.append([KB_s, KB_p, KB_o, c])

    # --

    return o_json
def main():
	# --
	# entity detection
	# --

	try:
		with open('data/surface-dict.json') as i_file:
			import json

			surface_dict = json.loads(i_file.read())

	except FileNotFoundError:
		try:
			with open('data/NLP/NLP-news-list-pyeongchang-olympic.json') as i_file:
				import json

				parsing = json.loads(i_file.read())

		except FileNotFoundError:
			with open('data/news/news-list-pyeongchang-olympic.json') as i_file:
				import json

				news_list = json.loads(i_file.read())

			# --

			import class_NLP

			NLP = class_NLP.NLP()

			# --

			parsing = {}

			P = 0

			for url, title, date, content, provider in news_list:
				P += 1

				if P % 100 == 0:
					print('parsing', P, len(news_list))

				# --
				
				parsing[url] = NLP.parse(content)

			# --

			with open('data/NLP/NLP-news-list-pyeongchang-olympic.json', 'w+') as o_file:
				import json

				o_file.write(json.dumps(parsing, ensure_ascii=False))

		# --

		import class_entity_detection

		ED = class_entity_detection.entity_detection()

		# --

		surface_dict = {}

		P = 0

		for url in parsing.keys():
			P += 1

			if P % 1000 == 0:
				print('entity detection', P, len(parsing.keys()))

			# --

			surface_dict[url] = ED.detect_entity(parsing[url])

		# --

		with open('data/surface-dict.json', 'w+') as o_file:
			import json

			o_file.write(json.dumps(surface_dict, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False))

	print('entity detected')

	# --

	url_set = set([])

	sentence_set = set([])

	for url in surface_dict.keys():
		url_set.add(url)

		for sentence in surface_dict[url].keys():
			sentence_set.add(sentence)

	print('url_set', len(url_set))
	
	print('sentence_set', len(sentence_set))

	# --
	# entity normalization
	# --

	with open('data/news/news-list-pyeongchang-olympic.json') as i_file:
		import json

		news_list = json.loads(i_file.read())

	try:
		with open('data/norm-dict.json') as i_file:
			import json

			norm_dict = json.loads(i_file.read())

		with open('data/norm-list.json') as i_file:
			import json

			norm_list = json.loads(i_file.read())

	except FileNotFoundError:
		import class_entity_norm

		EN = class_entity_norm.entity_norm()

		# --

		norm_dict = {}

		norm_list = []

		# --

		import class_utility

		utility = class_utility.utility()

		# --

		P = 0

		for url, title, date, content, provider in news_list:
			P += 1

			if P % 1000 == 0:
				print('entity normalization', P, len(news_list))

			# --

			norm_dict_url, norm_list_url = EN.norm_entity(surface_dict[url], {'date': utility.norm_article_date(date)})

			# --
			
			norm_dict[url] = norm_dict_url

			norm_list += norm_list_url

		# --

		norm_list = list(set(norm_list))

		# --

		with open('data/norm-dict.json', 'w+') as o_file:
			import json

			o_file.write(json.dumps(norm_dict, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False))

		with open('data/norm-list.json', 'w+') as o_file:
			norm_list = list(norm_list)

			# --

			import json

			o_file.write(json.dumps(norm_list, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False))

	print('entity normalized')

	# --
	# entity linking
	# --

	import class_KB

	KB = class_KB.KB()

	KB.load()

	print('KB loaded')

	# --

	try:
		with open('data/link-dict.json') as i_file:
			import json

			link_dict = json.loads(i_file.read())

	except FileNotFoundError:
		import class_entity_linking

		EL = class_entity_linking.entity_linking()

		# --

		import multiprocessing

		norm_queue = multiprocessing.Manager().Queue()

		for norm in norm_list:
			norm_queue.put(norm, False)

		# --

		link_dict = multiprocessing.Manager().dict()

		# --

		multiprocessing.Pool(12, EL.link_entity_multiprocess, (norm_queue, link_dict, KB))

		# --

		import time

		start_time = time.time()

		while not norm_queue.empty():
			print(['entity linking', len(norm_list) - norm_queue.qsize(), len(norm_list)])

			time.sleep(60)

		while len(link_dict.keys()) != len(norm_list):
			pass

		print(['entity linking', len(norm_list) - norm_queue.qsize(), len(norm_list)])

		print(['elapsed time', time.time() - start_time])

		# ==

		with open('data/link-dict.json', 'w+') as o_file:
			o_data = {}

			for norm in link_dict.keys():
				o_data[norm] = link_dict[norm]

			# --

			import json

			o_file.write(json.dumps(o_data, indent=4, separators=(',', ': '), sort_keys=True, ensure_ascii=False))

	print('entity linked')

	# --
	# DS data extraction
	# --

	import class_DS_data_extraction

	DSDE = class_DS_data_extraction.DS_data_extraction()

	# --

	DS_data = []

	# --

	embedding_corpus = []

	# --

	P = 0

	for url in norm_dict.keys():
		P += 1

		if P % 1000 == 0:
			print('DS data extraction', P, len(norm_dict.keys()))

		# --

		pholded_sentence_list = DSDE.placehold_sentence(norm_dict[url], link_dict)

		# --

		DS_sentence_list = DSDE.extract_DS_sentence(pholded_sentence_list, KB)

		# --

		embedding_sentence_list = list(norm_dict[url].keys())

		embedding_sentence_list += DSDE.get_embedding_sentence(DS_sentence_list)

		# --

		DS_data += DS_sentence_list

		embedding_corpus += embedding_sentence_list

	# --

	p_list, p_cnt = DSDE.get_property_list(DS_data)

	# --

	with open('data/DS-data.tsv', 'w+') as o_file:
		for x in DS_data:
			o_file.write('\t'.join(x) + '\n')

	print('DS_data', len(DS_data))

	# --

	with open('data/DS-embedding-corpus.txt', 'w+') as o_file:
		for x in embedding_corpus:
			o_file.write(x + '\n')

	print('embedding_corpus', len(embedding_corpus))

	# --

	with open('data/DS-data-property-list.txt', 'w+') as o_file:
		trainable_p_list = []

		for p, cnt in sorted(p_cnt.items(), key=lambda x: x[1], reverse=True):
			if cnt >= 50:
				trainable_p_list.append(p)

		# --

		o_file.write(', '.join(trainable_p_list))

	print('trainable_p_list', len(trainable_p_list))

	# --

	with open('data/DS-data-property-count.tsv', 'w+') as o_file:
		for p, cnt in sorted(p_cnt.items(), key=lambda x: x[1], reverse=True):
			o_file.write('\t'.join([p, str(cnt)]) + '\n')

	# --

	for p, cnt in sorted(p_cnt.items(), key=lambda x: x[1], reverse=True):
		print(p, cnt)
Beispiel #3
0
    def __init__(self):
        import class_utility

        self.utility = class_utility.utility()
def service(i_json):
	o_json = []

	# --

	date = i_json['date']
	content = i_json['content']

	# -- parsing

	import class_NLP

	NLP = class_NLP.NLP()

	parsing = NLP.parse(content)

	# -- entity detection

	import class_entity_detection

	ED = class_entity_detection.entity_detection()

	surface_dict = ED.detect_entity(parsing)

	import json

	print('surface_dict', json.dumps(surface_dict, indent=4, separators=(',', ': '), ensure_ascii=False))

	# -- entity norm

	import class_entity_norm

	EN = class_entity_norm.entity_norm()

	import class_utility

	utility = class_utility.utility()

	norm_dict, norm_list = EN.norm_entity(surface_dict, {'date': utility.norm_article_date(date)})

	norm_list = list(set(norm_list))

	import json

	print('norm_dict', json.dumps(norm_dict, indent=4, separators=(',', ': '), ensure_ascii=False))

	print('norm_list', norm_list)

	# -- entity linking

	import class_entity_linking

	EL = class_entity_linking.entity_linking()

	link_dict = EL.link_entity(norm_list, KB)

	import json

	print('link_dict', json.dumps(link_dict, indent=4, separators=(',', ': '), ensure_ascii=False))

	# -- placeholded sentence extraction

	import class_DS_data_extraction

	DSDE = class_DS_data_extraction.DS_data_extraction()

	pholded_sentence_dict = {}

	for sentence in norm_dict.keys():
		pholded_sentence_dict = DSDE.placehold_sentence(norm_dict, link_dict)

	# -- predicate linking

	import class_utility

	utility = class_utility.utility()

	L2K_input = []

	for e1, e2, pholded_sentence in pholded_sentence_dict:
		sentence = pholded_sentence.replace(' << _sbj_ >> ', ' << {0} >> '.format(utility.uri2name(e1)))
		sentence = sentence.replace(' << _obj_ >> ', ' << {0} >> '.format(utility.uri2name(e2)))

		L2K_input.append(sentence)

	L2K_input = list(set(L2K_input))

	# --

	L2K_response = POST_request('http://qamel.kaist.ac.kr:60002/service', json.dumps(L2K_input))
	
	L2K_output = []

	import json

	for four_tuple in json.loads(L2K_response):
		L2K_output.append(tuple(four_tuple))

	L2K_output = list(set(L2K_output))

	#import json

	#print('L2K_output', json.dumps(L2K_output, indent=4, separators=(',', ': '), ensure_ascii=False))

	# -- post-processing

	postprocessed_output = []

	for s, p, o, c in L2K_output:
		entity_set = set([])

		for surface in link_dict.keys():
			if link_dict[surface] != None:
				if not utility.is_literal(link_dict[surface]):
					entity_set.add(link_dict[surface])

		# --

		if utility.is_literal(s):
			KB_s = s

		else:
			KB_s = utility.name2uri(s, entity_set)

		# --

		if utility.is_literal(o):
			KB_o = o

		else:
			KB_o = utility.name2uri(o, entity_set)

		# --

		KB_p = list(KB.p_name2uri[utility.uri2name(p)])[0]

		# --

		import re

		KB_p = re.sub(r'^dbo:', 'http://dbpedia.org/ontology/', KB_p)
		KB_p = re.sub(r'^dbp:', 'http://dbpedia.org/property/', KB_p)
		KB_p = re.sub(r'^sport:', 'http://www.bbc.co.uk/ontologies/sport/', KB_p)

		# --

		postprocessed_output.append([KB_s, KB_p, KB_o, c])

	import json

	print('postprocessed_output', json.dumps(postprocessed_output, indent=4, separators=(',', ': '), ensure_ascii=False))

	'''
	with open('postprocessed_output', 'w+') as o_file:
		import json

		o_file.write(json.dumps(postprocessed_output, indent=4, separators=(',', ': '), ensure_ascii=False))
	'''

	# -- domain/range filtering

	for KB_s, KB_p, KB_o, c in postprocessed_output:
		if KB.check_domain_range(KB_s, KB_p, KB_o):
			o_json.append([KB_s, KB_p, KB_o, c])

	# --

	return o_json