def show_instances(class_name=''): name = 'train' if name == 'train': filename = Const.origin_all_train_filename elif name == 'dev': filename = Const.origin_all_dev_filename f = open(filename, 'r') content = f.readlines() html_doc = ' '.join(content) sentences_string, triples_string = utils.parse(html_doc) sentences_words = utils.sentence_tokenize(sentences_string) position_triples = utils.find_entity_position(sentences_words, triples_string) sentences_word_id, sentence_triples_id = utils.turn2id( sentences_words, position_triples) utils.triples_type(sentence_triples_id) if class_name == 'normal': func = utils.is_normal_triple elif class_name == 'single_entity_overlap': func = utils.is_over_lapping else: func = utils.is_multi_label words2id = utils.load_words2id() id2words = {v: k for k, v in words2id.items()} for sent_words_id, triples_id in zip(sentences_word_id, sentence_triples_id): if func(triples_id, is_relation_first=False): print ' '.join([id2words[x] for x in sent_words_id]) print triples_id print '-----------------------------------'
def prepare(name): print name if name == 'train': filename = Const.origin_all_train_filename if name == 'dev': filename = Const.origin_all_dev_filename if name == 'example': filename = Const.origin_example_filename print Const.triple_len f = open(filename, 'r') print filename content = f.readlines() html_doc = ' '.join(content) sentences_string, triples_string = utils.parse(html_doc) sentences_words = utils.sentence_tokenize(sentences_string) position_triples = utils.find_entity_position(sentences_words, triples_string) sentences_word_id, sentence_triples_id = utils.turn2id( sentences_words, position_triples) if name == 'train': # split train file into train and valid set [valid_sentences_word_id, valid_sentence_triples_id ], [train_sentences_word_id, train_sentence_triples_id] = utils.split(sentences_word_id, sentence_triples_id) utils.static_triples_info(train_sentence_triples_id) utils.triples_type(train_sentence_triples_id) utils.static_triples_info(valid_sentence_triples_id) utils.triples_type(valid_sentence_triples_id) json.dump([train_sentences_word_id, train_sentence_triples_id], open(Const.train_filename, 'w')) json.dump([valid_sentences_word_id, valid_sentence_triples_id], open(Const.valid_filename, 'w')) utils.instances2nyt_style( [train_sentences_word_id, train_sentence_triples_id], Const.nyt_style_raw_train_filename) utils.instances2nyt_style( [valid_sentences_word_id, valid_sentence_triples_id], Const.nyt_style_raw_valid_filename) elif name == 'dev': utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.dev_filename, 'w')) utils.instances2nyt_style([sentences_word_id, sentence_triples_id], Const.nyt_style_raw_test_filename) else: utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.example_filename, 'w'))
def prepare(name): print(name) if name == 'train': filename = Const.origin_all_train_filename if name == 'dev': filename = Const.origin_all_dev_filename if name == 'example': filename = Const.origin_example_filename print(Const.triple_len) f = open(filename, 'r', encoding='utf-8') print(filename) content = f.readlines() html_doc = ' '.join(content) if not os.path.isfile(name + "_parse.json"): sentences_string, triples_string = utils.parse(html_doc) json.dump([sentences_string, triples_string], open(name + '_parse.json', 'w', encoding='utf-8')) else: sentences_string, triples_string = json.load(open(name + '_parse.json', 'r', encoding='utf-8')) sentences_words = utils.sentence_tokenize(sentences_string) position_triples = utils.find_entity_position(sentences_words, triples_string) # 记录实体最后一个单词的索引(e1_end, e2_end, relation) sentences_word_id, sentence_triples_id = utils.turn2id(sentences_words, position_triples) # 将句子、关系类型转为id if name == 'train': # split train file into train and valid set [valid_sentences_word_id, valid_sentence_triples_id], [train_sentences_word_id, train_sentence_triples_id] = utils.split(sentences_word_id, sentence_triples_id) utils.static_triples_info(train_sentence_triples_id) utils.triples_type(train_sentence_triples_id) utils.static_triples_info(valid_sentence_triples_id) utils.triples_type(valid_sentence_triples_id) json.dump([train_sentences_word_id, train_sentence_triples_id], open(Const.train_filename, 'w', encoding='utf-8')) json.dump([valid_sentences_word_id, valid_sentence_triples_id], open(Const.valid_filename, 'w', encoding='utf-8')) utils.instances2nyt_style([train_sentences_word_id, train_sentence_triples_id], Const.nyt_style_raw_train_filename) utils.instances2nyt_style([valid_sentences_word_id, valid_sentence_triples_id], Const.nyt_style_raw_valid_filename) elif name == 'dev': utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.dev_filename, 'w', encoding='utf-8')) utils.instances2nyt_style([sentences_word_id, sentence_triples_id], Const.nyt_style_raw_test_filename) else: utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.example_filename, 'w', encoding='utf-8'))