def run_static(): data = utils.read_json(Const.origin_train_filtered_filename) words = utils.static_words(data) words2id = dict() for i, w in enumerate(words): words2id[w] = i json.dump(words2id, open(Const.words2id_filename, 'w'), indent=True) relations = utils.static_relations(data) relations2id = dict() for i, r in enumerate(relations): relations2id[r] = i json.dump(relations2id, open(Const.relations2id_filename, 'w'), indent=True)
def load_and_cache_examples(args, tokenizer, logger, mode="train"): """SemEval2010Task8 does'not have dev set""" assert mode in ["train", "test", "dev"] if not os.path.exists(args.data_cache_dir): os.mkdir(args.data_cache_dir) cached_examples_file = os.path.join( args.data_cache_dir, "cached_{}_{}_{}_{}".format(args.dataset, mode, args.entity_position_encoding, str(args.max_seq_length)), ) if os.path.exists(cached_examples_file): logger.info("Loading features from cached file %s", cached_examples_file) examples = torch.load(cached_examples_file) else: logger.info("Creating features for %s %s set" % (args.dataset, mode)) if args.dataset == 'kbp37': _, train_sentences, train_relations = read_txt(os.path.join(KBP37RawPath, "train.txt")) _, dev_sentences, dev_relations = read_txt(os.path.join(KBP37RawPath, "dev.txt")) _, test_sentences, test_relations = read_txt(os.path.join(KBP37RawPath, "test.txt")) if not args.kbp37_split_dev: train_sentences.extend(dev_sentences) train_relations.extend(dev_relations) else: _, train_sentences, train_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "train.txt")) _, test_sentences, test_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "test.txt")) relation2id_path = KBP37Relation2IdPath if args.dataset == "kbp37" else SemEval2010Relation2IdPath if os.path.exists(relation2id_path): with open(relation2id_path, 'r', encoding='utf8') as f: relation2id = json.load(f) else: relation2id, _ = static_relations(train_relations) with open(relation2id_path, 'w', encoding='utf8') as f: json.dump(relation2id, f) if mode == 'train': sentences, relations = train_sentences, train_relations elif mode == 'test': sentences, relations = test_sentences, test_relations else: if args.dataset == 'kbp37': sentences, relations = dev_sentences, dev_relations else: raise ValueError("SemEval2010Task8 does'not have dev set!") examples = create_examples_for_xlnet(sentences, relations, tokenizer, relation2id, True if args.entity_position_encoding=="entity_tag" else False, True if args.entity_position_encoding == "token_type_ids" else False, args.max_seq_length) torch.save(examples, cached_examples_file) return examples
def statics(name): if name == 'train': filename = Const.origin_all_train_filename elif name == 'dev': filename = Const.origin_all_dev_filename f = open(filename, 'r', encoding='utf-8') content = f.readlines() html_doc = ' '.join(content) sentences_string, triples_string = utils.parse(html_doc) # 读取内容 sentences_words = utils.sentence_tokenize(sentences_string) # 分词 relations_words = utils.static_relations(triples_string) # 对关系进行分词 并建立关系到id的映射 sentences_words.extend(relations_words) # not only static the words in sentences, but also the words in relations words2id = utils.static_words(sentences_words) # 建立单词到id的映射 relations_words_id = [None] for r_words in relations_words: r_words_id = [utils.turn_word2id(w, words2id) for w in r_words] relations_words_id.append(r_words_id) json.dump(relations_words_id, open(Const.relations_words_id_filename, 'w', encoding='utf-8'), indent=False)