Example #1
0
def run_static():
    data = utils.read_json(Const.origin_train_filtered_filename)
    words = utils.static_words(data)
    words2id = dict()
    for i, w in enumerate(words):
        words2id[w] = i
    json.dump(words2id, open(Const.words2id_filename, 'w'), indent=True)

    relations = utils.static_relations(data)
    relations2id = dict()
    for i, r in enumerate(relations):
        relations2id[r] = i
    json.dump(relations2id, open(Const.relations2id_filename, 'w'), indent=True)
Example #2
0
def load_and_cache_examples(args, tokenizer, logger, mode="train"):
    """SemEval2010Task8 does'not have dev set"""
    assert mode in ["train", "test", "dev"]

    if not os.path.exists(args.data_cache_dir):
        os.mkdir(args.data_cache_dir)

    cached_examples_file = os.path.join(
        args.data_cache_dir,
        "cached_{}_{}_{}_{}".format(args.dataset, mode, args.entity_position_encoding, str(args.max_seq_length)),
    )
    if os.path.exists(cached_examples_file):
        logger.info("Loading features from cached file %s", cached_examples_file)
        examples = torch.load(cached_examples_file)
    else:
        logger.info("Creating features for %s %s set" % (args.dataset, mode))
        if args.dataset == 'kbp37':
            _, train_sentences, train_relations = read_txt(os.path.join(KBP37RawPath, "train.txt"))
            _, dev_sentences, dev_relations = read_txt(os.path.join(KBP37RawPath, "dev.txt"))
            _, test_sentences, test_relations = read_txt(os.path.join(KBP37RawPath, "test.txt"))
            if not args.kbp37_split_dev:
                train_sentences.extend(dev_sentences)
                train_relations.extend(dev_relations)
        else:
            _, train_sentences, train_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "train.txt"))
            _, test_sentences, test_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "test.txt"))
        relation2id_path = KBP37Relation2IdPath if args.dataset == "kbp37" else SemEval2010Relation2IdPath
        if os.path.exists(relation2id_path):
            with open(relation2id_path, 'r', encoding='utf8') as f:
                relation2id = json.load(f)
        else:
            relation2id, _ = static_relations(train_relations)
            with open(relation2id_path, 'w', encoding='utf8') as f:
                json.dump(relation2id, f)
        if mode == 'train':
            sentences, relations = train_sentences, train_relations
        elif mode == 'test':
            sentences, relations = test_sentences, test_relations
        else:
            if args.dataset == 'kbp37':
                sentences, relations = dev_sentences, dev_relations
            else:
                raise ValueError("SemEval2010Task8 does'not have dev set!")
        examples = create_examples_for_xlnet(sentences, relations, tokenizer, relation2id,
                                             True if args.entity_position_encoding=="entity_tag" else False,
                                             True if args.entity_position_encoding == "token_type_ids" else False,
                                             args.max_seq_length)
        torch.save(examples, cached_examples_file)
    return examples
Example #3
0
def statics(name):
    if name == 'train':
        filename = Const.origin_all_train_filename
    elif name == 'dev':
        filename = Const.origin_all_dev_filename
    f = open(filename, 'r', encoding='utf-8')
    content = f.readlines()
    html_doc = ' '.join(content)
    sentences_string, triples_string = utils.parse(html_doc) # 读取内容
    sentences_words = utils.sentence_tokenize(sentences_string) # 分词
    relations_words = utils.static_relations(triples_string) # 对关系进行分词 并建立关系到id的映射
    sentences_words.extend(relations_words)  # not only static the words in sentences, but also the words in relations
    words2id = utils.static_words(sentences_words) # 建立单词到id的映射
    relations_words_id = [None]
    for r_words in relations_words:
        r_words_id = [utils.turn_word2id(w, words2id) for w in r_words]
        relations_words_id.append(r_words_id)
    json.dump(relations_words_id, open(Const.relations_words_id_filename, 'w', encoding='utf-8'), indent=False)