end_dic[(ent[1], ent[2])].append(ent)
                all_num += 1
            for k, v in start_dic.items():
                if len(v) > 1:
                    start_num += len(v)
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


if __name__ == "__main__":
    reader = Reader()
    reader.read_and_gen_vectors_pubmed_word2vec(config.embed_path)
    reader.read_all_data("./data/genia/", "genia.train", "genia.dev",
                         "genia.test")

    # print reader.train_sents[0]
    train_batches, dev_batches, test_batches = reader.to_batch(
        config.batch_size)
    f = open(config.train_data_path, 'wb')
    pickle.dump(train_batches, f)
    f.close()

    f = open(config.dev_data_path, 'wb')
    pickle.dump(dev_batches, f)
    f.close()

    f = open(config.test_data_path, 'wb')
    pickle.dump(test_batches, f)
    f.close()
                start_dic[(ent[0], ent[2])].append(ent)
                end_dic[(ent[1], ent[2])].append(ent)
                all_num += 1
            for k, v in start_dic.items():
                if len(v) > 1:
                    start_num += len(v)
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


if __name__ == "__main__":
    reader = Reader(config.bert_model)
    reader.read_all_data("./data/ace2005/", "ace2005.train", "ace2005.dev",
                         "ace2005.test")

    # print reader.train_sents[0]
    train_batches, dev_batches, test_batches = reader.to_batch(
        config.batch_size)
    f = open(config.train_data_path, 'wb')
    pickle.dump(train_batches, f)
    f.close()

    f = open(config.dev_data_path, 'wb')
    pickle.dump(dev_batches, f)
    f.close()

    f = open(config.test_data_path, 'wb')
    pickle.dump(test_batches, f)
    f.close()
Example #3
0
                end_dic[(ent[1], ent[2])].append(ent)
                all_num += 1
            for k, v in start_dic.items():
                if len(v) > 1:
                    start_num += len(v)
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


if __name__ == "__main__":
    reader = Reader()
    reader.read_and_gen_vectors_pubmed_word2vec(config.embed_path)
    reader.read_all_data("./data/genia_sample/", "train.data", "dev.data", "test.data")

    # print reader.train_sents[0]
    train_batches, dev_batches, test_batches = reader.to_batch(config.batch_size)
    f = open(config.train_data_path, 'wb')
    pickle.dump(train_batches, f)
    f.close()

    f = open(config.dev_data_path, 'wb')
    pickle.dump(dev_batches, f)
    f.close()

    f = open(config.test_data_path, 'wb')
    pickle.dump(test_batches, f)
    f.close()
Example #4
0
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


if __name__ == "__main__":
    tokenizer_dir = "tokenization/polish-roberta-large/"
    tokenizer = SentencePieceBPETokenizer(f"{tokenizer_dir}/vocab.json",
                                          f"{tokenizer_dir}/merges.txt")
    getattr(tokenizer,
            "_tokenizer").post_processor = RobertaProcessing(sep=("</s>", 2),
                                                             cls=("<s>", 0))
    reader = Reader("polish", tokenizer, cls="<s>", sep="</s>", threshold=8)
    reader.read_all_data("./data/poleval/", "poleval.train", "poleval.dev",
                         "poleval.test")

    # print reader.train_sents[0]
    train_batches, dev_batches, test_batches = reader.to_batch(
        config.batch_size)
    f = open(config.train_data_path, 'wb')
    pickle.dump(train_batches, f)
    f.close()

    f = open(config.dev_data_path, 'wb')
    pickle.dump(dev_batches, f)
    f.close()

    f = open(config.test_data_path, 'wb')
    pickle.dump(test_batches, f)
    f.close()
Example #5
0
                start_dic[(ent[0], ent[2])].append(ent)
                end_dic[(ent[1], ent[2])].append(ent)
                all_num += 1
            for k, v in start_dic.items():
                if len(v) > 1:
                    start_num += len(v)
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


if __name__ == "__main__":
    reader = Reader(config)
    reader.read_all_data()

    # print reader.train_sents[0]
    train_batches, dev_batches, test_batches = reader.to_batch()
    f = open(config.data_path + "_train.pkl", 'wb')
    pickle.dump(train_batches, f)
    f.close()

    f = open(config.data_path + "_dev.pkl", 'wb')
    pickle.dump(dev_batches, f)
    f.close()

    f = open(config.data_path + "_test.pkl", 'wb')
    pickle.dump(test_batches, f)
    f.close()