start_num += len(v)
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


if __name__ == "__main__":
    reader = Reader()
    reader.read_and_gen_vectors_pubmed_word2vec(config.embed_path)
    reader.read_all_data("./data/genia/", "genia.train", "genia.dev",
                         "genia.test")

    # print reader.train_sents[0]
    train_batches, dev_batches, test_batches = reader.to_batch(
        config.batch_size)
    f = open(config.train_data_path, 'wb')
    pickle.dump(train_batches, f)
    f.close()

    f = open(config.dev_data_path, 'wb')
    pickle.dump(dev_batches, f)
    f.close()

    f = open(config.test_data_path, 'wb')
    pickle.dump(test_batches, f)
    f.close()

    batch_stat(train_batches)
    batch_stat(dev_batches)
    batch_stat(test_batches)
Beispiel #2
0
            for k, v in start_dic.items():
                if len(v) > 1:
                    start_num += len(v)
            for k, v in end_dic.items():
                if len(v) > 1:
                    end_num += len(v)

    print("All {}, start {}, end {}".format(all_num, start_num, end_num))


if __name__ == "__main__":
    reader = Reader(config)
    reader.read_all_data()

    # print reader.train_sents[0]
    train_batches, dev_batches, test_batches = reader.to_batch()
    f = open(config.data_path + "_train.pkl", 'wb')
    pickle.dump(train_batches, f)
    f.close()

    f = open(config.data_path + "_dev.pkl", 'wb')
    pickle.dump(dev_batches, f)
    f.close()

    f = open(config.data_path + "_test.pkl", 'wb')
    pickle.dump(test_batches, f)
    f.close()

    #batch_stat(train_batches)
    #batch_stat(dev_batches)
    #batch_stat(test_batches)