コード例 #1
0
ファイル: mytrain.py プロジェクト: chenjiayu1502/NER
    crf = M.CRF(len(label_vocab))
    model = M.LSTMCRF(crf=crf,
                      vocab_sizes=[len(v) for v in input_vocabs],
                      word_dims=word_dim,
                      hidden_dim=lstm_dim,
                      layers=lstm_layers,
                      dropout_prob=dropout_prob,
                      bidirectional=bidirectional)
    model.reset_parameters()
    model = torch.load('./pkl/0622_multi_long_model_lstmcrf29.pkl')
    params = sum(np.prod(p.size()) for p in model.parameters())
    print("Number of parameters: {}".format(params))

    print("Loading word embeddings...")

    dataset = D.MultiSentWordDataset(input_path, label_path)
    test_dataset = D.MultiSentWordDataset(input_test_path, label_test_path)
    train_dataset = dataset
    train_dataloader = create_dataloader(train_dataset)
    test_dataloader = create_dataloader(test_dataset)

    trainer = BaseLSTMCRFTrainer(model=model,
                                 epochs=epoch,
                                 input_vocabs=input_vocabs,
                                 label_vocab=label_vocab)

    trainer.train(train_dataloader, data_size=len(train_dataset))

    #trainer=torch.load('./pkl/1.pkl')

    trainer.test(test_dataloader, data_size=len(test_dataset))
コード例 #2
0
def main(args):
    logging.basicConfig(level=logging.INFO)
    check_arguments(args)

    logging.info("Creating vocabulary...")
    input_vocabs = []

    for input in args.input_path:
        vocab = utils.Vocabulary()
        words = utils.FileReader(input).words()
        vocab.add("<pad>")
        vocab.add("<unk>")
        utils.populate_vocab(words, vocab)
        input_vocabs.append(vocab)
    # print(input_vocabs)[<utils.Vocabulary object at 0x7fa839f5a0b8>]

    label_vocab = utils.Vocabulary()
    words = utils.FileReader(args.label_path).words()
    label_vocab.add("START")
    label_vocab.add("END")
    utils.populate_vocab(words, label_vocab)

    for i, input_vocab in enumerate(input_vocabs):
        vocab_path = os.path.join(args.save_dir,
                                  "vocab-input{}.pkl".format(i + 1))
        pickle.dump(input_vocab, open(vocab_path, "wb"))
    vocab_path = os.path.join(args.save_dir, "vocab-label.pkl")
    pickle.dump(label_vocab, open(vocab_path, "wb"))

    logging.info("Initializing model...")
    crf = M.CRF(len(label_vocab))
    print('args.word_dim==',args.word_dim,type(args.word_dim))
    model = M.LSTMCRF(
        crf=crf,
        vocab_sizes=[len(v) for v in input_vocabs],
        word_dims=args.word_dim,
        hidden_dim=args.lstm_dim,
        layers=args.lstm_layers,
        dropout_prob=args.dropout_prob,
        bidirectional=args.bidirectional
    )
    model.reset_parameters()
    if args.gpu:
        gpu_main = args.gpu[0]
        model = model.cuda(gpu_main)
    params = sum(np.prod(p.size()) for p in model.parameters())
    logging.info("Number of parameters: {}".format(params))

    logging.info("Loading word embeddings...")
    # for vocab, we_type, we_path, we_freeze, emb in \
    #         zip(input_vocabs, args.wordembed_type, args.wordembed_path,
    #             args.wordembed_freeze, model.embeddings):
    #     if we_type == "glove":
    #         assert we_path is not None
    #         load_glove_embeddings(emb, vocab, we_path)
    #     elif we_type == "fasttext":
    #         assert we_path is not None
    #         assert args.fasttext_path is not None
    #         load_fasttext_embeddings(emb, vocab,
    #                                  fasttext_path=args.fasttext_path,
    #                                  embedding_path=we_path)
    #     elif we_type == "none":
    #         pass
    #     else:
    #         raise ValueError("Unrecognized word embedding "
    #                          "type: {}".format(we_type))
    #
    #     if we_freeze:
    #         emb.weight.requires_grad = False

    # Copying configuration file to save directory if config file is specified.
    if args.config:
        config_path = os.path.join(args.save_dir, os.path.basename(args.config))
        shutil.copy(args.config, config_path)

    def create_dataloader(dataset):
        return D.MultiSentWordDataLoader(
            dataset=dataset,
            input_vocabs=input_vocabs,
            label_vocabs=label_vocab,
            batch_size=args.batch_size,
            shuffle=args.shuffle,
            tensor_lens=True,
            num_workers=len(args.gpu) if args.gpu is not None else 1,
            pin_memory=True
        )

    dataset = D.MultiSentWordDataset(*args.input_path, args.label_path)
    test_dataset = D.MultiSentWordDataset(*args.test_input_path, args.test_label_path)

    if args.val:
        vr = args.val_ratio
        val_dataset, _ = dataset.split(vr, 1-vr, shuffle=args.shuffle)
    else:
        val_dataset = None

    train_dataset = dataset
    train_dataloader = create_dataloader(train_dataset)
    test_dataloader = create_dataloader(test_dataset)

    if val_dataset is not None:
        val_dataloader = create_dataloader(val_dataset)
    else:
        val_dataloader = None
    print(input_vocabs,type(input_vocabs))

    logging.info("Beginning training...")
    trainer = LSTMCRFTrainer(
        sargs=args,
        input_vocabs=input_vocabs,
        label_vocab=label_vocab,
        val_data=val_dataloader,
        model=model,
        epochs=args.epochs,
        gpus=args.gpu
    )

    trainer.train(train_dataloader, data_size=len(train_dataset))
    # trainer.validate()
    logging.info("Beginning testing...")
    # trainer.test(train_dataloader, data_size=len(train_dataset))
    #trainer.test(test_dataloader, data_size=len(test_dataset))
    logging.info("Done!")