Esempio n. 1
0
def simple_run():
    """train without k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    word_train, word_dev, label_train, label_dev = dev_split(config.train_dir)
    # simple run without k-fold
    run(word_train, label_train, word_dev, label_dev, vocab, device)
Esempio n. 2
0
def k_fold_run():
    """train with k-fold"""
    # set the logger
    utils.set_logger(config.log_dir)
    # 设置gpu为命令行参数指定的id
    if config.gpu != '':
        device = torch.device(f"cuda:{config.gpu}")
    else:
        device = torch.device("cpu")
    logging.info("device: {}".format(device))
    # 处理数据,分离文本和标签
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    # 分离出验证集
    data = np.load(config.train_dir, allow_pickle=True)
    words = data["words"]
    labels = data["labels"]
    kf = KFold(n_splits=config.n_split)
    kf_data = kf.split(words, labels)
    kf_index = 0
    total_test_loss = 0
    total_f1 = 0
    for train_index, dev_index in kf_data:
        kf_index += 1
        word_train = words[train_index]
        label_train = labels[train_index]
        word_dev = words[dev_index]
        label_dev = labels[dev_index]
        test_loss, f1 = run(word_train, label_train, word_dev, label_dev,
                            vocab, device, kf_index)
        total_test_loss += test_loss
        total_f1 += f1
    average_test_loss = float(total_test_loss) / config.n_split
    average_f1 = float(total_f1) / config.n_split
    logging.info("Average test loss: {} , average f1 score: {}".format(
        average_test_loss, average_f1))
Esempio n. 3
0
        config.embedding_dir, binary=False, encoding='utf-8')
    vocab_size = len(vocab) + 1
    embed_size = config.embedding_size
    weight = torch.zeros(vocab_size, embed_size)
    cnt = 0
    for i in range(len(word2vec_model.index_to_key)):
        try:
            index = vocab.word_id(word2vec_model.index_to_key[i])
        except:
            continue
        cnt += 1
        weight[index, :] = torch.from_numpy(word2vec_model.get_vector(
            vocab.id_word(vocab.word_id(word2vec_model.index_to_key[i]))))
    logging.info("--------Pretrained Embedding Loaded ! ({}/{})--------".format(cnt, len(vocab)))
    return weight


if __name__ == "__main__":
    from data_process import Processor
    from Vocabulary import Vocabulary
    processor = Processor(config)
    processor.data_process()
    # 建立词表
    vocab = Vocabulary(config)
    vocab.get_vocab()
    matrix, emb_vocab, size, l = load_embedding_manually(config.embedding_dir)
    print(emb_vocab['i2w'][4])  # 大
    print(vocab.word_id(emb_vocab['i2w'][4]))  # 15
    w = embedding(vocab)