def simple_run(): """train without k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 word_train, word_dev, label_train, label_dev = dev_split(config.train_dir) # simple run without k-fold run(word_train, label_train, word_dev, label_dev, vocab, device)
def k_fold_run(): """train with k-fold""" # set the logger utils.set_logger(config.log_dir) # 设置gpu为命令行参数指定的id if config.gpu != '': device = torch.device(f"cuda:{config.gpu}") else: device = torch.device("cpu") logging.info("device: {}".format(device)) # 处理数据,分离文本和标签 processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() # 分离出验证集 data = np.load(config.train_dir, allow_pickle=True) words = data["words"] labels = data["labels"] kf = KFold(n_splits=config.n_split) kf_data = kf.split(words, labels) kf_index = 0 total_test_loss = 0 total_f1 = 0 for train_index, dev_index in kf_data: kf_index += 1 word_train = words[train_index] label_train = labels[train_index] word_dev = words[dev_index] label_dev = labels[dev_index] test_loss, f1 = run(word_train, label_train, word_dev, label_dev, vocab, device, kf_index) total_test_loss += test_loss total_f1 += f1 average_test_loss = float(total_test_loss) / config.n_split average_f1 = float(total_f1) / config.n_split logging.info("Average test loss: {} , average f1 score: {}".format( average_test_loss, average_f1))
config.embedding_dir, binary=False, encoding='utf-8') vocab_size = len(vocab) + 1 embed_size = config.embedding_size weight = torch.zeros(vocab_size, embed_size) cnt = 0 for i in range(len(word2vec_model.index_to_key)): try: index = vocab.word_id(word2vec_model.index_to_key[i]) except: continue cnt += 1 weight[index, :] = torch.from_numpy(word2vec_model.get_vector( vocab.id_word(vocab.word_id(word2vec_model.index_to_key[i])))) logging.info("--------Pretrained Embedding Loaded ! ({}/{})--------".format(cnt, len(vocab))) return weight if __name__ == "__main__": from data_process import Processor from Vocabulary import Vocabulary processor = Processor(config) processor.data_process() # 建立词表 vocab = Vocabulary(config) vocab.get_vocab() matrix, emb_vocab, size, l = load_embedding_manually(config.embedding_dir) print(emb_vocab['i2w'][4]) # 大 print(vocab.word_id(emb_vocab['i2w'][4])) # 15 w = embedding(vocab)