train='train.json', validation='valid.json', test='test.json', format='json', fields=fields) SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) BATCH_SIZE = 128 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device) ## Training the Seq2Seq Model INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 ENC_HID_DIM = 512 DEC_HID_DIM = 512 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
def make_iter(self, train, validate, test, batch_size, device): train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train, validate, test), batch_size=batch_size, device=device) print('dataset initializing done') return train_iterator, valid_iterator, test_iterator
def data_preprocessing(): SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True # import de_core_news_sm, en_core_web_sm # spacy_de = de_core_news_sm.load() # spacy_en = en_core_web_sm.load() # spacy_de = spacy.load('de_core_news_sm') # spacy_en = spacy.load('en_core_web_sm') # Field对象 :指定要如何处理某个字段,比如指定分词方法,是否转成小写,起始字符,结束字符,补全字符以及词典等。 # 我们创建SRC和TRG两个Field对象,tokenize为我们刚才定义的分词器函数 # 在每句话的开头加入字符SOS,结尾加入字符EOS,将所有单词转换为小写。 SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) # splits方法可以同时加载训练集,验证集和测试集, # 参数exts指定使用哪种语言作为源语言和目标语言,fileds指定定义好的Field类 train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) # print(f"Number of training examples: {len(train_data.examples)}") # print(f"Number of validation examples: {len(valid_data.examples)}") # print(f"Number of testing examples: {len(test_data.examples)}") # vars() 函数返回对象object的属性和属性值的字典对象。 # print(vars(train_data.examples[0])) # 构建词表,即给每个单词编码,用数字表示每个单词,这样才能传入模型 SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) # print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}") # print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # print(device) BATCH_SIZE = 128 # BucketIterator:相比于标准迭代器,会将类似长度的样本当做一批来处理 # 因为在文本处理中经常会需要将每一批样本长度补齐为当前批中最长序列的长度 # 因此当样本长度差别较大时,使用BucketIerator可以带来填充效率的提高。 # 除此之外,我们还可以在Field中通过fix_length参数来对样本进行截断补齐操作。 # 当使用迭代器生成一个batch时,我们需要确保所有的源语言句子都padding到相同的长度,目标语言的句子也是。 # 这些功能torchtext可以自动的完成,其使用了动态padding,意味着一个batch内的所有句子会pad成batch内最长的句子长度。 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device) return SRC, TRG, device, train_iterator, valid_iterator, test_iterator
train_data, valid_data, test_data = Multi30k.splits( exts=(".de", ".en"), fields=(SRC, TRG) ) print("Example from train data:") print(vars(train_data.examples[0])) SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") BATCH_SIZE = 128 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device ) # Building the Seq2Seq model # ENCODER class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, hid_dim, dropout): super().__init__() self.hid_dim = hid_dim self.embedding = nn.Embedding( input_dim, emb_dim ) # no dropout, as only one layer
Question.build_vocab(train_data, min_freq=2) Answer.build_vocab( train_data, vectors=torchtext.vocab.Vectors("./python_code_glove_embedding_300.txt"), min_freq=2) print(f"Unique tokens in Question vocabulary: {len(Question.vocab)}") print(f"Unique tokens in Answer vocabulary: {len(Answer.vocab)}") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 32 print('BATCH_SIZE:', 32) train_iterator, valid_iterator = BucketIterator.splits( (train_data, valid_data), batch_size=BATCH_SIZE, sort=False, device=device) INPUT_DIM = len(Question.vocab) OUTPUT_DIM = len(Answer.vocab) HID_DIM = 300 ENC_LAYERS = 4 DEC_LAYERS = 4 ENC_HEADS = 5 DEC_HEADS = 5 ENC_PF_DIM = 512 DEC_PF_DIM = 512 ENC_DROPOUT = 0.1 DEC_DROPOUT = 0.1 enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device)
preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) LABEL = LabelField(dtype=torch.long, use_vocab=False) fields = [('data', TEXT), ('label', LABEL)] train, valid, test = TabularDataset.splits(path=SOURCE_FOLDER, train='train.csv', validation='validation.csv', test='test.csv', format='CSV', fields=fields, skip_header=True) train_generator, val_generator, test_generator = BucketIterator.splits( (train, valid, test), batch_size=BATCH_SIZE, device=device, sort=False) if not os.path.exists(log_dir): os.makedirs(log_dir) writer = tensorboardX.SummaryWriter() optimizer = optim.Adam(model.parameters(), lr=1e-4) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) global_count = 0 for epoch in range(NUM_EPOCHS): logger.info('Training on epoch {}'.format(epoch + 1)) epoch_loss = 0 epoch_acc = 0 model.train() for batch in tqdm.tqdm(train_generator, desc='Training'): optimizer.zero_grad()