Exemple #1
0
def prepare_dataloaders_from_bpe_files(opt, device):
    batch_size = opt.batch_size
    MIN_FREQ = 2
    if not opt.embs_share_weight:
        raise Exception("err")

    data = pickle.load(open(opt.data_pkl, 'rb'))
    MAX_LEN = data['settings'].max_len
    field = data['vocab']
    fields = (field, field)

    def filter_examples_with_length(x):
        return len(vars(x)['src']) <= MAX_LEN and len(
            vars(x)['trg']) <= MAX_LEN

    train = TranslationDataset(fields=fields,
                               path=opt.train_path,
                               exts=('.src', '.trg'),
                               filter_pred=filter_examples_with_length)
    val = TranslationDataset(fields=fields,
                             path=opt.val_path,
                             exts=('.src', '.trg'),
                             filter_pred=filter_examples_with_length)

    opt.max_token_seq_len = MAX_LEN + 2
    opt.src_pad_idx = opt.trg_pad_idx = field.vocab.stoi[Constants.PAD_WORD]
    opt.src_vocab_size = opt.trg_vocab_size = len(field.vocab)

    train_iterator = BucketIterator(train,
                                    batch_size=batch_size,
                                    device=device,
                                    train=True)
    val_iterator = BucketIterator(val, batch_size=batch_size, device=device)
    return train_iterator, val_iterator
Exemple #2
0
def prepare_dataloaders(opt, device):
    batch_size = opt.batch_size
    data = pickle.load(open(opt.data_pkl, 'rb'))

    opt.max_token_seq_len = data['settings'].max_len
    opt.src_pad_idx = data['vocab']['src'].vocab.stoi[Constants.PAD_WORD]
    opt.trg_pad_idx = data['vocab']['trg'].vocab.stoi[Constants.PAD_WORD]

    opt.src_vocab_size = len(data['vocab']['src'].vocab)
    opt.trg_vocab_size = len(data['vocab']['trg'].vocab)

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert data['vocab']['src'].vocab.stoi == data['vocab']['trg'].vocab.stoi, \
            'To sharing word embedding the src/trg word2idx table shall be the same.'

    fields = {'src': data['vocab']['src'], 'trg': data['vocab']['trg']}

    train = Dataset(examples=data['train'], fields=fields)
    val = Dataset(examples=data['valid'], fields=fields)

    train_iterator = BucketIterator(train,
                                    batch_size=batch_size,
                                    device=device,
                                    train=True)
    val_iterator = BucketIterator(val, batch_size=batch_size, device=device)

    return train_iterator, val_iterator
Exemple #3
0
def buildDataSets():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Model parameter
    MAX_SEQ_LEN = 16
    PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

    # Fields

    label_field = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        dtype=torch.int8)
    text_field = Field(use_vocab=False,
                       tokenize=tokenizer.encode,
                       lower=False,
                       include_lengths=False,
                       batch_first=True,
                       fix_length=MAX_SEQ_LEN,
                       pad_token=PAD_INDEX,
                       unk_token=UNK_INDEX)

    fields = {'label': ('label', label_field), 'text': ('text', text_field)}

    # TabularDataset

    train, valid, test = TabularDataset.splits(path='memesData/data',
                                               train='train.jsonl',
                                               validation='dev_unseen.jsonl',
                                               test='dev_seen.jsonl',
                                               format='JSON',
                                               fields=fields)

    # Iterators

    train_iter = BucketIterator(train,
                                batch_size=8,
                                sort_key=lambda x: len(x.text),
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    valid_iter = BucketIterator(valid,
                                batch_size=8,
                                sort_key=lambda x: len(x.text),
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    test_iter = Iterator(test,
                         batch_size=8,
                         train=False,
                         shuffle=False,
                         sort=False)
    return train_iter, valid_iter, test_iter
Exemple #4
0
def train_model(config_path: str):
    writer = SummaryWriter()
    config = read_training_pipeline_params(config_path)
    logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info("Device is {device}", device=device)
    SRC, TRG, dataset = get_dataset(config.dataset_path, False)
    train_data, valid_data, test_data = split_data(
        dataset, **config.split_ration.__dict__)
    SRC.build_vocab(train_data, min_freq=3)
    TRG.build_vocab(train_data, min_freq=3)
    torch.save(SRC.vocab, config.src_vocab_name)
    torch.save(TRG.vocab, config.trg_vocab_name)
    logger.info("Vocab saved")
    print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
    print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=config.BATCH_SIZE,
        device=device,
        sort_key=_len_sort_key,
    )
    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)

    config_encoder = BertConfig(vocab_size=INPUT_DIM)
    config_decoder = BertConfig(vocab_size=OUTPUT_DIM)
    config = EncoderDecoderConfig.from_encoder_decoder_configs(
        config_encoder, config_decoder)
    model = EncoderDecoderModel(config=config)
    config_encoder = model.config.encoder
    config_decoder = model.config.decoder
    config_decoder.is_decoder = True
    config_decoder.add_cross_attention = True
    config = EncoderDecoderConfig.from_encoder_decoder_configs(
        config_encoder, config_decoder)
    model = EncoderDecoderModel(config=config)
    args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        num_train_epochs=10,
        save_steps=3000,
        seed=0,
        load_best_model_at_end=True,
    )
    # args.place_model_on_device = device
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_iterator,
        eval_dataset=valid_iterator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    trainer.train()

    model.save_pretrained("bert2bert")
Exemple #5
0
def data_iter(text_field, label_field, train_bs=32, eval_bs=128, is_char_token=False):
    train = MyDataset("data/tnews/train.json", text_field=text_field, label_field=label_field, test=False)
    valid = MyDataset("data/tnews/dev.json", text_field=text_field, label_field=label_field, test=False)
    test = MyDataset("data/tnews/test.json", text_field=text_field, label_field=None, test=True)
    # 如果是char粒度的输入,不存在预训练的词向量
    word_embeddings = None
    if not is_char_token:
        word_embeddings = text_field.vocab.vectors

    # device=-1表示使用cpu进行数据集迭代
    device = 0 if torch.cuda.is_available() else -1
    train_iter = BucketIterator(
        dataset=train, batch_size=train_bs, shuffle=True, train=True,
        sort_key=lambda x: len(x.text), sort=False, device=device)
    val_iter = BucketIterator(valid, eval_bs, train=False, sort_key=lambda x: len(x.text),
                              sort=False, device=device)
    test_iter = Iterator(test, 128, shuffle=False, train=False, sort=False, device=device)

    return train_iter, val_iter, test_iter, word_embeddings
def create_iterators(data_file_location, split_to_train_and_test=True):

    if split_to_train_and_test:

        train, test = TabularDataset(path=data_file_location,
                                     format="TSV",
                                     fields=fields,
                                     skip_header=True).split()

        train_iter = BucketIterator(train,
                                    batch_size=Parameters.BATCH_SIZE,
                                    device=Parameters.DEVICE,
                                    train=True,
                                    shuffle=True)

        test_iter = BucketIterator(test,
                                   batch_size=Parameters.BATCH_SIZE,
                                   device=Parameters.DEVICE,
                                   train=True,
                                   shuffle=True,
                                   sort=False)
        answer = train_iter, test_iter
    else:
        dataset = TabularDataset(path=data_file_location,
                                 format="TSV",
                                 fields=fields,
                                 skip_header=True)
        answer = BucketIterator(dataset,
                                batch_size=Parameters.BATCH_SIZE,
                                device=Parameters.DEVICE,
                                train=True,
                                shuffle=True,
                                sort=False)

    print("Finish dataset prepare")
    return answer
Exemple #7
0
def TokenBucket(
    train, batch_size, device="cuda:0", key=lambda x: max(len(x.word[0]), 5)
):
    def batch_size_fn(x, _, size):
        return size + key(x)

    return BucketIterator(
        train,
        train=True,
        sort=False,
        sort_within_batch=True,
        shuffle=True,
        batch_size=batch_size,
        sort_key=lambda x: key(x),
        repeat=True,
        batch_size_fn=batch_size_fn,
        device=device,
    )
def make(config, device, train_data, valid_data, test_data, TTX, TRG, ASR):
    # Make the data
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        sort_key=lambda x: len(x.true_text),
        batch_size=config.batch_size,
        device=device)

    # Make the model
    model = make_model(config, device, TTX, TRG, ASR)
    print(f'The model has {count_parameters(model):,} trainable parameters')
    model.apply(initialize_weights)

    # Make the loss and optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    if config['decode_trg']:
        ignore_index = TRG.vocab.stoi[TRG.pad_token]
    else:
        ignore_index = TTX.vocab.stoi[TTX.pad_token]
    criterion = nn.CrossEntropyLoss(ignore_index=ignore_index)

    return model, train_iterator, valid_iterator, test_iterator, criterion, optimizer
    TEXT = Field(batch_first=True,
                 use_vocab=False,
                 tokenize=tokenize_and_cut,
                 preprocessing=tokenizer.convert_tokens_to_ids,
                 init_token=init_token_idx,
                 eos_token=eos_token_idx,
                 pad_token=pad_token_idx,
                 unk_token=unk_token_idx)
    LABEL = LabelField(dtype=torch.long, use_vocab=False)
    fields = [('data', TEXT), ('label', LABEL)]
    train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='validation.csv', test='test.csv',
                                               format='CSV', fields=fields, skip_header=True)

    train_generator, val_generator, test_generator = BucketIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        device=device, sort=False)

    criterion = nn.CrossEntropyLoss()

    criterion = criterion.to(device)
    all_statedict_path = glob.glob('/root/logs/*.pth')
    for state_dict_path in all_statedict_path:
        print(state_dict_path)
        epoch_loss = 0
        epoch_acc = 0
        model = phobert_lstm(phobert_path=phobert_path,
                             state_dict_path=state_dict_path,
                             hidden_dim=hidden_dim,
                             num_classes=num_classes,
                             
Exemple #10
0
                                        test='test.csv',
                                        format='csv',
                                        fields=fields)

len(train_data) , len(test_data)

print(vars(train_data.examples[0]))

Texto.build_vocab(train_data, max_size=10000, min_freq=1,vectors="glove.6B.100d")

Texto.vocab.freqs.most_common(25)

Texto.vocab.itos[:10]

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=2, device=device
)

class RNN_LSTM(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers)
        self.fc_out = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
Exemple #11
0
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import flor
flor.flags.NAME = 'kaggle-nlp-disasters-rnn'
flor.flags.REPLAY = False
device = torch.device(('cuda:0' if torch.cuda.is_available() else 'cpu'))
device
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)
fields = [('words', text_field), ('target', label_field)]
fields_test = [('words', text_field)]
(train, valid) = TabularDataset.splits(path='data', train='train_rnn.csv', validation='valid_rnn.csv', format='CSV', fields=fields, skip_header=True)
test = TabularDataset(path='data/test_rnn.csv', format='CSV', fields=fields_test, skip_header=True)
train_iter = BucketIterator(train, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True)
test_iter = BucketIterator(test, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True)
text_field.build_vocab(train, min_freq=5)

class LSTM(nn.Module):

    def __init__(self, dimension=128):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(len(text_field.vocab), 300)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=300, hidden_size=dimension, num_layers=1, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(p=0.5)
        self.fc = nn.Linear((2 * dimension), 1)

    def forward(self, text, text_len):
Exemple #12
0
    def make_iter(self, train, validate, test, batch_size, device):
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train, validate, test), batch_size=batch_size, device=device)

        print('dataset initializing done')
        return train_iterator, valid_iterator, test_iterator
Exemple #13
0
def train_model(config_path: str):
    writer = SummaryWriter()
    config = read_training_pipeline_params(config_path)
    logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info("Device is {device}", device=device)
    SRC, TRG, dataset = get_dataset(config.dataset_path,
                                    config.net_params.transformer)
    train_data, valid_data, test_data = split_data(
        dataset, **config.split_ration.__dict__)
    if config.net_params.pretrained_emb:
        src_vectors = torchtext.vocab.FastText(language='ru')
    SRC.build_vocab(train_data, min_freq=3)
    if config.net_params.pretrained_emb:
        SRC.vocab.load_vectors(src_vectors)
    TRG.build_vocab(train_data, min_freq=3)
    torch.save(SRC.vocab, config.src_vocab_name)
    torch.save(TRG.vocab, config.trg_vocab_name)
    logger.info("Vocab saved")
    print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
    print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=config.BATCH_SIZE,
        device=device,
        sort_key=_len_sort_key,
    )
    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)

    if config.net_params.attention:
        Encoder = network_gru_attention.Encoder
        Decoder = network_gru_attention.Decoder
        Seq2Seq = network_gru_attention.Seq2Seq
        Attention = network_gru_attention.Attention
        attn = Attention(config.net_params.HID_DIM, config.net_params.HID_DIM)
        enc = Encoder(INPUT_DIM, config.net_params.ENC_EMB_DIM,
                      config.net_params.HID_DIM, config.net_params.HID_DIM,
                      config.net_params.ENC_DROPOUT)
        dec = Decoder(OUTPUT_DIM, config.net_params.DEC_EMB_DIM,
                      config.net_params.HID_DIM, config.net_params.HID_DIM,
                      config.net_params.DEC_DROPOUT, attn)

        model = Seq2Seq(enc, dec, device)
    if config.net_params.transformer:
        logger.info("Transformer lets go")
        Encoder = network_transformer.Encoder
        Decoder = network_transformer.Decoder
        Seq2Seq = network_transformer.Seq2Seq
        SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
        TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
        HID_DIM = 512
        ENC_LAYERS = 6
        DEC_LAYERS = 6
        ENC_HEADS = 8
        DEC_HEADS = 8
        ENC_PF_DIM = 2048
        DEC_PF_DIM = 2048
        ENC_DROPOUT = 0.1
        DEC_DROPOUT = 0.1

        enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
                      ENC_DROPOUT, device)

        dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
                      DEC_DROPOUT, device)
        model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device)
    if not config.net_params.attention and not config.net_params.transformer:
        Encoder = my_network.Encoder
        Decoder = my_network.Decoder
        Seq2Seq = my_network.Seq2Seq
        enc = Encoder(INPUT_DIM, config.net_params.ENC_EMB_DIM,
                      config.net_params.HID_DIM, config.net_params.N_LAYERS,
                      config.net_params.ENC_DROPOUT)
        dec = Decoder(OUTPUT_DIM, config.net_params.DEC_EMB_DIM,
                      config.net_params.HID_DIM, config.net_params.N_LAYERS,
                      config.net_params.DEC_DROPOUT)
        model = Seq2Seq(enc, dec, device)

    model.apply(init_weights)
    if config.net_params.pretrained_emb:
        model.encoder.tok_embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(SRC.vocab.vectors))
    model.to(device)
    PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
    optimizer = optim.Adam(model.parameters(), config.lr)
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, **config.lr_scheduler.__dict__)
    train_history = []
    valid_history = []
    best_valid_loss = float('inf')
    print("Let's go")
    # for p in model.encoder.parameters():
    #     p.requires_grad = True
    # for p in model.decoder.parameters():
    #     p.requires_grad = True

    for epoch in range(config.N_EPOCHS):

        start_time = time.time()

        train_loss = train(model, train_iterator, optimizer, criterion,
                           config.CLIP, train_history, valid_history)
        valid_loss = evaluate(model, valid_iterator, criterion)
        lr_scheduler.step(valid_loss)
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), config.model_out_name)

        train_history.append(train_loss)
        valid_history.append(valid_loss)
        writer.add_scalar('train loss', train_history[-1], epoch)
        writer.add_scalar('valid loss', valid_history[-1], epoch)
        writer.add_scalar('learning rate', lr_scheduler._last_lr[0], epoch)
        print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}'
        )
        for idx, batch in enumerate(valid_iterator):
            if idx > 3:
                break
            src = batch.src[:, idx:idx + 1]
            trg = batch.trg[:, idx:idx + 1]
            generate_translation(src, trg, model, TRG.vocab, SRC.vocab,
                                 config.net_params.transformer)

    get_bleu(model, test_iterator, TRG, config.net_params.transformer)
def data_preprocessing():
    SEED = 1234

    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    # import de_core_news_sm, en_core_web_sm
    # spacy_de = de_core_news_sm.load()
    # spacy_en = en_core_web_sm.load()
    # spacy_de = spacy.load('de_core_news_sm')
    # spacy_en = spacy.load('en_core_web_sm')

    # Field对象 :指定要如何处理某个字段,比如指定分词方法,是否转成小写,起始字符,结束字符,补全字符以及词典等。
    # 我们创建SRC和TRG两个Field对象,tokenize为我们刚才定义的分词器函数
    # 在每句话的开头加入字符SOS,结尾加入字符EOS,将所有单词转换为小写。
    SRC = Field(tokenize=tokenize_de,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)
    TRG = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True)

    # splits方法可以同时加载训练集,验证集和测试集,
    # 参数exts指定使用哪种语言作为源语言和目标语言,fileds指定定义好的Field类
    train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                        fields=(SRC, TRG))

    # print(f"Number of training examples: {len(train_data.examples)}")
    # print(f"Number of validation examples: {len(valid_data.examples)}")
    # print(f"Number of testing examples: {len(test_data.examples)}")

    # vars() 函数返回对象object的属性和属性值的字典对象。
    # print(vars(train_data.examples[0]))

    # 构建词表,即给每个单词编码,用数字表示每个单词,这样才能传入模型
    SRC.build_vocab(train_data, min_freq=2)
    TRG.build_vocab(train_data, min_freq=2)

    # print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
    # print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # print(device)

    BATCH_SIZE = 128

    # BucketIterator:相比于标准迭代器,会将类似长度的样本当做一批来处理
    # 因为在文本处理中经常会需要将每一批样本长度补齐为当前批中最长序列的长度
    # 因此当样本长度差别较大时,使用BucketIerator可以带来填充效率的提高。
    # 除此之外,我们还可以在Field中通过fix_length参数来对样本进行截断补齐操作。

    # 当使用迭代器生成一个batch时,我们需要确保所有的源语言句子都padding到相同的长度,目标语言的句子也是。
    # 这些功能torchtext可以自动的完成,其使用了动态padding,意味着一个batch内的所有句子会pad成batch内最长的句子长度。
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device)

    return SRC, TRG, device, train_iterator, valid_iterator, test_iterator
Exemple #15
0
def train():
    spacy_ger = de_core_news_md.load()
    spacy_eng = en_core_web_sm.load()

    def tokenize_ger(text):
        return [tok.text for tok in spacy_ger.tokenizer(text)]

    def tokenize_eng(text):
        return [tok.text for tok in spacy_eng.tokenizer(text)]

    german = Field(tokenize=tokenize_ger,
                   lower=True,
                   init_token="<sos>",
                   eos_token="<eos>")

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")

    train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"),
                                                        fields=(german,
                                                                english))

    german.build_vocab(train_data, max_size=10000, min_freq=2)
    english.build_vocab(train_data, max_size=10000, min_freq=2)

    ### We're ready to define everything we need for training our Seq2Seq model ###

    # Training hyperparameters
    num_epochs = 20
    learning_rate = 0.001
    batch_size = 64

    # Model hyperparameters
    load_model = False
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_size_encoder = len(german.vocab)
    input_size_decoder = len(english.vocab)
    output_size = len(english.vocab)
    encoder_embedding_size = 300
    decoder_embedding_size = 300
    hidden_size = 1024  # Needs to be the same for both RNN's
    num_layers = 2
    enc_dropout = 0.5
    dec_dropout = 0.5

    # Tensorboard to get nice loss plot
    writer = SummaryWriter(f"runs/loss_plot")
    step = 0

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device,
    )

    encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                          hidden_size, num_layers, enc_dropout).to(device)

    decoder_net = Decoder(
        input_size_decoder,
        decoder_embedding_size,
        hidden_size,
        output_size,
        num_layers,
        dec_dropout,
    ).to(device)

    model = Seq2Seq(encoder_net, decoder_net, len(english.vocab),
                    device).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print(
        f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters"
    )

    pad_idx = english.vocab.stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    if load_model:
        load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model,
                        optimizer)

    sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

    for epoch in range(num_epochs):
        print(
            f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]"
        )

        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict()
        }
        # save_checkpoint(checkpoint)

        model.eval()

        translated_sentence = translate_sentence(model,
                                                 sentence,
                                                 german,
                                                 english,
                                                 device,
                                                 max_length=50)

        print(f"Translated example sentence: \n {translated_sentence}")

        model.train()

        for batch_idx, batch in enumerate(train_iterator):
            # Get input and targets and get to cuda
            inp_data = batch.src.to(device)
            target = batch.trg.to(device)

            # Forward prop
            output = model(inp_data, target)

            # print('\n')
            # print('Input', inp_data.shape)
            # print('Target', target.shape)
            # print('Output', output.shape)
            # print('---------------------')

            # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
            # doesn't take input in that form. For example if we have MNIST we want to have
            # output to be: (N, 10) and targets just (N). Here we can view it in a similar
            # way that we have output_words * batch_size that we want to send in into
            # our cost function, so we need to do some reshapin. While we're at it
            # Let's also remove the start token while we're at it
            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)

            # Back prop
            loss.backward()

            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()

            # Plot to tensorboard
            writer.add_scalar("Training loss", loss, global_step=step)
            # print("Training loss", loss)
            step += 1

    score = bleu(test_data[1:100], model, german, english, device)
    print(f"Bleu score {score*100:.2f}")
Exemple #16
0

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# In[ ]:


def _len_sort_key(x):
    return len(x.src)

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key=_len_sort_key
)


# ## Building the Model
# 
# Next, we'll build the model. Like previous notebooks it is made up of an *encoder* and a *decoder*, with the encoder *encoding* the input/source sentence (in Russian) into *context vector* and the decoder then *decoding* this context vector to output our output/target sentence (in English). 
# 
# ### Encoder
# 
# The Transformer's encoder does not attempt to compress the entire source sentence, $X = (x_1, ... ,x_n)$, into a single context vector, $z$. Instead it produces a sequence of context vectors, $Z = (z_1, ... , z_n)$. So, if our input sequence was 5 tokens long we would have $Z = (z_1, z_2, z_3, z_4, z_5)$. Why do we call this a sequence of context vectors and not a sequence of hidden states? A hidden state at time $t$ in an RNN has only seen tokens $x_t$ and all the tokens before it. However, each context vector here has seen all tokens at all positions within the input sequence.
# 
# ![](assets/transformer-encoder.png)
# 
# First, the tokens are passed through a standard embedding layer. Next, as the model has no recurrent it has no idea about the order of the tokens within the sequence. We solve this by using a second embedding layer called a *positional embedding layer*. This is a standard embedding layer where the input is not the token itself but the position of the token within the sequence, starting with the first token, the `<sos>` (start of sequence) token, in position 0. The position embedding has a "vocabulary" size of 100, which means our model can accept sentences up to 100 tokens long. This can be increased if we want to handle longer sentences.
rootPath = root_path
train_path = 'train.csv'
test_path = 'test.csv'

train_data_pt, devdata = data.TabularDataset.splits(path=rootPath,
                                                    train=train_path,
                                                    test=test_path,
                                                    fields=fieldsMy,
                                                    skip_header=True,
                                                    format='csv')
#print(train_data_pt.review, devdata)

text_field.build_vocab(train_data_pt.review, max_size=3000)

train_iter, dev_iter = BucketIterator.splits((train_data_pt, devdata),
                                             sort_key=lambda x: len(x.review),
                                             batch_sizes=(128, 256),
                                             sort=False,
                                             sort_within_batch=False,
                                             repeat=False)


def save_vocab(path):
    file = open(path, 'w+', encoding='utf-8')
    for i in range(len(text_field.vocab)):
        #print(text_field.vocab.itos[i]+'\n')
        file.write(text_field.vocab.itos[i] + '\n')
    file.close()


save_vocab(root_path + 'vocab.txt')
Exemple #18
0
    x=target[0]
    #autoregression / teacher forcing
    for t in range(1,seq_len):
      output,h,c=self.decoder(x,states,h,c)
      outputs[t]=output
      best_guess=output.argmax(1)
      x=target[t] if random.random()<ratio else best_guess
    return outputs

epochs=60
encoder_vocab=len(german.vocab)
decoder_vocab=len(english.vocab)
embed_encoder=300
embed_decoder=300

TrainD,ValD,TestD=BucketIterator.splits((Train,Val,Test),batch_size=64,sort_within_batch=True,sort_key=lambda x:len(x.src),device=device)

for a,b in TrainD:
  print(a[0].shape,a[1].shape)

encoder_net=Encoder(encoder_vocab,embed_encoder,256).to(device)
decoder_net=AttentionDecoder(decoder_vocab,embed_decoder,256,decoder_vocab).to(device)
Sequence_net=Encoder_Decoder(encoder_net,decoder_net).to(device)



pad_index=english.vocab.stoi['<pad>']
optimizer=opt.Adam(Sequence_net.parameters())
loss_f=CrossEntropyLoss(ignore_index=pad_index)

for i in range(epochs):
                                      format='csv',
                                      skip_header=True,
                                      fields=datafields)

print(f"Number of {data_size} training examples: {len(trn.examples)}")
print(f"Number of {data_size} validation examples: {len(vld.examples)}")
print(f"Number of {data_size} test examples: {len(tst.examples)}")

INPUT.build_vocab(trn)
TARGET.build_vocab(trn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iter, val_iter, test_iter = BucketIterator.splits(
    (trn, vld, tst),
    sort_key=lambda x: len(x.input),
    sort_within_batch=False,
    batch_size=BATCH_SIZE,
    device=device)
"""
Build Transformer
"""


class TransformerModel(nn.Module):
    def __init__(self,
                 model_type,
                 intoken,
                 outtoken,
                 hidden,
                 enc_layers=3,
                 dec_layers=1,
Exemple #20
0
class DataFrameDataset(torchtext.legacy.data.Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [Example.fromlist(list(r), fields) for i, r in df.iterrows()],
            fields)


train_dataset, test_dataset = DataFrameDataset(df=df,
                                               fields=(('text', text_field),
                                                       ('label',
                                                        label_field))).split()
with open(data_dir + 'n_labels.pkl', 'rb') as f:
    n_classes = pkl.load(f)
train_iter, test_iter = BucketIterator.splits(datasets=(train_dataset,
                                                        test_dataset),
                                              batch_sizes=(32, n_classes),
                                              sort=False)


class ModelParam(object):
    def __init__(self, param_dict: dict = dict()):
        self.input_size = param_dict.get('input_size', 0)
        self.vocab_size = param_dict.get('vocab_size')
        self.embedding_dim = param_dict.get('embedding_dim', 300)
        self.target_dim = param_dict.get('target_dim', n_classes)


class MyModel(nn.Module):
    def __init__(self, model_param: ModelParam):
        super().__init__()
        self.embedding = nn.Embedding(model_param.vocab_size,
Exemple #21
0
# TabularDataset

train, valid, test = TabularDataset.splits(path='./data',
                                           train='IMDB_single.csv',
                                           validation='IMDBs.csv',
                                           test='IMDBs.csv',
                                           format='CSV',
                                           fields=fields,
                                           skip_header=True)

# Iterators

train_iter = BucketIterator(train,
                            batch_size=16,
                            sort_key=lambda x: len(x.text),
                            device=device,
                            train=True,
                            sort=True,
                            sort_within_batch=True)
valid_iter = BucketIterator(valid,
                            batch_size=16,
                            sort_key=lambda x: len(x.text),
                            device=device,
                            train=True,
                            sort=True,
                            sort_within_batch=True)
test_iter = Iterator(test,
                     batch_size=16,
                     device=device,
                     train=False,
                     shuffle=False,
Exemple #22
0
            batch_first=True)

TGT = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TGT))

SRC.build_vocab(train_data, min_freq=2)
TGT.build_vocab(train_data, min_freq=2)

BATCH_SIZE = 8
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE)

if __name__ == "__main__":
    syn_data = synthetic_data(8, 2, 1)
    for i, batch in enumerate(syn_data):
        logging.info("batch-src shape {}, batch-src: {}".format(
            batch.src.shape, batch.src))
        logging.info("batch-tgt shape {}, batch-tgt: {}".format(
            batch.tgt.shape, batch.tgt))
        logging.info("batch-src-mask shape {}, batch-src-mask: {}".format(
            batch.src_mask.shape, batch.src_mask))
        logging.info("batch-tgt-mask shape {}, batch-src-mask: {}".format(
            batch.tgt_mask.shape, batch.tgt_mask))
        logging.info("batch-tgt-y shape {}, trg_y: {}".format(
            batch.tgt_y.shape, batch.tgt_y))
Exemple #23
0
    def train_adv(self, train_loader, val_loader, test_loader):
        """ Adversarially training process"""
        ## Step1. Train detector and generator
        best_model = os.path.join(self.args.savepath, 'best_model.pt')
        if not os.path.exists(best_model):
            self.forward(train_loader, val_loader, test_loader, maxepoch=3)
        logger.info('Loading pre-trained genearator and detector')
        self.model.load_state_dict(torch.load(best_model, map_location=lambda storage, loc: storage)['model'])


        ## Step2. Train adv generator and fix detector
        # Load checkpoint
        best_adv_model = os.path.join(self.args.savepath, 'best_adv_model.pt')
        if os.path.exists(best_adv_model):
            logger.info('Loading pre-trained adv genearator')
            self.model.load_state_dict(torch.load(best_adv_model, map_location=lambda storage, loc: storage)['model'])
        else:
            logger.info('Adversarially train generator -----------------------')
            train = CommentDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer, reverse_label=True)
            val = CommentDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer, reverse_label=True)
            train_iter, val_iter = BucketIterator.splits((train, val), sort_key=lambda x: len(x.src),
                sort_within_batch=False, batch_size=self.bs, device=self.device) # 3906, 977
            test = TreeDataset(self.args, test_loader, self.dataname, self.device, self.tokenizer)
            test_iter = Iterator(test, train=False, device=self.device, batch_size=self.bs,
                sort_key=lambda x: len(x.src), sort_within_batch=False)

            # Define trainer
            train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True,
                    self.args.train_gen ,self.model.generator, self.symbols,
                    self.model.vocab_size, self.args.label_smoothing)
            trainer = build_trainer(self.args, self.model, self.optim, train_loss, wandb=self.wandb)

            tot_train_steps = self.maxepoch * len(train_iter)
            test_stats = trainer.testing(test_iter, -1, gen_flag=True)
            test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), 'Before', self.args.label_num)

            # Freeze the detector
            for param in self.model.parameters():
                param.requires_grad = False
            for param in self.model.decoder.parameters():
                param.requires_grad = True

            best_total_loss = 100
            lowest_acc = 1
            stop_count = 0
            for epoch in range(self.maxepoch):
                print('[Adv train generator - fold {}] {}/{}'.format(self.fold, epoch, self.maxepoch))

                message = '{}-{} epoch {}/{} adv gen'.format(self.dataname, self.fold, epoch, 4)
                trainer.train(train_iter, tot_train_steps, message)
                val_stats = trainer.validate(val_iter, epoch)
                test_stats = trainer.testing(test_iter, epoch, gen_flag=False)
                test_stats = trainer.testing(test_iter, epoch, gen_flag=True) # polluted data
                test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), '{}-att'.format(epoch), self.args.label_num)
                #Save best model
                if test_stats.det_acc() < lowest_acc:
                    logger.info('Save Adv model at epoch {}'.format(epoch))
                    lowest_acc = test_stats.det_acc()
                    trainer._save('best_adv')
                    stop_count = 0
                else:
                    stop_count += 1

                if stop_count == 3:
                    break

        
        ## Step3. Train adv detector and fix generator
        logger.info('Adversarially train detector ---------------------------')
        train = CommentDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer)
        val = CommentDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer)
        train_iter, val_iter = BucketIterator.splits((train, val), sort_key=lambda x: len(x.src),
            sort_within_batch=False, batch_size=self.bs, device=self.device) # 3906, 977
        test = TreeDataset(self.args, test_loader, self.dataname, self.device, self.tokenizer)
        test_iter = Iterator(test, train=False, device=self.device, batch_size=self.bs,
            sort_key=lambda x: len(x.src), sort_within_batch=False)

        # Load checkpoint
        best_adv_model = os.path.join(self.args.savepath, 'best_adv_model.pt')
        self.model.load_state_dict(torch.load(best_adv_model, map_location=lambda storage, loc: storage)['model'])

        # Freeze the generator
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model.classifier.parameters():
            param.requires_grad = True
        for param in self.model.bert.parameters():
            param.requires_grad = True

        # Define trainer
        optim  = [build_optim(self.args, self.model, None)]
        train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True,
                False ,self.model.generator, self.symbols,
                self.model.vocab_size, self.args.label_smoothing)

        trainer = build_trainer(self.args, self.model, optim, train_loss, wandb=self.wandb)
        test_stats = trainer.testing(test_iter, -1, gen_flag=False) # clean data
        test_stats = trainer.testing(test_iter, -1, gen_flag=True) # polluted data

        tot_train_steps = self.maxepoch * len(train_iter)
        best_xent = 10
        stop_count = 0
        for epoch in range(self.maxepoch):
            print('[Adv train detector] {}/{}'.format(epoch, self.maxepoch))
            # Freeze the detector
            message = '{}-{} epoch {}/{} adv det'.format(self.dataname, self.fold, epoch, 5)
            trainer.train(train_iter, tot_train_steps, message)
            val_stats = trainer.validate(val_iter, epoch)
            test_stats = trainer.testing(test_iter, epoch, gen_flag=False) # clean data
            test_stats = trainer.testing(test_iter, epoch, gen_flag=True) # polluted data
            test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), '{}-adv'.format(epoch), self.args.label_num)

            # Save best model
            if val_stats.det_xent() < best_xent:
                print('Save model at epoch {}'.format(epoch))
                trainer._save('best_final')
                best_xent = val_stats.det_xent()
                stop_count = 0
            else:
                stop_count += 1

            if stop_count == 3:
                break
Exemple #24
0
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src))

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size,
                      num_layers, enc_dropout).to(device)
encoder_net = Encoder(input_size_decoder, decoder_embedding_size, hidden_size,
                      num_layers, dec_dropout).to(device)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
## complete training loop
Exemple #25
0
    def forward(self, train_loader, val_loader, test_loader, maxepoch=None):
        """ Normal training process """

        # Build data iterator for generator (One node on the tree is converted to one data)
        #train = TreeDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer)
        train = CommentDataset(self.args, train_loader, self.dataname, self.device, self.tokenizer)
        
        sample_per_cls = None
        #sample_per_cls = train.sample_per_cls(self.args.label_num)

        train_iter = BucketIterator(train, sort_key=lambda x: len(x.src),
            sort_within_batch=False, batch_size=self.bs, device=self.device)

        #val = TreeDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer)
        val = CommentDataset(self.args, val_loader, self.dataname, self.device, self.tokenizer)
        val_iter = BucketIterator(val, sort_key=lambda x: len(x.src),
            sort_within_batch=False, batch_size=96, device=self.device)

        test = TreeDataset(self.args, test_loader, self.dataname, self.device, self.tokenizer)
        test_iter = Iterator(test, train=False, device=self.device, batch_size=96,
            sort_key=lambda x: len(x.src), sort_within_batch=False)

        # Define trainer
        if self.args.train_gen:
            train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True,
                self.args.train_gen ,self.model.generator, self.symbols,
                self.model.vocab_size, self.args.label_smoothing, sample_per_cls=sample_per_cls)
        else:
            train_loss = abs_loss(self.args.label_num, self.maxepoch, self.device, True, sample_per_cls = sample_per_cls)

        trainer = build_trainer(self.args, self.model, self.optim, train_loss, wandb=self.wandb)

        # Start training
        best_loss = 10
        stop_count = 0
        tot_train_steps = self.maxepoch * len(train_iter)
        gen_flag = self.args.train_gen or self.args.test_gen
        if not maxepoch:
            maxepoch = self.maxepoch
        logger.info('Start training')
        for epoch in range(maxepoch):
            print('[Training] {}/{}'.format(epoch, self.maxepoch))

            if self.args.train_gen:
                job = 'gen det'
            else:
                job = 'det'
            message = '{}-{} epoch {} {}/{} '.format(self.dataname, self.fold, job, epoch, self.maxepoch)
            trainer.train(train_iter, tot_train_steps, message)
            val_stats = trainer.validate(val_iter, epoch)
            test_stats = trainer.testing(test_iter, epoch, gen_flag=gen_flag)
            test_stats.write_results(os.path.join(self.args.savepath, 'result_test.csv'), str(epoch), self.args.label_num)

            val_det_loss = val_stats.det_xent()
            # Save best model
            if val_det_loss < best_loss:
                print('Save model at epoch {}'.format(epoch))
                trainer._save('best')
                best_loss = val_det_loss
                stop_count = 0
            else:
                stop_count += 1
Exemple #26
0
    path="data",
    train="train_rnn.csv",
    validation="valid_rnn.csv",
    format="CSV",
    fields=fields,
    skip_header=True,
)
test = TabularDataset(path="data/test_rnn.csv",
                      format="CSV",
                      fields=fields_test,
                      skip_header=True)

train_iter = BucketIterator(
    train,
    batch_size=flor.log("batch_size", 200),
    sort_key=lambda x: len(x.words),
    device=device,
    sort=True,
    sort_within_batch=True,
)
valid_iter = BucketIterator(
    valid,
    batch_size=200,
    sort_key=lambda x: len(x.words),
    device=device,
    sort=True,
    sort_within_batch=True,
)
test_iter = BucketIterator(
    test,
    batch_size=200,
    sort_key=lambda x: len(x.words),
Exemple #27
0
Question.build_vocab(train_data, min_freq=2)
Answer.build_vocab(
    train_data,
    vectors=torchtext.vocab.Vectors("./python_code_glove_embedding_300.txt"),
    min_freq=2)

print(f"Unique tokens in Question vocabulary: {len(Question.vocab)}")
print(f"Unique tokens in Answer vocabulary: {len(Answer.vocab)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32

print('BATCH_SIZE:', 32)

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), batch_size=BATCH_SIZE, sort=False, device=device)

INPUT_DIM = len(Question.vocab)
OUTPUT_DIM = len(Answer.vocab)
HID_DIM = 300
ENC_LAYERS = 4
DEC_LAYERS = 4
ENC_HEADS = 5
DEC_HEADS = 5
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
              ENC_DROPOUT, device)