def _createFields(self, min_occurance_freq): self.CAPTION_FIELD = data.ReversibleField( tokenize='spacy', init_token=self.start_token, eos_token=self.end_token, pad_token=self.pad_token, lower=True, batch_first=True, is_target=True, unk_token=UNKNOWN_TOKEN) self.INDEX_FIELD = data.Field( sequential=False, use_vocab=False, batch_first=True) if self.use_yt_categories: # preprocessing: if there is no category replace with -1 (unique number for dummy category) self.CATEGORY_FIELD = data.Field( sequential=False, use_vocab=False, batch_first=True, preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x)))) # filter the dataset if the a category is missing (31 -> 41 (count = 1 :())) self.filter_callback = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31 else: self.CATEGORY = None self.filter_callback = None if self.use_asr_subtitles: self.ASR_SUBTITLES_FIELD = data.ReversibleField( tokenize='spacy', init_token=self.start_token, eos_token=self.end_token, pad_token=self.pad_token, lower=True, batch_first=True, unk_token=UNKNOWN_TOKEN) else: self.ASR_SUBTITLES_FIELD = None
def get_dataset_iter(args, data_name="MR"): print("Loading data...") TEXT = data.ReversibleField(lower=True, include_lengths=True, batch_first=True) LABEL = data.Field(sequential=False) if data_name == "MR": train, test = MR.splits(TEXT, LABEL) else: train, test = myset.splits(TEXT, LABEL) print("Building vocabulary...") TEXT.build_vocab(train) LABEL.build_vocab(train) # print(type(TEXT.vocab.stoi)) train_iter, test_iter = data.BucketIterator.splits( (train, test), sort_key=lambda x: len(x.text), sort_within_batch=True, batch_size=args.batch_size, device=-1, repeat=False) args.embed_num = len(TEXT.vocab) args.class_num = len(LABEL.vocab) - 1 print("Loading data finish...") return train_iter, test_iter
def create_reversible_field(sql_vocab): sql_tokenizer = lambda x: x.split(Constants.SQL_SEPARATOR) sql = textdata.ReversibleField(tokenize=sql_tokenizer, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD, pad_token=Constants.PAD_WORD, unk_token=Constants.UNK_WORD) sql.vocab = sql_vocab return sql
def main(): tokenizer = MyTokenizer() TEXT = data.Field(sequential=True, use_vocab=False, tokenize=tokenizer.numbericalized_tokenize, pad_token=0) SUMMARY = data.ReversibleField(sequential=True, init_token='<sos>', eos_token='<eos>') print('Data Loading...') train_data = data.TabularDataset( path='/home/yilin10945/summary/data/newsroom/train.200.json', format='json', fields={ 'text': ('text', TEXT), 'summary': ('summary', SUMMARY) }) SUMMARY.build_vocab(train_data, max_size=30000) #import pickle #pickle.dump((train_data, TEXT, SUMMARY), open('model/processed_data.pkl', 'wb')) print('Data Loaded!!!') hidden_size = 768 vocab_size = len(SUMMARY.vocab) learning_rate = 0.0001 n_epochs = 10 batch_size = 16 embedding = nn.Embedding(vocab_size, hidden_size) bert_model = BertModel.from_pretrained('bert-base-uncased') bert_model.eval() attn_decoder = LuongAttnDecoderRNN('general', embedding, hidden_size, vocab_size, 1, 0.1).to(device) decoder_optimizer = optim.Adam(attn_decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss() print('Start Training...') for epoch in range(n_epochs): running_loss = 0 step = 0 for batch in tqdm.tqdm( data.BucketIterator(dataset=train_data, batch_size=batch_size)): loss = train(batch.text, batch.summary.to(device), bert_model, attn_decoder, decoder_optimizer, criterion) running_loss += loss step += batch_size if step % 128 == 0: print(f'Step: {step}, Training Loss: {running_loss/step}') torch.save(attn_decoder.state_dict(), f'model/{step}.pt') epoch_loss = running_loss / len(train_data) print(f'Epoch: {epoch}, Training Loss: {epoch_loss}')
def load_data(batch_size): #sentence_field = data.ReversibleField(lower=True) sentence_field = torchtextdata.ReversibleField( lower=False, sequential=True ) #We will take care of lowercasing after the character model labels_field = torchtextdata.Field(lower=False, sequential=True) [train_iter, dev_iter] = read_train_and_dev_splits(sentence_field, labels_field, batch_size) return [train_iter, dev_iter, sentence_field, labels_field]
def __init__(self, config): device = config.device # if tokenize if not config.tokenize: tokenize = None # fields self.TEXT = data.ReversibleField(batch_first=True, tokenize=tokenize, lower=True) self.LABEL = data.ReversibleField(sequential=False, unk_token=None) # data split self.train, self.dev, self.test = datasets.MultiNLI.splits( self.TEXT, self.LABEL) # build vocabs self.TEXT.build_vocab(self.train, self.dev, self.test) self.LABEL.build_vocab(self.train) # add word vector add_vocab_vectors(self.TEXT, config) # create iterators self.train_iter, self.dev_iter, self.test_iter = \ data.BucketIterator.splits((self.train, self.dev, self.test), batch_sizes=(config.batch_size, config.batch_size, config.batch_size), device=device ) self.train_iter.repeat = False self.dev_iter.repeat = False self.test_iter.repeat = False self.max_word_len = max([len(w) for w in self.TEXT.vocab.itos]) self.vocab = self.TEXT.vocab config.n_embed = len(self.TEXT.vocab) config.d_out = len(self.LABEL.vocab) # output size, num of classes
def load_mentions_dataset(chat_dataset_path, csv_reader_params, word_embedding_file, fix_len = None): word_embedding_vectors = None if word_embedding_file: if word_embedding_file.endswith(".pt"): word_embedding_file = word_embedding_file[:-3] word_embedding_vectors = vocab.Vectors( word_embedding_file, cache = path.dirname(word_embedding_file)) message_field = data.ReversibleField(sequential=True, # tokenize=heb_tokenize, fix_length = fix_len, init_token=config['SOS_TOKEN'], eos_token=config['EOS_TOKEN'], pad_first=False, include_lengths=True ) fields = { 'message': ('message', message_field) } dataset = data.TabularDataset( path=chat_dataset_path, format='csv', csv_reader_params=csv_reader_params, skip_header=False, fields=fields ) if word_embedding_file: message_field.build_vocab(dataset, vectors=word_embedding_vectors) else: message_field.build_vocab(dataset) corpus_size = len(message_field.vocab) return message_field.vocab.vectors, corpus_size, dataset
vocab.itos[ix.item() if hasattr(ix, "item") else ix] for ix in ex ])) return textlist if __name__ == "__main__": device = "cuda:0" if torch.cuda.is_available() else "cpu" config = load_model_config(sys.argv[1]) torch.manual_seed(42) torch.cuda.manual_seed(42) # Dataset format text_field = torch_data.ReversibleField(sequential=True, lower=True, use_vocab=True, include_lengths=True, fix_length=256, tokenize="spacy") label_field = torch_data.Field(sequential=False, use_vocab=False, is_target=True) example_template = [('document', text_field), ('question', text_field), ('answer1', text_field), ('answer2', text_field), ('correct', label_field)] if sys.argv[2] == "train": # Read dataset mcscript_train, mcscript_dev, mcscript_val = mcread.read_mcscript( config["dataset_dir"], example_template) # Construct vocalbulary
def caption_iterator(cfg, batch_size, phase): print(f'Contructing caption_iterator for "{phase}" phase') spacy_en = spacy.load('en') def tokenize_en(txt): return [token.text for token in spacy_en.tokenizer(txt)] CAPTION = data.ReversibleField(tokenize='spacy', init_token=cfg.start_token, eos_token=cfg.end_token, pad_token=cfg.pad_token, lower=True, batch_first=True, is_target=True) INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True) # the order has to be the same as in the table fields = [ ('video_id', None), ('caption', CAPTION), ('start', None), ('end', None), ('duration', None), ('phase', None), ('idx', INDEX), ] dataset = data.TabularDataset( path=cfg.train_meta_path, format='tsv', skip_header=True, fields=fields, ) CAPTION.build_vocab(dataset.caption, min_freq=cfg.min_freq_caps, vectors=cfg.word_emb_caps) train_vocab = CAPTION.vocab if phase == 'val_1': dataset = data.TabularDataset(path=cfg.val_1_meta_path, format='tsv', skip_header=True, fields=fields) elif phase == 'val_2': dataset = data.TabularDataset(path=cfg.val_2_meta_path, format='tsv', skip_header=True, fields=fields) elif phase == 'learned_props': dataset = data.TabularDataset(path=cfg.val_prop_meta_path, format='tsv', skip_header=True, fields=fields) # sort_key = lambda x: data.interleave_keys(len(x.caption), len(y.caption)) datasetloader = data.BucketIterator(dataset, batch_size, sort_key=lambda x: 0, device=torch.device(cfg.device), repeat=False, shuffle=True) return train_vocab, datasetloader
from torchtext import datasets from torchtext.vocab import GloVe import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import torch from tqdm import tqdm device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') batch_size = 64 embedding_dim = 200 hidden_dim = 200 epochs = 5 # define Field TEXT = data.ReversibleField(lower=True, include_lengths=True) LABEL = data.Field(sequential=False) # make splits for data train, test = datasets.IMDB.splits(TEXT, LABEL) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_dim)) LABEL.build_vocab(train) train_iter, test_iter = data.BucketIterator.splits( (train, test), sort_key=lambda x: len(x.text), sort_within_batch=True, batch_size=batch_size, device=device, repeat=False)
import torch from NER_task.SequenceTaggingDataset import SequenceTaggingDataset from torchtext import data,datasets from torchtext import data import torch from torch import nn from torchcrf import CRF device = torch.device("cuda") def light_tokenize(sequence: str): return [sequence] TEXT = data.Field(sequential=True, tokenize=light_tokenize,include_lengths=True) LABEL = data.ReversibleField(sequential=True, tokenize=light_tokenize, unk_token=None, is_target=True) save_dir = 'save_models/model.pt' train = SequenceTaggingDataset( path='../Datasets/NER_data/train.txt',separator=' ', fields=[('text', TEXT), ('label', LABEL)]) valid = SequenceTaggingDataset( path='../Datasets/NER_data/valid.txt',separator=' ', fields=[('text', TEXT), ('label', LABEL)]) TEXT.build_vocab(train) LABEL.build_vocab(train) train_iter, val_iter = data.BucketIterator.splits(
def caption_iterator(start_token, end_token, pad_token, train_meta_path, val_1_meta_path, val_2_meta_path, min_freq, batch_size, device, phase, use_categories, use_subs): spacy_en = spacy.load('en') print(f'Preparing dataset for {phase}') def tokenize_en(txt): return [token.text for token in spacy_en.tokenizer(txt)] CAPTION = data.ReversibleField( tokenize='spacy', init_token=start_token, eos_token=end_token, pad_token=pad_token, lower=True, batch_first=True, is_target=True ) INDEX = data.Field( sequential=False, use_vocab=False, batch_first=True ) if use_categories: # preprocessing: if there is no category replace with -1 (unique number) CATEGORY = data.Field( sequential=False, use_vocab=False, batch_first=True, preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x))) ) # filter the dataset if the a category is missing (31 -> 41 (count = 1 :())) filter_pred = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31 else: CATEGORY = None filter_pred = None if use_subs: SUBS = data.ReversibleField( tokenize='spacy', init_token=start_token, eos_token=end_token, pad_token=pad_token, lower=True, batch_first=True ) else: SUBS = None # the order has to be the same as in the table fields = [ ('video_id', None), ('caption', CAPTION), ('start', None), ('end', None), ('duration', None), ('category_32', CATEGORY), ('subs', SUBS), ('phase', None), ('idx', INDEX), ] dataset = data.TabularDataset( path=train_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) CAPTION.build_vocab(dataset.caption, min_freq=min_freq) train_vocab = CAPTION.vocab train_subs_vocab = None if use_subs: SUBS.build_vocab(dataset.subs, min_freq=min_freq) train_subs_vocab = SUBS.vocab if phase == 'val_1': dataset = data.TabularDataset( path=val_1_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) elif phase == 'val_2': dataset = data.TabularDataset( path=val_2_meta_path, format='tsv', skip_header=True, fields=fields, filter_pred=filter_pred ) # sort_key = lambda x: data.interleave_keys(len(x.caption), len(x.caption)) sort_key = lambda x: 0 #len(x.caption) datasetloader = data.BucketIterator( dataset, batch_size, sort_key=sort_key, device=device, repeat=False, shuffle=True ) return train_vocab, train_subs_vocab, datasetloader
def main(): ############################### # PREPROCESSING ############################### datasets = ["train", "val", "test"] for dataset in datasets: if not os.path.exists(os.path.join("data", dataset + ".tsv")): print("Creating TSV for " + dataset) convert_to_tsv(dataset) print("Creating datasets", end='', flush=True) curr_time = datetime.now() article_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_in) summary_field = data.ReversibleField(tensor_type=torch.cuda.LongTensor, lower=True, tokenize=tokenizer_out, init_token='<sos>') train_set = data.TabularDataset(path='./data/train.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)]) val_set = data.TabularDataset(path='./data/val.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)]) diff_time, curr_time = get_time_diff(curr_time) print(", took {} min".format(diff_time)) print("Building vocabulary and creating batches", end='', flush=True) article_field.build_vocab(train_set, vectors="glove.6B.100d", max_size=encoder_vocab_size) summary_field.build_vocab(train_set, max_size=decoder_vocab_size) train_iter = data.BucketIterator(dataset=train_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE) val_iter = data.BucketIterator(dataset=val_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE) diff_time, curr_time = get_time_diff(curr_time) print(", took {} min".format(diff_time)) ############################### # MODEL CREATION ############################### print("Creating encoder and decoder models", end='', flush=True) encoder = EncoderLSTM(input_size=encoder_vocab_size, embed_size=embed_size, hidden_size=encoder_hidden_size, use_gpu=True, gpu_device=DEVICE, batch_size=batch_size) encoder.embedding.weight.data = article_field.vocab.vectors encoder.cuda(device=DEVICE) decoder = AttnDecoderLSTM(input_size=encoder_vocab_size, embed_size=embed_size, hidden_size=decoder_hidden_size, output_size=decoder_vocab_size, use_gpu=True, gpu_device=DEVICE, batch_size=batch_size) decoder.embedding.weight.data = article_field.vocab.vectors decoder.cuda(device=DEVICE) diff_time, curr_time = get_time_diff(curr_time) print(", took {} min".format(diff_time)) # Loss and SGD optimizers loss_func = nn.NLLLoss(ignore_index=1) # Ignore <pad> token encoder_opt = optim.Adam(encoder.parameters(), lr=lr) decoder_opt = optim.Adam(decoder.parameters(), lr=lr) ############################### # TRAINING ############################### print("Beginning training") tqdm_epoch = tqdm(range(num_epochs), desc="Epoch") for epoch in tqdm_epoch: train_iter.init_epoch() tqdm_batch = tqdm(train_iter, desc="Batch") for b_id, batch in enumerate(tqdm_batch): encoder.batch_size = batch.batch_size # Fixes weird bug where we get batch sizes that are not batch_size decoder.batch_size = batch.batch_size avg_loss = train(batch, encoder, decoder, encoder_opt, decoder_opt, loss_func, teacher_forcing_ratio) ############################### # TESTING ############################### # Load test set print("Loading test set") test_set = data.TabularDataset(path='./data/test.tsv', format='tsv', fields=[('article', article_field), ('summary', summary_field)]) test_iter = data.BucketIterator(dataset=test_set, batch_size=batch_size, sort_key=lambda x: len(x.article), repeat=False, device=DEVICE) print("Evaluating model") evaluate(encoder=encoder, decoder=decoder, dataset=test_iter, rev_field=article_field)
return [tok.text for tok in nlp.tokenizer(text)] def emb_tokenizer(l): r = [y for x in eval(l) for y in x] return r def y_tokenize(y): return int(y) TEXT = data.Field(sequential=True, tokenize=tokenizer, batch_first=True) #LABEL = data.Field(sequential=False, use_vocab=True,batch_first=True) LABEL = data.ReversibleField(sequential=False, unk_token='OTHER', use_vocab=True, batch_first=True) POS_EMB = data.Field(sequential=True, tokenize=emb_tokenizer, batch_first=True) print('loading data...') train, valid, test = data.TabularDataset.splits( path='../data/SemEval2010_task8_all_data', train='SemEval2010_task8_training/TRAIN_FILE_SUB.CSV', validation='SemEval2010_task8_training/VALID_FILE.CSV', test='SemEval2010_task8_testing_keys/TEST_FILE_FULL.CSV', format='csv', skip_header=True, csv_reader_params={'delimiter': '\t'}, fields=[('relation', LABEL), ('sentence', TEXT), ('pos_embed', POS_EMB)]) print('load data end') #print(valid[0].__dict__)
def __init__(self, config, lm_config, device): # define all fields TEXT = data.ReversibleField(sequential=True, tokenize=self.tokenizer, lower=False, include_lengths=False) POS = data.ReversibleField(sequential=True, lower=False, include_lengths=True) NER = data.ReversibleField(sequential=True, lower=False, include_lengths=True) LABEL = data.Field(sequential=False, use_vocab=False) IN_Q = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) IN_C = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) LEMMA_IN_Q = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) LEMMA_IN_C = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) TF = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) REL = data.ReversibleField(sequential=True, lower=False, include_lengths=True) # load lm data first lm_train = datasets.LanguageModelingDataset(os.path.join(lm_config.file_path, lm_config.train_f), TEXT, newline_eos=False) lm_dev = datasets.LanguageModelingDataset(os.path.join(lm_config.file_path, lm_config.dev_f), TEXT, newline_eos=False) # load actual data # we have keys: 'id', 'd_words', 'd_pos', 'd_ner', 'q_words', 'q_pos', 'c_words', # 'label', 'in_q', 'in_c', 'lemma_in_q', 'tf', 'p_q_relation', 'p_c_relation' train, val, test = data.TabularDataset.splits( path=config.data_dir, train=config.train_fname, validation=config.dev_fname, test=config.test_fname, format='json', fields={'d_words': ('d_words', TEXT), 'd_pos': ('d_pos', POS), 'd_ner': ('d_ner', NER), 'q_words': ('q_words', TEXT), 'q_pos': ('q_pos', POS), 'c_words': ('c_words', TEXT), 'label': ('label', LABEL), 'in_q': ('in_q', IN_Q), 'in_c': ('in_c', IN_C), 'lemma_in_q': ('lemma_in_q', LEMMA_IN_Q), 'lemma_in_c': ('lemma_in_c', LEMMA_IN_C), 'tf': ('tf', TF), 'p_q_relation': ('p_q_relation', REL), 'p_c_relation': ('p_c_relation', REL) }) print('train: %d, val: %d, test: %d' % (len(train), len(val), len(test))) # construct vocabulary TEXT.build_vocab(train, val, test, lm_train, lm_dev, vectors=config.vectors) POS.build_vocab(train, val, test) NER.build_vocab(train, val, test) REL.build_vocab(train, val, test) print('vocab size: %d' % len(TEXT.vocab)) print('pos size: %d' % len(POS.vocab)) print('ner size: %d' % len(NER.vocab)) print('rel size: %d' % len(REL.vocab)) self.TEXT = TEXT # iterators self.lm_train_iter = data.BPTTIterator(lm_train, batch_size=lm_config.batch_size, bptt_len=lm_config.bptt_len, repeat=False) self.lm_dev_iter = data.BPTTIterator(lm_dev, batch_size=lm_config.batch_size, bptt_len=lm_config.bptt_len, repeat=False) print('lm train batch num: %d, lm dev batch num: %d' % (len(self.lm_train_iter), len(self.lm_dev_iter))) self.train_iter = data.BucketIterator(dataset=train, batch_size=config.batch_size_train, sort_key=lambda x: len(x.d_words), device=device, shuffle=True, sort_within_batch=False, repeat=False) self.val_iter = data.Iterator(dataset=val, batch_size=config.batch_size_eval, sort_key=lambda x: len(x.d_words), train=False, shuffle=False, sort_within_batch=False, device=device, repeat=False) self.test_iter = data.Iterator(dataset=test, batch_size=config.batch_size_test, sort_key=lambda x: len(x.d_words), train=False, shuffle=False, sort_within_batch=False, device=device, repeat=False) print('train batch num: %d, dev batch num: %d' % (len(self.train_iter), len(self.val_iter))) # # Create embeddings embedding = nn.Embedding(len(TEXT.vocab), config.embed_dim) embedding.weight.data.copy_(TEXT.vocab.vectors) embedding.weight.requires_grad = False self.embedding = embedding.to(device) embedding_pos = nn.Embedding(len(POS.vocab), config.embed_dim_pos) embedding_pos.weight.data.normal_(0, 0.1) self.embedding_pos = embedding_pos.to(device) embedding_ner = nn.Embedding(len(NER.vocab), config.embed_dim_ner) embedding_ner.weight.data.normal_(0, 0.1) self.embedding_ner = embedding_ner.to(device) embedding_rel = nn.Embedding(len(REL.vocab), config.embed_dim_rel) embedding_rel.weight.data.normal_(0, 0.1) self.embedding_rel = embedding_rel.to(device) print('embedding', self.embedding) print('embedding_pos', self.embedding_pos) print('embedding_ner', self.embedding_ner) print('embedding_rel', self.embedding_rel) self.vocab_size = len(TEXT.vocab) print('vocab_size is', self.vocab_size)
with open('../../data/clinvar/text_classification_db_labels.json', 'r') as f: labels = json.load(f) # map labels to list label_list = [None] * len(labels) for k, v in labels.items(): label_list[v] = k labels = label_list logger.info("available labels: ") logger.info(labels) TEXT = data.ReversibleField(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True) LABEL = data.Field(sequential=False, use_vocab=False) if args.dataset == 'merged': train, val, test = data.TabularDataset.splits( path='../../data/clinvar/', train='merged_text_classification_db_train.tsv', validation='merged_text_classification_db_valid.tsv', test='merged_text_classification_db_test.tsv', format='tsv', fields=[('Text', TEXT), ('Description', LABEL)]) else: train, val, test = data.TabularDataset.splits( path='../../data/clinvar/', train='text_classification_db_train.tsv', validation='text_classification_db_valid.tsv',
def main(): parser = argparse.ArgumentParser() # parser.add_argument('-data', required=True) parser.add_argument('-max_len', '--max_word_seq_len', type=int, default=50) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # 快速變更設定 # opt.n_layers = 1 # opt.batch_size = 4 opt.cuda = torch.cuda.is_available() opt.epoch = 2000 opt.save_model = 'trained' opt.model = 'trained.chkpt' opt.d_word_vec = 300 opt.d_model = 300 opt.d_inner_hid = 600 opt.embs_share_weight = True opt.beam_size = 1 opt.max_len = 50 opt.max_token_seq_len = opt.max_len + 2 # 包含<BOS>, <EOS> opt.device = None if torch.cuda.is_available() else -1 # =========== prepare dataset =========== def len_filter(example): return len(example.src) <= opt.max_len and len( example.tgt) <= opt.max_len EN = data.ReversibleField(init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD, batch_first=True) train, val = Lang8.splits(exts=('.err.bpe', '.cor.bpe'), fields=[('src', EN), ('tgt', EN)], train='test', validation='test', test=None, filter_pred=len_filter) # adv_train, adv_dev, adv_test = Lang8.splits( # exts=('.adv.cor', '.adv.err'), fields=[('src', src), ('tgt', tgt)], # train='test', validation='test', test='test') # BD.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()]) # GD.build_vocab(train, vectors=[GloVe(name='840B', dim='300'), CharNGram(), FastText()]) EN.build_vocab(train, vectors=FastText()) print('vocab len: %d' % len(EN.vocab)) # 檢查Constants是否有誤 assert EN.vocab.stoi[EN.init_token] == Constants.BOS assert EN.vocab.stoi[EN.eos_token] == Constants.EOS assert EN.vocab.stoi[EN.pad_token] == Constants.PAD assert EN.vocab.stoi[EN.unk_token] == Constants.UNK # ---------- init model ---------- # if opt.embs_share_weight and train.src_word2idx != train.tgt_word2idx: # print('[Warning] The src/tgt word2idx table are different but asked to share word embedding.') print(opt) transformer = Transformer( len(EN.vocab), len(EN.vocab), opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, encoder_emb_weight=EN.vocab.vectors, decoder_emb_weight=EN.vocab.vectors, ) discriminator = TestDiscriminator( len(EN.vocab), d_model=300, max_len=opt.max_token_seq_len, ) print(transformer) print(discriminator) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) optimizer_G = optim.Adam(transformer.get_trainable_parameters(), lr=1e-4, betas=(0.5, 0.9)) optimizer_D = optim.Adam(discriminator.parameters(), lr=1e-4, betas=(0.5, 0.9)) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(len(EN.vocab)) if opt.cuda: transformer.cuda() discriminator.cuda() crit.cuda() # =========== training =========== supervised_trainer = trainers.TransformerTrainer() # trainer.train(transformer, train, val, crit, optimizer, opt, GD) # train_iter, val_iter = data.BucketIterator.splits( # (train, val), batch_sizes=(4, 256), device=opt.device, # sort_key=lambda x: len(x.src)) # batch = next(iter(train_iter)) # src_seq = batch.src # tgt_seq = batch.tgt # src_pos = transformer.get_position(src_seq.data) # tgt_pos = transformer.get_position(tgt_seq.data) # # # print(tgt_seq) # # print(src_pos) # # print(tgt_pos) # # transformer(src_seq, src_pos, tgt_seq, tgt_pos) # output = transformer(src_seq, src_pos) # print(output) # # print(discriminator(output)) # =========== WGAN training =========== wgan_trainer = WganTrainer(opt) train_iter, val_iter = data.BucketIterator.splits( (train, val), batch_sizes=(16, 64), device=opt.device, sort_key=lambda x: len(x.src), repeat=False) for epoch in range(opt.epoch): print('[Epoch %d]' % epoch) wgan_trainer.train_epoch(epoch, D=discriminator, G=transformer, optimizer_D=optimizer_D, optimizer_G=optimizer_G, train_iter=train_iter, n_tgt_vocab=len(EN.vocab)) valid_loss, valid_accu, bleu = supervised_trainer.evaluate( transformer, val_iter, crit, EN) print('(Validation) ppl: %8.5f, accuracy: %3.3f%%, BLEU %2.2f' % (math.exp(min(valid_loss, 100)), 100 * valid_accu, bleu))