def vocab_builder(self): #self.eid_field = Field(sequential=False,tokenize) print('Build Vocabulary') tokenize = BiGraphTextDataset.tokenize_text TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=35, use_vocab=True) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('text', TEXT)] path = '/data1/home2/AgainstRumor/data/Pheme/data.text.txt' train_data = TabularDataset(path=path, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.stoi_dict = TEXT.vocab.stoi self.vocab_vectors = TEXT.vocab.vectors
def prepare_data(args): TEXT = Field(lower=True, include_lengths=True, batch_first=True, tokenize='spacy', tokenizer_language="en_core_web_sm") LABEL = Field(sequential=False) # make splits for data print("Creating splits") if args.subset: train, dev, test = SNLI.splits(TEXT, LABEL, root='./subdata') else: train, dev, test = SNLI.splits(TEXT, LABEL, root='./data') print("Loading GloVe") glove = torchtext.vocab.GloVe(name='840B', dim=300) print("Aligning GloVe vocab") TEXT.build_vocab(train, vectors=glove) LABEL.build_vocab(train, specials_first=False) n_vocab = len(TEXT.vocab.itos) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device:", device) print("Creating BucketIterator") train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_sizes=(args.batch, 256, 256), device=device, shuffle=False) return TEXT, train_iter, dev_iter, test_iter
def build(self): print('Build Vocabulary from ', self.path) tokenize = BuildVocab.tokenize_text TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=35, use_vocab=True) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('text', TEXT)] data = TabularDataset(path=self.path, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(data, vectors=GloVe(name='6B', dim=300), max_size=1000) #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.stoi = TEXT.vocab.stoi self.vectors = TEXT.vocab.vectors
class DataLoader: source: Field = None target: Field = None def __init__(self, ext, tokenize_en, tokenize_de, sos_token, eos_token): self.ext = ext self.tokenize_en = tokenize_en self.tokenize_de = tokenize_de self.sos_token = sos_token self.eos_token = eos_token print('data initializing start') # generate field def make_dataset(self): if self.ext == ('.de', '.en'): self.source = Field(tokenize=self.tokenize_de, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) self.target = Field(tokenize=self.tokenize_en, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) elif self.ext == ('.en', '.de'): self.source = Field(tokenize=self.tokenize_en, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) self.target = Field(tokenize=self.tokenize_de, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) train_data, valid_data, test_data = Multi30k.splits( exts=self.ext, fields=(self.source, self.target)) return train_data, valid_data, test_data # build the vocabulary & mapping integer def build_vocab(self, train_data, min_freq): # min_freq : lower bound frequency of the word's appearance self.source.build_vocab(train_data, min_freq=min_freq) self.target.build_vocab(train_data, min_freq=min_freq) def make_iter(self, train, validate, test, batch_size, device): train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train, validate, test), batch_size=batch_size, device=device) print('dataset initializing done') return train_iterator, valid_iterator, test_iterator
def prepare(params, samples): # print(type(params)) # print(type(samples)) TEXT = Field(lower=True, include_lengths=True, batch_first=True, tokenize='spacy', tokenizer_language="en_core_web_sm") # data = [' '.join(s) for s in samples], data = samples # print("data",len(data[0])) # print(data) TEXT.build_vocab(data, vectors=params.glove) params.model.emb_vec = torch.nn.Embedding.from_pretrained( TEXT.vocab.vectors, freeze=True).to(device=params.device) params["TEXT"] = TEXT
def __init__(self, device=None, jit=False): super().__init__() self.device = device self.jit = jit # Download and the load default data. WORD = Field(include_lengths=True) UD_TAG = Field(init_token="<bos>", eos_token="<eos>", include_lengths=True) # Download and the load default data. train, val, test = UDPOS.splits( fields=(("word", WORD), ("udtag", UD_TAG), (None, None)), filter_pred=lambda ex: 5 < len(ex.word) < 30, ) WORD.build_vocab(train.word, min_freq=3) UD_TAG.build_vocab(train.udtag) self.train_iter = torch_struct.data.TokenBucket(train, batch_size=100, device=device) H = 256 T = 30 NT = 30 self.model = NeuralCFG(len(WORD.vocab), T, NT, H) if jit: self.model = torch.jit.script(self.model) self.model.to(device=device) self.opt = torch.optim.Adam(self.model.parameters(), lr=0.001, betas=[0.75, 0.999]) for i, ex in enumerate(self.train_iter): words, lengths = ex.word self.words = words.long().to(device).transpose(0, 1) self.lengths = lengths.to(device) break
def produce_iterators(train_filename, valid_filename, test_filename, asr_tokenizer, ttx_tokenizer=None): """ Produce datasets for each of training, validation and test data. Also build vocabs for true text, tags, and ASR. :param train_filename: location of train data csv :param valid_filename: location of valid data csv :param test_filename: location of test data csv :return: """ TTX = Field(tokenize=lambda x: tokenize_TTX(x, ttx_tokenizer), init_token='<sos>', eos_token='<eos>', lower=False, batch_first=True) TRG = Field(tokenize=tokenize_TRG, init_token='<sos>', eos_token='<eos>', lower=False, batch_first=True) ASR = Field(tokenize=lambda x: tokenize_ASR(x, asr_tokenizer), init_token='<sos>', eos_token='<eos>', lower=False, batch_first=True) fields = { 'true_text': ('true_text', TTX), 'tags': ('tags', TRG), 'asr': ('asr', ASR) } train_data, valid_data, test_data = TabularDataset.splits( path='.\\', train=train_filename, validation=valid_filename, test=test_filename, format='csv', fields=fields) # Put min_freq at 2 or higher for real data TTX.build_vocab(train_data, min_freq=1) TRG.build_vocab(train_data, min_freq=1) ASR.build_vocab(train_data, min_freq=1) return train_data, valid_data, test_data, TTX, TRG, ASR
import random import torch.optim as optim import time tokenizer = word_tokenize SEED = 1234 torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True TEXT = Field(tokenize=tokenizer, include_lengths=True) LABEL = LabelField(dtype=torch.float) train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) train_data, valid_data = train_data.split(random_state=random.seed(SEED)) MAX_VOCAB_SIZE = 25000 TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.300d", unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) BATCH_SIZE = 64 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, vaild_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, device=device) class RNN(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
def test_xnli(self): batch_size = 4 # create fields TEXT = Field() GENRE = LabelField() LABEL = LabelField() LANGUAGE = LabelField() # create val/test splits, XNLI does not have a test set val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE) # check both are XNLI datasets assert type(val) == type(test) == XNLI # check all have the correct number of fields assert len(val.fields) == len(test.fields) == 5 # check fields are the correct type assert type(val.fields['premise']) == Field assert type(val.fields['hypothesis']) == Field assert type(val.fields['label']) == LabelField assert type(val.fields['genre']) == LabelField assert type(val.fields['language']) == LabelField assert type(test.fields['premise']) == Field assert type(test.fields['hypothesis']) == Field assert type(test.fields['label']) == LabelField assert type(test.fields['genre']) == LabelField assert type(test.fields['language']) == LabelField # check each is the correct length assert len(val) == 37350 assert len(test) == 75150 # build vocabulary TEXT.build_vocab(val) LABEL.build_vocab(val) GENRE.build_vocab(val) LANGUAGE.build_vocab(val) # ensure vocabulary has been created assert hasattr(TEXT, 'vocab') assert hasattr(TEXT.vocab, 'itos') assert hasattr(TEXT.vocab, 'stoi') # create iterators val_iter, test_iter = Iterator.splits((val, test), batch_size=batch_size) # get a batch to test batch = next(iter(val_iter)) # split premise and hypothesis from tuples to tensors premise = batch.premise hypothesis = batch.hypothesis label = batch.label genre = batch.genre language = batch.language # check each is actually a tensor assert type(premise) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(label) == torch.Tensor assert type(genre) == torch.Tensor assert type(language) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert label.shape[-1] == batch_size assert genre.shape[-1] == batch_size assert language.shape[-1] == batch_size # xnli cannot use the iters method, ensure raises error with self.assertRaises(NotImplementedError): val_iter, test_iter = XNLI.iters(batch_size=batch_size) # remove downloaded xnli directory shutil.rmtree('.data/xnli')
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') parser = argparse.ArgumentParser() parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-lr_mul', type=float, default=2.0) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-output_dir', type=str, default=None) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) opt = parser.parse_args() english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True, pad_token='<blank>', init_token='<s>', eos_token='</s>') german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True, pad_token='<blank>', init_token='<s>', eos_token='</s>') fields = {'English': ('eng', english), 'German': ('ger', german)} train_data, test_data = TabularDataset.splits(path='', train='train.json', test='test.json', format='json', fields=fields) english.build_vocab(train_data, max_size=1000, min_freq=1) print('[Info] Get source language vocabulary size:', len(english.vocab)) german.build_vocab(train_data, max_size=1000, min_freq=1) print('[Info] Get target language vocabulary size:', len(german.vocab)) batch_size = opt.batch_size # data = pickle.load(open(opt.data_file, 'rb')) opt.src_pad_idx = english.vocab.stoi['<blank>'] opt.trg_pad_idx = german.vocab.stoi['<blank>'] opt.src_vocab_size = len(english.vocab) opt.trg_vocab_size = len(german.vocab) devices = [0, 1, 2, 3] pad_idx = opt.trg_vocab_size model = make_model(len(english.vocab), len(german.vocab), N=6) model.cuda() criterion = LabelSmoothing(size=len(german.vocab), padding_idx=pad_idx, smoothing=0.1) criterion.cuda() BATCH_SIZE = 12000 train_iter = MyIterator(train_data, batch_size=BATCH_SIZE, device=0, repeat=False, sort_key=lambda x: (len(x.eng), len(x.ger)), batch_size_fn=batch_size_fn, train=True) valid_iter = MyIterator(test_data, batch_size=BATCH_SIZE, device=0, repeat=False, sort_key=lambda x: (len(x.eng), len(x.ger)), batch_size_fn=batch_size_fn, train=False) model_par = nn.DataParallel(model, device_ids=devices) model_opt = NoamOpt( model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch in range(10): model_par.train() run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt)) model_par.eval() loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None)) print(loss) for i, batch in enumerate(valid_iter): src = batch.src.transpose(0, 1)[:1] src_mask = (src != english.vocab.stoi["<blank>"]).unsqueeze(-2) out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=german.vocab.stoi["<s>"]) print("Translation:", end="\t") for i in range(1, out.size(1)): sym = german.vocab.itos[out[0, i]] if sym == "</s>": break print(sym, end=" ") print() print("Target:", end="\t") for i in range(1, batch.trg.size(0)): sym = german.vocab.itos[batch.trg.data[i, 0]] if sym == "</s>": break print(sym, end=" ") print() break
batch_size=200, sort_key=lambda x: len(x.words), device=device, sort=True, sort_within_batch=True, ) test_iter = BucketIterator( test, batch_size=200, sort_key=lambda x: len(x.words), device=device, sort=True, sort_within_batch=True, ) text_field.build_vocab(train, min_freq=flor.log("min_freq", 5)) # LSTM model class LSTM(nn.Module): def __init__(self, dimension=128): super(LSTM, self).__init__() self.embedding = nn.Embedding(len(text_field.vocab), dimension) flor.log("embedding", self.embedding) self.lstm = nn.LSTM( input_size=dimension, hidden_size=dimension, num_layers=1, batch_first=True, bidirectional=True,
german = Field(tokenize=tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>') english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>') train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) # model class Encoder(nn.Module): def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout): super(Encoder, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.dropout = nn.Dropout(p) self.embedding = nn.Embedding(input_size, embedding_size) self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
def data_preprocessing(): SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True # import de_core_news_sm, en_core_web_sm # spacy_de = de_core_news_sm.load() # spacy_en = en_core_web_sm.load() # spacy_de = spacy.load('de_core_news_sm') # spacy_en = spacy.load('en_core_web_sm') # Field对象 :指定要如何处理某个字段,比如指定分词方法,是否转成小写,起始字符,结束字符,补全字符以及词典等。 # 我们创建SRC和TRG两个Field对象,tokenize为我们刚才定义的分词器函数 # 在每句话的开头加入字符SOS,结尾加入字符EOS,将所有单词转换为小写。 SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True) TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) # splits方法可以同时加载训练集,验证集和测试集, # 参数exts指定使用哪种语言作为源语言和目标语言,fileds指定定义好的Field类 train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) # print(f"Number of training examples: {len(train_data.examples)}") # print(f"Number of validation examples: {len(valid_data.examples)}") # print(f"Number of testing examples: {len(test_data.examples)}") # vars() 函数返回对象object的属性和属性值的字典对象。 # print(vars(train_data.examples[0])) # 构建词表,即给每个单词编码,用数字表示每个单词,这样才能传入模型 SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) # print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}") # print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # print(device) BATCH_SIZE = 128 # BucketIterator:相比于标准迭代器,会将类似长度的样本当做一批来处理 # 因为在文本处理中经常会需要将每一批样本长度补齐为当前批中最长序列的长度 # 因此当样本长度差别较大时,使用BucketIerator可以带来填充效率的提高。 # 除此之外,我们还可以在Field中通过fix_length参数来对样本进行截断补齐操作。 # 当使用迭代器生成一个batch时,我们需要确保所有的源语言句子都padding到相同的长度,目标语言的句子也是。 # 这些功能torchtext可以自动的完成,其使用了动态padding,意味着一个batch内的所有句子会pad成batch内最长的句子长度。 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device) return SRC, TRG, device, train_iterator, valid_iterator, test_iterator
print(batch_data.text) # **Embedding** # 下面介绍如何在torchtext中使用预训练的词向量,进而传送给神经网络模型进行训练 # In[25]: # 加载预训练的词向量 vectors = Vectors(name="data/tnews_jieba_tencent_embeddings.txt") # 指定Vector缺失值的初始化方式,oov词的初始化方式 vectors.unk_init = nn.init.uniform_ # 这里只使用训练集的数据进行词汇表的构建 TEXT.build_vocab(train_dataset_torchtext, vectors=vectors) # In[26]: # 统计词频 TEXT.vocab.freqs.most_common(10) # **迭代器** # # * **Iterator**:保持数据样本顺序不变来构建批数据,适用测试集 # # * **BucketIterator**:自动将相似长度的示例批处理在一起,最大程度地减少所需的填充量,适合训练集和验证集 #
class Preprocessing(Dataset): __tokPattern = r"""[0-9A-Za-z_]*[A-Za-z_-]+[0-9A-Za-z_]*|\.|\!|\?|\d+|\-|%|[.,!?;'"]""" __supportedExtensions = ['txt', 'csv', 'json'] __seedAttrs = ['nFirst', 'minFreq'] def __init__(self, fileParams={}, tokenizationOption='regex', seedParams={ 'nFirst': 1, 'minFreq': 5 }, fieldParams={ 'lower': True, 'eos_token': '<!EOS!>' }, spacyObj=None): self.__fileName, self.__fileExtension, self.__parsingColumn = checkFileParams( fileParams) self.__seedParams = checkSeedParams(seedParams) self.__DataVocab = Field(**fieldParams) self.__spacyObj = spacyObj self.__customTokenize = self.__tokenizationMethod(tokenizationOption) self.__readFile() @property def getFileName(self): return self.__fileName @property def getVocab(self): return self.__DataVocab def __readFile(self): text = readFiles(self.__fileName, self.__fileExtension, self.__parsingColumn) self.examples = self.__getObjects(text) self.__seeds = getStartWords(self.__seedParams, text) self.__build_vocab() def __getObjects(self, text): self.fields = {"src": self.__DataVocab} return [Document(**self.__tokenize(instance)) for instance in text] def __build_vocab(self): self.__DataVocab.build_vocab(self) for instance in self.examples: instance.create_tokens(self.__DataVocab) def __regexTokenization(self, document): return re.findall(self.__tokPattern, document) def __nltkTokenization(self, document): return self.tokenizer(document) def __spacyTokenization(self, instance): return [ entity.text.strip() for entity in self.__spacyObj(instance) if entity.text.strip() ] def __tokenize(self, instance): instance = self.__customTokenize(instance) return {'src': instance, 'trg': instance[1:]} @checkParams(str) def __tokenizationMethod(self, param): param = param.lower() if param == 'nltk': self.tokenizer = importNltk() return self.__nltkTokenization elif param == 'regex': return self.__regexTokenization elif param == 'spacy': if not self.__spacyObj: raise Exception( "Please provide the spacy object to tokenize with.") return self.__spacyTokenization raise Exception( "The parameter 'tokenizationOption' can only be nltk, regex and spacy" ) def getSeed(self): """ return a weighted seed. In case static seed is enabled, then the most frequent token will be the seed. """ seeds = list(self.__seeds.keys()) probs = list(self.__seeds.values()) return choice(seeds, 1, probs).tolist()
texts = np.concatenate((train_, eval_)) labels = np.concatenate((train_labels, eval_labels)) df = pd.DataFrame({'text': texts, 'label': labels}) text_field = Field(sequential=True, tokenize='basic_english', fix_length=5, lower=True) label_field = Field(sequential=False, use_vocab=False, is_target=True) preprocessed_text = df['text'].apply(lambda x: text_field.preprocess(x)) # text_field.build_vocab(preprocessed_text, vectors='fasttext.simple.300d') text_field.build_vocab(preprocessed_text, vectors='glove.6B.50d') vocab = text_field.vocab ltoi = {l: i for i, l in enumerate(df['label'].unique())} df['label'] = df['label'].apply(lambda y: ltoi[y]) class DataFrameDataset(torchtext.legacy.data.Dataset): def __init__(self, df: pd.DataFrame, fields: list): super(DataFrameDataset, self).__init__( [Example.fromlist(list(r), fields) for i, r in df.iterrows()], fields) train_dataset, test_dataset = DataFrameDataset(df=df, fields=(('text', text_field),
Valoracion = Field(sequential=False, use_vocab=False) fields = {"Texto": ("t", Texto), "Valoracion": ("v", Valoracion)} train_data, test_data = TabularDataset.splits( path='/content/Dataset', train='train.csv', test='test.csv', format='csv', fields=fields) len(train_data) , len(test_data) print(vars(train_data.examples[0])) Texto.build_vocab(train_data, max_size=10000, min_freq=1,vectors="glove.6B.100d") Texto.vocab.freqs.most_common(25) Texto.vocab.itos[:10] train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=2, device=device ) class RNN_LSTM(nn.Module): def __init__(self, input_size, embed_size, hidden_size, num_layers): super(RNN_LSTM, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers
datafields = [("input", INPUT), ("target", TARGET)] trn, vld, tst = TabularDataset.splits(path="data/" + data_size, train=train_csv, validation=validation_csv, test=test_csv, format='csv', skip_header=True, fields=datafields) print(f"Number of {data_size} training examples: {len(trn.examples)}") print(f"Number of {data_size} validation examples: {len(vld.examples)}") print(f"Number of {data_size} test examples: {len(tst.examples)}") INPUT.build_vocab(trn) TARGET.build_vocab(trn) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_iter, val_iter, test_iter = BucketIterator.splits( (trn, vld, tst), sort_key=lambda x: len(x.input), sort_within_batch=False, batch_size=BATCH_SIZE, device=device) """ Build Transformer """
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) TGT = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TGT)) SRC.build_vocab(train_data, min_freq=2) TGT.build_vocab(train_data, min_freq=2) BATCH_SIZE = 8 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE) if __name__ == "__main__": syn_data = synthetic_data(8, 2, 1) for i, batch in enumerate(syn_data): logging.info("batch-src shape {}, batch-src: {}".format( batch.src.shape, batch.src)) logging.info("batch-tgt shape {}, batch-tgt: {}".format( batch.tgt.shape, batch.tgt)) logging.info("batch-src-mask shape {}, batch-src-mask: {}".format( batch.src_mask.shape, batch.src_mask))
device eng=spacy.load('en') ger=spacy.load('de_core_news_sm') def Tokenize_eng(text): return [a.text for a in eng.tokenizer(text)] def Tokenize_german(text): return [b.text for b in ger.tokenizer(text)] german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>') english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>') Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english)) german.build_vocab(Train,max_size=10000,min_freq=2) english.build_vocab(Train,max_size=10000,min_freq=2) ##building encoder class Encoder(Module): def __init__(self,inp_size,emd_size,hidden_size): super(Encoder,self).__init__() self.inp_size=inp_size self.emd_size=emd_size self.hidden_size=hidden_size self.drop=Dropout(0.5) self.embed=Embedding(self.inp_size,self.emd_size) self.lstm=LSTM(self.emd_size,self.hidden_size,bidirectional=True) self.fc_hidden=Linear(self.hidden_size*2,self.hidden_size) self.fc_cell=Linear(self.hidden_size*2,self.hidden_size) def forward(self,x):
random.seed(1337) torch.manual_seed(1337) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Download and the load default data. WORD = Field(include_lengths=True) UD_TAG = Field(init_token="<bos>", eos_token="<eos>", include_lengths=True) # Download and the load default data. train, val, test = UDPOS.splits( fields=(("word", WORD), ("udtag", UD_TAG), (None, None)), filter_pred=lambda ex: 5 < len(ex.word) < 30, ) WORD.build_vocab(train.word, min_freq=3) UD_TAG.build_vocab(train.udtag) train_iter = torch_struct.data.TokenBucket(train, batch_size=100, device="cuda:0") H = 256 T = 30 NT = 30 model = NeuralCFG(len(WORD.vocab), T, NT, H) if args.script: print("scripting...") model = torch.jit.script(model) model.cuda() opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=[0.75, 0.999])
def main(): parser = argparse.ArgumentParser() parser.add_argument('-raw_dir', required=True) parser.add_argument('-data_dir', required=True) parser.add_argument('-codes', required=True) parser.add_argument('-save_data', required=True) parser.add_argument('-prefix', required=True) parser.add_argument('-max_len', type=int, default=100) parser.add_argument('--symbols', '-s', type=int, default=32000, help="Vocabulary size") parser.add_argument( '--min-frequency', type=int, default=6, metavar='FREQ', help= 'Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') parser.add_argument( '--dict-input', action="store_true", help= "If set, input file is interpreted as a dictionary where each line contains a word-count pair" ) parser.add_argument( '--separator', type=str, default='@@', metavar='STR', help= "Separator between non-final subword units (default: '%(default)s'))") parser.add_argument('--total-symbols', '-t', action="store_true") opt = parser.parse_args() # Create folder if needed. mkdir_if_needed(opt.raw_dir) mkdir_if_needed(opt.data_dir) # Download and extract raw data. raw_train = get_raw_files(opt.raw_dir, _TRAIN_DATA_SOURCES) raw_val = get_raw_files(opt.raw_dir, _VAL_DATA_SOURCES) raw_test = get_raw_files(opt.raw_dir, _TEST_DATA_SOURCES) # Merge files into one. train_src, train_trg = compile_files(opt.raw_dir, raw_train, opt.prefix + '-train') val_src, val_trg = compile_files(opt.raw_dir, raw_val, opt.prefix + '-val') test_src, test_trg = compile_files(opt.raw_dir, raw_test, opt.prefix + '-test') # Build up the code from training files if not exist opt.codes = os.path.join(opt.data_dir, opt.codes) if not os.path.isfile(opt.codes): sys.stderr.write( f"Collect codes from training data and save to {opt.codes}.\n") learn_bpe(raw_train['src'] + raw_train['trg'], opt.codes, opt.symbols, opt.min_frequency, True) sys.stderr.write(f"BPE codes prepared.\n") sys.stderr.write(f"Build up the tokenizer.\n") with codecs.open(opt.codes, encoding='utf-8') as codes: bpe = BPE(codes, separator=opt.separator) sys.stderr.write(f"Encoding ...\n") encode_files(bpe, train_src, train_trg, opt.data_dir, opt.prefix + '-train') encode_files(bpe, val_src, val_trg, opt.data_dir, opt.prefix + '-val') encode_files(bpe, test_src, test_trg, opt.data_dir, opt.prefix + '-test') sys.stderr.write(f"Done.\n") field = Field(tokenize=str.split, lower=True, pad_token=Constants.PAD_WORD, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD) fields = (field, field) MAX_LEN = opt.max_len def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN enc_train_files_prefix = opt.prefix + '-train' train = TranslationDataset(fields=fields, path=os.path.join(opt.data_dir, enc_train_files_prefix), exts=('.src', '.trg'), filter_pred=filter_examples_with_length) from itertools import chain field.build_vocab(chain(train.src, train.trg), min_freq=2) data = { 'settings': opt, 'vocab': field, } opt.save_data = os.path.join(opt.data_dir, opt.save_data) print('[Info] Dumping the processed data to pickle file', opt.save_data) pickle.dump(data, open(opt.save_data, 'wb'))
def train(): spacy_ger = de_core_news_md.load() spacy_eng = en_core_web_sm.load() def tokenize_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) ### We're ready to define everything we need for training our Seq2Seq model ### # Training hyperparameters num_epochs = 20 learning_rate = 0.001 batch_size = 64 # Model hyperparameters load_model = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_size_encoder = len(german.vocab) input_size_decoder = len(english.vocab) output_size = len(english.vocab) encoder_embedding_size = 300 decoder_embedding_size = 300 hidden_size = 1024 # Needs to be the same for both RNN's num_layers = 2 enc_dropout = 0.5 dec_dropout = 0.5 # Tensorboard to get nice loss plot writer = SummaryWriter(f"runs/loss_plot") step = 0 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ) encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device) decoder_net = Decoder( input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout, ).to(device) model = Seq2Seq(encoder_net, decoder_net, len(english.vocab), device).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters" ) pad_idx = english.vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) if load_model: load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model, optimizer) sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen." for epoch in range(num_epochs): print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]" ) checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict() } # save_checkpoint(checkpoint) model.eval() translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50) print(f"Translated example sentence: \n {translated_sentence}") model.train() for batch_idx, batch in enumerate(train_iterator): # Get input and targets and get to cuda inp_data = batch.src.to(device) target = batch.trg.to(device) # Forward prop output = model(inp_data, target) # print('\n') # print('Input', inp_data.shape) # print('Target', target.shape) # print('Output', output.shape) # print('---------------------') # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss # doesn't take input in that form. For example if we have MNIST we want to have # output to be: (N, 10) and targets just (N). Here we can view it in a similar # way that we have output_words * batch_size that we want to send in into # our cost function, so we need to do some reshapin. While we're at it # Let's also remove the start token while we're at it output = output[1:].reshape(-1, output.shape[2]) target = target[1:].reshape(-1) optimizer.zero_grad() loss = criterion(output, target) # Back prop loss.backward() # Clip to avoid exploding gradient issues, makes sure grads are # within a healthy range torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # Gradient descent step optimizer.step() # Plot to tensorboard writer.add_scalar("Training loss", loss, global_step=step) # print("Training loss", loss) step += 1 score = bleu(test_data[1:100], model, german, english, device) print(f"Bleu score {score*100:.2f}")
def main_wo_bpe(): ''' Usage: python preprocess.py -lang_src de -lang_trg en -save_data multi30k_de_en.pkl -share_vocab ''' spacy_support_langs = [ 'de', 'el', 'en', 'es', 'fr', 'it', 'lt', 'nb', 'nl', 'pt' ] parser = argparse.ArgumentParser() parser.add_argument('-lang_src', required=True, choices=spacy_support_langs) parser.add_argument('-lang_trg', required=True, choices=spacy_support_langs) parser.add_argument('-save_data', required=True) parser.add_argument('-data_src', type=str, default=None) parser.add_argument('-data_trg', type=str, default=None) parser.add_argument('-max_len', type=int, default=100) parser.add_argument('-min_word_count', type=int, default=3) parser.add_argument('-keep_case', action='store_true') parser.add_argument('-share_vocab', action='store_true') #parser.add_argument('-ratio', '--train_valid_test_ratio', type=int, nargs=3, metavar=(8,1,1)) #parser.add_argument('-vocab', default=None) opt = parser.parse_args() assert not any([opt.data_src, opt.data_trg ]), 'Custom data input is not support now.' assert not any([opt.data_src, opt.data_trg]) or all( [opt.data_src, opt.data_trg]) print(opt) src_lang_model = spacy.load(opt.lang_src) trg_lang_model = spacy.load(opt.lang_trg) def tokenize_src(text): return [tok.text for tok in src_lang_model.tokenizer(text)] def tokenize_trg(text): return [tok.text for tok in trg_lang_model.tokenizer(text)] SRC = Field(tokenize=tokenize_src, lower=not opt.keep_case, pad_token=Constants.PAD_WORD, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD) TRG = Field(tokenize=tokenize_trg, lower=not opt.keep_case, pad_token=Constants.PAD_WORD, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD) MAX_LEN = opt.max_len MIN_FREQ = opt.min_word_count if not all([opt.data_src, opt.data_trg]): assert {opt.lang_src, opt.lang_trg} == {'de', 'en'} else: # Pack custom txt file into example datasets raise NotImplementedError def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN train, val, test = Multi30k.splits(exts=('.' + opt.lang_src, '.' + opt.lang_trg), fields=(SRC, TRG), filter_pred=filter_examples_with_length) SRC.build_vocab(train.src, min_freq=MIN_FREQ) print('[Info] Get source language vocabulary size:', len(SRC.vocab)) TRG.build_vocab(train.trg, min_freq=MIN_FREQ) print('[Info] Get target language vocabulary size:', len(TRG.vocab)) if opt.share_vocab: print('[Info] Merging two vocabulary ...') for w, _ in SRC.vocab.stoi.items(): # TODO: Also update the `freq`, although it is not likely to be used. if w not in TRG.vocab.stoi: TRG.vocab.stoi[w] = len(TRG.vocab.stoi) TRG.vocab.itos = [None] * len(TRG.vocab.stoi) for w, i in TRG.vocab.stoi.items(): TRG.vocab.itos[i] = w SRC.vocab.stoi = TRG.vocab.stoi SRC.vocab.itos = TRG.vocab.itos print('[Info] Get merged vocabulary size:', len(TRG.vocab)) data = { 'settings': opt, 'vocab': { 'src': SRC, 'trg': TRG }, 'train': train.examples, 'valid': val.examples, 'test': test.examples } print('[Info] Dumping the processed data to pickle file', opt.save_data) pickle.dump(data, open(opt.save_data, 'wb'))
batch_size=200, sort_key=lambda x: len(x.words), device=device, sort=True, sort_within_batch=True, ) test_iter = BucketIterator( test, batch_size=200, sort_key=lambda x: len(x.words), device=device, sort=True, sort_within_batch=True, ) text_field.build_vocab(train, min_freq=5) # LSTM model class LSTM(nn.Module): def __init__(self, dimension=128): super(LSTM, self).__init__() self.embedding = nn.Embedding(len(text_field.vocab), dimension) flor.log("embedding", self.embedding) self.lstm = nn.LSTM( input_size=dimension, hidden_size=dimension, num_layers=1, batch_first=True, bidirectional=True,
train_data = pd.DataFrame(train_lines) valid_data = pd.DataFrame(train_lines[dataset_length - valid_size:]) train_data = [ Example.fromlist([train_data.questions[i], train_data.answers[i]], fields) for i in range(train_data.shape[0]) ] valid_data = [ Example.fromlist([valid_data.questions[i], valid_data.answers[i]], fields) for i in range(valid_data.shape[0]) ] train_data = Dataset(train_data, fields) valid_data = Dataset(valid_data, fields) Question.build_vocab(train_data, min_freq=2) Answer.build_vocab( train_data, vectors=torchtext.vocab.Vectors("./python_code_glove_embedding_300.txt"), min_freq=2) print(f"Unique tokens in Question vocabulary: {len(Question.vocab)}") print(f"Unique tokens in Answer vocabulary: {len(Answer.vocab)}") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 32 print('BATCH_SIZE:', 32) train_iterator, valid_iterator = BucketIterator.splits(