def build(self): print('Build Vocabulary from ', self.path) tokenize = BuildVocab.tokenize_text TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=35, use_vocab=True) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('text', TEXT)] data = TabularDataset(path=self.path, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(data, vectors=GloVe(name='6B', dim=300), max_size=1000) #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.stoi = TEXT.vocab.stoi self.vectors = TEXT.vocab.vectors
def vocab_builder(self): #self.eid_field = Field(sequential=False,tokenize) print('Build Vocabulary') tokenize = BiGraphTextDataset.tokenize_text TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=35, use_vocab=True) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('text', TEXT)] path = '/data1/home2/AgainstRumor/data/Pheme/data.text.txt' train_data = TabularDataset(path=path, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.stoi_dict = TEXT.vocab.stoi self.vocab_vectors = TEXT.vocab.vectors
def buildDataSets(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Model parameter MAX_SEQ_LEN = 16 PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # Fields label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.int8) text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) fields = {'label': ('label', label_field), 'text': ('text', text_field)} # TabularDataset train, valid, test = TabularDataset.splits(path='memesData/data', train='train.jsonl', validation='dev_unseen.jsonl', test='dev_seen.jsonl', format='JSON', fields=fields) # Iterators train_iter = BucketIterator(train, batch_size=8, sort_key=lambda x: len(x.text), train=True, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=8, sort_key=lambda x: len(x.text), train=True, sort=True, sort_within_batch=True) test_iter = Iterator(test, batch_size=8, train=False, shuffle=False, sort=False) return train_iter, valid_iter, test_iter
def produce_iterators(train_filename, valid_filename, test_filename, asr_tokenizer, ttx_tokenizer=None): """ Produce datasets for each of training, validation and test data. Also build vocabs for true text, tags, and ASR. :param train_filename: location of train data csv :param valid_filename: location of valid data csv :param test_filename: location of test data csv :return: """ TTX = Field(tokenize=lambda x: tokenize_TTX(x, ttx_tokenizer), init_token='<sos>', eos_token='<eos>', lower=False, batch_first=True) TRG = Field(tokenize=tokenize_TRG, init_token='<sos>', eos_token='<eos>', lower=False, batch_first=True) ASR = Field(tokenize=lambda x: tokenize_ASR(x, asr_tokenizer), init_token='<sos>', eos_token='<eos>', lower=False, batch_first=True) fields = { 'true_text': ('true_text', TTX), 'tags': ('tags', TRG), 'asr': ('asr', ASR) } train_data, valid_data, test_data = TabularDataset.splits( path='.\\', train=train_filename, validation=valid_filename, test=test_filename, format='csv', fields=fields) # Put min_freq at 2 or higher for real data TTX.build_vocab(train_data, min_freq=1) TRG.build_vocab(train_data, min_freq=1) ASR.build_vocab(train_data, min_freq=1) return train_data, valid_data, test_data, TTX, TRG, ASR
def create_iterators(data_file_location, split_to_train_and_test=True): if split_to_train_and_test: train, test = TabularDataset(path=data_file_location, format="TSV", fields=fields, skip_header=True).split() train_iter = BucketIterator(train, batch_size=Parameters.BATCH_SIZE, device=Parameters.DEVICE, train=True, shuffle=True) test_iter = BucketIterator(test, batch_size=Parameters.BATCH_SIZE, device=Parameters.DEVICE, train=True, shuffle=True, sort=False) answer = train_iter, test_iter else: dataset = TabularDataset(path=data_file_location, format="TSV", fields=fields, skip_header=True) answer = BucketIterator(dataset, batch_size=Parameters.BATCH_SIZE, device=Parameters.DEVICE, train=True, shuffle=True, sort=False) print("Finish dataset prepare") return answer
def get_dataset(path_do_data: str, transformer: bool) -> TabularDataset: SRC = Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=False) TRG = Field( tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=False, ) dataset = TabularDataset(path=path_do_data, format='tsv', fields=[('trg', TRG), ('src', SRC)]) return SRC, TRG, dataset
def test(file_list): sentences = list( map(lambda x: list(sent_tokenize(open(x).read())), file_list)) train_set = sentences[:math.floor(len(sentences) / 2)] test_set = sentences[math.floor(len(sentences) / 2):] train_data = pd.DataFrame() test_data = pd.DataFrame() for i in train_set: train_data = pd.concat([train_data, pd.DataFrame(i)]) for j in test_set: test_data = pd.concat([test_data, pd.DataFrame(j)]) train_data.to_csv("train_data.csv", index=False) test_data.to_csv("test_data.csv", index=False) TEXT = data.Field(sequential=True, use_vocab=True, tokenize=word_tokenize, lower=True, batch_first=True) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=False, is_target=True) train_data, test_data = TabularDataset.splits(path='.', train='train_data.csv', test='test_data.csv', format='csv', fields=[('text', TEXT), ('label', LABEL)], skip_header=True) batch_size = 5 train_loader = Iterator(dataset=train_data, batch_size=batch_size) test_loader = Iterator(dataset=test_data, batch_size=batch_size)
init_token_idx = tokenizer.convert_tokens_to_ids(init_token) eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token) pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token) unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token) TEXT = Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) LABEL = LabelField(dtype=torch.long, use_vocab=False) fields = [('data', TEXT), ('label', LABEL)] train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='validation.csv', test='test.csv', format='CSV', fields=fields, skip_header=True) train_generator, val_generator, test_generator = BucketIterator.splits( (train, valid, test), batch_size=batch_size, device=device, sort=False) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) all_statedict_path = glob.glob('/root/logs/*.pth') for state_dict_path in all_statedict_path: print(state_dict_path) epoch_loss = 0 epoch_acc = 0 model = phobert_lstm(phobert_path=phobert_path,
print(device) #python -m spacy download en spacy_en = spacy.load("en") def tokenize(text): return [tok.text for tok in spacy_en.tokenizer(text)] Texto = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True) Valoracion = Field(sequential=False, use_vocab=False) fields = {"Texto": ("t", Texto), "Valoracion": ("v", Valoracion)} train_data, test_data = TabularDataset.splits( path='/content/Dataset', train='train.csv', test='test.csv', format='csv', fields=fields) len(train_data) , len(test_data) print(vars(train_data.examples[0])) Texto.build_vocab(train_data, max_size=10000, min_freq=1,vectors="glove.6B.100d") Texto.vocab.freqs.most_common(25) Texto.vocab.itos[:10] train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=2, device=device
from torchtext.legacy.data import Field, TabularDataset, BucketIterator import torch.nn as nn from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence import torch.optim as optim from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import seaborn as sns import flor flor.flags.NAME = 'kaggle-nlp-disasters-rnn' flor.flags.REPLAY = False device = torch.device(('cuda:0' if torch.cuda.is_available() else 'cpu')) device label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True) fields = [('words', text_field), ('target', label_field)] fields_test = [('words', text_field)] (train, valid) = TabularDataset.splits(path='data', train='train_rnn.csv', validation='valid_rnn.csv', format='CSV', fields=fields, skip_header=True) test = TabularDataset(path='data/test_rnn.csv', format='CSV', fields=fields_test, skip_header=True) train_iter = BucketIterator(train, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True) test_iter = BucketIterator(test, batch_size=200, sort_key=(lambda x: len(x.words)), device=device, sort=True, sort_within_batch=True) text_field.build_vocab(train, min_freq=5) class LSTM(nn.Module): def __init__(self, dimension=128): super(LSTM, self).__init__() self.embedding = nn.Embedding(len(text_field.vocab), 300) self.dimension = dimension self.lstm = nn.LSTM(input_size=300, hidden_size=dimension, num_layers=1, batch_first=True, bidirectional=True) self.drop = nn.Dropout(p=0.5) self.fc = nn.Linear((2 * dimension), 1)
tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True) TARGET = Field(sequential=True, tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True) datafields = [("input", INPUT), ("target", TARGET)] trn, vld, tst = TabularDataset.splits(path="data/" + data_size, train=train_csv, validation=validation_csv, test=test_csv, format='csv', skip_header=True, fields=datafields) print(f"Number of {data_size} training examples: {len(trn.examples)}") print(f"Number of {data_size} validation examples: {len(vld.examples)}") print(f"Number of {data_size} test examples: {len(tst.examples)}") INPUT.build_vocab(trn) TARGET.build_vocab(trn) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_iter, val_iter, test_iter = BucketIterator.splits( (trn, vld, tst),
# fields = (SRC, TRG)) # fetch from Github repo # !wget # https://raw.githubusercontent.com/tberg12/cse291spr21/main/assignment1/train.json # !wget # https://raw.githubusercontent.com/tberg12/cse291spr21/main/assignment1/valid.json # !wget # https://raw.githubusercontent.com/tberg12/cse291spr21/main/assignment1/test.json # and load to same variables fields = {'src': ('src', SRC), 'trg': ('trg', TRG)} train_data, valid_data, test_data = TabularDataset.splits( path='.', train='train.json', validation='valid.json', test='test.json', format='json', fields=fields) SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) BATCH_SIZE = 128 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True,
# Model parameter MAX_SEQ_LEN = 128 PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # Fields label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) fields = [('label', label_field), ('titletext', text_field)] # TabularDataset source_folder = "data/real_fake_news" destination_folder = "outs/debug" train, valid, test = TabularDataset.splits(path=source_folder, train='train{}.csv'.format(debug_flag), validation='valid{}.csv'.format(debug_flag), test='test{}.csv'.format(debug_flag), format='CSV', fields=fields, skip_header=True) # Iterators train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.titletext), device=device, train=True, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=16, sort_key=lambda x: len(x.titletext), device=device, train=True, sort=True, sort_within_batch=True) test_iter = Iterator(test, batch_size=16, device=device, train=False, shuffle=False, sort=False) # Build model class BERT(nn.Module): def __init__(self):
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') parser = argparse.ArgumentParser() parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-lr_mul', type=float, default=2.0) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-output_dir', type=str, default=None) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) opt = parser.parse_args() english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True, pad_token='<blank>', init_token='<s>', eos_token='</s>') german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True, pad_token='<blank>', init_token='<s>', eos_token='</s>') fields = {'English': ('eng', english), 'German': ('ger', german)} train_data, test_data = TabularDataset.splits(path='', train='train.json', test='test.json', format='json', fields=fields) english.build_vocab(train_data, max_size=1000, min_freq=1) print('[Info] Get source language vocabulary size:', len(english.vocab)) german.build_vocab(train_data, max_size=1000, min_freq=1) print('[Info] Get target language vocabulary size:', len(german.vocab)) batch_size = opt.batch_size # data = pickle.load(open(opt.data_file, 'rb')) opt.src_pad_idx = english.vocab.stoi['<blank>'] opt.trg_pad_idx = german.vocab.stoi['<blank>'] opt.src_vocab_size = len(english.vocab) opt.trg_vocab_size = len(german.vocab) devices = [0, 1, 2, 3] pad_idx = opt.trg_vocab_size model = make_model(len(english.vocab), len(german.vocab), N=6) model.cuda() criterion = LabelSmoothing(size=len(german.vocab), padding_idx=pad_idx, smoothing=0.1) criterion.cuda() BATCH_SIZE = 12000 train_iter = MyIterator(train_data, batch_size=BATCH_SIZE, device=0, repeat=False, sort_key=lambda x: (len(x.eng), len(x.ger)), batch_size_fn=batch_size_fn, train=True) valid_iter = MyIterator(test_data, batch_size=BATCH_SIZE, device=0, repeat=False, sort_key=lambda x: (len(x.eng), len(x.ger)), batch_size_fn=batch_size_fn, train=False) model_par = nn.DataParallel(model, device_ids=devices) model_opt = NoamOpt( model.src_embed[0].d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch in range(10): model_par.train() run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt)) model_par.eval() loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None)) print(loss) for i, batch in enumerate(valid_iter): src = batch.src.transpose(0, 1)[:1] src_mask = (src != english.vocab.stoi["<blank>"]).unsqueeze(-2) out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=german.vocab.stoi["<s>"]) print("Translation:", end="\t") for i in range(1, out.size(1)): sym = german.vocab.itos[out[0, i]] if sym == "</s>": break print(sym, end=" ") print() print("Target:", end="\t") for i in range(1, batch.trg.size(0)): sym = german.vocab.itos[batch.trg.data[i, 0]] if sym == "</s>": break print(sym, end=" ") print() break
def split_data(dataset: TabularDataset, train_size: float, valid_size: float, test_size: float): train_data, valid_data, test_data = dataset.split( split_ratio=[train_size, valid_size, test_size]) return train_data, valid_data, test_data
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) fields = [('index', label_field), ('text', text_field), ('label', label_field)] # TabularDataset train, valid, test = TabularDataset.splits(path='./data', train='IMDB_single.csv', validation='IMDBs.csv', test='IMDBs.csv', format='CSV', fields=fields, skip_header=True) # Iterators train_iter = BucketIterator(train, batch_size=16, sort_key=lambda x: len(x.text), device=device, train=True, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=16,
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(tokenize="spacy", lower=True, include_lengths=True, batch_first=True) fields = [("words", text_field), ("target", label_field)] fields_test = [("words", text_field)] train, valid = TabularDataset.splits( path="data", train="train_rnn.csv", validation="valid_rnn.csv", format="CSV", fields=fields, skip_header=True, ) test = TabularDataset(path="data/test_rnn.csv", format="CSV", fields=fields_test, skip_header=True) train_iter = BucketIterator( train, batch_size=flor.log("batch_size", 200), sort_key=lambda x: len(x.words), device=device, sort=True, sort_within_batch=True,
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token) max_input_length = tokenizer.max_model_input_sizes['vinai/phobert-base'] TEXT = Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) LABEL = LabelField(dtype=torch.long, use_vocab=False) fields = [('data', TEXT), ('label', LABEL)] train, valid, test = TabularDataset.splits(path=SOURCE_FOLDER, train='train.csv', validation='validation.csv', test='test.csv', format='CSV', fields=fields, skip_header=True) train_generator, val_generator, test_generator = BucketIterator.splits( (train, valid, test), batch_size=BATCH_SIZE, device=device, sort=False) if not os.path.exists(log_dir): os.makedirs(log_dir) writer = tensorboardX.SummaryWriter() optimizer = optim.Adam(model.parameters(), lr=1e-4) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) global_count = 0 for epoch in range(NUM_EPOCHS):