print(df.head()) print('Count of sample is {}'.format(len(df))) train_df = df[:25000] test_df = df[25000:] train_df.to_csv('data/train_data.csv', index=False) test_df.to_csv('data/test_data.csv', index=False) from torchtext import data # Declare field TEXT = data.Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, batch_first=True, fix_length=20) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=False, is_target=True) from torchtext.data import TabularDataset train_data, test_data = TabularDataset.splits(path='.', train='data/train_data.csv', test='data/test_data.csv', format='csv', fields=[('text', TEXT),
def test_init_when_nesting_field_is_not_sequential(self): nesting_field = data.Field(sequential=False) field = data.NestedField(nesting_field) assert field.pad_token == "<pad>"
def test_numericalize(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ]] numericalized = field.numericalize(examples_data) assert numericalized.dim() == 3 assert numericalized.size(0) == len(examples_data) for example, numericalized_example in zip(examples_data, numericalized): verify_numericalized_example(field, example, numericalized_example, batch_first=True) # test include_lengths nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field, include_lengths=True) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [[ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ]] numericalized, seq_len, word_len = field.numericalize( (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]])) assert numericalized.dim() == 3 assert len(seq_len) == 2 assert len(word_len) == 2 assert numericalized.size(0) == len(examples_data) for example, numericalized_example in zip(examples_data, numericalized): verify_numericalized_example(field, example, numericalized_example, batch_first=True)
# load MR dataset def mr(text_field, label_field, **kargs): train_data, dev_data = mydatasets.MR.splits(text_field, label_field) text_field.build_vocab(train_data, dev_data, min_freq=args.min_freq) label_field.build_vocab(train_data, dev_data) train_iter, dev_iter = data.Iterator.splits( (train_data, dev_data), batch_sizes=(args.batch_size, len(dev_data)), **kargs) return train_iter, dev_iter # load data print("\nLoading data...") text_field = data.Field(lower=True) # text_field = data.Field(lower=False) label_field = data.Field(sequential=False) static_text_field = data.Field(lower=True) static_label_field = data.Field(sequential=False) if args.FIVE_CLASS_TASK: print("Executing 5 Classification Task......") # train_iter, dev_iter, test_iter = mrs_five(args.datafile_path, args.name_trainfile, # args.name_devfile, args.name_testfile, args.char_data, text_field, label_field, device=-1, repeat=False, shuffle=args.epochs_shuffle) if args.CNN_MUI is True or args.DEEP_CNN_MUI is True: train_iter, dev_iter, test_iter = mrs_five_mui( args.datafile_path, args.name_trainfile, args.name_devfile, args.name_testfile, args.char_data,
if __name__ == '__main__': # Set up # Set up spacy_de = spacy.load('de') spacy_en = spacy.load('en') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] BOS_WORD = '<s>' EOS_WORD = '</s>' DE = data.Field(tokenize=tokenize_de) EN = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS MAX_LEN = 20 train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN) MIN_FREQ = 5 DE.build_vocab(train.src, min_freq=MIN_FREQ) EN.build_vocab(train.trg, min_freq=MIN_FREQ) print(DE.vocab.freqs.most_common(10)) print("Size of German vocab", len(DE.vocab)) print(EN.vocab.freqs.most_common(10)) print("Size of English vocab", len(EN.vocab))
if True: import spacy spacy_de = spacy.load('de') spacy_en = spacy.load('en') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = "<blank>" SRC = data.Field(tokenize=tokenize_en, pad_token=BLANK_WORD) TGT = data.Field(tokenize=tokenize_de, init_token = BOS_WORD, eos_token = EOS_WORD, pad_token=BLANK_WORD) MAX_LEN = 100 train, val, test = datasets.IWSLT.splits( exts=('.en', '.de'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN) MIN_FREQ = 2 SRC.build_vocab(train.src, min_freq=MIN_FREQ) TGT.build_vocab(train.trg, min_freq=MIN_FREQ) class MyIterator(data.Iterator): def create_batches(self): if self.train:
import codecs import os import torch from subword_nmt.apply_bpe import BPE from torchtext import data, datasets import shared BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = "<blank>" MAX_LEN = 350 MIN_VOCAB_FREQ = 1 tokenizer_fun = lambda s: s.split() SRC = data.Field(pad_token=BLANK_WORD, batch_first=True, tokenize=tokenizer_fun) TGT = data.Field(init_token = BOS_WORD, eos_token = EOS_WORD, pad_token=BLANK_WORD, batch_first=True, tokenize=tokenizer_fun) def load_dataset(src_lang: str, tgt_lang: str, min_length: int = 0, only_val: bool = False): print("Loading dataset...") if only_val: train = None test = None val = datasets.WMT14.splits(root=os.path.abspath(os.path.join(shared.DATA_FOLDER)), exts=(f'.{src_lang}', f'.{tgt_lang}'), fields=(SRC, TGT), train=None, test=None, filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN and len(vars(x)['src']) >= min_length and len(vars(x)['trg']) >= min_length)[0] else: train, val, test = datasets.WMT14.splits(root=os.path.abspath(os.path.join(shared.DATA_FOLDER)), exts=(f'.{src_lang}', f'.{tgt_lang}'), fields=(SRC, TGT),
import torch from torchtext import data from torchtext import datasets from torchtext.vocab import GloVe from cove import MTLSTM parser = ArgumentParser() parser.add_argument('--device', default=0, help='Which device to run one; -1 for CPU') parser.add_argument('--data', default='.data', help='where to store data') parser.add_argument('--embeddings', default='.embeddings', help='where to store embeddings') args = parser.parse_args() inputs = data.Field(lower=True, include_lengths=True, batch_first=True) print('Generating train, dev, test splits') train, dev, test = datasets.IWSLT.splits(root=args.data, exts=['.en', '.de'], fields=[inputs, inputs]) train_iter, dev_iter, test_iter = data.Iterator.splits( (train, dev, test), batch_size=100, device=torch.device(args.device) if args.device >= 0 else None) print('Building vocabulary') inputs.build_vocab(train, dev, test) inputs.vocab.load_vectors(vectors=GloVe(name='840B', dim=300, cache=args.embeddings)) outputs_last_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, model_cache=args.embeddings) outputs_both_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, model_cache=args.embeddings) outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, residual_embeddings=True, model_cache=args.embeddings) if args.device >=0:
torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if not args.cuda: args.gpu = -1 if torch.cuda.is_available() and args.cuda: print("Note: You are using GPU for training") torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: print("Warning: You have Cuda but not use it. You are using CPU for training.") ################## Load the datasets ################## TEXT = data.Field(lower=True) ED = data.Field(sequential=False, use_vocab=False) train, dev = data.TabularDataset.splits(path=args.output, train='entity_train.txt', validation='entity_valid.txt', format='tsv', fields=[('text', TEXT), ('mid', ED)]) field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', None)] test = data.TabularDataset(path=os.path.join(args.output, 'test.txt'), format='tsv', fields=field) TEXT.build_vocab(train, dev, test) # training data includes validation data match_embedding = 0 TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), words_dim) for i, token in enumerate(TEXT.vocab.itos): wv_index = stoi.get(token, None) if wv_index is not None: TEXT.vocab.vectors[i] = vectors[wv_index] match_embedding += 1 else:
torch.backends.cudnn.deterministic = True if not args.cuda: args.gpu = -1 if torch.cuda.is_available() and args.cuda: print("CUDA enabled") torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: print("CUDA is availabe but is not being used") np.random.seed(args.seed) random.seed(args.seed) # Set up the data for training # SST-1 if args.dataset == 'SST-1': TEXT = data.Field(batch_first=True, tokenize=clean_str_sst) LABEL = data.Field(sequential=False) train, dev, test = SST1Dataset.splits(TEXT, LABEL) elif args.dataset == 'SST-2': TEXT = data.Field(batch_first=True) LABEL = data.Field(sequential=False) #train, dev, test = SST2Dataset.splitits(TEXT, LABEL) train, dev, test = torchtext.datasets.SST.splits( TEXT, LABEL, train_subtrees=True, filter_pred=lambda ex: ex.label != 'neutral') elif args.dataset == 'trec': TEXT = data.Field(batch_first=True) LABEL = data.Field(sequential=False)
import torch from torchtext import data from torchtext import datasets import random from torchsummary import summary SEED = 1234 torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True TEXT = data.Field(tokenize='spacy') LABEL = data.LabelField(dtype=torch.float) print("downloading data : ") train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) train_data, valid_data = train_data.split(random_state=random.seed(SEED)) TEXT.build_vocab(train_data, max_size=5000) LABEL.build_vocab(train_data) BATCH_SIZE = 64 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)
from torchtext import data from torchtext import datasets from torchtext.vocab import GloVe, CharNGram # Approach 1: # set up fields TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) LABEL = data.Field(sequential=False) # make splits for data train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True) # print information about the data print('train.fields', train.fields) print('len(train)', len(train)) print('vars(train[0])', vars(train[0])) # build the vocabulary TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train) # print vocab information print('len(TEXT.vocab)', len(TEXT.vocab)) print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size()) # make iterator for splits train_iter, test_iter = data.BucketIterator.splits( (train, test), batch_size=3, device=0)
if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) ############################ # Load data ############################ print("Loading data...") PAD_WORD = '<blank>' eval_batch_size = args.eval_batch_size src = data.Field(pad_token=PAD_WORD) trg = data.Field(pad_token=PAD_WORD) train_data = datasets.TranslationDataset(path=args.data + '/train', exts=('.en', '.de'), fields=(src, trg)) val_data = datasets.TranslationDataset(path=args.data + '/valid', exts=('.en', '.de'), fields=(src, trg)) test_data = datasets.TranslationDataset(path=args.data + '/test', exts=('.en', '.de'), fields=(src, trg)) print("DONE\n") ############################
''' https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb ''' import torch import torch.nn as nn from torch.autograd import Variable from torchtext import data # > The first difference is that we do not need to set the dtype in # the LABEL field. -- # https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/5%20-%20Multi-class%20Sentiment%20Analysis.ipynb # Two classes: dtype=torch.float TEXT = data.Field(sequential=True, include_lengths=False, batch_first=False) LABELS = data.LabelField() NAMES = data.RawField(is_target=False) # Fields are added by column left to write in the underlying table fields=[('name', NAMES), ('label', LABELS), ('text', TEXT)] train, dev, test = data.TabularDataset.splits( path='tmp/processed', format='CSV', fields=fields, train='train.csv', validation='dev.csv', test='test.csv') # https://github.com/pytorch/text/issues/641 train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_sizes=(100, 100, 100), sort_key=lambda x: len(x.text),
import torch import torch.optim as O import torch.nn as nn from torchtext import data from torchtext import datasets from model import SNLIClassifier from util import get_args, makedirs args = get_args() torch.cuda.set_device(args.gpu) device = torch.device('cuda:{}'.format(args.gpu)) inputs = data.Field(lower=args.lower, tokenize='spacy') answers = data.Field(sequential=False) train, dev, test = datasets.SNLI.splits(inputs, answers) inputs.build_vocab(train, dev, test) if args.word_vectors: if os.path.isfile(args.vector_cache): inputs.vocab.vectors = torch.load(args.vector_cache) else: inputs.vocab.load_vectors(args.word_vectors) makedirs(os.path.dirname(args.vector_cache)) torch.save(inputs.vocab.vectors, args.vector_cache) answers.build_vocab(train) train_iter, dev_iter, test_iter = data.BucketIterator.splits(
def tokenizer(text): # create a tokenizer function # 返回 a list of <class 'spacy.tokens.token.Token'> return [tok.text for tok in spacy_en.tokenizer(text)] from torchtext import data import numpy as np from data import text_utils if __name__ == '__main__': args = argument_parser() with open("seq2seq/bak/TEXT.Field", "rb") as f: TEXT = dill.load(f) LENGTH = data.Field(sequential=False, use_vocab=False) embeddings = np.random.random((len(TEXT.vocab.itos), args.embed_size)) args.TEXT = TEXT encoder = SN_MODELS["encoder"](embeddings, args) # atten = SN_MODELS["attention"](args.hidden_size * 4, 300) #decoder = SN_MODELS["decoder"](embeddings, args) atten = SN_MODELS["attention"](args.hidden_size, "general") decoder = SN_MODELS["decoder"](embeddings, args, atten) model_class = SN_MODELS[args.model_name] # model = model_class(encoder, decoder, args) model = model_class(encoder, decoder, args)
# return [tok for tok in j_t.tokenize(text, wakati=True)] def tokenizer(text): wakati = [] node = tagger.parseToNode(text).next while node.next: wakati.append(node.surface) node = node.next return wakati #Fieldクラス TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True) LABEL = data.Field(sequential=False, use_vocab=True) FILE = data.Field(sequential=False, use_vocab=True) #データの読み込み dataset = data.TabularDataset(path='./sentence.tsv', format='tsv', fields=[('Text', TEXT), ('Label', LABEL), ('File', FILE)], skip_header=True) LABEL.build_vocab(dataset) FILE.build_vocab(dataset)
# Split dev dataset into test set and validation set dev_set = pd.read_csv(args.dev_set) validation_set, test_set = train_test_split(dev_set, test_size = args.test_size) # Saving file names to variables trainloc = args.train_set valloc = args.save+'validation_set.csv' testloc = args.save+'test_set.csv' # Saving validation and test set to csv file validation_set.to_csv(valloc, index=False) test_set.to_csv(testloc, index=False) # Create Field object tokenize = lambda x: x.split() TEXT = data.Field(tokenize=tokenize, lower=False, include_lengths = True, init_token = '<SOS>', eos_token = '<EOS>') LEX = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>') BIO = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>') # Specify Fields in the dataset fields = [('context', TEXT), ('question', TEXT), ('bio', BIO), ('lex', LEX)] # Build the dataset train_data, valid_data, test_data = data.TabularDataset.splits(path = '',train=trainloc, validation=valloc, test=testloc, fields = fields, format='csv', skip_header=True) # Build vocabulary MAX_VOCAB_SIZE = 50000 MIN_COUNT = 5 BATCH_SIZE = args.batch_size
('tfidf', TfidfTransformer()), ('clf', LogisticRegression( penalty='l2', multi_class='auto',solver='saga', max_iter=100, tol=1e-3)), ]) text_clf.fit(train['hypothesis'], train['label']) predicted = text_clf.predict(test['hypothesis']) print(np.mean(predicted == test['label'])) with open("tfidf.txt", 'w') as f: for idx in range(len(predicted)): f.write("{}\n".format(map_to_word(predicted[idx]))) TEXT = data.Field(tokenize = 'spacy', lower = True) LABEL = data.LabelField() train_data, valid_data, test_data = datasets.SNLI.splits(TEXT, LABEL) MIN_FREQ = 2 TEXT.build_vocab(train_data, min_freq = MIN_FREQ, vectors = "glove.6B.300d", unk_init = torch.Tensor.normal_) LABEL.build_vocab(train_data) BATCH_SIZE = 256
def train_discriminator( dataset, dataset_fp=None, pretrained_model="gpt2-medium", epochs=10, batch_size=64, log_interval=10, save_model=False, cached=False, no_cuda=False, ): device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" print("Preprocessing {} dataset...".format(dataset)) start = time.time() if dataset == "SST": idx2class = [ "positive", "negative", "very positive", "very negative", "neutral" ] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) text = torchtext_data.Field() label = torchtext_data.Field(sequential=False) train_data, val_data, test_data = datasets.SST.splits( text, label, fine_grained=True, train_subtrees=True, ) x = [] y = [] for i in trange(len(train_data), ascii=True): seq = TreebankWordDetokenizer().detokenize( vars(train_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) x.append(seq) y.append(class2idx[vars(train_data[i])["label"]]) train_dataset = Dataset(x, y) test_x = [] test_y = [] for i in trange(len(test_data), ascii=True): seq = TreebankWordDetokenizer().detokenize( vars(test_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) test_x.append(seq) test_y.append(class2idx[vars(test_data[i])["label"]]) test_dataset = Dataset(test_x, test_y) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 2, } elif dataset == "clickbait": idx2class = ["non_clickbait", "clickbait"] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) with open("datasets/clickbait/clickbait_train_prefix.txt") as f: data = [] for i, line in enumerate(f): try: data.append(eval(line)) except Exception: print("Error evaluating line {}: {}".format(i, line)) continue x = [] y = [] with open("datasets/clickbait/clickbait_train_prefix.txt") as f: for i, line in enumerate(tqdm(f, ascii=True)): try: d = eval(line) seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: print( "Line {} is longer than maximum length {}".format( i, max_length_seq)) continue x.append(seq) y.append(d["label"]) except Exception: print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 1, } elif dataset == "toxic": idx2class = ["non_toxic", "toxic"] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) x = [] y = [] with open("datasets/toxic/toxic_train.txt") as f: for i, line in enumerate(tqdm(f, ascii=True)): try: d = eval(line) seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: print( "Line {} is longer than maximum length {}".format( i, max_length_seq)) continue x.append(seq) y.append(int(np.sum(d["label"]) > 0)) except Exception: print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 0, } else: # if dataset == "generic": # This assumes the input dataset is a TSV with the following structure: # class \t text if dataset_fp is None: raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.") classes = set() with open(dataset_fp) as f: csv_reader = csv.reader(f, delimiter="\t") for row in tqdm(csv_reader, ascii=True): if row: classes.add(row[0]) idx2class = sorted(classes) class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator(class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device).to(device) x = [] y = [] with open(dataset_fp) as f: csv_reader = csv.reader(f, delimiter="\t") for i, row in enumerate(tqdm(csv_reader, ascii=True)): if row: label = row[0] text = row[1] try: seq = discriminator.tokenizer.encode(text) if len(seq) < max_length_seq: seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: print("Line {} is longer than maximum length {}". format(i, max_length_seq)) continue x.append(seq) y.append(class2idx[label]) except Exception: print( "Error tokenizing line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split( full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), "embed_size": discriminator.embed_size, "pretrained_model": pretrained_model, "class_vocab": class2idx, "default_class": 0, } end = time.time() print("Preprocessed {} data points".format( len(train_dataset) + len(test_dataset))) print("Data preprocessing took: {:.3f}s".format(end - start)) if cached: print("Building representation cache...") start = time.time() train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device) test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device) end = time.time() print("Building representation cache took: {:.3f}s".format(end - start)) else: train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn) if save_model: with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file: json.dump(discriminator_meta, meta_file) optimizer = optim.Adam(discriminator.parameters(), lr=0.0001) for epoch in range(epochs): start = time.time() print("\nEpoch", epoch + 1) train_epoch( discriminator=discriminator, data_loader=train_loader, optimizer=optimizer, epoch=epoch, log_interval=log_interval, device=device, ) evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device) end = time.time() print("Epoch took: {:.3f}s".format(end - start)) print("\nExample prediction") predict(example_sentence, discriminator, idx2class, cached=cached, device=device) if save_model: # torch.save(discriminator.state_dict(), # "{}_discriminator_{}.pt".format( # args.dataset, epoch + 1 # )) torch.save( discriminator.get_classifier().state_dict(), "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1), )
def main(): # Use a GPU if available, as it should be faster. device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') student.device = device print("Using device: {}" "\n".format(str(device))) # Load the training dataset, and create a dataloader to generate a batch. textField = data.Field(lower=True, include_lengths=True, batch_first=True, preprocessing=student.preprocessing, postprocessing=student.postprocessing, stop_words=student.stopWords) labelField = data.Field(sequential=False, use_vocab=False, is_target=True) dataset = data.TabularDataset('train.json', 'json', { 'reviewText': ('reviewText', textField), 'rating': ('rating', labelField) }) textField.build_vocab(dataset, vectors=student.wordVectors) # Allow training on the entire dataset, or split it for training and validation. if student.trainValSplit == 1: trainLoader = data.BucketIterator(dataset, shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) else: train, validate = dataset.split(split_ratio=student.trainValSplit, stratified=True, strata_field='rating') trainLoader, valLoader = data.BucketIterator.splits( (train, validate), shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) # Get model and optimiser from student. net = student.net.to(device) criterion = student.lossFunc optimiser = student.optimiser # Train. for epoch in range(student.epochs): runningLoss = 0 for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to(device) length = batch.reviewText[1].to(device) labels = batch.rating.type(torch.FloatTensor).to(device) # PyTorch calculates gradients by accumulating contributions # to them (useful for RNNs). # Hence we must manually set them to zero before calculating them. optimiser.zero_grad() # Forward pass through the network. output = net(inputs, length) loss = criterion(output, student.convertLabel(labels)) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() runningLoss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, runningLoss / 32)) runningLoss = 0 # Save model. torch.save(net.state_dict(), 'savedModel.pth') print("\n" "Model saved to savedModel.pth") # Test on validation data if it exists. if student.trainValSplit != 1: net.eval() closeness = [0 for _ in range(5)] with torch.no_grad(): for batch in valLoader: # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to( device) length = batch.reviewText[1].to(device) labels = batch.rating.type(torch.FloatTensor).to(device) # Convert network output to integer values. outputs = student.convertNetOutput(net(inputs, length)).flatten() for i in range(5): closeness[i] += torch.sum(abs(labels - outputs) == i).item() accuracy = [x / len(validate) for x in closeness] score = 100 * (accuracy[0] + 0.4 * accuracy[1]) print("\n" "Correct predictions: {:.2%}\n" "One star away: {:.2%}\n" "Two stars away: {:.2%}\n" "Three stars away: {:.2%}\n" "Four stars away: {:.2%}\n" "\n" "Weighted score: {:.2f}".format(*accuracy, score))
spacy_de = spacy.load('de') spacy_en = spacy.load('en') url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] # Testing IWSLT DE = data.Field(tokenize=tokenize_de) EN = data.Field(tokenize=tokenize_en) train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN)) print(train.fields) print(len(train)) print(vars(train[0])) print(vars(train[100])) DE.build_vocab(train.src, min_freq=3) EN.build_vocab(train.trg, max_size=50000) train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=3) print(DE.vocab.freqs.most_common(10))
def train_from_data(self, train_raw_data, test_raw_data, W, word2index, args): self.word_embed_dim = W.shape[1] self.hidden_size = args.n_hidden self.vocab_size = len(W) self.output_size = 3 if args.model == 'IOG': self.tagger = networks.IOG(self.word_embed_dim, self.output_size, self.vocab_size, args) else: print("model name not found") exit(-1) W = torch.from_numpy(W) self.tagger.word_rep.word_embed.weight = nn.Parameter(W) TEXT = data.Field(sequential=True, use_vocab=False, pad_token=0, batch_first=True, include_lengths=True) LABEL_T = data.Field(sequential=True, use_vocab=False, pad_token=0, batch_first=True) LABEL_O = data.Field(sequential=True, use_vocab=False, pad_token=-1, batch_first=True) LEFT_MASK = data.Field(sequential=True, use_vocab=False, pad_token=0, batch_first=True) RIGHT_MASK = data.Field(sequential=True, use_vocab=False, pad_token=0, batch_first=True) fields = [('text', TEXT), ('target', LABEL_T), ('label', LABEL_O), ('left_mask', LEFT_MASK), ('right_mask', RIGHT_MASK)] if args.use_dev: train_texts, train_t, train_ow, dev_texts, dev_t, dev_ow = self.split_dev( *train_raw_data) dev_data = [[ numericalize(text, word2index), numericalize_label(target, tag2id), numericalize_label(label, tag2id), *self.generate_mask(target) ] for text, target, label in zip(dev_texts, dev_t, dev_ow)] dev_dataset = ToweDataset(fields, dev_data) train_data = [[ numericalize(text, word2index), numericalize_label(target, tag2id), numericalize_label(label, tag2id), *self.generate_mask(target) ] for text, target, label in zip(train_texts, train_t, train_ow)] test_data = [[ numericalize(text, word2index), numericalize_label(target, tag2id), numericalize_label(label, tag2id), *self.generate_mask(target) ] for text, target, label in zip(*test_raw_data)] train_dataset = ToweDataset(fields, train_data) test_dataset = ToweDataset(fields, test_data) device = torch.device( "cuda" if torch.cuda.is_available() and cuda_flag else "cpu") n_gpu = torch.cuda.device_count() np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) train_iter = data.Iterator( train_dataset, batch_size=args.batch_size, sort_within_batch=True, repeat=False, device=device if torch.cuda.is_available() else -1) if args.use_dev: dev_iter = data.Iterator( dev_dataset, batch_size=args.eval_bs, shuffle=False, sort_within_batch=True, repeat=False, device=device if torch.cuda.is_available() else -1) else: dev_iter = None test_iter = data.Iterator( test_dataset, batch_size=args.eval_bs, shuffle=False, sort_within_batch=True, repeat=False, device=device if torch.cuda.is_available() else -1) train.train(self.tagger, train_iter, dev_iter, test_iter, args=args) pass
def main(config): if not os.path.exists(config.model_dir): os.makedirs(config.model_dir) if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) print("\t \t \t the model name is {}".format(config.model_name)) device, n_gpu = get_device() torch.manual_seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True # cudnn 使用确定性算法,保证每次结果一样 """ sst2 数据准备 """ CHAR_NESTING = data.Field(tokenize=list, lower=True) char_field = data.NestedField(CHAR_NESTING, tokenize='spacy', fix_length=config.sequence_length) word_field = data.Field(tokenize='spacy', lower=True, include_lengths=True, fix_length=config.sequence_length) label_field = data.LabelField(dtype=torch.long) train_iterator, dev_iterator, test_iterator = sst_word_char( config.data_path, word_field, char_field, label_field, config.batch_size, device, config.glove_word_file, config.glove_char_file, config.cache_path) """ 词向量准备 """ word_embeddings = word_field.vocab.vectors char_embeddings = char_field.vocab.vectors model_file = config.model_dir + 'model1.pt' """ 模型准备 """ if config.model_name == "TextRNNHighway": from TextRNNHighway import TextRNNHighway model = TextRNNHighway.TextRNNHighway( config.glove_word_dim, config.glove_char_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, word_embeddings, char_embeddings, config.highway_layers) elif config.model_name == "TextCNNHighway": from TextCNNHighway import TextCNNHighway filter_sizes = [int(val) for val in config.filter_sizes.split()] model = TextCNNHighway.TextCNNHighway( config.glove_word_dim, config.glove_char_dim, config.filter_num, filter_sizes, config.output_dim, config.dropout, word_embeddings, char_embeddings, config.highway_layers) elif config.model_name == "LSTMATTHighway": from LSTMATTHighway import LSTMATTHighway model = LSTMATTHighway.LSTMATTHighway( config.glove_word_dim, config.glove_char_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, word_embeddings, char_embeddings, config.highway_layers) elif config.model_name == "TextRCNNHighway": from TextRCNNHighway import TextRCNNHighway model = TextRCNNHighway.TextRCNNHighway( config.glove_word_dim, config.glove_char_dim, config.output_dim, config.hidden_size, config.num_layers, config.bidirectional, config.dropout, word_embeddings, char_embeddings, config.highway_layers) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) if config.do_train: train(config.epoch_num, model, train_iterator, dev_iterator, optimizer, criterion, ['0', '1'], model_file, config.log_dir, config.print_step, 'highway') model.load_state_dict(torch.load(model_file)) criterion = nn.CrossEntropyLoss() test_loss, test_acc, test_report = evaluate(model, test_iterator, criterion, ['0', '1'], 'highway') print("-------------- Test -------------") print( "\t Loss: {} | Acc: {} | Micro avg F1: {} | Macro avg F1: {} | Weighted avg F1: {}" .format(test_loss, test_acc, test_report['micro avg']['f1-score'], test_report['macro avg']['f1-score'], test_report['weighted avg']['f1-score']))
def test_build_vocab(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Write JSON dataset and construct a Dataset self.write_test_ppid_dataset(data_format="json") json_fields = { "question1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field) } json_dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format="json", fields=json_fields) # Test build_vocab default question_field.build_vocab(tsv_dataset, json_dataset, specials=['<space>']) assert question_field.vocab.freqs == Counter({ 'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2 }) expected_stoi = { '<unk>': 0, '<pad>': 1, '<space>': 2, 'Lincoln': 3, 'When': 4, 'born?': 5, 'do': 6, 'instead': 7, 'of': 8, 'use': 9, 'was': 10, 'you': 11, '"&"': 12, '"and"?': 13, '2+2': 14, '2+2=?': 15, 'Abraham': 16, 'What': 17, 'Where': 18, 'Which': 19, 'is': 20, 'location': 21, 'し?': 22, 'シ': 23 } assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert question_field.vocab.itos == expected_itos label_field.build_vocab(tsv_dataset, json_dataset) assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) expected_stoi = {'1': 1, '0': 2, '<unk>': 0} assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert label_field.vocab.itos == expected_itos # Test build_vocab default question_field.build_vocab(tsv_dataset, json_dataset) assert question_field.vocab.freqs == Counter({ 'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2 }) expected_stoi = { '<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3, 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, 'use': 8, 'was': 9, 'you': 10, '"&"': 11, '"and"?': 12, '2+2': 13, '2+2=?': 14, 'Abraham': 15, 'What': 16, 'Where': 17, 'Which': 18, 'is': 19, 'location': 20, 'し?': 21, 'シ': 22 } assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert question_field.vocab.itos == expected_itos label_field.build_vocab(tsv_dataset, json_dataset) assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) expected_stoi = {'1': 1, '0': 2, '<unk>': 0} assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert label_field.vocab.itos == expected_itos # Test build_vocab with extra kwargs passed to Vocab question_field.build_vocab(tsv_dataset, json_dataset, max_size=8, min_freq=3) assert question_field.vocab.freqs == Counter({ 'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2 }) expected_stoi = { '<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3, 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, 'use': 8, 'was': 9 } assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [ x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1]) ] assert question_field.vocab.itos == expected_itos
def load_data( data_cfg: dict ) -> (Dataset, Dataset, Optional[Dataset], Vocabulary, Vocabulary): """ Load train, dev and optionally test data as specified in configuration. Vocabularies are created from the training set with a limit of `voc_limit` tokens and a minimum token frequency of `voc_min_freq` (specified in the configuration dictionary). The training data is filtered to include sentences up to `max_sent_length` on source and target side. :param data_cfg: configuration dictionary for data ("data" part of configuation file) :return: - train_data: training dataset - dev_data: development dataset - test_data: testdata set if given, otherwise None - src_vocab: source vocabulary extracted from training data - trg_vocab: target vocabulary extracted from training data """ # load data from files src_lang = data_cfg["src"] trg_lang = data_cfg["trg"] train_path = data_cfg["train"] dev_path = data_cfg["dev"] test_path = data_cfg.get("test", None) level = data_cfg["level"] lowercase = data_cfg["lowercase"] max_sent_length = data_cfg["max_sent_length"] tok_fun = lambda s: list(s) if level == "char" else s.split() src_field = data.Field(init_token=None, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) trg_field = data.Field(init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, unk_token=UNK_TOKEN, batch_first=True, lower=lowercase, include_lengths=True) train_data = TranslationDataset( path=train_path, exts=("." + src_lang, "." + trg_lang), fields=(src_field, trg_field), filter_pred=lambda x: len(vars(x)['src']) <= max_sent_length and len( vars(x)['trg']) <= max_sent_length) src_max_size = data_cfg.get("src_voc_limit", sys.maxsize) src_min_freq = data_cfg.get("src_voc_min_freq", 1) trg_max_size = data_cfg.get("trg_voc_limit", sys.maxsize) trg_min_freq = data_cfg.get("trg_voc_min_freq", 1) src_vocab_file = data_cfg.get("src_vocab", None) trg_vocab_file = data_cfg.get("trg_vocab", None) src_vocab = build_vocab(field="src", min_freq=src_min_freq, max_size=src_max_size, dataset=train_data, vocab_file=src_vocab_file) trg_vocab = build_vocab(field="trg", min_freq=trg_min_freq, max_size=trg_max_size, dataset=train_data, vocab_file=trg_vocab_file) dev_data = TranslationDataset(path=dev_path, exts=("." + src_lang, "." + trg_lang), fields=(src_field, trg_field)) test_data = None if test_path is not None: # check if target exists if os.path.isfile(test_path + "." + trg_lang): test_data = TranslationDataset(path=test_path, exts=("." + src_lang, "." + trg_lang), fields=(src_field, trg_field)) else: # no target is given -> create dataset from src only test_data = MonoDataset(path=test_path, ext="." + src_lang, field=src_field) src_field.vocab = src_vocab trg_field.vocab = trg_vocab return train_data, dev_data, test_data, src_vocab, trg_vocab
def test_pad(self): # Default case. field = data.Field() minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [["a", "sentence", "of", "data", "."], [ "yet", "another", "<pad>", "<pad>", "<pad>" ], ["one", "last", "sent", "<pad>", "<pad>"]] expected_lengths = [5, 2, 3] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) # Test fix_length properly truncates and pads. field = data.Field(fix_length=3) minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [["a", "sentence", "of"], ["yet", "another", "<pad>"], ["one", "last", "sent"]] expected_lengths = [3, 2, 3] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(fix_length=3, include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) field = data.Field(fix_length=3, truncate_first=True) expected_padded_minibatch = [["of", "data", "."], ["yet", "another", "<pad>"], ["one", "last", "sent"]] assert field.pad(minibatch) == expected_padded_minibatch # Test init_token is properly handled. field = data.Field(fix_length=4, init_token="<bos>") minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [["<bos>", "a", "sentence", "of"], ["<bos>", "yet", "another", "<pad>"], ["<bos>", "one", "last", "sent"]] expected_lengths = [4, 3, 4] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(fix_length=4, init_token="<bos>", include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) # Test init_token and eos_token are properly handled. field = data.Field(init_token="<bos>", eos_token="<eos>") minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"], ["one", "last", "sent"]] expected_padded_minibatch = [ ["<bos>", "a", "sentence", "of", "data", ".", "<eos>"], ["<bos>", "yet", "another", "<eos>", "<pad>", "<pad>", "<pad>"], ["<bos>", "one", "last", "sent", "<eos>", "<pad>", "<pad>"] ] expected_lengths = [7, 4, 5] assert field.pad(minibatch) == expected_padded_minibatch field = data.Field(init_token="<bos>", eos_token="<eos>", include_lengths=True) assert field.pad(minibatch) == (expected_padded_minibatch, expected_lengths) # Test that non-sequential data is properly handled. field = data.Field(init_token="<bos>", eos_token="<eos>", sequential=False) minibatch = [["contradiction"], ["neutral"], ["entailment"]] assert field.pad(minibatch) == minibatch field = data.Field(init_token="<bos>", eos_token="<eos>", sequential=False, include_lengths=True) assert field.pad(minibatch) == minibatch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') spacy_en = spacy.load('en') # create a tokenizer function def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)] """ field在默认的情况下都期望一个输入是一组单词的序列,并且将单词映射成整数。 这个映射被称为vocab。如果一个field已经被数字化了并且不需要被序列化, 可以将参数设置为use_vocab=False以及sequential=False。 """ LABEL = data.Field(sequential=False, use_vocab=False) TEXT = data.Field(sequential=True, use_vocab=tokenizer, lower=True) # 定义Dataset # 对于csv/tsv类型的文件,TabularDataset很容易进行处理,故我们选它来生成Dataset """ 我们不需要 'PhraseId' 和 'SentenceId'这两列, 所以我们给他们的field传递 None 如果你的数据有列名,如我们这里的'Phrase','Sentiment',... 设置skip_header=True,不然它会把列名也当一个数据处理 """ train, val = data.TabularDataset.splits(path='E:\\ML_data\\torchText\\', train='train.csv', validation='val.csv', format='csv', skip_header=True, fields=[('PhraseId', None),
def __init__( self, train_fn, batch_size=64, valid_ratio=.1, device=-1, max_vocab=999999, min_freq=1, use_eos=False, shuffle=True, ): ''' DataLoader initialization. :param train_fn: Train-set filename :param batch_size: Batchify data fot certain batch size. :param device: Device-id to load data (-1 for CPU) :param max_vocab: Maximum vocabulary size :param min_freq: Minimum frequency for loaded word. :param use_eos: If it is True, put <EOS> after every end of sentence. :param shuffle: If it is True, random shuffle the input data. ''' super().__init__() # Define field of the input file. # The input file consists of two fields. self.label = data.Field( sequential=False, use_vocab=True, unk_token=None ) self.text = data.Field( use_vocab=True, batch_first=True, include_lengths=False, eos_token='<EOS>' if use_eos else None, ) # Those defined two columns will be delimited by TAB. # Thus, we use TabularDataset to load two columns in the input file. # We would have two separate input file: train_fn, valid_fn # Files consist of two columns: label field and text field. train, valid = data.TabularDataset( path=train_fn, format='tsv', fields=[ ('label', self.label), ('text', self.text), ], ).split(split_ratio=(1 - valid_ratio)) # Those loaded dataset would be feeded into each iterator: # train iterator and valid iterator. # We sort input sentences by length, to group similar lengths. self.train_loader, self.valid_loader = data.BucketIterator.splits( (train, valid), batch_size=batch_size, device='cuda:%d' % device if device >= 0 else 'cpu', shuffle=shuffle, sort_key=lambda x: len(x.text), sort_within_batch=True, ) # At last, we make a vocabulary for label and text field. # It is making mapping table between words and indice. self.label.build_vocab(train) self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)
torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if not args.cuda: args.gpu = -1 if torch.cuda.is_available() and args.cuda: print("Note: You are using GPU for training") torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: print( "Warning: You have Cuda but not use it. You are using CPU for training." ) TEXT = data.Field(lower=True) RELATION = data.Field(sequential=False) train, dev, test = SQdataset.splits(TEXT, RELATION, args.data_dir) TEXT.build_vocab(train, dev, test) RELATION.build_vocab(train, dev) train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False, sort=False, shuffle=True) dev_iter = data.Iterator(dev, batch_size=args.batch_size,