def build(self): print('Build Vocabulary from ', self.path) tokenize = BuildVocab.tokenize_text TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=35, use_vocab=True) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('text', TEXT)] data = TabularDataset(path=self.path, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(data, vectors=GloVe(name='6B', dim=300), max_size=1000) #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.stoi = TEXT.vocab.stoi self.vectors = TEXT.vocab.vectors
def vocab_builder(self): #self.eid_field = Field(sequential=False,tokenize) print('Build Vocabulary') tokenize = BiGraphTextDataset.tokenize_text TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=35, use_vocab=True) datafields = [('eid', None), ('idxP', None), ('idxC', None), ('MaxDegree', None), ('MaxL', None), ('text', TEXT)] path = '/data1/home2/AgainstRumor/data/Pheme/data.text.txt' train_data = TabularDataset(path=path, format='tsv', skip_header=False, fields=datafields) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) #train_iter = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.stoi_dict = TEXT.vocab.stoi self.vocab_vectors = TEXT.vocab.vectors
def buildDataSets(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Model parameter MAX_SEQ_LEN = 16 PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # Fields label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.int8) text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) fields = {'label': ('label', label_field), 'text': ('text', text_field)} # TabularDataset train, valid, test = TabularDataset.splits(path='memesData/data', train='train.jsonl', validation='dev_unseen.jsonl', test='dev_seen.jsonl', format='JSON', fields=fields) # Iterators train_iter = BucketIterator(train, batch_size=8, sort_key=lambda x: len(x.text), train=True, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=8, sort_key=lambda x: len(x.text), train=True, sort=True, sort_within_batch=True) test_iter = Iterator(test, batch_size=8, train=False, shuffle=False, sort=False) return train_iter, valid_iter, test_iter
def tokenizer_from(src_lang: str, tgt_lang: str): src_lang = _load_lang(src_lang) tgt_lang = _load_lang(tgt_lang) info('Building tokenizer') src_tok = build_tokenizer(src_lang) tgt_tok = build_tokenizer(tgt_lang) src = Field(tokenize=src_tok) tgt = Field(tokenize=tgt_tok) return src, tgt
class DataLoader: source: Field = None target: Field = None def __init__(self, ext, tokenize_en, tokenize_de, sos_token, eos_token): self.ext = ext self.tokenize_en = tokenize_en self.tokenize_de = tokenize_de self.sos_token = sos_token self.eos_token = eos_token print('data initializing start') # generate field def make_dataset(self): if self.ext == ('.de', '.en'): self.source = Field(tokenize=self.tokenize_de, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) self.target = Field(tokenize=self.tokenize_en, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) elif self.ext == ('.en', '.de'): self.source = Field(tokenize=self.tokenize_en, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) self.target = Field(tokenize=self.tokenize_de, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) train_data, valid_data, test_data = Multi30k.splits( exts=self.ext, fields=(self.source, self.target)) return train_data, valid_data, test_data # build the vocabulary & mapping integer def build_vocab(self, train_data, min_freq): # min_freq : lower bound frequency of the word's appearance self.source.build_vocab(train_data, min_freq=min_freq) self.target.build_vocab(train_data, min_freq=min_freq) def make_iter(self, train, validate, test, batch_size, device): train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train, validate, test), batch_size=batch_size, device=device) print('dataset initializing done') return train_iterator, valid_iterator, test_iterator
def make_SRC_TRG(tokenize_src, tokenize_trg, lower=False, batch_first=True): SRC = Field(tokenize=tokenize_src, init_token='<sos>', eos_token='<eos>', lower=lower, batch_first=batch_first) TRG = Field(tokenize=tokenize_trg, init_token='<sos>', eos_token='<eos>', lower=lower, batch_first=batch_first) return SRC, TRG
def make_dataset(self): if self.ext == ('.de', '.en'): self.source = Field(tokenize=self.tokenize_de, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) self.target = Field(tokenize=self.tokenize_en, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) elif self.ext == ('.en', '.de'): self.source = Field(tokenize=self.tokenize_en, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) self.target = Field(tokenize=self.tokenize_de, init_token=self.sos_token, eos_token=self.eos_token, lower=True, batch_first=True) train_data, valid_data, test_data = Multi30k.splits( exts=self.ext, fields=(self.source, self.target)) return train_data, valid_data, test_data
def prepare_data(args): TEXT = Field(lower=True, include_lengths=True, batch_first=True, tokenize='spacy', tokenizer_language="en_core_web_sm") LABEL = Field(sequential=False) # make splits for data print("Creating splits") if args.subset: train, dev, test = SNLI.splits(TEXT, LABEL, root='./subdata') else: train, dev, test = SNLI.splits(TEXT, LABEL, root='./data') print("Loading GloVe") glove = torchtext.vocab.GloVe(name='840B', dim=300) print("Aligning GloVe vocab") TEXT.build_vocab(train, vectors=glove) LABEL.build_vocab(train, specials_first=False) n_vocab = len(TEXT.vocab.itos) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device:", device) print("Creating BucketIterator") train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train, dev, test), batch_sizes=(args.batch, 256, 256), device=device, shuffle=False) return TEXT, train_iter, dev_iter, test_iter
def prepare(params, samples): # print(type(params)) # print(type(samples)) TEXT = Field(lower=True, include_lengths=True, batch_first=True, tokenize='spacy', tokenizer_language="en_core_web_sm") # data = [' '.join(s) for s in samples], data = samples # print("data",len(data[0])) # print(data) TEXT.build_vocab(data, vectors=params.glove) params.model.emb_vec = torch.nn.Embedding.from_pretrained( TEXT.vocab.vectors, freeze=True).to(device=params.device) params["TEXT"] = TEXT
def __init__(self, fileParams={}, tokenizationOption='regex', seedParams={ 'nFirst': 1, 'minFreq': 5 }, fieldParams={ 'lower': True, 'eos_token': '<!EOS!>' }, spacyObj=None): self.__fileName, self.__fileExtension, self.__parsingColumn = checkFileParams( fileParams) self.__seedParams = checkSeedParams(seedParams) self.__DataVocab = Field(**fieldParams) self.__spacyObj = spacyObj self.__customTokenize = self.__tokenizationMethod(tokenizationOption) self.__readFile()
def get_dataset(path_do_data: str, transformer: bool) -> TabularDataset: SRC = Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=False) TRG = Field( tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=False, ) dataset = TabularDataset(path=path_do_data, format='tsv', fields=[('trg', TRG), ('src', SRC)]) return SRC, TRG, dataset
def __init__(self, device=None, jit=False): super().__init__() self.device = device self.jit = jit # Download and the load default data. WORD = Field(include_lengths=True) UD_TAG = Field(init_token="<bos>", eos_token="<eos>", include_lengths=True) # Download and the load default data. train, val, test = UDPOS.splits( fields=(("word", WORD), ("udtag", UD_TAG), (None, None)), filter_pred=lambda ex: 5 < len(ex.word) < 30, ) WORD.build_vocab(train.word, min_freq=3) UD_TAG.build_vocab(train.udtag) self.train_iter = torch_struct.data.TokenBucket(train, batch_size=100, device=device) H = 256 T = 30 NT = 30 self.model = NeuralCFG(len(WORD.vocab), T, NT, H) if jit: self.model = torch.jit.script(self.model) self.model.to(device=device) self.opt = torch.optim.Adam(self.model.parameters(), lr=0.001, betas=[0.75, 0.999]) for i, ex in enumerate(self.train_iter): words, lengths = ex.word self.words = words.long().to(device).transpose(0, 1) self.lengths = lengths.to(device) break
device = 'cuda' if torch.cuda.is_available() else 'cpu' spacy_ger = spacy.load('de') spacy_eng = spacy.load('en') def tokenizer_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenizer_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenizer_ger, lower=True, init_token='<sos>', eos_token='<eos>') english = Field(tokenize=tokenizer_eng, lower=True, init_token='<sos>', eos_token='<eos>') train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) # model
parser = argparse.ArgumentParser() parser.add_argument('--debug', metavar='fn', default="", help="Dump outputs into file") parser.add_argument('--script', default=False, help="Script the model") args = parser.parse_args() random.seed(1337) torch.manual_seed(1337) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Download and the load default data. WORD = Field(include_lengths=True) UD_TAG = Field(init_token="<bos>", eos_token="<eos>", include_lengths=True) # Download and the load default data. train, val, test = UDPOS.splits( fields=(("word", WORD), ("udtag", UD_TAG), (None, None)), filter_pred=lambda ex: 5 < len(ex.word) < 30, ) WORD.build_vocab(train.word, min_freq=3) UD_TAG.build_vocab(train.udtag) train_iter = torch_struct.data.TokenBucket(train, batch_size=100, device="cuda:0") H = 256
spacy_en = spacy.load("en_core_web_sm") def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) TGT = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TGT)) SRC.build_vocab(train_data, min_freq=2) TGT.build_vocab(train_data, min_freq=2)
# embedding = FastText('simple') embedding = GloVe(name='6B', dim=50) data_dir = './data/sats-data/' train_ = np.load(data_dir + 'train_sents.npy', allow_pickle=True) train_labels = np.load(data_dir + 'labels_train.npy', allow_pickle=True) eval_ = np.load(data_dir + 'eval_sents.npy', allow_pickle=True) eval_labels = np.load(data_dir + 'labels_val.npy', allow_pickle=True) texts = np.concatenate((train_, eval_)) labels = np.concatenate((train_labels, eval_labels)) df = pd.DataFrame({'text': texts, 'label': labels}) text_field = Field(sequential=True, tokenize='basic_english', fix_length=5, lower=True) label_field = Field(sequential=False, use_vocab=False, is_target=True) preprocessed_text = df['text'].apply(lambda x: text_field.preprocess(x)) # text_field.build_vocab(preprocessed_text, vectors='fasttext.simple.300d') text_field.build_vocab(preprocessed_text, vectors='glove.6B.50d') vocab = text_field.vocab ltoi = {l: i for i, l in enumerate(df['label'].unique())} df['label'] = df['label'].apply(lambda y: ltoi[y]) class DataFrameDataset(torchtext.legacy.data.Dataset): def __init__(self, df: pd.DataFrame, fields: list):
# Evaluation from sklearn.metrics import accuracy_score, classification_report, confusion_matrix import seaborn as sns start_debugger_on_exception() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') device = torch.device('cuda:6') # Model parameter MAX_SEQ_LEN = 128 PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # Fields label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) fields = [('index', label_field), ('text', text_field), ('label', label_field)] # TabularDataset train, valid, test = TabularDataset.splits(path='./data', train='IMDB_single.csv',
wandb.run.name = wandb.run.name + '-fc' elif transition_type == 'depth_wise_conv': wandb.run.name = wandb.run.name + '-dwc' wandb.run.save() train_loss_key = "Train loss" valid_loss_key = "Validation loss" test_loss_key = "Test loss" char_accuracy_key = "Character accuracy" seq_accuracy_key = "Sequence accuracy" """ Preparing Data """ tokenize = lambda x: x.split() INPUT = Field(sequential=True, tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True) TARGET = Field(sequential=True, tokenize=tokenize, init_token='<sos>', eos_token='<eos>', lower=True) datafields = [("input", INPUT), ("target", TARGET)] trn, vld, tst = TabularDataset.splits(path="data/" + data_size, train=train_csv, validation=validation_csv, test=test_csv, format='csv',
import torch from torchtext.legacy import data from torchtext.legacy.data import Field from torchtext.legacy.data import LabelField from torchtext.legacy import datasets from nltk.tokenize import word_tokenize import torch.nn as nn import random import torch.optim as optim import time tokenizer = word_tokenize SEED = 1234 torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True TEXT = Field(tokenize=tokenizer, include_lengths=True) LABEL = LabelField(dtype=torch.float) train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) train_data, valid_data = train_data.split(random_state=random.seed(SEED)) MAX_VOCAB_SIZE = 25000 TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.300d", unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) BATCH_SIZE = 64 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, vaild_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data),
from torchtext.legacy.data import Field,BucketIterator import spacy import random import torch.optim as opt device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') device eng=spacy.load('en') ger=spacy.load('de_core_news_sm') def Tokenize_eng(text): return [a.text for a in eng.tokenizer(text)] def Tokenize_german(text): return [b.text for b in ger.tokenizer(text)] german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>') english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>') Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english)) german.build_vocab(Train,max_size=10000,min_freq=2) english.build_vocab(Train,max_size=10000,min_freq=2) ##building encoder class Encoder(Module): def __init__(self,inp_size,emd_size,hidden_size): super(Encoder,self).__init__() self.inp_size=inp_size self.emd_size=emd_size self.hidden_size=hidden_size self.drop=Dropout(0.5)
import torchtext import torch.nn as nn import torch.optim as optim import spacy from torchtext.legacy.data import Field, TabularDataset, BucketIterator device = "cuda" if torch.cuda.is_available() else "cpu" print(device) #python -m spacy download en spacy_en = spacy.load("en") def tokenize(text): return [tok.text for tok in spacy_en.tokenizer(text)] Texto = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True) Valoracion = Field(sequential=False, use_vocab=False) fields = {"Texto": ("t", Texto), "Valoracion": ("v", Valoracion)} train_data, test_data = TabularDataset.splits( path='/content/Dataset', train='train.csv', test='test.csv', format='csv', fields=fields) len(train_data) , len(test_data) print(vars(train_data.examples[0]))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-raw_dir', required=True) parser.add_argument('-data_dir', required=True) parser.add_argument('-codes', required=True) parser.add_argument('-save_data', required=True) parser.add_argument('-prefix', required=True) parser.add_argument('-max_len', type=int, default=100) parser.add_argument('--symbols', '-s', type=int, default=32000, help="Vocabulary size") parser.add_argument( '--min-frequency', type=int, default=6, metavar='FREQ', help= 'Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') parser.add_argument( '--dict-input', action="store_true", help= "If set, input file is interpreted as a dictionary where each line contains a word-count pair" ) parser.add_argument( '--separator', type=str, default='@@', metavar='STR', help= "Separator between non-final subword units (default: '%(default)s'))") parser.add_argument('--total-symbols', '-t', action="store_true") opt = parser.parse_args() # Create folder if needed. mkdir_if_needed(opt.raw_dir) mkdir_if_needed(opt.data_dir) # Download and extract raw data. raw_train = get_raw_files(opt.raw_dir, _TRAIN_DATA_SOURCES) raw_val = get_raw_files(opt.raw_dir, _VAL_DATA_SOURCES) raw_test = get_raw_files(opt.raw_dir, _TEST_DATA_SOURCES) # Merge files into one. train_src, train_trg = compile_files(opt.raw_dir, raw_train, opt.prefix + '-train') val_src, val_trg = compile_files(opt.raw_dir, raw_val, opt.prefix + '-val') test_src, test_trg = compile_files(opt.raw_dir, raw_test, opt.prefix + '-test') # Build up the code from training files if not exist opt.codes = os.path.join(opt.data_dir, opt.codes) if not os.path.isfile(opt.codes): sys.stderr.write( f"Collect codes from training data and save to {opt.codes}.\n") learn_bpe(raw_train['src'] + raw_train['trg'], opt.codes, opt.symbols, opt.min_frequency, True) sys.stderr.write(f"BPE codes prepared.\n") sys.stderr.write(f"Build up the tokenizer.\n") with codecs.open(opt.codes, encoding='utf-8') as codes: bpe = BPE(codes, separator=opt.separator) sys.stderr.write(f"Encoding ...\n") encode_files(bpe, train_src, train_trg, opt.data_dir, opt.prefix + '-train') encode_files(bpe, val_src, val_trg, opt.data_dir, opt.prefix + '-val') encode_files(bpe, test_src, test_trg, opt.data_dir, opt.prefix + '-test') sys.stderr.write(f"Done.\n") field = Field(tokenize=str.split, lower=True, pad_token=Constants.PAD_WORD, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD) fields = (field, field) MAX_LEN = opt.max_len def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN enc_train_files_prefix = opt.prefix + '-train' train = TranslationDataset(fields=fields, path=os.path.join(opt.data_dir, enc_train_files_prefix), exts=('.src', '.trg'), filter_pred=filter_examples_with_length) from itertools import chain field.build_vocab(chain(train.src, train.trg), min_freq=2) data = { 'settings': opt, 'vocab': field, } opt.save_data = os.path.join(opt.data_dir, opt.save_data) print('[Info] Dumping the processed data to pickle file', opt.save_data) pickle.dump(data, open(opt.save_data, 'wb'))
def translate(cfg_file: str, ckpt: str, output_path: str = None, batch_class: Batch = Batch, n_best: int = 1) -> None: """ Interactive translation function. Loads model from checkpoint and translates either the stdin input or asks for input to translate interactively. The input has to be pre-processed according to the data that the model was trained on, i.e. tokenized or split into subwords. Translations are printed to stdout. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output file :param batch_class: class type of batch :param n_best: amount of candidates to display """ def _load_line_as_data(line): """ Create a dataset from one line via a temporary file. """ # write src input to temporary file tmp_name = "tmp" tmp_suffix = ".src" tmp_filename = tmp_name + tmp_suffix with open(tmp_filename, "w") as tmp_file: tmp_file.write("{}\n".format(line)) test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field) # remove temporary file if os.path.exists(tmp_filename): os.remove(tmp_filename) return test_data def _translate_data(test_data): """ Translates given dataset, using parameters from outer scope. """ # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=test_data, batch_size=batch_size, batch_class=batch_class, batch_type=batch_type, level=level, max_output_length=max_output_length, eval_metric="", use_cuda=use_cuda, compute_loss=False, beam_size=beam_size, beam_alpha=beam_alpha, postprocess=postprocess, bpe_type=bpe_type, sacrebleu=sacrebleu, n_gpu=n_gpu, n_best=n_best) return hypotheses cfg = load_config(cfg_file) model_dir = cfg["training"]["model_dir"] _ = make_logger(model_dir, mode="translate") # version string returned # when checkpoint is not specified, take oldest from model dir if ckpt is None: ckpt = get_latest_checkpoint(model_dir) # read vocabs src_vocab_file = cfg["data"].get("src_vocab", model_dir + "/src_vocab.txt") trg_vocab_file = cfg["data"].get("trg_vocab", model_dir + "/trg_vocab.txt") src_vocab = Vocabulary(file=src_vocab_file) trg_vocab = Vocabulary(file=trg_vocab_file) data_cfg = cfg["data"] level = data_cfg["level"] lowercase = data_cfg["lowercase"] tok_fun = lambda s: list(s) if level == "char" else s.split() src_field = Field(init_token=None, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) src_field.vocab = src_vocab # parse test args batch_size, batch_type, use_cuda, device, n_gpu, level, _, \ max_output_length, beam_size, beam_alpha, postprocess, \ bpe_type, sacrebleu, _, _ = parse_test_args(cfg, mode="translate") # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.to(device) if not sys.stdin.isatty(): # input file given test_data = MonoDataset(path=sys.stdin, ext="", field=src_field) all_hypotheses = _translate_data(test_data) if output_path is not None: # write to outputfile if given def write_to_file(output_path_set, hypotheses): with open(output_path_set, mode="w", encoding="utf-8") \ as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") logger.info("Translations saved to: %s.", output_path_set) if n_best > 1: for n in range(n_best): file_name, file_extension = os.path.splitext(output_path) write_to_file( "{}-{}{}".format( file_name, n, file_extension if file_extension else ""), [ all_hypotheses[i] for i in range(n, len(all_hypotheses), n_best) ]) else: write_to_file("{}".format(output_path), all_hypotheses) else: # print to stdout for hyp in all_hypotheses: print(hyp) else: # enter interactive mode batch_size = 1 batch_type = "sentence" while True: try: src_input = input("\nPlease enter a source sentence " "(pre-processed): \n") if not src_input.strip(): break # every line has to be made into dataset test_data = _load_line_as_data(line=src_input) hypotheses = _translate_data(test_data) print("JoeyNMT: Hypotheses ranked by score") for i, hyp in enumerate(hypotheses): print("JoeyNMT #{}: {}".format(i + 1, hyp)) except (KeyboardInterrupt, EOFError): print("\nBye.") break
def main_wo_bpe(): ''' Usage: python preprocess.py -lang_src de -lang_trg en -save_data multi30k_de_en.pkl -share_vocab ''' spacy_support_langs = [ 'de', 'el', 'en', 'es', 'fr', 'it', 'lt', 'nb', 'nl', 'pt' ] parser = argparse.ArgumentParser() parser.add_argument('-lang_src', required=True, choices=spacy_support_langs) parser.add_argument('-lang_trg', required=True, choices=spacy_support_langs) parser.add_argument('-save_data', required=True) parser.add_argument('-data_src', type=str, default=None) parser.add_argument('-data_trg', type=str, default=None) parser.add_argument('-max_len', type=int, default=100) parser.add_argument('-min_word_count', type=int, default=3) parser.add_argument('-keep_case', action='store_true') parser.add_argument('-share_vocab', action='store_true') #parser.add_argument('-ratio', '--train_valid_test_ratio', type=int, nargs=3, metavar=(8,1,1)) #parser.add_argument('-vocab', default=None) opt = parser.parse_args() assert not any([opt.data_src, opt.data_trg ]), 'Custom data input is not support now.' assert not any([opt.data_src, opt.data_trg]) or all( [opt.data_src, opt.data_trg]) print(opt) src_lang_model = spacy.load(opt.lang_src) trg_lang_model = spacy.load(opt.lang_trg) def tokenize_src(text): return [tok.text for tok in src_lang_model.tokenizer(text)] def tokenize_trg(text): return [tok.text for tok in trg_lang_model.tokenizer(text)] SRC = Field(tokenize=tokenize_src, lower=not opt.keep_case, pad_token=Constants.PAD_WORD, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD) TRG = Field(tokenize=tokenize_trg, lower=not opt.keep_case, pad_token=Constants.PAD_WORD, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD) MAX_LEN = opt.max_len MIN_FREQ = opt.min_word_count if not all([opt.data_src, opt.data_trg]): assert {opt.lang_src, opt.lang_trg} == {'de', 'en'} else: # Pack custom txt file into example datasets raise NotImplementedError def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN train, val, test = Multi30k.splits(exts=('.' + opt.lang_src, '.' + opt.lang_trg), fields=(SRC, TRG), filter_pred=filter_examples_with_length) SRC.build_vocab(train.src, min_freq=MIN_FREQ) print('[Info] Get source language vocabulary size:', len(SRC.vocab)) TRG.build_vocab(train.trg, min_freq=MIN_FREQ) print('[Info] Get target language vocabulary size:', len(TRG.vocab)) if opt.share_vocab: print('[Info] Merging two vocabulary ...') for w, _ in SRC.vocab.stoi.items(): # TODO: Also update the `freq`, although it is not likely to be used. if w not in TRG.vocab.stoi: TRG.vocab.stoi[w] = len(TRG.vocab.stoi) TRG.vocab.itos = [None] * len(TRG.vocab.stoi) for w, i in TRG.vocab.stoi.items(): TRG.vocab.itos[i] = w SRC.vocab.stoi = TRG.vocab.stoi SRC.vocab.itos = TRG.vocab.itos print('[Info] Get merged vocabulary size:', len(TRG.vocab)) data = { 'settings': opt, 'vocab': { 'src': SRC, 'trg': TRG }, 'train': train.examples, 'valid': val.examples, 'test': test.examples } print('[Info] Dumping the processed data to pickle file', opt.save_data) pickle.dump(data, open(opt.save_data, 'wb'))
def train(): spacy_ger = de_core_news_md.load() spacy_eng = en_core_web_sm.load() def tokenize_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) ### We're ready to define everything we need for training our Seq2Seq model ### # Training hyperparameters num_epochs = 20 learning_rate = 0.001 batch_size = 64 # Model hyperparameters load_model = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_size_encoder = len(german.vocab) input_size_decoder = len(english.vocab) output_size = len(english.vocab) encoder_embedding_size = 300 decoder_embedding_size = 300 hidden_size = 1024 # Needs to be the same for both RNN's num_layers = 2 enc_dropout = 0.5 dec_dropout = 0.5 # Tensorboard to get nice loss plot writer = SummaryWriter(f"runs/loss_plot") step = 0 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ) encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device) decoder_net = Decoder( input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout, ).to(device) model = Seq2Seq(encoder_net, decoder_net, len(english.vocab), device).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters" ) pad_idx = english.vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) if load_model: load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model, optimizer) sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen." for epoch in range(num_epochs): print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]" ) checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict() } # save_checkpoint(checkpoint) model.eval() translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50) print(f"Translated example sentence: \n {translated_sentence}") model.train() for batch_idx, batch in enumerate(train_iterator): # Get input and targets and get to cuda inp_data = batch.src.to(device) target = batch.trg.to(device) # Forward prop output = model(inp_data, target) # print('\n') # print('Input', inp_data.shape) # print('Target', target.shape) # print('Output', output.shape) # print('---------------------') # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss # doesn't take input in that form. For example if we have MNIST we want to have # output to be: (N, 10) and targets just (N). Here we can view it in a similar # way that we have output_words * batch_size that we want to send in into # our cost function, so we need to do some reshapin. While we're at it # Let's also remove the start token while we're at it output = output[1:].reshape(-1, output.shape[2]) target = target[1:].reshape(-1) optimizer.zero_grad() loss = criterion(output, target) # Back prop loss.backward() # Clip to avoid exploding gradient issues, makes sure grads are # within a healthy range torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # Gradient descent step optimizer.step() # Plot to tensorboard writer.add_scalar("Training loss", loss, global_step=step) # print("Training loss", loss) step += 1 score = bleu(test_data[1:100], model, german, english, device) print(f"Bleu score {score*100:.2f}")
batch_size = 8 tokenizer = AutoTokenizer.from_pretrained(phobert_path, use_fast=False) init_token = tokenizer.cls_token eos_token = tokenizer.sep_token init_token_idx = tokenizer.convert_tokens_to_ids(init_token) eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token) pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token) unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token) TEXT = Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut, preprocessing=tokenizer.convert_tokens_to_ids, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) LABEL = LabelField(dtype=torch.long, use_vocab=False) fields = [('data', TEXT), ('label', LABEL)] train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='validation.csv', test='test.csv', format='CSV', fields=fields, skip_header=True) train_generator, val_generator, test_generator = BucketIterator.splits( (train, valid, test), batch_size=batch_size, device=device, sort=False) criterion = nn.CrossEntropyLoss()
import flor from multiprocessing import set_start_method try: set_start_method("spawn") except RuntimeError: pass flor.flags.NAME = "kaggle-nlp-disasters-rnn" flor.flags.REPLAY = False device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field( tokenize=flor.log("tokenizer", "spacy"), lower=True, include_lengths=True, batch_first=True, ) fields = [("words", text_field), ("target", label_field)] fields_test = [("words", text_field)] train, valid = TabularDataset.splits( path="data", train="train_rnn.csv", validation="valid_rnn.csv", format="CSV",
def test_xnli(self): batch_size = 4 # create fields TEXT = Field() GENRE = LabelField() LABEL = LabelField() LANGUAGE = LabelField() # create val/test splits, XNLI does not have a test set val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE) # check both are XNLI datasets assert type(val) == type(test) == XNLI # check all have the correct number of fields assert len(val.fields) == len(test.fields) == 5 # check fields are the correct type assert type(val.fields['premise']) == Field assert type(val.fields['hypothesis']) == Field assert type(val.fields['label']) == LabelField assert type(val.fields['genre']) == LabelField assert type(val.fields['language']) == LabelField assert type(test.fields['premise']) == Field assert type(test.fields['hypothesis']) == Field assert type(test.fields['label']) == LabelField assert type(test.fields['genre']) == LabelField assert type(test.fields['language']) == LabelField # check each is the correct length assert len(val) == 37350 assert len(test) == 75150 # build vocabulary TEXT.build_vocab(val) LABEL.build_vocab(val) GENRE.build_vocab(val) LANGUAGE.build_vocab(val) # ensure vocabulary has been created assert hasattr(TEXT, 'vocab') assert hasattr(TEXT.vocab, 'itos') assert hasattr(TEXT.vocab, 'stoi') # create iterators val_iter, test_iter = Iterator.splits((val, test), batch_size=batch_size) # get a batch to test batch = next(iter(val_iter)) # split premise and hypothesis from tuples to tensors premise = batch.premise hypothesis = batch.hypothesis label = batch.label genre = batch.genre language = batch.language # check each is actually a tensor assert type(premise) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(label) == torch.Tensor assert type(genre) == torch.Tensor assert type(language) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert label.shape[-1] == batch_size assert genre.shape[-1] == batch_size assert language.shape[-1] == batch_size # xnli cannot use the iters method, ensure raises error with self.assertRaises(NotImplementedError): val_iter, test_iter = XNLI.iters(batch_size=batch_size) # remove downloaded xnli directory shutil.rmtree('.data/xnli')
from multiprocessing import set_start_method try: set_start_method("spawn") except RuntimeError: pass flor.flags.NAME = "kaggle-nlp-disasters-rnn" flor.flags.REPLAY = False device = flor.log( "device", torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) device label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(tokenize="spacy", lower=True, include_lengths=True, batch_first=True) fields = [("words", text_field), ("target", label_field)] fields_test = [("words", text_field)] train, valid = TabularDataset.splits( path="data", train="train_rnn.csv", validation="valid_rnn.csv", format="CSV", fields=fields, skip_header=True,
class Preprocessing(Dataset): __tokPattern = r"""[0-9A-Za-z_]*[A-Za-z_-]+[0-9A-Za-z_]*|\.|\!|\?|\d+|\-|%|[.,!?;'"]""" __supportedExtensions = ['txt', 'csv', 'json'] __seedAttrs = ['nFirst', 'minFreq'] def __init__(self, fileParams={}, tokenizationOption='regex', seedParams={ 'nFirst': 1, 'minFreq': 5 }, fieldParams={ 'lower': True, 'eos_token': '<!EOS!>' }, spacyObj=None): self.__fileName, self.__fileExtension, self.__parsingColumn = checkFileParams( fileParams) self.__seedParams = checkSeedParams(seedParams) self.__DataVocab = Field(**fieldParams) self.__spacyObj = spacyObj self.__customTokenize = self.__tokenizationMethod(tokenizationOption) self.__readFile() @property def getFileName(self): return self.__fileName @property def getVocab(self): return self.__DataVocab def __readFile(self): text = readFiles(self.__fileName, self.__fileExtension, self.__parsingColumn) self.examples = self.__getObjects(text) self.__seeds = getStartWords(self.__seedParams, text) self.__build_vocab() def __getObjects(self, text): self.fields = {"src": self.__DataVocab} return [Document(**self.__tokenize(instance)) for instance in text] def __build_vocab(self): self.__DataVocab.build_vocab(self) for instance in self.examples: instance.create_tokens(self.__DataVocab) def __regexTokenization(self, document): return re.findall(self.__tokPattern, document) def __nltkTokenization(self, document): return self.tokenizer(document) def __spacyTokenization(self, instance): return [ entity.text.strip() for entity in self.__spacyObj(instance) if entity.text.strip() ] def __tokenize(self, instance): instance = self.__customTokenize(instance) return {'src': instance, 'trg': instance[1:]} @checkParams(str) def __tokenizationMethod(self, param): param = param.lower() if param == 'nltk': self.tokenizer = importNltk() return self.__nltkTokenization elif param == 'regex': return self.__regexTokenization elif param == 'spacy': if not self.__spacyObj: raise Exception( "Please provide the spacy object to tokenize with.") return self.__spacyTokenization raise Exception( "The parameter 'tokenizationOption' can only be nltk, regex and spacy" ) def getSeed(self): """ return a weighted seed. In case static seed is enabled, then the most frequent token will be the seed. """ seeds = list(self.__seeds.keys()) probs = list(self.__seeds.values()) return choice(seeds, 1, probs).tolist()