def load_checkpoint(name, path=None): """ Load a trained model, along with its optimizer Args: name (str): the name of the model path (str): the directory, in which the model is saved Returns: model, optimizer """ if path is None: path = TRAINED_PATH # model_fname = os.path.join(path, "{}.pt".format(name)) try: model_fname = os.path.join(path, name) print("Loading checkpoint `{}` ...".format(model_fname), end=" ") with open(model_fname, 'rb') as f: state = torch.load(f, map_location="cpu") print("done!") except: print("Loading checkpoint `{}` ...".format(name), end=" ") with open(name, 'rb') as f: state = torch.load(f, map_location="cpu") print("done!") if "vocab" in state: if isinstance(state["vocab"], tuple): state["vocab"] = tuple(Vocab().from_vocab_instance(v) for v in state["vocab"]) else: state["vocab"] = Vocab().from_vocab_instance(state["vocab"]) return state
def build_vocab_from_file(file, tokenize): _vocab = Vocab() for line in iterate_data(file): tokens = tokenize(line) _vocab.read_sequence(tokens) return _vocab
def read_corpus(file, tokenize): _vocab = Vocab() _data = [] for line in iterate_data(file): tokens = tokenize(line) _vocab.read_sequence(tokens) _data.append(tokens) return _vocab, _data
def read_corpus_subw(file, subword_path): subword = spm.SentencePieceProcessor() subword.Load(subword_path + ".model") vocab = Vocab(sos="<s>", eos="</s>", unk="<unk>") vocab.from_file(subword_path, skip=4) _data = [] for line in iterate_data(file): tokens = subword.EncodeAsPieces(line.rstrip().encode('utf-8')) _data.append(tokens) vocab.subword = subword return vocab, _data
def init_vocab(vocab=None, subword_path=None, oovs=0): _is_prebuilt = True # use the given vocab if vocab is not None: _vocab = vocab # load vocab from disk elif vocab is None and subword_path is not None: _vocab = Vocab(oovs=oovs, subword=subword_path) _vocab.load_from_vocab_file(subword_path + ".vocab") # build vocab from the tokens in the dataset else: _vocab = Vocab(oovs=oovs, subword=subword_path) _vocab.reset() _is_prebuilt = False return _vocab, _is_prebuilt
def __init__(self, input, tokenize=None, vocab=None, vocab_size=None, subword_path=None, seq_len=0, sos=False, oovs=0, lang="en", subsample=0, **kwargs): """ Base Dataset for Language Modeling. Args: tokenize (callable): tokenization callable, which takes as input a string and returns a list of tokens input (str, list): the path to the data file, or a list of samples. vocab (Vocab): a vocab instance. If None, then build a new one from the Datasets data. vocab_size(int): if given, then trim the vocab to the given number. """ self.input = input self.seq_len = seq_len self.subword_path = subword_path self.sos = sos self.oovs = oovs self.subsample = subsample # > define tokenization to be used ------------------------------- if tokenize is not None: self.tokenize = tokenize else: self.tokenize = self.space_tok if self.subword_path is not None: subword = spm.SentencePieceProcessor() subword_path = fix_paths(subword_path, "datasets") subword.Load(subword_path + ".model") self.tokenize = lambda x: subword.EncodeAsPieces(x.rstrip()) else: self.tokenize = MosesTokenizer(lang=lang).tokenize # > Build Vocabulary -------------------------------------------- self.vocab, is_vocab_built = self.init_vocab(vocab, subword_path, oovs) # > Cache text file --------------------------------------------- self.lengths = [] _is_cached = False def _line_callback(x): _tokens = self.tokenize(x) self.lengths.append(len(self.add_special_tokens(_tokens))) if is_vocab_built is False: self.vocab.read_sequence(_tokens) # ------------------------------------------------------------- # If there is a (vocab, lengths) tuple associated with the given input # file, then load them from cache and skip the recalculation # ------------------------------------------------------------- _ckey = self._get_cache_key(input, vocab, self.tokenize, subword_path, vocab_size, self.subsample) _cfile = os.path.join(os.path.dirname(input), f".cache_{_ckey}") if os.path.isfile(_cfile): print("Loading data from cache...", end=" ") with open(_cfile, "rb") as f: _vocab, self.lengths = pickle.load(f) self.vocab = Vocab().from_vocab_instance(_vocab) print("done!") _is_cached = True # > Preprocessing --------------------------------------------- print("Preprocessing...") self.data = DatasetCache(input, callback=_line_callback, subsample=subsample) # if the text file has already been cached, # but lengths and vocab are not cached (i.e., new for this input file) if _is_cached is False and len(self.lengths) == 0: for i in range(len(self.data)): _line_callback(self.data[i]) # trim down the size of a newly created vocab if subword_path is None and vocab_size is not None: self.vocab.build_lookup(vocab_size) # ------------------------------------------------------------- # save to cache if not already saved # ------------------------------------------------------------- if _is_cached is False: print("Writing data to cache...") with open(_cfile, "wb") as f: pickle.dump((self.vocab, self.lengths), f) self.lengths = numpy.array(self.lengths)
class BaseSequenceDataset(Dataset, ABC): def __init__(self, input, tokenize=None, vocab=None, vocab_size=None, subword_path=None, seq_len=0, sos=False, oovs=0, lang="en", subsample=0, **kwargs): """ Base Dataset for Language Modeling. Args: tokenize (callable): tokenization callable, which takes as input a string and returns a list of tokens input (str, list): the path to the data file, or a list of samples. vocab (Vocab): a vocab instance. If None, then build a new one from the Datasets data. vocab_size(int): if given, then trim the vocab to the given number. """ self.input = input self.seq_len = seq_len self.subword_path = subword_path self.sos = sos self.oovs = oovs self.subsample = subsample # > define tokenization to be used ------------------------------- if tokenize is not None: self.tokenize = tokenize else: self.tokenize = self.space_tok if self.subword_path is not None: subword = spm.SentencePieceProcessor() subword_path = fix_paths(subword_path, "datasets") subword.Load(subword_path + ".model") self.tokenize = lambda x: subword.EncodeAsPieces(x.rstrip()) else: self.tokenize = MosesTokenizer(lang=lang).tokenize # > Build Vocabulary -------------------------------------------- self.vocab, is_vocab_built = self.init_vocab(vocab, subword_path, oovs) # > Cache text file --------------------------------------------- self.lengths = [] _is_cached = False def _line_callback(x): _tokens = self.tokenize(x) self.lengths.append(len(self.add_special_tokens(_tokens))) if is_vocab_built is False: self.vocab.read_sequence(_tokens) # ------------------------------------------------------------- # If there is a (vocab, lengths) tuple associated with the given input # file, then load them from cache and skip the recalculation # ------------------------------------------------------------- _ckey = self._get_cache_key(input, vocab, self.tokenize, subword_path, vocab_size, self.subsample) _cfile = os.path.join(os.path.dirname(input), f".cache_{_ckey}") if os.path.isfile(_cfile): print("Loading data from cache...", end=" ") with open(_cfile, "rb") as f: _vocab, self.lengths = pickle.load(f) self.vocab = Vocab().from_vocab_instance(_vocab) print("done!") _is_cached = True # > Preprocessing --------------------------------------------- print("Preprocessing...") self.data = DatasetCache(input, callback=_line_callback, subsample=subsample) # if the text file has already been cached, # but lengths and vocab are not cached (i.e., new for this input file) if _is_cached is False and len(self.lengths) == 0: for i in range(len(self.data)): _line_callback(self.data[i]) # trim down the size of a newly created vocab if subword_path is None and vocab_size is not None: self.vocab.build_lookup(vocab_size) # ------------------------------------------------------------- # save to cache if not already saved # ------------------------------------------------------------- if _is_cached is False: print("Writing data to cache...") with open(_cfile, "wb") as f: pickle.dump((self.vocab, self.lengths), f) self.lengths = numpy.array(self.lengths) @staticmethod def init_vocab(vocab=None, subword_path=None, oovs=0): _is_prebuilt = True # use the given vocab if vocab is not None: _vocab = vocab # load vocab from disk elif vocab is None and subword_path is not None: _vocab = Vocab(oovs=oovs, subword=subword_path) _vocab.load_from_vocab_file(subword_path + ".vocab") # build vocab from the tokens in the dataset else: _vocab = Vocab(oovs=oovs, subword=subword_path) _vocab.reset() _is_prebuilt = False return _vocab, _is_prebuilt def dataitem(self, i): # tokenize sentence / text token_list = self.tokenize(self.data[i]) # add special tokens such as <BOS> or <EOS> token_list = self.add_special_tokens(token_list) # vectorize the tokens vector = vectorize(token_list, self.vocab) return vector @staticmethod def _get_cache_key(input, vocab, tokenize, subword_path, vocab_size, subsample): """ Args: input: vocab: tokenize: oovs: Returns: """ _hash = lambda x: hashlib.sha256(x.encode()).hexdigest() _cache_key = _hash(input) + str(os.stat(input).st_mtime) if vocab is not None: _cache_key += vocab.hash() if subsample is not None: _cache_key += str(subsample) _cache_key += _hash(inspect.getsource(tokenize)) _cache_key += str(subword_path) _cache_key += str(vocab_size) return _hash(_cache_key) @staticmethod def space_tok(text): return text.rstrip().split() def add_special_tokens(self, tokens): tokens = tokens + [self.vocab.EOS] if self.sos: tokens = [self.vocab.SOS] + tokens if self.seq_len > 0: tokens = tokens[:self.seq_len] return tokens def properties(self): props = dict() props["file"] = os.path.basename(self.input) props["examples"] = len(self) props["vocab"] = len(self.vocab) props["tokens (unique)"] = len(self.vocab.vocab) props["tokens (total)"] = number_h(sum(self.lengths)) # if len(self.data) < 4000000: # unk_id = self.vocab.tok2id[self.vocab.UNK] # _coverage = numpy.mean( # [self.dataitem(i).count(unk_id) / len(self.dataitem(i)) # for i in range(len(self.data))]) # props["UNKs"] = f"{round((_coverage * 100), 2)} %" if hasattr(self, 'seq_len'): props["max length"] = self.seq_len if hasattr(self, 'bptt'): props["BPTT"] = self.bptt return props def __str__(self): return tabulate([self.properties()], headers="keys", floatfmt=".4f", numalign="right") def truncate(self, n): self.data = self.data[:n] self.lengths = self.lengths[:n]
def run(config): # ------------------------------------------------------------------------- # Load pretrained models # ------------------------------------------------------------------------- vocab_src = None vocab_trg = None # Load pretrained LM, which will be used for LM-Fusion or as LM-prior if config["data"]["prior_path"] is not None: if "gpt2" in config["data"]["prior_path"]: _gpt_model = os.path.split(config["data"]["prior_path"])[1] tokenizer = GPT2Tokenizer.from_pretrained(_gpt_model) vocab_trg = Vocab() vocab_trg.from_gpt2(tokenizer) _checkp_prior = GPT2LMHeadModel.from_pretrained(_gpt_model) config["model"]["dec_padding_idx"] = None else: _checkp_prior = load_checkpoint(config["data"]["prior_path"]) vocab_trg = _checkp_prior["vocab"] if _checkp_prior["config"]["data"]["subword_path"] is not None: sub_path = _checkp_prior["config"]["data"]["subword_path"] config["data"]["trg"]["subword_path"] = sub_path # ------------------------------------------------------------------------- # Data Loading and Preprocessing # ------------------------------------------------------------------------- train_loader, val_loader = nmt_dataloaders(config, vocab_src, vocab_trg) # ------------------------------------------------------------------------- # Initialize Model and Priors # ------------------------------------------------------------------------- model_type = config["model"].get("type", "rnn") src_ntokens = len(val_loader.dataset.src.vocab) trg_ntokens = len(val_loader.dataset.trg.vocab) # Initialize Model if model_type == "rnn": model = Seq2SeqRNN(src_ntokens, trg_ntokens, **config["model"]) elif model_type == "transformer": model = Seq2SeqTransformer(src_ntokens, trg_ntokens, **config["model"]) else: raise NotImplementedError model_init(model, **config.get("init", {})) # Initialize prior LM _has_lm_prior = "prior" in config["losses"] _has_lm_fusion = config["model"]["decoding"].get("fusion") is not None if _has_lm_prior or _has_lm_fusion: if "gpt2" in config["data"]["prior_path"]: prior = _checkp_prior prior.to(config["device"]) freeze_module(prior) for name, module in prior.named_modules(): if isinstance(module, nn.Dropout): module.p = 0 else: prior = prior_model_from_checkpoint(_checkp_prior) prior.to(config["device"]) freeze_module(prior) else: prior = None model.tie_weights() # ------------------------------------------------------------------------- # Training Pipeline # ------------------------------------------------------------------------- callbacks = [ LossCallback(config["logging"]["log_interval"]), GradientCallback(config["logging"]["log_interval"]), ModuleGradientCallback(["encoder"], config["logging"]["log_interval"]), SamplesCallback(config["logging"]["log_interval"]), EvalCallback(config["logging"]["eval_interval"], keep_best=True, early_stop=config["optim"]["early_stop"]) ] if model_type == "rnn": callbacks.append(AttentionCallback(config["logging"]["eval_interval"])) eval_interval = config["logging"]["eval_interval"] full_eval_interval = config["logging"].get("full_eval_interval", 15 * eval_interval) callbacks.append(FunctionCallback(eval_best, full_eval_interval)) trainer = NmtPriorTrainer(model, train_loader, val_loader, config, config["device"], prior=prior, callbacks=callbacks, src_dirs=config["src_dirs"], resume_state_id=config["resume_state_id"]) if trainer.exp.has_finished(): return trainer # ------------------------------------------------------------------------- # Training Loop # ------------------------------------------------------------------------- for epoch in range(config["epochs"]): train_loss = trainer.train_epoch() val_loss = trainer.eval_epoch() print("\n" * 3) if trainer.early_stop: print("Stopping early ...") break trainer.exp.finalize() return trainer
#################################################################### # SETTINGS #################################################################### opts, config = train_options() #################################################################### # Data Loading and Preprocessing #################################################################### vocab = None if config["vocab"]["vocab_path"] is not None: vocab_path = config["vocab"]["vocab_path"] print(f"Loading vocab from '{vocab_path}'...") vocab = Vocab() vocab.from_file(vocab_path) if opts.cp_vocab is not None: print(f"Loading vocab from checkpoint '{opts.cp_vocab}'...") vcp = load_checkpoint(opts.cp_vocab) vocab = vcp["vocab"] if opts.resume: checkpoint = load_checkpoint(opts.resume) config["vocab"].update(checkpoint["config"]["vocab"]) if not config["vocab"]["subword"]: vocab = checkpoint["vocab"] def giga_tokenizer(x):