Ejemplo n.º 1
0
def load_checkpoint(name, path=None):
    """
    Load a trained model, along with its optimizer
    Args:
        name (str): the name of the model
        path (str): the directory, in which the model is saved

    Returns:
        model, optimizer

    """
    if path is None:
        path = TRAINED_PATH

    # model_fname = os.path.join(path, "{}.pt".format(name))
    try:
        model_fname = os.path.join(path, name)
        print("Loading checkpoint `{}` ...".format(model_fname), end=" ")
        with open(model_fname, 'rb') as f:
            state = torch.load(f, map_location="cpu")
        print("done!")
    except:
        print("Loading checkpoint `{}` ...".format(name), end=" ")
        with open(name, 'rb') as f:
            state = torch.load(f, map_location="cpu")
        print("done!")

    if "vocab" in state:
        if isinstance(state["vocab"], tuple):
            state["vocab"] = tuple(Vocab().from_vocab_instance(v)
                                   for v in state["vocab"])
        else:
            state["vocab"] = Vocab().from_vocab_instance(state["vocab"])

    return state
Ejemplo n.º 2
0
def build_vocab_from_file(file, tokenize):
    _vocab = Vocab()

    for line in iterate_data(file):
        tokens = tokenize(line)
        _vocab.read_sequence(tokens)

    return _vocab
Ejemplo n.º 3
0
def read_corpus(file, tokenize):
    _vocab = Vocab()

    _data = []
    for line in iterate_data(file):
        tokens = tokenize(line)
        _vocab.read_sequence(tokens)
        _data.append(tokens)

    return _vocab, _data
Ejemplo n.º 4
0
def read_corpus_subw(file, subword_path):
    subword = spm.SentencePieceProcessor()
    subword.Load(subword_path + ".model")

    vocab = Vocab(sos="<s>", eos="</s>", unk="<unk>")
    vocab.from_file(subword_path, skip=4)

    _data = []
    for line in iterate_data(file):
        tokens = subword.EncodeAsPieces(line.rstrip().encode('utf-8'))
        _data.append(tokens)

    vocab.subword = subword

    return vocab, _data
Ejemplo n.º 5
0
    def init_vocab(vocab=None, subword_path=None, oovs=0):
        _is_prebuilt = True

        # use the given vocab
        if vocab is not None:
            _vocab = vocab

        # load vocab from disk
        elif vocab is None and subword_path is not None:
            _vocab = Vocab(oovs=oovs, subword=subword_path)
            _vocab.load_from_vocab_file(subword_path + ".vocab")

        # build vocab from the tokens in the dataset
        else:
            _vocab = Vocab(oovs=oovs, subword=subword_path)
            _vocab.reset()
            _is_prebuilt = False
        return _vocab, _is_prebuilt
Ejemplo n.º 6
0
    def __init__(self,
                 input,
                 tokenize=None,
                 vocab=None,
                 vocab_size=None,
                 subword_path=None,
                 seq_len=0,
                 sos=False,
                 oovs=0,
                 lang="en",
                 subsample=0,
                 **kwargs):
        """
        Base Dataset for Language Modeling.

        Args:
            tokenize (callable): tokenization callable, which takes as input
                a string and returns a list of tokens
            input (str, list): the path to the data file, or a list of samples.
            vocab (Vocab): a vocab instance. If None, then build a new one
                from the Datasets data.
            vocab_size(int): if given, then trim the vocab to the given number.
        """
        self.input = input
        self.seq_len = seq_len
        self.subword_path = subword_path
        self.sos = sos
        self.oovs = oovs
        self.subsample = subsample

        # > define tokenization to be used -------------------------------
        if tokenize is not None:
            self.tokenize = tokenize
        else:
            self.tokenize = self.space_tok

        if self.subword_path is not None:
            subword = spm.SentencePieceProcessor()
            subword_path = fix_paths(subword_path, "datasets")
            subword.Load(subword_path + ".model")
            self.tokenize = lambda x: subword.EncodeAsPieces(x.rstrip())
        else:
            self.tokenize = MosesTokenizer(lang=lang).tokenize

        # > Build Vocabulary --------------------------------------------
        self.vocab, is_vocab_built = self.init_vocab(vocab, subword_path, oovs)

        # > Cache text file ---------------------------------------------
        self.lengths = []
        _is_cached = False

        def _line_callback(x):
            _tokens = self.tokenize(x)
            self.lengths.append(len(self.add_special_tokens(_tokens)))

            if is_vocab_built is False:
                self.vocab.read_sequence(_tokens)

        # -------------------------------------------------------------
        # If there is a (vocab, lengths) tuple associated with the given input
        # file, then load them from cache and skip the recalculation
        # -------------------------------------------------------------
        _ckey = self._get_cache_key(input, vocab, self.tokenize,
                                    subword_path, vocab_size, self.subsample)
        _cfile = os.path.join(os.path.dirname(input), f".cache_{_ckey}")
        if os.path.isfile(_cfile):
            print("Loading data from cache...", end=" ")
            with open(_cfile, "rb") as f:
                _vocab, self.lengths = pickle.load(f)
                self.vocab = Vocab().from_vocab_instance(_vocab)
            print("done!")
            _is_cached = True

        # > Preprocessing ---------------------------------------------
        print("Preprocessing...")
        self.data = DatasetCache(input,
                                 callback=_line_callback,
                                 subsample=subsample)

        # if the text file has already been cached,
        # but lengths and vocab are not cached (i.e., new for this input file)
        if _is_cached is False and len(self.lengths) == 0:
            for i in range(len(self.data)):
                _line_callback(self.data[i])

        # trim down the size of a newly created vocab
        if subword_path is None and vocab_size is not None:
            self.vocab.build_lookup(vocab_size)

        # -------------------------------------------------------------
        # save to cache if not already saved
        # -------------------------------------------------------------
        if _is_cached is False:
            print("Writing data to cache...")
            with open(_cfile, "wb") as f:
                pickle.dump((self.vocab, self.lengths), f)

        self.lengths = numpy.array(self.lengths)
Ejemplo n.º 7
0
class BaseSequenceDataset(Dataset, ABC):
    def __init__(self,
                 input,
                 tokenize=None,
                 vocab=None,
                 vocab_size=None,
                 subword_path=None,
                 seq_len=0,
                 sos=False,
                 oovs=0,
                 lang="en",
                 subsample=0,
                 **kwargs):
        """
        Base Dataset for Language Modeling.

        Args:
            tokenize (callable): tokenization callable, which takes as input
                a string and returns a list of tokens
            input (str, list): the path to the data file, or a list of samples.
            vocab (Vocab): a vocab instance. If None, then build a new one
                from the Datasets data.
            vocab_size(int): if given, then trim the vocab to the given number.
        """
        self.input = input
        self.seq_len = seq_len
        self.subword_path = subword_path
        self.sos = sos
        self.oovs = oovs
        self.subsample = subsample

        # > define tokenization to be used -------------------------------
        if tokenize is not None:
            self.tokenize = tokenize
        else:
            self.tokenize = self.space_tok

        if self.subword_path is not None:
            subword = spm.SentencePieceProcessor()
            subword_path = fix_paths(subword_path, "datasets")
            subword.Load(subword_path + ".model")
            self.tokenize = lambda x: subword.EncodeAsPieces(x.rstrip())
        else:
            self.tokenize = MosesTokenizer(lang=lang).tokenize

        # > Build Vocabulary --------------------------------------------
        self.vocab, is_vocab_built = self.init_vocab(vocab, subword_path, oovs)

        # > Cache text file ---------------------------------------------
        self.lengths = []
        _is_cached = False

        def _line_callback(x):
            _tokens = self.tokenize(x)
            self.lengths.append(len(self.add_special_tokens(_tokens)))

            if is_vocab_built is False:
                self.vocab.read_sequence(_tokens)

        # -------------------------------------------------------------
        # If there is a (vocab, lengths) tuple associated with the given input
        # file, then load them from cache and skip the recalculation
        # -------------------------------------------------------------
        _ckey = self._get_cache_key(input, vocab, self.tokenize,
                                    subword_path, vocab_size, self.subsample)
        _cfile = os.path.join(os.path.dirname(input), f".cache_{_ckey}")
        if os.path.isfile(_cfile):
            print("Loading data from cache...", end=" ")
            with open(_cfile, "rb") as f:
                _vocab, self.lengths = pickle.load(f)
                self.vocab = Vocab().from_vocab_instance(_vocab)
            print("done!")
            _is_cached = True

        # > Preprocessing ---------------------------------------------
        print("Preprocessing...")
        self.data = DatasetCache(input,
                                 callback=_line_callback,
                                 subsample=subsample)

        # if the text file has already been cached,
        # but lengths and vocab are not cached (i.e., new for this input file)
        if _is_cached is False and len(self.lengths) == 0:
            for i in range(len(self.data)):
                _line_callback(self.data[i])

        # trim down the size of a newly created vocab
        if subword_path is None and vocab_size is not None:
            self.vocab.build_lookup(vocab_size)

        # -------------------------------------------------------------
        # save to cache if not already saved
        # -------------------------------------------------------------
        if _is_cached is False:
            print("Writing data to cache...")
            with open(_cfile, "wb") as f:
                pickle.dump((self.vocab, self.lengths), f)

        self.lengths = numpy.array(self.lengths)

    @staticmethod
    def init_vocab(vocab=None, subword_path=None, oovs=0):
        _is_prebuilt = True

        # use the given vocab
        if vocab is not None:
            _vocab = vocab

        # load vocab from disk
        elif vocab is None and subword_path is not None:
            _vocab = Vocab(oovs=oovs, subword=subword_path)
            _vocab.load_from_vocab_file(subword_path + ".vocab")

        # build vocab from the tokens in the dataset
        else:
            _vocab = Vocab(oovs=oovs, subword=subword_path)
            _vocab.reset()
            _is_prebuilt = False
        return _vocab, _is_prebuilt

    def dataitem(self, i):

        # tokenize sentence / text
        token_list = self.tokenize(self.data[i])

        # add special tokens such as <BOS> or <EOS>
        token_list = self.add_special_tokens(token_list)

        # vectorize the tokens
        vector = vectorize(token_list, self.vocab)
        return vector

    @staticmethod
    def _get_cache_key(input, vocab, tokenize, subword_path, vocab_size,
                       subsample):
        """

        Args:
            input:
            vocab:
            tokenize:
            oovs:

        Returns:

        """
        _hash = lambda x: hashlib.sha256(x.encode()).hexdigest()
        _cache_key = _hash(input) + str(os.stat(input).st_mtime)

        if vocab is not None:
            _cache_key += vocab.hash()

        if subsample is not None:
            _cache_key += str(subsample)

        _cache_key += _hash(inspect.getsource(tokenize))
        _cache_key += str(subword_path)
        _cache_key += str(vocab_size)
        return _hash(_cache_key)

    @staticmethod
    def space_tok(text):
        return text.rstrip().split()

    def add_special_tokens(self, tokens):
        tokens = tokens + [self.vocab.EOS]
        if self.sos:
            tokens = [self.vocab.SOS] + tokens
        if self.seq_len > 0:
            tokens = tokens[:self.seq_len]
        return tokens

    def properties(self):
        props = dict()
        props["file"] = os.path.basename(self.input)
        props["examples"] = len(self)
        props["vocab"] = len(self.vocab)
        props["tokens (unique)"] = len(self.vocab.vocab)
        props["tokens (total)"] = number_h(sum(self.lengths))

        # if len(self.data) < 4000000:
        #     unk_id = self.vocab.tok2id[self.vocab.UNK]
        #     _coverage = numpy.mean(
        #         [self.dataitem(i).count(unk_id) / len(self.dataitem(i))
        #          for i in range(len(self.data))])
        #     props["UNKs"] = f"{round((_coverage * 100), 2)} %"

        if hasattr(self, 'seq_len'):
            props["max length"] = self.seq_len
        if hasattr(self, 'bptt'):
            props["BPTT"] = self.bptt
        return props

    def __str__(self):
        return tabulate([self.properties()],
                        headers="keys", floatfmt=".4f", numalign="right")

    def truncate(self, n):
        self.data = self.data[:n]
        self.lengths = self.lengths[:n]
Ejemplo n.º 8
0
def run(config):
    # -------------------------------------------------------------------------
    # Load pretrained models
    # -------------------------------------------------------------------------
    vocab_src = None
    vocab_trg = None

    # Load pretrained LM, which will be used for LM-Fusion or as LM-prior
    if config["data"]["prior_path"] is not None:
        if "gpt2" in config["data"]["prior_path"]:
            _gpt_model = os.path.split(config["data"]["prior_path"])[1]
            tokenizer = GPT2Tokenizer.from_pretrained(_gpt_model)
            vocab_trg = Vocab()
            vocab_trg.from_gpt2(tokenizer)
            _checkp_prior = GPT2LMHeadModel.from_pretrained(_gpt_model)
            config["model"]["dec_padding_idx"] = None
        else:
            _checkp_prior = load_checkpoint(config["data"]["prior_path"])
            vocab_trg = _checkp_prior["vocab"]

            if _checkp_prior["config"]["data"]["subword_path"] is not None:
                sub_path = _checkp_prior["config"]["data"]["subword_path"]
                config["data"]["trg"]["subword_path"] = sub_path

    # -------------------------------------------------------------------------
    # Data Loading and Preprocessing
    # -------------------------------------------------------------------------
    train_loader, val_loader = nmt_dataloaders(config, vocab_src, vocab_trg)

    # -------------------------------------------------------------------------
    # Initialize Model and Priors
    # -------------------------------------------------------------------------
    model_type = config["model"].get("type", "rnn")
    src_ntokens = len(val_loader.dataset.src.vocab)
    trg_ntokens = len(val_loader.dataset.trg.vocab)

    # Initialize Model
    if model_type == "rnn":
        model = Seq2SeqRNN(src_ntokens, trg_ntokens, **config["model"])
    elif model_type == "transformer":
        model = Seq2SeqTransformer(src_ntokens, trg_ntokens, **config["model"])
    else:
        raise NotImplementedError

    model_init(model, **config.get("init", {}))

    # Initialize prior LM
    _has_lm_prior = "prior" in config["losses"]
    _has_lm_fusion = config["model"]["decoding"].get("fusion") is not None
    if _has_lm_prior or _has_lm_fusion:
        if "gpt2" in config["data"]["prior_path"]:
            prior = _checkp_prior
            prior.to(config["device"])
            freeze_module(prior)
            for name, module in prior.named_modules():
                if isinstance(module, nn.Dropout):
                    module.p = 0
        else:
            prior = prior_model_from_checkpoint(_checkp_prior)
            prior.to(config["device"])
            freeze_module(prior)
    else:
        prior = None

    model.tie_weights()

    # -------------------------------------------------------------------------
    # Training Pipeline
    # -------------------------------------------------------------------------
    callbacks = [
        LossCallback(config["logging"]["log_interval"]),
        GradientCallback(config["logging"]["log_interval"]),
        ModuleGradientCallback(["encoder"], config["logging"]["log_interval"]),
        SamplesCallback(config["logging"]["log_interval"]),
        EvalCallback(config["logging"]["eval_interval"],
                     keep_best=True,
                     early_stop=config["optim"]["early_stop"])
    ]
    if model_type == "rnn":
        callbacks.append(AttentionCallback(config["logging"]["eval_interval"]))

    eval_interval = config["logging"]["eval_interval"]
    full_eval_interval = config["logging"].get("full_eval_interval",
                                               15 * eval_interval)
    callbacks.append(FunctionCallback(eval_best, full_eval_interval))

    trainer = NmtPriorTrainer(model,
                              train_loader,
                              val_loader,
                              config,
                              config["device"],
                              prior=prior,
                              callbacks=callbacks,
                              src_dirs=config["src_dirs"],
                              resume_state_id=config["resume_state_id"])

    if trainer.exp.has_finished():
        return trainer

    # -------------------------------------------------------------------------
    # Training Loop
    # -------------------------------------------------------------------------
    for epoch in range(config["epochs"]):
        train_loss = trainer.train_epoch()
        val_loss = trainer.eval_epoch()
        print("\n" * 3)

        if trainer.early_stop:
            print("Stopping early ...")
            break

    trainer.exp.finalize()
    return trainer
Ejemplo n.º 9
0
####################################################################
# SETTINGS
####################################################################
opts, config = train_options()

####################################################################
# Data Loading and Preprocessing
####################################################################

vocab = None

if config["vocab"]["vocab_path"] is not None:
    vocab_path = config["vocab"]["vocab_path"]
    print(f"Loading vocab from '{vocab_path}'...")
    vocab = Vocab()
    vocab.from_file(vocab_path)

if opts.cp_vocab is not None:
    print(f"Loading vocab from checkpoint '{opts.cp_vocab}'...")
    vcp = load_checkpoint(opts.cp_vocab)
    vocab = vcp["vocab"]

if opts.resume:
    checkpoint = load_checkpoint(opts.resume)
    config["vocab"].update(checkpoint["config"]["vocab"])
    if not config["vocab"]["subword"]:
        vocab = checkpoint["vocab"]


def giga_tokenizer(x):