def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)))
        src_eos_idx = src_dict.add_special_token('[END_OF_SENT]')
        print('src_dict:[END_OF_SENT] id = {}, token = {}'.format(src_eos_idx, src_dict[src_eos_idx]))

        tgt_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        tgt_eos_idx = tgt_dict.add_special_token('[END_OF_SENT]')
        print('tgt_dict:[END_OF_SENT] id = {}, token = {}'.format(tgt_eos_idx, tgt_dict[tgt_eos_idx]))

        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        assert src_dict.sep() == tgt_dict.sep()

        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Example #2
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load google bert dictionaries)."""
        dict_path = os.path.join(args.data, "vocab.txt")
        if not os.path.isfile(dict_path):
            raise FileNotFoundError("Dict not found: {}".format(dict_path))

        from transformers import BertTokenizer
        tokenizer = BertTokenizer.from_pretrained(args.bert_name)
        tgt_dict = BertDictionary.load(dict_path, tokenizer)
        print("| dictionary: {} types".format(len(tgt_dict)))
        return cls(args, tgt_dict)
Example #3
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad = options.eval_bool(args.left_pad)

        # load dictionaries
        dictionary = BertDictionary.load(os.path.join(args.data[0], 'dict.txt'))
        print('| dictionary: {} types'.format(len(dictionary)))

        return cls(args, dictionary)