Beispiel #1
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        args.trigram_block = options.eval_bool(args.trigram_block)
        args.init_from_pretrained_doc_model = options.eval_bool(args.init_from_pretrained_doc_model)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        if args.pretrained_bert_model.startswith('roberta'):
            src_dict = GPT2Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang)))
        else:
            src_dict = BertDictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang)))
 
        if args.init_from_pretrained_doc_model:
            print('adding the [SENT_MASK] token? change it within Bert Special Tokens')
            pass
            # adding the [SENT_MASK] token?

        tgt_dict = FlexibleDictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #2
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.src.txt'))
        tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.tgt.txt'))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info('[{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        logger.info('[{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)))
        src_eos_idx = src_dict.add_special_token('[END_OF_SENT]')
        print('src_dict:[END_OF_SENT] id = {}, token = {}'.format(src_eos_idx, src_dict[src_eos_idx]))

        tgt_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        tgt_eos_idx = tgt_dict.add_special_token('[END_OF_SENT]')
        print('tgt_dict:[END_OF_SENT] id = {}, token = {}'.format(tgt_eos_idx, tgt_dict[tgt_eos_idx]))

        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        assert src_dict.sep() == tgt_dict.sep()

        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
    def setup_task(cls, cfg: TranslationlfConfig, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        paths = utils.split_paths(cfg.data)
        assert len(paths) > 0
        # find language pair automatically
        if cfg.source_lang is None or cfg.target_lang is None:
            cfg.source_lang, cfg.target_lang = data_utils.infer_language_pair(
                paths[0])
        if cfg.source_lang is None or cfg.target_lang is None:
            raise Exception(
                "Could not infer language pair, please provide it explicitly")

        # load dictionaries
        src_dict = cls.load_dictionary(
            os.path.join(paths[0], "dict.{}.txt".format(cfg.source_lang)))
        tgt_dict = cls.load_dictionary(
            os.path.join(paths[0], "dict.{}.txt".format(cfg.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info("[{}] dictionary: {} types".format(cfg.source_lang,
                                                       len(src_dict)))
        logger.info("[{}] dictionary: {} types".format(cfg.target_lang,
                                                       len(tgt_dict)))
        #load here the longformer reps
        # todo check!
        # todo (next) load all of h5paths
        lf_reps = load_longformer_representations(cfg.lf_path)
        sen_doc_align = load_sen_doc_alignment(cfg.sen_doc)
        return cls(cfg, src_dict, tgt_dict, lf_reps, sen_doc_align)
Beispiel #5
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #6
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                "Could not infer language pair, please provide it explicitly")

        dictionary = cls.load_dictionary(os.path.join(paths[0], "dict.txt"))

        # langs:"en-zh,my-en"
        logger.info("args.add_lang_token: {} ".format(args.add_lang_token))
        if args.add_lang_token and len(args.langs) > 0:
            languages = args.langs.split(",")
            for lang_pair in languages:
                if lang_pair == "-": continue
                logger.info("{} was add to dictionary".format(lang_pair))
                lang = lang_pair.split("-")
                dictionary.add_symbol("[{}]".format(lang[0]))
                dictionary.add_symbol("[{}]".format(lang[1]))
        return cls(args, dictionary, dictionary)
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        if getattr(args, 'raw_text', False):
            utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw')
            args.dataset_impl = 'raw'
        elif getattr(args, 'lazy_load', False):
            utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy')
            args.dataset_impl = 'lazy'

        paths = args.data.split(':')
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))
        char_dict = cls.load_dictionary(os.path.join(paths[0], 'dict_char.txt'))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict, char_dict)
Beispiel #8
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = Dictionary.load(
            os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = Dictionary.load(
            os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #9
0
    def setup_task(cls, args, **kwargs):
        """Setup GEC task, including dictionary & model building."""

        """
        Similar to the translation task, but also load labels dictionaries
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = cls.load_dictionary(os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = cls.load_dictionary(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #10
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        src_dict = BertBasedDictionary(args.bert_name)
        tgt_dict = cls.load_dictionary(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad(), "%d != %d" % (src_dict.pad(),
                                                               tgt_dict.pad())
        assert src_dict.eos() == tgt_dict.eos(), "%d != %d" % (src_dict.eos(),
                                                               tgt_dict.eos())
        assert src_dict.unk() == tgt_dict.unk(), "%d != %d" % (src_dict.unk(),
                                                               tgt_dict.unk())
        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #11
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        # left_pad_source=True, left_pad_target=False
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # 例如source_lang=cn  target_lang=en
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        #加载字典文件
        src_dict = cls.load_dictionary(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = cls.load_dictionary(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} 个字符'.format(args.source_lang,
                                                 len(src_dict)))
        print('| [{}] dictionary: {} 个字符'.format(args.target_lang,
                                                 len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        assert not args.left_pad_source, f'args.left_pad_source must be False'

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        args.no_strip_node_label = getattr(args, 'no_strip_node_label', False)
        src_dict = DPTreeWrapperDictionary.load(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)),
            no_strip_node_label=args.no_strip_node_label)
        tgt_dict = Dictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] DPtree-dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #13
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])

        # load dictionary
        subword_dict = SubwordDictionary.load(
            os.path.join(args.data[0], 'model.vcb'))

        return cls(args, subword_dict)
Beispiel #14
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        args.trigram_block = options.eval_bool(args.trigram_block)
        args.init_from_pretrained_doc_model = options.eval_bool(
            args.init_from_pretrained_doc_model)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        if args.roberta_model.startswith('roberta'):
            src_dict = GPT2Dictionary.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.source_lang)))
        else:
            src_dict = BertDictionary.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.source_lang)))
        idx = src_dict.add_special_token('<sent_mask>')
        print('<sent_mask> id = {}, token = {}'.format(idx, src_dict[idx]))
        print('<mask> id is', src_dict.index('<mask>'))
        print('<sent_mask> id is', src_dict.index('<sent_mask>'))

        # tgt_dict = FlexibleDictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        # generate the tgt_dict
        tgt_dict = PointerFlexibleDictionary(args.max_doc_length,
                                             specialTokens=[('EOS', '</s>'),
                                                            ('PAD', '<pad>'),
                                                            ('UNK', '<unk>'),
                                                            ('BOS', '<s>')])

        assert tgt_dict.index('0') == 0
        print('| WARNING: idx should should match the context in the tgt dict')
        # if args.predict_arch == 'pointer_net':
        #     assert tgt_dict.eos() == args.max_doc_length

        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
    def setup_task(cls, cfg: TranslationConfig, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        paths = utils.split_paths(cfg.data)
        assert len(paths) > 0
        # find language pair automatically
        if cfg.source_lang is None or cfg.target_lang is None:
            cfg.source_lang, cfg.target_lang = data_utils.infer_language_pair(
                paths[0])
        if cfg.source_lang is None or cfg.target_lang is None:
            raise Exception(
                "Could not infer language pair, please provide it explicitly")

        # load dictionaries
        bert_dict_langs: Set = set(cfg.use_bert_dict.split(","))
        source_lang = cfg.source_lang
        if cfg.source_lang in bert_dict_langs:
            logger.info("Use DirctionaryForBert for {}".format(source_lang))
            src_dict = DictionaryForBert.load(
                os.path.join(paths[0], "dict.{}.txt".format(source_lang)))
        else:
            logger.info("Use default Dirctionary for {}".format(source_lang))
            src_dict = cls.load_dictionary(
                os.path.join(paths[0], "dict.{}.txt".format(source_lang)))

        target_lang = cfg.target_lang
        if cfg.target_lang in bert_dict_langs:
            logger.info("Use DirctionaryForBert for {}".format(target_lang))
            tgt_dict = DictionaryForBert.load(
                os.path.join(paths[0], "dict.{}.txt".format(target_lang)))
        else:
            logger.info("Use default Dirctionary for {}".format(target_lang))
            tgt_dict = cls.load_dictionary(
                os.path.join(paths[0], "dict.{}.txt".format(target_lang)))

        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info("[{}] dictionary: {} types".format(cfg.source_lang,
                                                       len(src_dict)))
        logger.info("[{}] dictionary: {} types".format(cfg.target_lang,
                                                       len(tgt_dict)))

        return cls(cfg, src_dict, tgt_dict)
Beispiel #16
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        user_data_frame = kwargs['user_data_frame']
        task_score = kwargs['task_score']
        src_dict = kwargs['src_dict']
        tgt_dict = kwargs['tgt_dict']
        return cls(args=args, src_dict=src_dict, tgt_dict=tgt_dict, user_data_frame=user_data_frame, task_score=task_score)
Beispiel #17
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        # args.left_pad_source = options.eval_bool(args.left_pad_source)
        # args.left_pad_target = options.eval_bool(args.left_pad_target)
        if getattr(args, 'raw_text', False):
            utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw')
            args.dataset_impl = 'raw'
        elif getattr(args, 'lazy_load', False):
            utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy')
            args.dataset_impl = 'lazy'

        # wait-k
        try:
            args.wait_k = int(args.wait_k)
        except:
            if args.wait_k == "uniform":
                assert args.wait_k_sample_start < args.wait_k_sample_end
            elif args.wait_k == "CL-linear":
                assert args.wait_k_sample_start > args.wait_k_sample_end
                assert args.max_epoch <= 0
            else:
                raise ValueError("Unsupported wait-k sampling method %s" % args.wait_k)

        paths = args.data.split(':')
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not test language pair, please provide it explicitly')

        # load dictionaries
        src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #18
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        #could remove the following .....
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        if args.flatenc or args.flatdec:
            flatData = args.flatdata
        if args.flatenc:
            flatFile = os.path.join(flatData,
                                    'dict.{}.txt'.format(args.source_lang))
            print("For flat encoder load dictionary: ", flatFile)
            src_dict = Dictionary.load(flatFile)
        else:
            src_dict = DictionaryWCS.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.source_lang)))
        if args.flatdec:
            flatFile = os.path.join(flatData,
                                    'dict.{}.txt'.format(args.target_lang))
            print("For flat decoder load dictionary: ", flatFile)
            tgt_dict = Dictionary.load(flatFile)
        else:
            tgt_dict = DictionaryWCS.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        args.trigram_block = options.eval_bool(args.trigram_block)
        args.init_from_pretrained_doc_model = options.eval_bool(
            args.init_from_pretrained_doc_model)

        if getattr(args, 'raw_text', False):
            utils.deprecation_warning(
                '--raw-text is deprecated, please use --dataset-impl=raw')
            args.dataset_impl = 'raw'
        elif getattr(args, 'lazy_load', False):
            utils.deprecation_warning(
                '--lazy-load is deprecated, please use --dataset-impl=lazy')
            args.dataset_impl = 'lazy'

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = GPT2Dictionary.load(
            os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang)))
        idx = src_dict.add_special_token('<sent_mask>')
        print('<sent_mask> id = {}, token = {}'.format(idx, src_dict[idx]))
        print('<mask> id is', src_dict.index('<mask>'))
        print('<sent_mask> id is', src_dict.index('<sent_mask>'))

        tgt_dict = FlexibleDictionary.load(
            os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        # tgt_dict = None

        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #20
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = Dictionary.load(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = Dictionary.load(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        if not hasattr(args, 'device_id') or args.device_id == 0:
            print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                       len(src_dict)))
            print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                       len(tgt_dict)))
            if hasattr(args,
                       'share_all_embeddings') and args.share_all_embeddings:
                src_dict.update(tgt_dict)
                tgt_dict = src_dict
                print("Join dictionary to share embeddings")
                print('| [{}] dictionary: {} types'.format(
                    args.source_lang, len(src_dict)))
                print('| [{}] dictionary: {} types'.format(
                    args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #21
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        if args.lang_pairs is None:
            raise ValueError(
                '--lang-pairs is required. List all the language pairs in the training objective.'
            )
        if isinstance(args.lang_pairs, str):
            args.lang_pairs = args.lang_pairs.split(',')
        assert 'source-target' in args.lang_pairs and 'source-untarget' in args.lang_pairs

        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = cls.load_dictionary(
            os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = cls.load_dictionary(
            os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info('[{}] dictionary: {} types'.format(args.source_lang,
                                                       len(src_dict)))
        logger.info('[{}] dictionary: {} types'.format(args.target_lang,
                                                       len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #22
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        if not hasattr(args, 'audio_input'):
            args.audio_input = False

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')
        # load dictionaries

        if not args.audio_input:
            src_dict = Dictionary.load(
                os.path.join(args.data[0],
                             'dict.{}.txt'.format(args.source_lang)))
        else:
            src_dict = AudioDictionary.load(
                os.path.join(args.data[0],
                             'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = Dictionary.load(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #23
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        args.trigram_block = options.eval_bool(args.trigram_block)
        args.init_from_pretrained_doc_model = options.eval_bool(
            args.init_from_pretrained_doc_model)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        if args.roberta_model.startswith('roberta'):
            src_dict = GPT2Dictionary.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.source_lang)))
        else:
            src_dict = BertDictionary.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.source_lang)))
        idx = src_dict.add_special_token('<sent_mask>')
        print('<sent_mask> id = {}, token = {}'.format(idx, src_dict[idx]))
        print('<mask> id is', src_dict.index('<mask>'))
        print('<sent_mask> id is', src_dict.index('<sent_mask>'))

        tgt_dict = FlexibleDictionary.load(
            os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #24
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        # get padding...
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)
        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0]
            )

        print("path:",os.path.join(paths[0], "/Dicts/dict.txt"))
        dictionary = cls.load_dictionary(
            os.path.join(paths[0]+"/Dicts/", "dict.txt")
        )

        return cls(args, dictionary,paths)
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        s = args.word_mask_keep_rand.split(',')
        s = [float(x) for x in s]
        setattr(args, 'pred_probs', torch.FloatTensor([s[0], s[1], s[2]]))

        if getattr(args, 'raw_text', False):
            utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw')
            args.dataset_impl = 'raw'
        elif getattr(args, 'lazy_load', False):
            utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy')
            args.dataset_impl = 'lazy'

        paths = args.data.split(':')
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = BertWordpieceDictionary.load(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = BertWordpieceDictionary.load(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))

        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Beispiel #26
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = utils.eval_bool(args.left_pad_source)
        args.left_pad_target = utils.eval_bool(args.left_pad_target)

        paths = utils.split_paths(args.data)
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        if args.use_bert_model:
            tgt_first = True
        else:
            tgt_first = False
        if tgt_first:
            tgt_dict = cls.load_dictionary(os.path.join(
                paths[0], 'dict.{}.txt'.format(args.target_lang)),
                                           custom_bos=args.bos,
                                           custom_pad=args.pad,
                                           custom_eos=args.eos,
                                           custom_unk=args.unk,
                                           add_sentence_limit_words_after=True)
            bos_id_tgt = tgt_dict.bos()
            pad_id_tgt = tgt_dict.pad()
            eos_id_tgt = tgt_dict.eos()
            unk_id_tgt = tgt_dict.unk()
            src_dict = cls.load_dictionary(os.path.join(
                paths[0], 'dict.{}.txt'.format(args.source_lang)),
                                           custom_bos=args.bos,
                                           custom_pad=args.pad,
                                           custom_eos=args.eos,
                                           custom_unk=args.unk,
                                           add_sentence_limit_words_after=True,
                                           tgt_first=tgt_first,
                                           bos_id_tgt=bos_id_tgt,
                                           pad_id_tgt=pad_id_tgt,
                                           eos_id_tgt=eos_id_tgt,
                                           unk_id_tgt=unk_id_tgt)
        else:
            src_dict = cls.load_dictionary(
                os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
            tgt_dict = cls.load_dictionary(
                os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))

        # print(src_dict.pad(), '', tgt_dict.pad())
        # print(src_dict.bos(), '', tgt_dict.bos())
        # print(src_dict.eos(), '', tgt_dict.eos())
        # print(src_dict.unk(), '', tgt_dict.unk())
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        logger.info('[{}] dictionary: {} types'.format(args.source_lang,
                                                       len(src_dict)))
        logger.info('[{}] dictionary: {} types'.format(args.target_lang,
                                                       len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).
        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        if getattr(args, 'raw_text', False):
            utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw')
            args.dataset_impl = 'raw'
        elif getattr(args, 'lazy_load', False):
            utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy')
            args.dataset_impl = 'lazy'

        paths = args.data.split(':')
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        assert args.target_lang == 'actions', 'target extension must be "actions"'
        args.target_lang_nopos = 'actions_nopos'    # only build dictionary without pointer values
        args.target_lang_pos = 'actions_pos'
        args.target_lang_vocab_nodes = 'actions.vocab.nodes'
        args.target_lang_vocab_others = 'actions.vocab.others'
        src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
        # tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang_nopos)))

        # NOTE rebuild the dictionary every time
        tgt_dict = cls.build_dictionary_bart_extend(
            node_freq_min=args.node_freq_min,
            node_file_path=os.path.join(paths[0], args.target_lang_vocab_nodes),
            others_file_path=os.path.join(paths[0], args.target_lang_vocab_others)
            )

        # TODO target dictionary 'actions_nopos' is hard coded now; change it later
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang_nopos, len(tgt_dict)))

        # ========== load the pretrained BART model ==========
        if getattr(args, 'arch', None):
            # training time: pretrained BART needs to be used for initialization
            if 'bart_base' in args.arch or 'bartsv_base' in args.arch:
                print('-' * 10 + ' loading pretrained bart.base model ' + '-' * 10)
                bart = torch.hub.load('pytorch/fairseq', 'bart.base')
            elif 'bart_large' in args.arch or 'bartsv_large' in args.arch:
                print('-' * 10 + 'loading pretrained bart.large model ' + '-' * 10)
                bart = torch.hub.load('pytorch/fairseq', 'bart.large')
            else:
                raise ValueError
        else:
            # inference time: pretrained BART is only used for dictionary related things; size does not matter
            # NOTE size does matter; update this later in model initialization if model is with "bart.large"
            print('-' * 10 + ' (for bpe vocab and embed size at inference time) loading pretrained bart.base model '
                  + '-' * 10)
            bart = torch.hub.load('pytorch/fairseq', 'bart.base')

        bart.eval()    # the pretrained BART model is only for assistance
        # ====================================================

        return cls(args, src_dict, tgt_dict, bart)