Ejemplo n.º 1
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Ejemplo n.º 2
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt'))
        print('| dictionary: {} types'.format(len(dictionary)))
        return cls(args, dictionary)
Ejemplo n.º 3
0
 def setup_task(cls, args, **kwargs):
     """Setup the task."""
     paths = utils.split_paths(args.data)
     assert len(paths) > 0
     dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
     logger.info("dictionary: {} types".format(len(dictionary)))
     if not hasattr(args, "shuffle_instance"):
         args.shuffle_instance = False
     return cls(args, dictionary)
Ejemplo n.º 4
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries)."""
        dict_path = os.path.join(args.data, "dict.txt")
        if not os.path.isfile(dict_path):
            raise FileNotFoundError("Dict not found: {}".format(dict_path))
        tgt_dict = Dictionary.load(dict_path)

        print("| dictionary: {} types".format(len(tgt_dict)))
        return cls(args, tgt_dict)
Ejemplo n.º 5
0
 def initializer(self):
     global bpe
     bpe = get_encoder(
         os.path.join(self.roberta_dir, 'gpt2_bpe', 'encoder.json'),
         os.path.join(self.roberta_dir, 'gpt2_bpe', 'vocab.bpe'),
     )
     global vocab
     vocab = Dictionary.load(
         os.path.join(self.roberta_dir, 'roberta.base', 'dict.txt'))
Ejemplo n.º 6
0
    def load_dictionary(cls, filename, use_ctc_loss):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        if use_ctc_loss:
            return CTCLossDictionary.load(filename)
        return Dictionary.load(filename)
Ejemplo n.º 7
0
 def __init__(self, vncore=True):
     """
     Hacky way to run VnCoreNLP tokenizer with PhoBERT
     :param vncore: Set it to `False` if your sentences are already tokenized by VnCoreNLP
     """
     self.dictionary = Dictionary.load(open(DICT_PATH))
     self.annotator = None
     self.vncore = vncore
     self.bpe = fastBPE(args)
Ejemplo n.º 8
0
 def setup_task(cls, cfg: SpanMaskedLMConfig, **kwargs):
     """Setup the task."""
     paths = utils.split_paths(cfg.data)
     assert len(paths) > 0
     dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
     logger.info("dictionary: {} types".format(len(dictionary)))
     if not hasattr(cfg, "shuffle"):
         cfg.shuffle = False
     return cls(cfg, dictionary)
Ejemplo n.º 9
0
    def load_dictionary(cls, args, filename, source=True):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        dictionary = Dictionary.load(filename)
        dictionary.add_symbol('<mask>')
        return dictionary
Ejemplo n.º 10
0
 def setup_task(cls, cfg: HubertPretrainingConfig,
                **kwargs) -> "HubertPretrainingTask":
     label_dir = cfg.data if cfg.label_dir is None else cfg.label_dir
     dictionaries = {
         label: Dictionary.load(f"{label_dir}/dict.{label}.txt")
         if os.path.exists(f"{label_dir}/dict.{label}.txt") else None
         for label in cfg.labels
     }
     return cls(cfg, dictionaries)
Ejemplo n.º 11
0
 def load_target_dictionary(self):
     if self.cfg.labels:
         dict_path = os.path.join(self.cfg.data,
                                  f"dict.{self.cfg.labels}.txt")
         if not os.path.isfile(dict_path):
             dict_path = os.path.join(self.cfg.label_dir,
                                      f"dict.{self.cfg.labels}.txt")
         return Dictionary.load(dict_path)
     return None
Ejemplo n.º 12
0
    def load_dictionary(cls, filename):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        dictionary = Dictionary.load(filename)
        dictionary.add_symbol("<mask>")
        return dictionary
Ejemplo n.º 13
0
    def test_finalize(self):
        txt = [
            'A B C D',
            'B C D',
            'C D',
            'D',
        ]
        ref_ids1 = list(
            map(torch.IntTensor, [
                [4, 5, 6, 7, 2],
                [5, 6, 7, 2],
                [6, 7, 2],
                [7, 2],
            ]))
        ref_ids2 = list(
            map(torch.IntTensor, [
                [7, 6, 5, 4, 2],
                [6, 5, 4, 2],
                [5, 4, 2],
                [4, 2],
            ]))

        # build dictionary
        d = Dictionary()
        for line in txt:
            Tokenizer.tokenize(line, d, add_if_not_exist=True)

        def get_ids(dictionary):
            ids = []
            for line in txt:
                ids.append(
                    Tokenizer.tokenize(line,
                                       dictionary,
                                       add_if_not_exist=False))
            return ids

        def assertMatch(ids, ref_ids):
            for toks, ref_toks in zip(ids, ref_ids):
                self.assertEqual(toks.size(), ref_toks.size())
                self.assertEqual(0, (toks != ref_toks).sum().item())

        ids = get_ids(d)
        assertMatch(ids, ref_ids1)

        # check finalized dictionary
        d.finalize()
        finalized_ids = get_ids(d)
        assertMatch(finalized_ids, ref_ids2)

        # write to disk and reload
        with tempfile.NamedTemporaryFile(mode='w') as tmp_dict:
            d.save(tmp_dict.name)
            d = Dictionary.load(tmp_dict.name)
            reload_ids = get_ids(d)
            assertMatch(reload_ids, ref_ids2)
            assertMatch(finalized_ids, reload_ids)
Ejemplo n.º 14
0
    def load_dictionary(cls, filename, sde=False):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        if sde:
            return CharNgramDictionary.load(filename)
        else:
            return Dictionary.load(filename)
Ejemplo n.º 15
0
        def tokenizer(sentence):

            attrs = Args()

            tokenizer = MosesTokenizer(attrs)
            bpe = SubwordNMTBPE(attrs)
            dictionary = Dictionary.load('dict.en.txt')

            return dictionary.encode_line(bpe.encode(sentence),
                                          add_if_not_exist=False)
Ejemplo n.º 16
0
    def load_dictionary(cls, filename, weight_by_freq=False):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        if weight_by_freq:
            return DictionaryWithInvFreqWeight.load(filename)
        else:
            return Dictionary.load(filename)
Ejemplo n.º 17
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        #could remove the following .....
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        if args.flatenc or args.flatdec:
            flatData = args.flatdata
        if args.flatenc:
            flatFile = os.path.join(flatData,
                                    'dict.{}.txt'.format(args.source_lang))
            print("For flat encoder load dictionary: ", flatFile)
            src_dict = Dictionary.load(flatFile)
        else:
            src_dict = DictionaryWCS.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.source_lang)))
        if args.flatdec:
            flatFile = os.path.join(flatData,
                                    'dict.{}.txt'.format(args.target_lang))
            print("For flat decoder load dictionary: ", flatFile)
            tgt_dict = Dictionary.load(flatFile)
        else:
            tgt_dict = DictionaryWCS.load(
                os.path.join(args.data,
                             'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Ejemplo n.º 18
0
    def load_pretrained_model(path,
                              src_dict_path,
                              tgt_dict_path,
                              arg_overrides=None):
        model = utils.load_checkpoint_to_cpu(path)
        args = model['args']
        state_dict = model['model']
        args = utils.override_model_args(args, arg_overrides)
        src_dict = Dictionary.load(src_dict_path)
        tgt_dict = Dictionary.load(tgt_dict_path)
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()

        task = TranslationTask(args, src_dict, tgt_dict)
        model = task.build_model(args)
        model.upgrade_state_dict(state_dict)
        model.load_state_dict(state_dict, strict=True)
        return model
 def setup_task(cls, args, **kwargs):
     """Setup the task.
     """
     paths = args.data.split(':')
     assert len(paths) > 0
     dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))
     logger.info('dictionary: {} types'.format(len(dictionary)))
     if not hasattr(args, 'shuffle_instance'):
         args.shuffle_instance = False
     return cls(args, dictionary)
Ejemplo n.º 20
0
    def setup_task(cls, args, **kwargs):
        # load data dictionary
        data_dictionary_dict = {}
        for field in configs.fields:
            data_dictionary_dict[field] = Dictionary.load(os.path.join(args.data, 'input0', f'{field}', 'dict.txt'))
            if field in configs.maskable_fields:
                data_dictionary_dict[field].add_symbol('<mask>')  # to align with the dictionary used in pretraining
            logger.info('[input {}] dictionary: {} types'.format(field, len(data_dictionary_dict[field])))

        label_dict = data_dictionary_dict  # dummy set as we don't have discrete label
        return SimilarityTask(args, data_dictionary_dict, label_dict)
Ejemplo n.º 21
0
 def setup_task(cls, args, **kwargs):
     paths = utils.split_paths(args.data)
     assert len(paths) > 0
     dictionary = None
     if args.use_bert_dict:
         dictionary = DictionaryForBert.load(
             os.path.join(paths[0], "dict.txt"))
     else:
         dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
     logger.info("dictionary: {} types".format(len(dictionary)))
     return cls(args, dictionary)
Ejemplo n.º 22
0
    def setup_task(cls, args, **kwargs):
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data)
        if args.source_lang is None or args.target_lang is None:
            raise Exception('Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Ejemplo n.º 23
0
 def setup_task(cls, args, **kwargs):
     """
     https://github.com/pytorch/fairseq/blob/master/fairseq/tasks/masked_lm.py#L78
     """
     paths = utils.split_paths(args.data)
     assert len(paths) > 0
     data_dict = cls.load_dictionary(
         os.path.join(paths[0], 'input', 'dict.txt'))
     logger.info('dictionary: {} types'.format(len(data_dict)))
     meta_dict = Dictionary.load(os.path.join(paths[0], 'meta', 'dict.txt'))
     return cls(args, data_dict, meta_dict)
Ejemplo n.º 24
0
 def initializer(self):
     global bpe
     bpe = get_encoder(
         os.path.join(self.roberta_dir, 'encoder.json'),
         os.path.join(self.roberta_dir, 'vocab.bpe'),
     )
     global vocab
     vocab = Dictionary.load(os.path.join(self.roberta_dir, 'dict.txt'))
     global entities
     if self.entity_vocab is not None:
         entities = load_entities(self.entity_vocab)
Ejemplo n.º 25
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = Dictionary.load(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = Dictionary.load(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        if not hasattr(args, 'device_id') or args.device_id == 0:
            print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                       len(src_dict)))
            print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                       len(tgt_dict)))
            if hasattr(args,
                       'share_all_embeddings') and args.share_all_embeddings:
                src_dict.update(tgt_dict)
                tgt_dict = src_dict
                print("Join dictionary to share embeddings")
                print('| [{}] dictionary: {} types'.format(
                    args.source_lang, len(src_dict)))
                print('| [{}] dictionary: {} types'.format(
                    args.target_lang, len(tgt_dict)))

        return cls(args, src_dict, tgt_dict)
Ejemplo n.º 26
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.source_lang = f"{args.source_lang1}_{args.source_lang2}"
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)

        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            args.source_lang, args.target_lang = data_utils.infer_language_pair(
                args.data[0])
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict1 = Dictionary.load(
            os.path.join(args.data[0],
                         'dict.{}.txt'.format(args.source_lang1)))
        src_dict2 = Dictionary.load(
            os.path.join(args.data[0],
                         'dict.{}.txt'.format(args.source_lang2)))
        tgt_dict = Dictionary.load(
            os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang)))
        assert src_dict1.pad() == src_dict2.pad()
        assert src_dict1.eos() == src_dict2.eos()
        assert src_dict1.unk() == src_dict2.unk()
        assert src_dict1.pad() == tgt_dict.pad()
        assert src_dict1.eos() == tgt_dict.eos()
        assert src_dict1.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang1,
                                                   len(src_dict1)))
        print('| [{}] dictionary: {} types'.format(args.source_lang2,
                                                   len(src_dict2)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))

        return cls(args, src_dict1, src_dict2, tgt_dict)
Ejemplo n.º 27
0
 def load_dictionary(cls, args, filename, source=True):
     """Load the dictionary from the filename
     Args:
         filename (str): the filename
     """
     dictionary = Dictionary.load(filename)
     ##
     langs = args.langs.split(",")
     for l in langs:
         dictionary.add_symbol("[{}]".format(l))
     dictionary.add_symbol("<mask>")
     return dictionary
 def setup_task(cls, args, **kwargs):
     """Setup the task (e.g., load dictionaries)."""
     task = super(SpeechTranslationCTCTask, cls).setup_task(args)
     source_dict_path = os.path.join(args.data.split(os.pathsep)[0], "dict.{}.txt".format(args.source_lang))
     if not os.path.isfile(source_dict_path):
         raise FileNotFoundError("Dict not found: {}".format(source_dict_path))
     src_dict = Dictionary.load(source_dict_path)
     if args.criterion == "ctc_multi_loss":
         src_dict.add_symbol("<ctc_blank>")
     print("| CTC dictionary: {} types".format(len(src_dict)))
     task.src_dict = src_dict
     return task
Ejemplo n.º 29
0
    def __init__(
        self,
        data_dir,
        split,
        sample_rate,
        max_sample_size=None,
        min_sample_size=0,
        shuffle=True,
        pad=False,
        normalize=False,
        num_buckets=0,
        compute_mask_indices=False,
        **mask_compute_kwargs,
    ):
        super().__init__(
            sample_rate=sample_rate,
            max_sample_size=max_sample_size,
            min_sample_size=min_sample_size,
            shuffle=shuffle,
            pad=pad,
            normalize=normalize,
            compute_mask_indices=compute_mask_indices,
            **mask_compute_kwargs,
        )

        from fairseq.data import data_utils, Dictionary

        self.fnames_dict = Dictionary.load(os.path.join(data_dir, "dict.txt"))

        root_path = os.path.join(data_dir, f"{split}.root")
        if os.path.exists(root_path):
            with open(root_path, "r") as f:
                self.root_dir = next(f).strip()
        else:
            self.root_dir = None

        fnames_path = os.path.join(data_dir, split)
        self.fnames = data_utils.load_indexed_dataset(fnames_path,
                                                      self.fnames_dict)
        lengths_path = os.path.join(data_dir, f"{split}.lengths")

        with open(lengths_path, "r") as f:
            for line in f:
                sz = int(line.rstrip())
                assert (
                    sz >= min_sample_size
                ), f"Min sample size is not supported for binarized dataset, but found a sample with size {sz}"
                self.sizes.append(sz)

        self.sizes = np.array(self.sizes, dtype=np.int64)

        self.set_bucket_info(num_buckets)
        logger.info(f"loaded {len(self.fnames)} samples")
    def load_dictionary(cls, args, filename, source=True):
        """Load the dictionary from the filename

        Args:
            filename (str): the filename
        """
        dictionary = Dictionary.load(filename)
        dictionary.add_symbol("<mask>")
        #cls.Q_token = dictionary.add_symbol("<Q>")
        #cls.A_token = dictionary.add_symbol("<A>")

        return dictionary
Ejemplo n.º 31
0
 def setup_task(cls, args, **kwargs):
     paths = args.data.split(':')
     assert len(paths) > 0
     if 'bert' in args and args.bert:
         print('| bert dictionary')
         dictionary = BertDictionary()
     else:
         dictionary = Dictionary.load(os.path.join(paths[0],'dict.txt'))
     print('| dictionary: {} types'.format(len(dictionary)))
     if args.freq_weighted_replacement:
         print('| freq weighted mask replacement')
     return cls(args, dictionary)
Ejemplo n.º 32
0
    def test_finalize(self):
        txt = [
            'A B C D',
            'B C D',
            'C D',
            'D',
        ]
        ref_ids1 = list(map(torch.IntTensor, [
            [4, 5, 6, 7, 2],
            [5, 6, 7, 2],
            [6, 7, 2],
            [7, 2],
        ]))
        ref_ids2 = list(map(torch.IntTensor, [
            [7, 6, 5, 4, 2],
            [6, 5, 4, 2],
            [5, 4, 2],
            [4, 2],
        ]))

        # build dictionary
        d = Dictionary()
        for line in txt:
            Tokenizer.tokenize(line, d, add_if_not_exist=True)

        def get_ids(dictionary):
            ids = []
            for line in txt:
                ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False))
            return ids

        def assertMatch(ids, ref_ids):
            for toks, ref_toks in zip(ids, ref_ids):
                self.assertEqual(toks.size(), ref_toks.size())
                self.assertEqual(0, (toks != ref_toks).sum().item())

        ids = get_ids(d)
        assertMatch(ids, ref_ids1)

        # check finalized dictionary
        d.finalize()
        finalized_ids = get_ids(d)
        assertMatch(finalized_ids, ref_ids2)

        # write to disk and reload
        with tempfile.NamedTemporaryFile(mode='w') as tmp_dict:
            d.save(tmp_dict.name)
            d = Dictionary.load(tmp_dict.name)
            reload_ids = get_ids(d)
            assertMatch(reload_ids, ref_ids2)
            assertMatch(finalized_ids, reload_ids)