def create_lang_dictionary(cls, langs): unk = "<unk>" # hack to remove symbols other than unk as they are not needed by lang dict lang_dict = Dictionary(pad=unk, eos=unk, unk=unk, bos=unk) for lang in langs: lang_dict.add_symbol(lang) return lang_dict
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol("word{}".format(i)) logger.info("dictionary: {} types".format(len(dictionary))) return cls(args, dictionary)
def build_dictionary( cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8 ): d = BertDictionary() for filename in filenames: Dictionary.add_file_to_dictionary( filename, d, tokenizer.tokenize_line, workers ) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def setup_task(cls, args, **kwargs): """Setup the task. """ dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol("word{}".format(i)) logger.info("dictionary: {} types".format(len(dictionary))) args.max_source_positions = args.src_len + dictionary.pad() + 2 args.max_target_positions = args.tgt_len + dictionary.pad() + 2 return cls(args, dictionary)
class DummyLMTask(FairseqTask): def __init__(self, cfg: DummyLMConfig): super().__init__(cfg) # load dictionary self.dictionary = Dictionary() for i in range(cfg.dict_size): self.dictionary.add_symbol("word{}".format(i)) self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 logger.info("dictionary: {} types".format(len(self.dictionary))) seq = torch.arange(cfg.tokens_per_sample + 1) + self.dictionary.pad() + 1 self.dummy_src = seq[:-1] self.dummy_tgt = seq[1:] def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if self.cfg.batch_size is not None: bsz = self.cfg.batch_size else: bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( (bsz, ), self.cfg.tokens_per_sample, dtype=torch.long), }, "target": torch.stack([self.dummy_tgt for _ in range(bsz)]), "nsentences": bsz, "ntokens": bsz * self.cfg.tokens_per_sample, }, num_items=self.cfg.dataset_size, item_size=self.cfg.tokens_per_sample, ) @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary
def __init__(self, cfg: DummyLMConfig): super().__init__(cfg) # load dictionary self.dictionary = Dictionary() for i in range(cfg.dict_size): self.dictionary.add_symbol("word{}".format(i)) self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 logger.info("dictionary: {} types".format(len(self.dictionary))) seq = torch.arange(cfg.tokens_per_sample + 1) + self.dictionary.pad() + 1 self.dummy_src = seq[:-1] self.dummy_tgt = seq[1:]
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ return Dictionary.load(filename)
def setup_task(cls, args, **kwargs): """Setup the task.""" dictionary = Dictionary.load(os.path.join(args.data, "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False return cls(args, dictionary)
def augment_dictionary( dictionary: Dictionary, language_list: List[str], lang_tok_style: str, langtoks_specs: Sequence[str] = (LangTokSpec.main.value, ), extra_data: Optional[Dict[str, str]] = None, ) -> None: for spec in langtoks_specs: for language in language_list: dictionary.add_symbol( get_lang_tok(lang=language, lang_tok_style=lang_tok_style, spec=spec)) if lang_tok_style == LangTokStyle.mbart.value or ( extra_data is not None and LangTokSpec.mono_dae.value in extra_data): dictionary.add_symbol("<mask>")
def load_dictionary(cls, args, filename, source=True): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol("<mask>") return dictionary
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def setup_dictionary(cls, args, **kwargs): dictionary = None output_dictionary = None if args.data: paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) output_dictionary = dictionary if args.output_dictionary_size >= 0: output_dictionary = TruncatedDictionary( dictionary, args.output_dictionary_size ) return (dictionary, output_dictionary)
def setup_task(cls, args, **kwargs): data_cfg = S2TDataConfig(op.join(args.data, args.config_yaml)) dict_path = op.join(args.data, data_cfg.vocab_filename) if not op.isfile(dict_path): raise FileNotFoundError(f"Dict not found: {dict_path}") tgt_dict = Dictionary.load(dict_path) logger.info(f"dictionary size ({data_cfg.vocab_filename}): " f"{len(tgt_dict):,}") if getattr(args, "train_subset", None) is not None: if not all( s.startswith("train") for s in args.train_subset.split(",")): raise ValueError('Train splits should be named like "train*".') return cls(args, tgt_dict)
def load_target_dictionary(self): if self.cfg.labels: dict_path = os.path.join(self.cfg.data, f"dict.{self.cfg.labels}.txt") return Dictionary.load(dict_path) return None
def _lang_id(dic: Dictionary, lang: str): """Return language ID index.""" idx = dic.index(lang) assert idx != dic.unk_index, "cannot find language ID for lang {}".format( lang) return idx
def padding_idx(self): return Dictionary().pad() if self.vocab is None else self.vocab.pad()
def setup_task(cls, args, **kwargs): paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) return cls(args, dictionary)
def _lang_token_index(dic: Dictionary, lang: str): """Return language token index.""" idx = dic.index(_lang_token(lang)) assert idx != dic.unk_index, "cannot find language token for lang {}".format( lang) return idx