def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None, char_level_op=False): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs = {} # 就分隔为char形式 if char_level_op: for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def process(self, paths, train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, src_embed_op: EmbeddingOption = None): input_name, target_name = 'words', 'target' src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary( **src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) info = DataBundle(datasets=self.load(paths)) _train_ds = [info.datasets[name] for name in train_ds ] if train_ds else info.datasets.values() src_vocab.from_dataset(*_train_ds, field_name=input_name) tgt_vocab.from_dataset(*_train_ds, field_name=target_name) src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) tgt_vocab.index_dataset(*info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs = {input_name: src_vocab, target_name: tgt_vocab} if src_embed_op is not None: src_embed_op.vocab = src_vocab init_emb = EmbedLoader.load_with_vocab(**src_embed_op) info.embeddings[input_name] = init_emb for name, dataset in info.datasets.items(): dataset.set_input(input_name) dataset.set_target(target_name) return info
def process(self, paths: Union[str, Dict[str, str]], train_ds: Iterable[str] = None, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, embed_opt: EmbeddingOption = None, char_level_op=False, split_dev_op=True ): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle(datasets=self.load(paths)) src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) _train_ds = [info.datasets[name] for name in train_ds] if train_ds else info.datasets.values() def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) chars.append('') chars.pop() return chars input_name, target_name = 'words', 'target' info.vocabs={} #就分隔为char形式 if char_level_op: for dataset in info.datasets.values(): dataset.apply_field(wordtochar, field_name="words",new_field_name='chars') # if embed_opt is not None: # embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) # info.embeddings['words'] = embed else: src_vocab.from_dataset(*_train_ds, field_name=input_name) src_vocab.index_dataset(*info.datasets.values(),field_name=input_name, new_field_name=input_name) info.vocabs[input_name]=src_vocab tgt_vocab.from_dataset(*_train_ds, field_name=target_name) tgt_vocab.index_dataset( *info.datasets.values(), field_name=target_name, new_field_name=target_name) info.vocabs[target_name]=tgt_vocab if split_dev_op: info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False) for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info
def process(self, paths: Union[str, Dict[str, str]], src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None, src_embed_opt: EmbeddingOption = None): paths = check_dataloader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary( **src_vocab_opt) src_vocab.from_dataset(datasets['train'], field_name='words') src_vocab.index_dataset(*datasets.values(), field_name='words') tgt_vocab = Vocabulary(unknown=None, padding=None) \ if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) tgt_vocab.from_dataset(datasets['train'], field_name='target') tgt_vocab.index_dataset(*datasets.values(), field_name='target') info.vocabs = {"words": src_vocab, "target": tgt_vocab} info.datasets = datasets if src_embed_opt is not None: embed = EmbedLoader.load_with_vocab(**src_embed_opt, vocab=src_vocab) info.embeddings['words'] = embed for name, dataset in info.datasets.items(): dataset.set_input("words") dataset.set_target("target") return info