def load(path, load_splits, src, dst): """Loads specified data splits (e.g. test, train or valid) from the path.""" langcode = '{}-{}'.format(src, dst) def fmt_path(fmt, *args): return os.path.join(path, fmt.format(*args)) src_dict = Dictionary.load(fmt_path('dict.{}.txt', src)) dst_dict = Dictionary.load(fmt_path('dict.{}.txt', dst)) dataset = LanguageDatasets(src, dst, src_dict, dst_dict) for split in load_splits: for k in itertools.count(): prefix = "{}{}".format(split, k if k > 0 else '') src_path = fmt_path('{}.{}.{}', prefix, langcode, src) if not IndexedInMemoryDataset.exists(src_path): break dataset.splits[prefix] = LanguagePairDataset( IndexedInMemoryDataset(src_path), IndexedInMemoryDataset(fmt_path('{}.{}.{}', prefix, langcode, dst)), pad_idx=dataset.src_dict.pad(), eos_idx=dataset.src_dict.eos(), ) return dataset
def load(path, src, dst): """Loads the train, valid, and test sets from the specified folder.""" langcode = '{}-{}'.format(src, dst) def fmt_path(fmt, *args): return os.path.join(path, fmt.format(*args)) src_dict = Dictionary.load(fmt_path('dict.{}.txt', src)) dst_dict = Dictionary.load(fmt_path('dict.{}.txt', dst)) dataset = LanguageDatasets(src, dst, src_dict, dst_dict) for split in ['train', 'valid', 'test']: for k in itertools.count(): prefix = "{}{}".format(split, k if k > 0 else '') src_path = fmt_path('{}.{}.{}', prefix, langcode, src) if not IndexedInMemoryDataset.exists(src_path): break dataset.splits[prefix] = LanguagePairDataset( IndexedInMemoryDataset(src_path), IndexedInMemoryDataset( fmt_path('{}.{}.{}', prefix, langcode, dst)), padding_value=dataset.src_dict.pad(), eos=dataset.src_dict.eos(), ) return dataset
def load_dictionaries(path, src_lang, dst_lang): """Load dictionaries for a given language pair.""" src_dict = Dictionary.load( os.path.join(path, 'dict.{}.txt'.format(src_lang))) dst_dict = Dictionary.load( os.path.join(path, 'dict.{}.txt'.format(dst_lang))) return src_dict, dst_dict
def load_dictionaries(path, src_lang, dst_lang): """Load dictionaries for a given language pair.""" src_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(src_lang))) dst_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(dst_lang))) #vector_dict.src_dict = src_dict.indices #vector_dict.reverse() #vector_dict.add_vector() #create_vec_dict() return src_dict, dst_dict
def check_sentence(sentence, token_dict=None): if (token_dict == None): token_dict = Dictionary.load( '/home/nishit/quick-edit/data/iwslt14.tokenized.de-en/dict.en.txt') for i in sentence: print(token_dict[i], end=' ') print('\n')
def load_dictionaries(path, src_lang, dst_lang): """Load dictionaries for a given language pair.""" src_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(src_lang))) dst_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(dst_lang))) return src_dict, dst_dict