Example #1
0
def load(path, load_splits, src, dst):
    """Loads specified data splits (e.g. test, train or valid) from the path."""

    langcode = '{}-{}'.format(src, dst)

    def fmt_path(fmt, *args):
        return os.path.join(path, fmt.format(*args))

    src_dict = Dictionary.load(fmt_path('dict.{}.txt', src))
    dst_dict = Dictionary.load(fmt_path('dict.{}.txt', dst))
    dataset = LanguageDatasets(src, dst, src_dict, dst_dict)

    for split in load_splits:
        for k in itertools.count():
            prefix = "{}{}".format(split, k if k > 0 else '')
            src_path = fmt_path('{}.{}.{}', prefix, langcode, src)

            if not IndexedInMemoryDataset.exists(src_path):
                break

            dataset.splits[prefix] = LanguagePairDataset(
                IndexedInMemoryDataset(src_path),
                IndexedInMemoryDataset(fmt_path('{}.{}.{}', prefix, langcode, dst)),
                pad_idx=dataset.src_dict.pad(),
                eos_idx=dataset.src_dict.eos(),
            )

    return dataset
Example #2
0
def load(path, src, dst):
    """Loads the train, valid, and test sets from the specified folder."""

    langcode = '{}-{}'.format(src, dst)

    def fmt_path(fmt, *args):
        return os.path.join(path, fmt.format(*args))

    src_dict = Dictionary.load(fmt_path('dict.{}.txt', src))
    dst_dict = Dictionary.load(fmt_path('dict.{}.txt', dst))
    dataset = LanguageDatasets(src, dst, src_dict, dst_dict)

    for split in ['train', 'valid', 'test']:
        for k in itertools.count():
            prefix = "{}{}".format(split, k if k > 0 else '')
            src_path = fmt_path('{}.{}.{}', prefix, langcode, src)

            if not IndexedInMemoryDataset.exists(src_path):
                break

            dataset.splits[prefix] = LanguagePairDataset(
                IndexedInMemoryDataset(src_path),
                IndexedInMemoryDataset(
                    fmt_path('{}.{}.{}', prefix, langcode, dst)),
                padding_value=dataset.src_dict.pad(),
                eos=dataset.src_dict.eos(),
            )

    return dataset
Example #3
0
def load_dictionaries(path, src_lang, dst_lang):
    """Load dictionaries for a given language pair."""
    src_dict = Dictionary.load(
        os.path.join(path, 'dict.{}.txt'.format(src_lang)))
    dst_dict = Dictionary.load(
        os.path.join(path, 'dict.{}.txt'.format(dst_lang)))
    return src_dict, dst_dict
Example #4
0
def load_dictionaries(path, src_lang, dst_lang):
    """Load dictionaries for a given language pair."""
    src_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(src_lang)))
    dst_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(dst_lang)))
    #vector_dict.src_dict = src_dict.indices
    #vector_dict.reverse()
    #vector_dict.add_vector()
    #create_vec_dict()
    return src_dict, dst_dict
Example #5
0
def check_sentence(sentence, token_dict=None):
    if (token_dict == None):
        token_dict = Dictionary.load(
            '/home/nishit/quick-edit/data/iwslt14.tokenized.de-en/dict.en.txt')

    for i in sentence:
        print(token_dict[i], end=' ')
    print('\n')
Example #6
0
def load_dictionaries(path, src_lang, dst_lang):
    """Load dictionaries for a given language pair."""
    src_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(src_lang)))
    dst_dict = Dictionary.load(os.path.join(path, 'dict.{}.txt'.format(dst_lang)))
    return src_dict, dst_dict