Example #1
0
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        # paths = self.args.data.split(':')
        # assert len(paths) > 0
        # data_path = paths[epoch % len(paths)]
        from fairseq.data.cvit.utils import pairs_select
        pairs = pairs_select(self.data['corpora'], split)

        # infer langcode
        src, tgt = self.args.source_lang, self.args.target_lang

        from ilmulti.sentencepiece import SentencePieceTokenizer
        tokenizer = SentencePieceTokenizer(self.data['hard_coded_dict'])

        # tokenizer test
        # tokens = tokenizer("Hello World !", lang = 'en')

        self.datasets[split] = load_langpair_dataset(
            pairs,
            self.src_dict,
            tokenizer,
            combine=combine,
            dataset_impl=self.args.dataset_impl,
            upsample_primary=self.args.upsample_primary,
            left_pad_source=self.args.left_pad_source,
            left_pad_target=self.args.left_pad_target,
            max_source_positions=self.args.max_source_positions,
            max_target_positions=self.args.max_target_positions,
        )
Example #2
0
def build_corpus(corpus, config, rebuild=False):
    from ilmulti.sentencepiece import SentencePieceTokenizer
    tokenizer = SentencePieceTokenizer(config)
    if not LMDBCorpus.exists(corpus):
        print("LMDB({}) does not exist. Building".format(corpus.path))
        raw_dataset = _CVITIndexedRawTextDataset(corpus, tokenizer)
        writer = LMDBCorpusWriter(raw_dataset)
        writer.close()
        print("Built LMDB({})".format(corpus.path))
Example #3
0
    def load_dataset(self, split, epoch=0, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """

        print("This is the split", split)

        from fairseq.data.cvit.utils import monoling_select
        dataset = monoling_select(self.data['corpora'], split)

        from ilmulti.sentencepiece import SentencePieceTokenizer

        hard_code_dict = self.data['hard_coded_dict']

        tokenizer = SentencePieceTokenizer(hard_code_dict)
        dataset = CVITIndexedRawTextDataset(dataset, tokenizer,
                                            self.dictionary)

        if dataset is None:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(
                split, split_path))

        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample,
            pad=self.dictionary.pad(),
            eos=self.dictionary.eos(),
            break_mode=self.args.sample_break_mode,
            include_targets=True,
        )

        add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none'

        self.datasets[split] = MonolingualDataset(
            dataset,
            dataset.sizes,
            self.dictionary,
            self.output_dictionary,
            add_eos_for_other_targets=add_eos_for_other_targets,
            shuffle=True,
            targets=self.targets,
            add_bos_token=self.args.add_bos_token,
        )
Example #4
0
from ilmulti.dataset import MonolingualDataset, ParallelDataset, MultilingualDataset
from ilmulti.dataset.torch import TensorParallelDataset
from ilmulti.dataset import AgnosticTokenizedDataset
from ilmulti.filters import PairDetect
from ilmulti.sentencepiece import SentencePieceTokenizer
from ilmulti.dataset.torch import TensorMultiDataset
import os
from ilmulti.dataset import ParallelWriter
from ilmulti.dataset import FakeParallelDataset
from ilmulti.utils import canonicalize
from tqdm import tqdm, trange

# Create tokenizer

tokenizer = SentencePieceTokenizer()
# exit()

# Declare datasets
mininterval = 40
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--lang', required=True, type=str)
parser.add_argument('--output', required=True, type=str)
args = parser.parse_args()

if not os.path.exists(args.output):
    os.makedirs(args.output)

dictionary = tokenizer.dictionary()
dictionary.save(os.path.join(args.output, "vocab.dict"))
Example #5
0
from ilmulti.dataset import MonolingualDataset, ParallelDataset, MultilingualDataset
from ilmulti.dataset import AgnosticTokenizedDataset
from ilmulti.filters import PairDetect
from ilmulti.sentencepiece import SentencePieceTokenizer
import os
from ilmulti.dataset import ParallelWriter
from ilmulti.dataset import FakeParallelDataset
from tqdm import tqdm

# Create tokenizer

tokenizer = SentencePieceTokenizer()

# Declare datasets


class Collector(set):
    def __init__(self, *args, **kwargs):
        super().__init__(self, *args, **kwargs)

    def add(self, pset):
        super().add(pset)
        if not pset.is_mono():
            first, second = pset.get_mono_as_parallel()
            super().add(first)
            super().add(second)


pairs = Collector()
# 1: ILCI
# C(N, 2) Pairs.