Example #1
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang,
                            num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.make_builder(dataset_dest_file(
            args, output_prefix, lang, "bin"),
                                          impl=args.dataset_impl,
                                          vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(input_file,
                               vocab,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            lang,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Example #2
0
    def make_binary_da_dataset(input_prefix, output_prefix, lang, num_workers,
                               da_mapping):
        logger.info("Adding domain indexes")
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            n_seq_tok[0] += worker_result["nseq"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        # TODO: Error encounters if num_workers>1:
        #  No such file or directory: 'data-bin/iwslt14.tokenized.de-en/train.da1.en-de.en.idx'
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize_da,
                    (
                        args,
                        input_file,
                        prefix,
                        lang,
                        da_mapping,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.make_builder(
            dataset_dest_file(args, output_prefix, lang, "bin"),
            impl=args.dataset_impl,
        )
        merge_result(
            Binarizer.binarize_da(input_file,
                                  lambda t: ds.add_item(t),
                                  da_mapping,
                                  offset=0,
                                  end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
Example #3
0
    def make_binary_alignment_dataset(input_prefix, output_prefix,
                                      num_workers):
        nseq = [0]

        def merge_result(worker_result):
            nseq[0] += worker_result["nseq"]

        input_file = input_prefix
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize_alignments,
                    (
                        args,
                        input_file,
                        utils.parse_alignment,
                        prefix,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.make_builder(dataset_dest_file(
            args, output_prefix, None, "bin"),
                                          impl=args.dataset_impl)

        merge_result(
            Binarizer.binarize_alignments(
                input_file,
                utils.parse_alignment,
                lambda t: ds.add_item(t),
                offset=0,
                end=offsets[1],
            ))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, None)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, None, "idx"))

        logger.info("[alignments] {}: parsed {} alignments".format(
            input_file, nseq[0]))
Example #4
0
    def make_binary_dataset(input_file, output_prefix, dic, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = args.tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        prefix,
                        dic,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            output_file(args, output_prefix, "bin"))
        merge_result(
            args.tokenizer.binarize(input_file,
                                    dic,
                                    lambda t: ds.add_item(t),
                                    offset=0,
                                    end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = output_file(args, prefix, '')
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(output_file(args, output_prefix, 'idx'))

        print("| {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dic.unk_word))
Example #5
0
    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        print("offsets", offsets)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, dict, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, 'bin'))
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
Example #6
0
    def make_binary_alignment_dataset(input_prefix, output_prefix, src, tgt,
                                      num_workers):
        nseq = [0]

        def merge_result(worker_result):
            nseq[0] += worker_result['nseq']

        parse_alignment = lambda s: torch.IntTensor(
            [int(t) for t in s.split()])
        input_file = input_prefix
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = multiprocessing.Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize_alignments,
                    (args, input_file, parse_alignment, prefix, src, tgt,
                     offsets[worker_id], offsets[worker_id + 1]),
                    callback=merge_result)
            pool.close()

        ds = indexed_dataset.make_builder(dataset_dest_file(
            args, output_prefix, src, tgt, None, "bin"),
                                          impl=args.dataset_impl)

        merge_result(
            Binarizer.binarize_alignments(input_file,
                                          parse_alignment,
                                          lambda t: ds.add_item(t),
                                          offset=0,
                                          end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, src, tgt)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(
            dataset_dest_file(args, output_prefix, src, tgt, None, "idx"))

        print("| [alignments] {}: parsed {} alignments".format(
            input_file, nseq[0]))
    def append(self, other_dataset_path_prefix: str) -> None:
        """
        append an existing dataset.
        Beware, if it wasn't built with the same coder, you are in trouble.
        """
        other_index = HuffmanMMapIndex(
            indexed_dataset.index_file_path(other_dataset_path_prefix))
        for (ptr, size) in other_index:
            self._ptrs.append(ptr + self._data_len)
            self._sizes.append(size)

        # Concatenate data
        with open(indexed_dataset.data_file_path(other_dataset_path_prefix),
                  "rb") as f:
            shutil.copyfileobj(f, self._data_file)

        self._data_len += other_index.data_len
 def open(self):
     self._coder.to_file(vocab_file_path(self._path_prefix))
     self._data_file = open(
         indexed_dataset.data_file_path(self._path_prefix), "wb")
 def exists(prefix_path):
     return (
         PathManager.exists(indexed_dataset.index_file_path(prefix_path))
         and PathManager.exists(indexed_dataset.data_file_path(prefix_path))
         and PathManager.exists(vocab_file_path(prefix_path)))
Example #10
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, avoid_tokenize=False):
        if vocab is not None:
            print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        else:
            print('| Using None Dictionary and only string split is performed.')

        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                # TODO: worker > 1 is not working for map dataset
                if args.input_mapping is True:
                    raise NotImplementedError("Worker > 1 is not implemented for map dataset yet.")
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                        avoid_tokenize,
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.make_builder(
            dataset_dest_file(args, output_prefix, lang, "bin"),
            impl=args.dataset_impl,
            vocab_size=len(vocab) if vocab is not None else -1,
        )
        merge_result(
            Binarizer.binarize(
                input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1], avoid_tokenize=avoid_tokenize,
            )
        )
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        if vocab is not None:
            unk = vocab.unk_word if hasattr(vocab, 'unk_word') else vocab.unk_token
        else:
            unk = ""
        logger.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                unk,
            )
        )
Example #11
0
    def multiprocess_dataset(
        cls,
        input_file: str,
        dataset_impl: str,
        binarizer: Binarizer,
        output_prefix: str,
        vocab_size=None,
        num_workers=1,
    ) -> BinarizeSummary:
        final_summary = BinarizeSummary()

        offsets = find_offsets(input_file, num_workers)
        # find_offsets returns a list of position [pos1, pos2, pos3, pos4] but we would want pairs:
        # [(pos1, pos2), (pos2, pos3), (pos3, pos4)] to process the chunks with start/end info
        # we zip the list with itself shifted by one to get all the pairs.
        (first_chunk, *more_chunks) = zip(offsets, offsets[1:])
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            worker_results = [
                pool.apply_async(
                    cls._binarize_chunk_and_finalize,
                    args=(
                        binarizer,
                        input_file,
                        start_offset,
                        end_offset,
                        _worker_prefix(
                            output_prefix,
                            worker_id,
                        ),
                        dataset_impl,
                    ),
                    kwds={
                        "vocab_size": vocab_size,
                    }
                    if vocab_size is not None
                    else {},
                )
                for worker_id, (start_offset, end_offset) in enumerate(
                    more_chunks, start=1
                )
            ]

            pool.close()
            pool.join()
            for r in worker_results:
                summ = r.get()
                final_summary.merge(summ)

        # do not close the bin file as we need to merge the worker results in
        final_ds, summ = cls._binarize_file_chunk(
            binarizer,
            input_file,
            offset_start=first_chunk[0],
            offset_end=first_chunk[1],
            output_prefix=output_prefix,
            dataset_impl=dataset_impl,
            vocab_size=vocab_size if vocab_size is not None else None,
        )
        final_summary.merge(summ)

        if num_workers > 1:
            for worker_id in range(1, num_workers):
                # merge the worker outputs
                worker_output_prefix = _worker_prefix(
                    output_prefix,
                    worker_id,
                )
                final_ds.merge_file_(worker_output_prefix)
                try:
                    os.remove(indexed_dataset.data_file_path(worker_output_prefix))
                    os.remove(indexed_dataset.index_file_path(worker_output_prefix))
                except Exception as e:
                    logger.error(
                        f"couldn't remove {worker_output_prefix}.*", exc_info=e
                    )

        #  now we can close the file
        idx_file = indexed_dataset.index_file_path(output_prefix)
        final_ds.finalize(idx_file)
        return final_summary
Example #12
0
    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words=None):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()
        copyied = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            copyied.update(worker_result["copied"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:  # todo: not support copy 
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                    callback=merge_result  
                )
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin")
        )
        words_list = []

        def binarize_consumer(ids, words):
            ds.add_item(ids)
            words_list.append(words)

        merge_result(
            Binarizer.binarize(
                input_file, vocab, binarize_consumer,
                offset=0, end=offsets[1], copy_ext_dict=args.copy_ext_dict, copy_src_words=copy_src_words
            )
        )
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print(
            "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
                100 * sum(copyied.values()) / n_seq_tok[1]
            )
        )

        return words_list