Exemple #1
0
    def make_binary_da_dataset(input_prefix, output_prefix, lang, num_workers,
                               da_mapping):
        logger.info("Adding domain indexes")
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            n_seq_tok[0] += worker_result["nseq"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        # TODO: Error encounters if num_workers>1:
        #  No such file or directory: 'data-bin/iwslt14.tokenized.de-en/train.da1.en-de.en.idx'
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize_da,
                    (
                        args,
                        input_file,
                        prefix,
                        lang,
                        da_mapping,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.make_builder(
            dataset_dest_file(args, output_prefix, lang, "bin"),
            impl=args.dataset_impl,
        )
        merge_result(
            Binarizer.binarize_da(input_file,
                                  lambda t: ds.add_item(t),
                                  da_mapping,
                                  offset=0,
                                  end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
Exemple #2
0
def binarize_da(args, filename, output_prefix, da_mapping, offset, end):
    ds = indexed_dataset.make_builder(
        dataset_dest_file(args, output_prefix, None, "bin"),
        impl=args.dataset_impl,
        vocab_size=None,
    )

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize_da(filename,
                                consumer,
                                da_mapping,
                                offset=offset,
                                end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, None, "idx"))
    return res