def make_binary_da_dataset(input_prefix, output_prefix, lang, num_workers, da_mapping): logger.info("Adding domain indexes") n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): n_seq_tok[0] += worker_result["nseq"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None # TODO: Error encounters if num_workers>1: # No such file or directory: 'data-bin/iwslt14.tokenized.de-en/train.da1.en-de.en.idx' if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize_da, ( args, input_file, prefix, lang, da_mapping, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.make_builder( dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl, ) merge_result( Binarizer.binarize_da(input_file, lambda t: ds.add_item(t), da_mapping, offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
def binarize_da(args, filename, output_prefix, da_mapping, offset, end): ds = indexed_dataset.make_builder( dataset_dest_file(args, output_prefix, None, "bin"), impl=args.dataset_impl, vocab_size=None, ) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize_da(filename, consumer, da_mapping, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, None, "idx")) return res