def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.make_builder(dataset_dest_file( args, output_prefix, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab)) merge_result( Binarizer.binarize(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_da_dataset(input_prefix, output_prefix, lang, num_workers, da_mapping): logger.info("Adding domain indexes") n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): n_seq_tok[0] += worker_result["nseq"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None # TODO: Error encounters if num_workers>1: # No such file or directory: 'data-bin/iwslt14.tokenized.de-en/train.da1.en-de.en.idx' if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize_da, ( args, input_file, prefix, lang, da_mapping, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.make_builder( dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl, ) merge_result( Binarizer.binarize_da(input_file, lambda t: ds.add_item(t), da_mapping, offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
def make_binary_alignment_dataset(input_prefix, output_prefix, num_workers): nseq = [0] def merge_result(worker_result): nseq[0] += worker_result["nseq"] input_file = input_prefix offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize_alignments, ( args, input_file, utils.parse_alignment, prefix, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.make_builder(dataset_dest_file( args, output_prefix, None, "bin"), impl=args.dataset_impl) merge_result( Binarizer.binarize_alignments( input_file, utils.parse_alignment, lambda t: ds.add_item(t), offset=0, end=offsets[1], )) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, None) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, None, "idx")) logger.info("[alignments] {}: parsed {} alignments".format( input_file, nseq[0]))
def make_binary_dataset(input_file, output_prefix, dic, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = args.tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, prefix, dic, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( output_file(args, output_prefix, "bin")) merge_result( args.tokenizer.binarize(input_file, dic, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = output_file(args, prefix, '') ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(output_file(args, output_prefix, 'idx')) print("| {}: {} sents, {} tokens, {:.3}% replaced by {}".format( input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dic.unk_word))
def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') offsets = Tokenizer.find_offsets(input_file, num_workers) print("offsets", offsets) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def make_binary_alignment_dataset(input_prefix, output_prefix, src, tgt, num_workers): nseq = [0] def merge_result(worker_result): nseq[0] += worker_result['nseq'] parse_alignment = lambda s: torch.IntTensor( [int(t) for t in s.split()]) input_file = input_prefix offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = multiprocessing.Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize_alignments, (args, input_file, parse_alignment, prefix, src, tgt, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.make_builder(dataset_dest_file( args, output_prefix, src, tgt, None, "bin"), impl=args.dataset_impl) merge_result( Binarizer.binarize_alignments(input_file, parse_alignment, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, src, tgt) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize( dataset_dest_file(args, output_prefix, src, tgt, None, "idx")) print("| [alignments] {}: parsed {} alignments".format( input_file, nseq[0]))
def append(self, other_dataset_path_prefix: str) -> None: """ append an existing dataset. Beware, if it wasn't built with the same coder, you are in trouble. """ other_index = HuffmanMMapIndex( indexed_dataset.index_file_path(other_dataset_path_prefix)) for (ptr, size) in other_index: self._ptrs.append(ptr + self._data_len) self._sizes.append(size) # Concatenate data with open(indexed_dataset.data_file_path(other_dataset_path_prefix), "rb") as f: shutil.copyfileobj(f, self._data_file) self._data_len += other_index.data_len
def open(self): self._coder.to_file(vocab_file_path(self._path_prefix)) self._data_file = open( indexed_dataset.data_file_path(self._path_prefix), "wb")
def exists(prefix_path): return ( PathManager.exists(indexed_dataset.index_file_path(prefix_path)) and PathManager.exists(indexed_dataset.data_file_path(prefix_path)) and PathManager.exists(vocab_file_path(prefix_path)))
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, avoid_tokenize=False): if vocab is not None: print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) else: print('| Using None Dictionary and only string split is performed.') n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): # TODO: worker > 1 is not working for map dataset if args.input_mapping is True: raise NotImplementedError("Worker > 1 is not implemented for map dataset yet.") prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1], avoid_tokenize, ), callback=merge_result, ) pool.close() ds = indexed_dataset.make_builder( dataset_dest_file(args, output_prefix, lang, "bin"), impl=args.dataset_impl, vocab_size=len(vocab) if vocab is not None else -1, ) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1], avoid_tokenize=avoid_tokenize, ) ) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) if vocab is not None: unk = vocab.unk_word if hasattr(vocab, 'unk_word') else vocab.unk_token else: unk = "" logger.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], unk, ) )
def multiprocess_dataset( cls, input_file: str, dataset_impl: str, binarizer: Binarizer, output_prefix: str, vocab_size=None, num_workers=1, ) -> BinarizeSummary: final_summary = BinarizeSummary() offsets = find_offsets(input_file, num_workers) # find_offsets returns a list of position [pos1, pos2, pos3, pos4] but we would want pairs: # [(pos1, pos2), (pos2, pos3), (pos3, pos4)] to process the chunks with start/end info # we zip the list with itself shifted by one to get all the pairs. (first_chunk, *more_chunks) = zip(offsets, offsets[1:]) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) worker_results = [ pool.apply_async( cls._binarize_chunk_and_finalize, args=( binarizer, input_file, start_offset, end_offset, _worker_prefix( output_prefix, worker_id, ), dataset_impl, ), kwds={ "vocab_size": vocab_size, } if vocab_size is not None else {}, ) for worker_id, (start_offset, end_offset) in enumerate( more_chunks, start=1 ) ] pool.close() pool.join() for r in worker_results: summ = r.get() final_summary.merge(summ) # do not close the bin file as we need to merge the worker results in final_ds, summ = cls._binarize_file_chunk( binarizer, input_file, offset_start=first_chunk[0], offset_end=first_chunk[1], output_prefix=output_prefix, dataset_impl=dataset_impl, vocab_size=vocab_size if vocab_size is not None else None, ) final_summary.merge(summ) if num_workers > 1: for worker_id in range(1, num_workers): # merge the worker outputs worker_output_prefix = _worker_prefix( output_prefix, worker_id, ) final_ds.merge_file_(worker_output_prefix) try: os.remove(indexed_dataset.data_file_path(worker_output_prefix)) os.remove(indexed_dataset.index_file_path(worker_output_prefix)) except Exception as e: logger.error( f"couldn't remove {worker_output_prefix}.*", exc_info=e ) # now we can close the file idx_file = indexed_dataset.index_file_path(output_prefix) final_ds.finalize(idx_file) return final_summary
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words=None): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() copyied = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) copyied.update(worker_result["copied"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # todo: not support copy pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1] ), callback=merge_result ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) words_list = [] def binarize_consumer(ids, words): ds.add_item(ids) words_list.append(words) merge_result( Binarizer.binarize( input_file, vocab, binarize_consumer, offset=0, end=offsets[1], copy_ext_dict=args.copy_ext_dict, copy_src_words=copy_src_words ) ) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print( "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, 100 * sum(copyied.values()) / n_seq_tok[1] ) ) return words_list