def main(args): tensorizer = init_tenzorizer(args.encoder_model_type, args) # disable auto-padding to save disk space of serialized files tensorizer.set_pad_to_max(False) convert_retriever_results(args.is_train_set, args.retriever_results, args.out_file, args.gold_passages_src, tensorizer, args.num_workers)
def _run_preprocessing(tensorizer: Tensorizer): # temporarily disable auto-padding to save disk space usage of serialized files tensorizer.set_pad_to_max(False) serialized_files = convert_retriever_results(is_train, data_files[0], out_file_prefix, gold_passages_src, self.tensorizer, num_workers=self.args.num_workers) tensorizer.set_pad_to_max(True) return serialized_files