def fofn_to_chunks(fofn): files = fofn_to_files(fofn) chunks = [] for i, f in enumerate(files): chunk_id = "chunk-{i}".format(i=i) _d = {Constants.CHUNK_KEY_FOFN: f} p = PipelineChunk(chunk_id, **_d) chunks.append(p) return chunks
def _args_chunk_fofn(args): fofn_files = fofn_to_files(args.input_fofn) log.info("read in fofn with {n} files.".format(n=len(fofn_files))) chunks = CU.write_grouped_fofn_chunks(fofn_files, args.max_total_chunks, args.output_dir, args.chunk_report_json) log.debug("Converted {x} Fofn into {n} chunks. Write chunks to {f}".format( n=len(chunks), f=args.chunk_report_json, x=len(fofn_files))) return 0
def gather_fofn(input_files, output_file, skip_empty=True): """ This should be better spec'ed and impose a tighter constraint on the FOFN :param input_files: List of file paths :param output_file: File Path :param skip_empty: Ignore empty files :return: Output file :rtype: str """ all_files = [] for input_file in input_files: file_names = fofn_to_files(input_file) all_files.extend(file_names) with open(output_file, 'w') as f: f.write("\n".join(all_files)) return output_file
def nchunk_fofn(input_file, max_chunks): input_files = fofn_to_files(input_file) nchunks = min(len(input_files), max_chunks) return nchunks
def _args_chunk_fofn(args): fofn_files = fofn_to_files(args.input_fofn) log.info("read in fofn with {n} files.".format(n=len(fofn_files))) chunks = CU.write_grouped_fofn_chunks(fofn_files, args.max_total_chunks, args.output_dir, args.chunk_report_json) log.debug("Converted {x} Fofn into {n} chunks. Write chunks to {f}".format(n=len(chunks), f=args.chunk_report_json, x=len(fofn_files))) return 0
def _fofn_to_metadata(path): files = fofn_to_files(path) return DatasetMetadata(len(files), len(files))