def get_csv_chunk_generator(self, csv_file_path, row_count, chunks): """ Given the csv_file_path and a row_count, yield chunks number of string chunks Parameters ---------- csv_file_path: str File path for the CSV written by Postgres row_count: int Number of rows in the CSV chunks: int Number of chunks to yield Yields ------ str """ # Yield only a single chunk if the number of rows is small. if row_count <= chunks: with open(csv_file_path, "r") as f: yield f.read() raise StopIteration # Get chunk boundaries left_closed_boundary = util.linspace(0, row_count, chunks) left_closed_boundary.append(row_count - 1) right_closed_boundary = left_closed_boundary[1:] final_boundary_index = len(right_closed_boundary) - 1 # We're going to allocate a large buffer for this -- let's read as fast # as possible chunk_lines = [] boundary_index = 0 boundary = right_closed_boundary[boundary_index] one_mebibyte = 1048576 with open(csv_file_path, "r", one_mebibyte) as f: for line_number, row in enumerate(f): chunk_lines.append(row) if line_number == boundary: if boundary_index != final_boundary_index: boundary_index += 1 boundary = right_closed_boundary[boundary_index] yield "".join(chunk_lines) chunk_lines = []
def chunked_json_slices(data, slices, directory=None, clean_on_exit=True): """ Given an iterator of dicts, chunk them into *slices* and write to temp files on disk. Clean up when leaving scope. Parameters ---------- data : iter of dicts Iterable of dictionaries to be serialized to chunks slices : int Number of chunks to generate dir : str Dir to write chunks to. Will default to $HOME/.shiftmanager/tmp/ clean_on_exit : bool, default True Clean up chunks on disk when context exits Returns ------- stamp : str Timestamp that prepends the filenames of chunks written to disc chunk_files : list List of filenames """ # Ensure that files get cleaned up even on raised exception try: num_data = len(data) chunk_range_start = util.linspace(0, num_data, slices) chunk_range_end = chunk_range_start[1:] chunk_range_end.append(None) stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S%f") if not directory: user_home = os.path.expanduser("~") directory = os.path.join(user_home, ".shiftmanager", "tmp") if not os.path.exists(directory): os.makedirs(directory) chunk_files = [] range_zipper = list(zip(chunk_range_start, chunk_range_end)) for i, (inclusive, exclusive) in enumerate(range_zipper): # Get either a inc/excl slice, # or the slice to the end of the range if exclusive is not None: sliced = data[inclusive:exclusive] else: sliced = data[inclusive:] newlined = "" for doc in sliced: newlined = "{}{}\n".format(newlined, json.dumps(doc)) filepath = "{}.gz".format("-".join([stamp, str(i)])) write_path = os.path.join(directory, filepath) current_fp = gzip.open(write_path, 'wb') current_fp.write(newlined.encode("utf-8")) current_fp.close() chunk_files.append(write_path) yield stamp, chunk_files finally: if clean_on_exit: for filepath in chunk_files: os.remove(filepath)