def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths, match_mlperf): # type: (pd.DataFrame, int, int, rconst.Paths, bool) -> None """Construct training and evaluation datasets. This function manages dataset construction and validation that the transformations have produced correct results. The particular logic of transforming the data is performed in _train_eval_map_fn(). Args: df: The dataframe containing the entire dataset. It is essential that this dataframe be produced by _filter_index_sort(), as subsequent transformations rely on `df` having particular structure. approx_num_shards: The approximate number of similarly sized shards to construct from `df`. The MovieLens has severe imbalances where some users have interacted with many items; this is common among datasets involving user data. Rather than attempt to aggressively balance shard size, this function simply allows shards to "overflow" which can produce a number of shards which is less than `approx_num_shards`. This small degree of imbalance does not impact performance; however it does mean that one should not expect approx_num_shards to be the ACTUAL number of shards. num_items: The cardinality of the item set. cache_paths: rconst.Paths object containing locations for various cache files. match_mlperf: If True, sample eval negative with replacements, which the MLPerf reference implementation does. """ num_rows = len(df) approximate_partitions = np.linspace( 0, num_rows, approx_num_shards + 1).astype("int") start_ind, end_ind = 0, 0 shards = [] for i in range(1, approx_num_shards + 1): end_ind = approximate_partitions[i] while (end_ind < num_rows and df[movielens.USER_COLUMN][end_ind - 1] == df[movielens.USER_COLUMN][end_ind]): end_ind += 1 if end_ind <= start_ind: continue # imbalance from prior shard. df_shard = df[start_ind:end_ind] user_shard = df_shard[movielens.USER_COLUMN].values.astype(np.int32) item_shard = df_shard[movielens.ITEM_COLUMN].values.astype(np.uint16) shards.append({ movielens.USER_COLUMN: user_shard, movielens.ITEM_COLUMN: item_shard, }) start_ind = end_ind assert end_ind == num_rows approx_num_shards = len(shards) tf.logging.info("Splitting train and test data and generating {} test " "negatives per user...".format(rconst.NUM_EVAL_NEGATIVES)) tf.gfile.MakeDirs(cache_paths.train_shard_subdir) map_args = [(shards[i], i, num_items, cache_paths) for i in range(approx_num_shards)] with popen_helper.get_pool(multiprocessing.cpu_count()) as pool: pool.map(_train_eval_map_fn, map_args) # pylint: disable=no-member
def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths, match_mlperf): # type: (pd.DataFrame, int, int, rconst.Paths, bool) -> None """Construct training and evaluation datasets. This function manages dataset construction and validation that the transformations have produced correct results. The particular logic of transforming the data is performed in _train_eval_map_fn(). Args: df: The dataframe containing the entire dataset. It is essential that this dataframe be produced by _filter_index_sort(), as subsequent transformations rely on `df` having particular structure. approx_num_shards: The approximate number of similarly sized shards to construct from `df`. The MovieLens has severe imbalances where some users have interacted with many items; this is common among datasets involving user data. Rather than attempt to aggressively balance shard size, this function simply allows shards to "overflow" which can produce a number of shards which is less than `approx_num_shards`. This small degree of imbalance does not impact performance; however it does mean that one should not expect approx_num_shards to be the ACTUAL number of shards. num_items: The cardinality of the item set. cache_paths: rconst.Paths object containing locations for various cache files. match_mlperf: If True, sample eval negative with replacements, which the MLPerf reference implementation does. """ num_rows = len(df) approximate_partitions = np.linspace( 0, num_rows, approx_num_shards + 1).astype("int") start_ind, end_ind = 0, 0 shards = [] for i in range(1, approx_num_shards + 1): end_ind = approximate_partitions[i] while (end_ind < num_rows and df[movielens.USER_COLUMN][end_ind - 1] == df[movielens.USER_COLUMN][end_ind]): end_ind += 1 if end_ind <= start_ind: continue # imbalance from prior shard. df_shard = df[start_ind:end_ind] user_shard = df_shard[movielens.USER_COLUMN].values.astype(np.int32) item_shard = df_shard[movielens.ITEM_COLUMN].values.astype(np.uint16) shards.append({ movielens.USER_COLUMN: user_shard, movielens.ITEM_COLUMN: item_shard, }) start_ind = end_ind assert end_ind == num_rows approx_num_shards = len(shards) tf.logging.info("Splitting train and test data and generating {} test " "negatives per user...".format(rconst.NUM_EVAL_NEGATIVES)) tf.gfile.MakeDirs(cache_paths.train_shard_subdir) map_args = [(shards[i], i, num_items, cache_paths) for i in range(approx_num_shards)] with popen_helper.get_pool(multiprocessing.cpu_count()) as pool: pool.map(_train_eval_map_fn, map_args) # pylint: disable=no-member
def _construct_records( is_training, # type: bool train_cycle, # type: typing.Optional[int] num_workers, # type: int cache_paths, # type: rconst.Paths num_readers, # type: int num_neg, # type: int num_positives, # type: int num_items, # type: int epochs_per_cycle, # type: int batch_size, # type: int training_shards, # type: typing.List[str] deterministic=False, # type: bool match_mlperf=False # type: bool ): """Generate false negatives and write TFRecords files. Args: is_training: Are training records (True) or eval records (False) created. train_cycle: Integer of which cycle the generated data is for. num_workers: Number of multiprocessing workers to use for negative generation. cache_paths: Paths object with information of where to write files. num_readers: The number of reader datasets in the input_fn. This number is approximate; fewer shards will be created if not all shards are assigned batches. This can occur due to discretization in the assignment process. num_neg: The number of false negatives per positive example. num_positives: The number of positive examples. This value is used to pre-allocate arrays while the imap is still running. (NumPy does not allow dynamic arrays.) num_items: The cardinality of the item set. epochs_per_cycle: The number of epochs worth of data to construct. batch_size: The expected batch size used during training. This is used to properly batch data when writing TFRecords. training_shards: The picked positive examples from which to generate negatives. """ st = timeit.default_timer() if is_training: mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN) mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg) # set inside _process_shard() mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) else: # Later logic assumes that all items for a given user are in the same batch. assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1) assert num_neg == rconst.NUM_EVAL_NEGATIVES mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, value=num_positives) assert epochs_per_cycle == 1 or is_training num_workers = min([num_workers, len(training_shards) * epochs_per_cycle]) num_pts = num_positives * (1 + num_neg) # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without # precision concerns num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size num_padding = num_pts_with_padding - num_pts # We choose a different random seed for each process, so that the processes # will not all choose the same random numbers. process_seeds = [stat_utils.random_int32() for _ in training_shards * epochs_per_cycle] map_args = [ (shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf) for i, shard in enumerate(training_shards * epochs_per_cycle)] with popen_helper.get_pool(num_workers, init_worker) as pool: map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member data_generator = map_fn(_process_shard, map_args) data = [ np.zeros(shape=(num_pts_with_padding,), dtype=np.int32) - 1, np.zeros(shape=(num_pts_with_padding,), dtype=np.uint16), np.zeros(shape=(num_pts_with_padding,), dtype=np.int8), ] # Training data is shuffled. Evaluation data MUST not be shuffled. # Downstream processing depends on the fact that evaluation data for a given # user is grouped within a batch. if is_training: index_destinations = np.random.permutation(num_pts) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) else: index_destinations = np.arange(num_pts) start_ind = 0 for data_segment in data_generator: n_in_segment = data_segment[0].shape[0] dest = index_destinations[start_ind:start_ind + n_in_segment] start_ind += n_in_segment for i in range(3): data[i][dest] = data_segment[i] assert np.sum(data[0] == -1) == num_padding if is_training: if num_padding: # In order to have a full batch, randomly include points from earlier in # the batch. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) pad_sample_indices = np.random.randint( low=0, high=num_pts, size=(num_padding,)) dest = np.arange(start=start_ind, stop=start_ind + num_padding) start_ind += num_padding for i in range(3): data[i][dest] = data[i][pad_sample_indices] else: # For Evaluation, padding is all zeros. The evaluation input_fn knows how # to interpret and discard the zero padded entries. data[0][num_pts:] = 0 # Check that no points were overlooked. assert not np.sum(data[0] == -1) if is_training: # The number of points is slightly larger than num_pts due to padding. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE, value=int(data[0].shape[0])) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE, value=batch_size) else: # num_pts is logged instead of int(data[0].shape[0]), because the size # of the data vector includes zero pads which are ignored. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts) batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers) current_file_id = -1 current_batch_id = -1 batches_by_file = [[] for _ in range(num_readers)] while True: current_batch_id += 1 if (current_batch_id % batches_per_file) == 0: current_file_id += 1 start_ind = current_batch_id * batch_size end_ind = start_ind + batch_size if end_ind > num_pts_with_padding: if start_ind != num_pts_with_padding: raise ValueError("Batch padding does not line up") break batches_by_file[current_file_id].append(current_batch_id) # Drop shards which were not assigned batches batches_by_file = [i for i in batches_by_file if i] num_readers = len(batches_by_file) if is_training: # Empirically it is observed that placing the batch with repeated values at # the start rather than the end improves convergence. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) batches_by_file[0][0], batches_by_file[-1][-1] = \ batches_by_file[-1][-1], batches_by_file[0][0] if is_training: template = rconst.TRAIN_RECORD_TEMPLATE record_dir = os.path.join(cache_paths.train_epoch_dir, get_cycle_folder_name(train_cycle)) tf.gfile.MakeDirs(record_dir) else: template = rconst.EVAL_RECORD_TEMPLATE record_dir = cache_paths.eval_data_subdir batch_count = 0 for i in range(num_readers): fpath = os.path.join(record_dir, template.format(i)) log_msg("Writing {}".format(fpath)) with tf.python_io.TFRecordWriter(fpath) as writer: for j in batches_by_file[i]: start_ind = j * batch_size end_ind = start_ind + batch_size record_kwargs = dict( users=data[0][start_ind:end_ind], items=data[1][start_ind:end_ind], ) if is_training: record_kwargs["labels"] = data[2][start_ind:end_ind] else: record_kwargs["dupe_mask"] = stat_utils.mask_duplicates( record_kwargs["items"].reshape(-1, num_neg + 1), axis=1).flatten().astype(np.int8) batch_bytes = _construct_record(**record_kwargs) writer.write(batch_bytes) batch_count += 1 # We write to a temp file then atomically rename it to the final file, because # writing directly to the final file can cause the main process to read a # partially written JSON file. ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP) with tf.gfile.Open(ready_file_temp, "w") as f: json.dump({ "batch_size": batch_size, "batch_count": batch_count, }, f) ready_file = os.path.join(record_dir, rconst.READY_FILE) tf.gfile.Rename(ready_file_temp, ready_file) if is_training: log_msg("Cycle {} complete. Total time: {:.1f} seconds" .format(train_cycle, timeit.default_timer() - st)) else: log_msg("Eval construction complete. Total time: {:.1f} seconds" .format(timeit.default_timer() - st))
def _construct_records( is_training, # type: bool train_cycle, # type: typing.Optional[int] num_workers, # type: int cache_paths, # type: rconst.Paths num_readers, # type: int num_neg, # type: int num_positives, # type: int num_items, # type: int epochs_per_cycle, # type: int batch_size, # type: int training_shards, # type: typing.List[str] deterministic=False, # type: bool match_mlperf=False # type: bool ): """Generate false negatives and write TFRecords files. Args: is_training: Are training records (True) or eval records (False) created. train_cycle: Integer of which cycle the generated data is for. num_workers: Number of multiprocessing workers to use for negative generation. cache_paths: Paths object with information of where to write files. num_readers: The number of reader datasets in the input_fn. This number is approximate; fewer shards will be created if not all shards are assigned batches. This can occur due to discretization in the assignment process. num_neg: The number of false negatives per positive example. num_positives: The number of positive examples. This value is used to pre-allocate arrays while the imap is still running. (NumPy does not allow dynamic arrays.) num_items: The cardinality of the item set. epochs_per_cycle: The number of epochs worth of data to construct. batch_size: The expected batch size used during training. This is used to properly batch data when writing TFRecords. training_shards: The picked positive examples from which to generate negatives. """ st = timeit.default_timer() if is_training: mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_STEP_TRAIN_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_HP_NUM_NEG, value=num_neg) # set inside _process_shard() mlperf_helper.ncf_print( key=mlperf_helper.TAGS.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True) else: # Later logic assumes that all items for a given user are in the same batch. assert not batch_size % (rconst.NUM_EVAL_NEGATIVES + 1) assert num_neg == rconst.NUM_EVAL_NEGATIVES mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_STEP_EVAL_NEG_GEN) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS, value=num_positives) assert epochs_per_cycle == 1 or is_training num_workers = min([num_workers, len(training_shards) * epochs_per_cycle]) num_pts = num_positives * (1 + num_neg) # Equivalent to `int(ceil(num_pts / batch_size)) * batch_size`, but without # precision concerns num_pts_with_padding = (num_pts + batch_size - 1) // batch_size * batch_size num_padding = num_pts_with_padding - num_pts # We choose a different random seed for each process, so that the processes # will not all choose the same random numbers. process_seeds = [ stat_utils.random_int32() for _ in training_shards * epochs_per_cycle ] map_args = [(shard, num_items, num_neg, process_seeds[i], is_training, match_mlperf) for i, shard in enumerate(training_shards * epochs_per_cycle)] with popen_helper.get_pool(num_workers, init_worker) as pool: map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member data_generator = map_fn(_process_shard, map_args) data = [ np.zeros(shape=(num_pts_with_padding, ), dtype=np.int32) - 1, np.zeros(shape=(num_pts_with_padding, ), dtype=np.uint16), np.zeros(shape=(num_pts_with_padding, ), dtype=np.int8), ] # Training data is shuffled. Evaluation data MUST not be shuffled. # Downstream processing depends on the fact that evaluation data for a given # user is grouped within a batch. if is_training: index_destinations = np.random.permutation(num_pts) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) else: index_destinations = np.arange(num_pts) start_ind = 0 for data_segment in data_generator: n_in_segment = data_segment[0].shape[0] dest = index_destinations[start_ind:start_ind + n_in_segment] start_ind += n_in_segment for i in range(3): data[i][dest] = data_segment[i] assert np.sum(data[0] == -1) == num_padding if is_training: if num_padding: # In order to have a full batch, randomly include points from earlier in # the batch. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) pad_sample_indices = np.random.randint(low=0, high=num_pts, size=(num_padding, )) dest = np.arange(start=start_ind, stop=start_ind + num_padding) start_ind += num_padding for i in range(3): data[i][dest] = data[i][pad_sample_indices] else: # For Evaluation, padding is all zeros. The evaluation input_fn knows how # to interpret and discard the zero padded entries. data[0][num_pts:] = 0 # Check that no points were overlooked. assert not np.sum(data[0] == -1) if is_training: # The number of points is slightly larger than num_pts due to padding. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_SIZE, value=int(data[0].shape[0])) mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_BATCH_SIZE, value=batch_size) else: # num_pts is logged instead of int(data[0].shape[0]), because the size # of the data vector includes zero pads which are ignored. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_SIZE, value=num_pts) batches_per_file = np.ceil(num_pts_with_padding / batch_size / num_readers) current_file_id = -1 current_batch_id = -1 batches_by_file = [[] for _ in range(num_readers)] while True: current_batch_id += 1 if (current_batch_id % batches_per_file) == 0: current_file_id += 1 start_ind = current_batch_id * batch_size end_ind = start_ind + batch_size if end_ind > num_pts_with_padding: if start_ind != num_pts_with_padding: raise ValueError("Batch padding does not line up") break batches_by_file[current_file_id].append(current_batch_id) # Drop shards which were not assigned batches batches_by_file = [i for i in batches_by_file if i] num_readers = len(batches_by_file) if is_training: # Empirically it is observed that placing the batch with repeated values at # the start rather than the end improves convergence. mlperf_helper.ncf_print(key=mlperf_helper.TAGS.INPUT_ORDER) batches_by_file[0][0], batches_by_file[-1][-1] = \ batches_by_file[-1][-1], batches_by_file[0][0] if is_training: template = rconst.TRAIN_RECORD_TEMPLATE record_dir = os.path.join(cache_paths.train_epoch_dir, get_cycle_folder_name(train_cycle)) tf.gfile.MakeDirs(record_dir) else: template = rconst.EVAL_RECORD_TEMPLATE record_dir = cache_paths.eval_data_subdir batch_count = 0 for i in range(num_readers): fpath = os.path.join(record_dir, template.format(i)) log_msg("Writing {}".format(fpath)) with tf.python_io.TFRecordWriter(fpath) as writer: for j in batches_by_file[i]: start_ind = j * batch_size end_ind = start_ind + batch_size record_kwargs = dict( users=data[0][start_ind:end_ind], items=data[1][start_ind:end_ind], ) if is_training: record_kwargs["labels"] = data[2][start_ind:end_ind] else: record_kwargs["dupe_mask"] = stat_utils.mask_duplicates( record_kwargs["items"].reshape(-1, num_neg + 1), axis=1).flatten().astype(np.int8) batch_bytes = _construct_record(**record_kwargs) writer.write(batch_bytes) batch_count += 1 # We write to a temp file then atomically rename it to the final file, because # writing directly to the final file can cause the main process to read a # partially written JSON file. ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP) with tf.gfile.Open(ready_file_temp, "w") as f: json.dump({ "batch_size": batch_size, "batch_count": batch_count, }, f) ready_file = os.path.join(record_dir, rconst.READY_FILE) tf.gfile.Rename(ready_file_temp, ready_file) if is_training: log_msg("Cycle {} complete. Total time: {:.1f} seconds".format( train_cycle, timeit.default_timer() - st)) else: log_msg( "Eval construction complete. Total time: {:.1f} seconds".format( timeit.default_timer() - st))
def _construct_training_records( train_cycle, # type: int num_workers, # type: int cache_paths, # type: rconst.Paths num_readers, # type: int num_neg, # type: int num_train_positives, # type: int num_items, # type: int epochs_per_cycle, # type: int train_batch_size, # type: int training_shards, # type: typing.List[str] spillover, # type: bool carryover=None, # type: typing.Union[typing.List[np.ndarray], None] deterministic=False # type: bool ): """Generate false negatives and write TFRecords files. Args: train_cycle: Integer of which cycle the generated data is for. num_workers: Number of multiprocessing workers to use for negative generation. cache_paths: Paths object with information of where to write files. num_readers: The number of reader datasets in the train input_fn. num_neg: The number of false negatives per positive example. num_train_positives: The number of positive examples. This value is used to pre-allocate arrays while the imap is still running. (NumPy does not allow dynamic arrays.) num_items: The cardinality of the item set. epochs_per_cycle: The number of epochs worth of data to construct. train_batch_size: The expected batch size used during training. This is used to properly batch data when writing TFRecords. training_shards: The picked positive examples from which to generate negatives. spillover: If the final batch is incomplete, push it to the next cycle (True) or include a partial batch (False). carryover: The data points to be spilled over to the next cycle. """ st = timeit.default_timer() num_workers = min([num_workers, len(training_shards) * epochs_per_cycle]) carryover = carryover or [ np.zeros((0, ), dtype=np.int32), np.zeros((0, ), dtype=np.uint16), np.zeros((0, ), dtype=np.int8), ] num_carryover = carryover[0].shape[0] num_pts = num_carryover + num_train_positives * (1 + num_neg) # We choose a different random seed for each process, so that the processes # will not all choose the same random numbers. process_seeds = [ np.random.randint(2**32) for _ in training_shards * epochs_per_cycle ] map_args = [(shard, num_items, num_neg, process_seeds[i]) for i, shard in enumerate(training_shards * epochs_per_cycle)] with popen_helper.get_pool(num_workers, init_worker) as pool: map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member data_generator = map_fn(_process_shard, map_args) data = [ np.zeros(shape=(num_pts, ), dtype=np.int32) - 1, np.zeros(shape=(num_pts, ), dtype=np.uint16), np.zeros(shape=(num_pts, ), dtype=np.int8), ] # The carryover data is always first. for i in range(3): data[i][:num_carryover] = carryover[i] index_destinations = np.random.permutation( num_train_positives * (1 + num_neg)) + num_carryover start_ind = 0 for data_segment in data_generator: n_in_segment = data_segment[0].shape[0] dest = index_destinations[start_ind:start_ind + n_in_segment] start_ind += n_in_segment for i in range(3): data[i][dest] = data_segment[i] # Check that no points were dropped. assert (num_pts - num_carryover) == start_ind assert not np.sum(data[0] == -1) record_dir = os.path.join(cache_paths.train_epoch_dir, get_cycle_folder_name(train_cycle)) tf.gfile.MakeDirs(record_dir) batches_per_file = np.ceil(num_pts / train_batch_size / num_readers) current_file_id = -1 current_batch_id = -1 batches_by_file = [[] for _ in range(num_readers)] output_carryover = [ np.zeros(shape=(0, ), dtype=np.int32), np.zeros(shape=(0, ), dtype=np.uint16), np.zeros(shape=(0, ), dtype=np.int8), ] while True: current_batch_id += 1 if (current_batch_id % batches_per_file) == 0: current_file_id += 1 end_ind = (current_batch_id + 1) * train_batch_size if end_ind > num_pts: if spillover: output_carryover = [ data[i][current_batch_id * train_batch_size:num_pts] for i in range(3) ] break else: batches_by_file[current_file_id].append(current_batch_id) break batches_by_file[current_file_id].append(current_batch_id) batch_count = 0 for i in range(num_readers): fpath = os.path.join(record_dir, rconst.TRAIN_RECORD_TEMPLATE.format(i)) log_msg("Writing {}".format(fpath)) with tf.python_io.TFRecordWriter(fpath) as writer: for j in batches_by_file[i]: start_ind = j * train_batch_size end_ind = start_ind + train_batch_size batch_bytes = _construct_record( users=data[0][start_ind:end_ind], items=data[1][start_ind:end_ind], labels=data[2][start_ind:end_ind], ) writer.write(batch_bytes) batch_count += 1 if spillover: written_pts = output_carryover[0].shape[ 0] + batch_count * train_batch_size if num_pts != written_pts: raise ValueError( "Error detected: point counts do not match: {} vs. {}".format( num_pts, written_pts)) # We write to a temp file then atomically rename it to the final file, because # writing directly to the final file can cause the main process to read a # partially written JSON file. ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP) with tf.gfile.Open(ready_file_temp, "w") as f: json.dump( { "batch_size": train_batch_size, "batch_count": batch_count, }, f) ready_file = os.path.join(record_dir, rconst.READY_FILE) tf.gfile.Rename(ready_file_temp, ready_file) log_msg("Cycle {} complete. Total time: {:.1f} seconds".format( train_cycle, timeit.default_timer() - st)) return output_carryover