def get_data_files(data_sources): """Get list of data files from data_sources. Args: data_sources: a list or tuple of data file paths. Returns: A list of file paths. Raises: ValueError: if not data files are not found """ # TODO(alanesuhr): Verify this is necessary for sharded TFRecord files? data_files = [] for source in data_sources: if source.endswith('@*'): data_files += gfile.Glob(source[:-2] + '*') elif '@' in source: data_files += gfile.GenerateShardedFilenames(source) elif '*' in source or '?' in source or '[' in source: data_files += gfile.Glob(source) else: data_files.append(source) if not data_files: raise ValueError('No data files found in %s' % data_sources) return data_files
def get_existing_corners(segmentation_dir): corners = [] # Legacy path format. for path in gfile.Glob(os.path.join(segmentation_dir, 'seg-*_*_*.npz')): corners.append(get_corner_from_path(path)) for path in gfile.Glob(os.path.join(segmentation_dir, '*/*/seg-*_*_*.npz')): corners.append(get_corner_from_path(path)) return corners
def get_model_data(glob, existing): """Read all model meta filenames and extract per model metadata.""" globbed = sorted(gfile.Glob(glob)) skipped = 0 model_data = [] for model_path in tqdm(globbed): assert model_path.lower().endswith(".meta"), model_path model_run, model_num, model_name = parse_model_components(model_path) row_name = MODEL_PREFIX.format(run=model_run, num=model_name) if row_name in existing: skipped += 1 continue metadata = ( (MODEL_NAME, model_name), (b"model_num", model_num), (b"run", model_run), (b"parent", ""), (b"tag", ""), (b"tool", "cbt_models_backfill_to_cbt"), (b"trained_date", ""), ) model_data.append((row_name, metadata)) print("Read {} Models, {} new, {} existing".format(len(globbed), len(model_data), skipped)) return model_data
def read_games(glob, existing_paths): """Read all SGFs that match glob Parse each game and extract relevant metadata for eval games table. """ globbed = sorted(gfile.Glob(glob)) skipped = 0 to_parse = [] for sgf_name in tqdm(globbed): assert sgf_name.lower().endswith('.sgf'), sgf_name sgf_path = canonical_name(sgf_name) sgf_filename = os.path.basename(sgf_path) if sgf_path in existing_paths or sgf_filename in existing_paths: skipped += 1 continue to_parse.append(sgf_name) game_data = [] with multiprocessing.Pool() as pool: game_data = pool.map(bigtable_output.process_game, tqdm(to_parse), 100) print("Read {} SGFs, {} new, {} existing".format(len(globbed), len(game_data), skipped)) return game_data
def main(unused_argv): metadata = pd.read_csv(gfile.Open(FLAGS.metadata_file), sep='\t') # Read DeepMass:Prism outputs and merge with metadata. outputs = [] for filen in gfile.Glob(FLAGS.input_data_pattern): with gfile.Open(filen) as infile: if FLAGS.batch_prediction: out_df = pd.read_json(infile, lines=True) else: out_df = json.load(infile) out_df = pd.DataFrame(out_df['predictions']) out_df = out_df.merge(metadata, left_on='key', right_on='index', how='left') outputs.append(out_df) outputs = pd.concat(outputs) outputs = outputs.apply(reformat_outputs, args=(int(FLAGS.label_dim), FLAGS.neutral_losses), axis=1) # Read additional features. if FLAGS.add_feature_names is not None: outputs_drip = [] for filen in gfile.Glob(FLAGS.add_input_data_pattern): with gfile.Open(filen) as infile: if FLAGS.batch_prediction: out_df = pd.read_json(infile, lines=True) out_df = pd.DataFrame(out_df['outputs'].tolist(), columns=FLAGS.add_feature_names, index=out_df['key']) else: pass outputs_drip.append(out_df) outputs_drip = pd.concat(outputs_drip) outputs = outputs.merge(outputs_drip, how='left', left_on='key', right_index=True) # Write to a file. with gfile.Open(os.path.join(FLAGS.output_data_dir, 'outputs.tsv'), 'w') as outf: outputs.to_csv(outf, sep='\t', index=False)
def input_fn(input_pattern, mode, params, grammar): """Creates input features and labels tensor dicts. Args: input_pattern: String, input path. mode: tf.estimator.ModeKeys execution mode. params: HParams object containing model hyperparameters. grammar: arithmetic_grammar.Grammar. Returns: features: Dict containing input tensors. labels: label tensor. """ if mode == tf.estimator.ModeKeys.TRAIN: randomize = True num_epochs = None else: randomize = False num_epochs = 1 filenames = gfile.Glob(input_pattern) num_files = len(filenames) filename_dataset = tf.data.Dataset.from_tensor_slices( tf.convert_to_tensor(filenames)) if randomize: filename_dataset = filename_dataset.shuffle(num_files) dataset = filename_dataset.interleave( tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE, cycle_length=num_files, block_length=1) if randomize: dataset = dataset.shuffle(params.shuffle_buffer_size or 1000 * params.batch_size) dataset = dataset.batch(params.batch_size) dataset = dataset.map(functools.partial(parse_examples_fn, params=params, grammar=grammar), num_parallel_calls=params.num_parallel_calls) if params.cache_dataset: # Cache the expensive read and parsing from file system. dataset = dataset.cache() dataset = dataset.map(functools.partial(process_dataset_fn, params=params, grammar=grammar), num_parallel_calls=params.num_parallel_calls) dataset = dataset.repeat(num_epochs) dataset = dataset.prefetch(params.prefetch_buffer_size or 1000 * params.batch_size) features, labels = dataset.make_one_shot_iterator().get_next() return features, labels
def get_model_paths(model_dir): """Returns all model paths in the model_dir.""" all_models = gfile.Glob(os.path.join(model_dir, '*.meta')) model_filenames = [os.path.basename(m) for m in all_models] model_numbers_names = [(shipname.detect_model_num(m), shipname.detect_model_name(m)) for m in model_filenames] model_names = sorted(model_numbers_names) return [os.path.join(model_dir, name[1]) for name in model_names]
def get_models(): """Finds all models, returning a list of model number and names sorted increasing. Returns: [(13, 000013-modelname), (17, 000017-modelname), ...etc] """ all_models = gfile.Glob(os.path.join(models_dir(), '*.meta')) model_filenames = [os.path.basename(m) for m in all_models] model_numbers_names = sorted([(shipname.detect_model_num(m), shipname.detect_model_name(m)) for m in model_filenames]) return model_numbers_names
def test_train_mnist(self): # Create the random data and write it to the disk. test_subdirectory = tempfile.mkdtemp(dir=FLAGS.test_tmpdir) # Create the model parameters. model_path = os.path.join(test_subdirectory, 'temp_model') with flagsaver.flagsaver( model_path=model_path, save_period=1, num_dense_units='4,4', epochs=1, learning_rate=0.1, dropout=0.0, batch_size=32): train_mnist.main(argv=()) # Verify that the trained model was saved. self.assertTrue(gfile.Exists(os.path.join(model_path, 'test_accuracy.txt'))) self.assertLen(gfile.Glob(os.path.join(model_path, 'weights_epoch*')), 1)
def _create_dataset(self): if not hasattr(self, "dataset_"): files = gfile.Glob(dataset_file_pattern(self.dataset_name_)) if not files: raise IOError("Unable to find training files. data_pattern='" + dataset_file_pattern(self.dataset_name_) + "'.") # logging.info("Number of training files: %s.", str(len(files))) if len(files) > 1: # Read in multiple tfrecord files and interleave them in parallel files = tf.data.Dataset.from_tensor_slices(files) dataset = files.interleave( tf.data.TFRecordDataset, cycle_length=self.num_parallel_readers, num_parallel_calls=tf.data.experimental.AUTOTUNE, ) else: # Only a single tfrecord was given dataset = tf.data.TFRecordDataset( files, num_parallel_reads=self.num_parallel_readers) self.dataset_ = dataset
def create_filename_queue(coordinates_file_pattern, shuffle=True): """Creates a queue for reading coordinates from coordinate file. Args: coordinates_file_pattern: File pattern for TFRecords of input examples of the form of a glob pattern or path@shards. shuffle: Whether to shuffle the coordinate file list. Note that the expanded coordinates_file_pattern is not guaranteed to be sorted alphabetically. Returns: Tensorflow queue with coordinate filenames """ m = re.search(r'@(\d{1,})', coordinates_file_pattern) if m: num_shards = int(m.group(1)) coord_file_list = [ re.sub(r'@(\d{1,})', '-%.5d-of-%.5d' % (i, num_shards), coordinates_file_pattern) for i in range(num_shards) ] else: coord_file_list = gfile.Glob(coordinates_file_pattern) return tf.train.string_input_producer(coord_file_list, shuffle=shuffle)
def get_pbs(): all_pbs = gfile.Glob(os.path.join(models_dir(), '*.pb')) return all_pbs
def get_games(model_name): return gfile.Glob(os.path.join(selfplay_dir(), model_name, '*.zz'))
def _get(pattern): files = gfile.Glob(pattern) pool = multiprocessing.Pool() all_results = pool.map(_load, files) return pd.DataFrame(all_results)