def local_analysis(args): if args.analysis: # Already analyzed. return if not args.schema or not args.features: raise ValueError('Either --analysis, or both --schema and --features are provided.') tf_config = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_spec = tf_config.get('cluster', {}) if len(cluster_spec.get('worker', [])) > 0: raise ValueError('If "schema" and "features" are provided, local analysis will run and ' + 'only BASIC scale-tier (no workers node) is supported.') if cluster_spec and not (args.schema.startswith('gs://') and args.features.startswith('gs://')): raise ValueError('Cloud trainer requires GCS paths for --schema and --features.') print('Running analysis.') schema = json.loads(file_io.read_file_to_string(args.schema).decode()) features = json.loads(file_io.read_file_to_string(args.features).decode()) args.analysis = os.path.join(args.job_dir, 'analysis') args.transform = True file_io.recursive_create_dir(args.analysis) feature_analysis.run_local_analysis(args.analysis, args.train, schema, features) print('Analysis done.')
def test_categorical(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = ['red,apple', 'red,pepper', 'red,apple', 'blue,grape', 'blue,apple', 'green,pepper'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'color', 'type': 'STRING'}, {'name': 'type', 'type': 'STRING'}] features = {'color': {'transform': 'one_hot', 'source_column': 'color'}, 'type': {'transform': 'target'}} feature_analysis.run_local_analysis( output_folder, [input_file_path], schema, features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]}, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) finally: shutil.rmtree(output_folder)
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.schema: schema = json.loads( file_io.read_file_to_string(args.schema).decode()) else: import google.datalab.bigquery as bq schema = bq.Table(args.bigquery).schema._bq_schema features = json.loads( file_io.read_file_to_string(args.features).decode()) file_io.recursive_create_dir(args.output) if args.cloud: run_cloud_analysis( output_dir=args.output, csv_file_pattern=args.csv, bigquery_table=args.bigquery, schema=schema, features=features) else: feature_analysis.run_local_analysis( output_dir=args.output, csv_file_pattern=args.csv, schema=schema, features=features)
def test_text(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = ['the quick brown fox,raining in kir,cat1|cat2,true', 'quick brown brown chicken,raining in pdx,cat2|cat3|cat4,false'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}, {'name': 'col3', 'type': 'STRING'}, {'name': 'col4', 'type': 'STRING'}] features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'}, 'col2': {'transform': 'tfidf', 'source_column': 'col2'}, 'col3': {'transform': 'multi_hot', 'source_column': 'col3', 'separator': '|'}, 'col4': {'transform': 'target'}} feature_analysis.run_local_analysis( output_folder, [input_file_path], schema, features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) self.assertEqual(stats['column_stats']['col3']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) # vocabs are sorted by count only col1_vocab = vocab['col1'].tolist() self.assertItemsEqual(col1_vocab[:2], ['brown', 'quick']) self.assertItemsEqual(col1_vocab[2:], ['chicken', 'fox', 'the']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) # vocabs are sorted by count only col2_vocab = vocab['col2'].tolist() self.assertItemsEqual(col2_vocab[:2], ['in', 'raining']) self.assertItemsEqual(col2_vocab[2:], ['kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1]) finally: shutil.rmtree(output_folder)
def testAtomicWriteStringToFileOverwriteFalse(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.atomic_write_string_to_file(file_path, "old", overwrite=False) with self.assertRaises(errors.AlreadyExistsError): file_io.atomic_write_string_to_file(file_path, "new", overwrite=False) file_contents = file_io.read_file_to_string(file_path) self.assertEqual("old", file_contents) file_io.delete_file(file_path) file_io.atomic_write_string_to_file(file_path, "new", overwrite=False) file_contents = file_io.read_file_to_string(file_path) self.assertEqual("new", file_contents)
def __init__(self, *args, **kwargs): super(ApiCompatibilityTest, self).__init__(*args, **kwargs) golden_update_warning_filename = os.path.join( resource_loader.get_root_dir_with_all_resources(), _UPDATE_WARNING_FILE) self._update_golden_warning = file_io.read_file_to_string( golden_update_warning_filename) test_readme_filename = os.path.join( resource_loader.get_root_dir_with_all_resources(), _TEST_README_FILE) self._test_readme_message = file_io.read_file_to_string( test_readme_filename)
def test_text(self): test_folder = os.path.join(self._bucket_root, 'test_text') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = ['the quick brown fox,raining in kir,cat1|cat2,true', 'quick brown brown chicken,raining in pdx,cat2|cat3|cat4,false'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}, {'name': 'col3', 'type': 'STRING'}, {'name': 'col4', 'type': 'STRING'}] features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'}, 'col2': {'transform': 'tfidf', 'source_column': 'col2'}, 'col3': {'transform': 'multi_hot', 'source_column': 'col3', 'separator': '|'}, 'col4': {'transform': 'target'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, features=features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) self.assertEqual(stats['column_stats']['col3']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) self.assertEqual(vocab['col1'].tolist(), ['brown', 'quick', 'chicken', 'fox', 'the', ]) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
def test_categorical(self): test_folder = os.path.join(self._bucket_root, 'test_categorical') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = ['red,car,apple', 'red,truck,pepper', 'red,van,apple', 'blue,bike,grape', 'blue,train,apple', 'green,airplane,pepper'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'color', 'type': 'STRING'}, {'name': 'transport', 'type': 'STRING'}, {'name': 'type', 'type': 'STRING'}] features = {'color': {'transform': 'one_hot', 'source_column': 'color'}, 'transport': {'transform': 'embedding', 'source_column': 'transport'}, 'type': {'transform': 'target'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, features=features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]}, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertEqual(vocab['transport'].tolist(), ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
def get_model_schema_and_features(model_dir): """Get a local model's schema and features config. Args: model_dir: local or GCS path of a model. Returns: A tuple of schema (list) and features config (dict). """ schema_file = os.path.join(model_dir, 'assets.extra', 'schema.json') schema = json.loads(file_io.read_file_to_string(schema_file)) features_file = os.path.join(model_dir, 'assets.extra', 'features.json') features_config = json.loads(file_io.read_file_to_string(features_file)) return schema, features_config
def test_categorical(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = ['red,car,apple', 'red,truck,pepper', 'red,van,apple', 'blue,bike,grape', 'blue,train,apple', 'green,airplane,pepper'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'color', 'type': 'STRING'}, {'name': 'transport', 'type': 'STRING'}, {'name': 'type', 'type': 'STRING'}] features = {'color': {'transform': 'one_hot', 'source_column': 'color'}, 'transport': {'transform': 'embedding', 'source_column': 'transport'}, 'type': {'transform': 'target'}} feature_analysis.run_local_analysis( output_folder, [input_file_path], schema, features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]}, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. As each vocab has the same count, order in file is # not known. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertItemsEqual(vocab['transport'].tolist(), ['car', 'truck', 'van', 'bike', 'train', 'airplane']) finally: shutil.rmtree(output_folder)
def load_model(saved_model_path): """Load a keras.Model from SavedModel. load_model reinstantiates model state by: 1) loading model topology from json (this will eventually come from metagraph). 2) loading model weights from checkpoint. Args: saved_model_path: a string specifying the path to an existing SavedModel. Returns: a keras.Model instance. """ # restore model topology from json string model_json_filepath = os.path.join( compat.as_bytes(saved_model_path), compat.as_bytes(constants.ASSETS_DIRECTORY), compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON)) model_json = file_io.read_file_to_string(model_json_filepath) model = model_from_json(model_json) # restore model weights checkpoint_prefix = os.path.join( compat.as_text(saved_model_path), compat.as_text(constants.VARIABLES_DIRECTORY), compat.as_text(constants.VARIABLES_FILENAME)) model.load_weights(checkpoint_prefix) return model
def run_analysis(args): """Builds an analysis file for training. Uses BiqQuery tables to do the analysis. Args: args: command line args Raises: ValueError if schema contains unknown types. """ import google.datalab.bigquery as bq if args.bigquery_table: table = bq.Table(args.bigquery_table) schema_list = table.schema._bq_schema else: schema_list = json.loads( file_io.read_file_to_string(args.schema_file).decode()) table = bq.ExternalDataSource( source=args.input_file_pattern, schema=bq.Schema(schema_list)) # Check the schema is supported. for col_schema in schema_list: col_type = col_schema['type'].lower() if col_type != 'string' and col_type != 'integer' and col_type != 'float': raise ValueError('Schema contains an unsupported type %s.' % col_type) run_numerical_analysis(table, schema_list, args) run_categorical_analysis(table, schema_list, args) # Save a copy of the schema to the output location. file_io.write_string_to_file( os.path.join(args.output_dir, SCHEMA_FILE), json.dumps(schema_list, indent=2, separators=(',', ': ')))
def testMultipleWrites(self): file_path = os.path.join(self._base_dir, "temp_file") with file_io.FileIO(file_path, mode="w") as f: f.write("line1\n") f.write("line2") file_contents = file_io.read_file_to_string(file_path) self.assertEqual("line1\nline2", file_contents)
def _read_latest_config_files(self, run_path_pairs): """Reads and returns the projector config files in every run directory.""" configs = {} config_fpaths = {} for run_name, assets_dir in run_path_pairs: config = projector_config_pb2.ProjectorConfig() config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME) if file_io.file_exists(config_fpath): file_content = file_io.read_file_to_string(config_fpath) text_format.Merge(file_content, config) has_tensor_files = False for embedding in config.embeddings: if embedding.tensor_path: has_tensor_files = True break if not config.model_checkpoint_path: # See if you can find a checkpoint file in the logdir. logdir = _assets_dir_to_logdir(assets_dir) ckpt_path = _find_latest_checkpoint(logdir) if not ckpt_path and not has_tensor_files: continue if ckpt_path: config.model_checkpoint_path = ckpt_path # Sanity check for the checkpoint file. if (config.model_checkpoint_path and not checkpoint_exists(config.model_checkpoint_path)): logging.warning('Checkpoint file "%s" not found', config.model_checkpoint_path) continue configs[run_name] = config config_fpaths[run_name] = config_fpath return configs, config_fpaths
def testCopy(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.copy(file_path, copy_path) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
def test_numerics(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s,%s' % (i, 10 * i + 0.5, i + 0.5) for i in range(100)])) schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}, {'name': 'col3', 'type': 'FLOAT'}] features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}, 'col3': {'transform': 'target'}} feature_analysis.run_local_analysis( output_folder, [input_file_path], schema, features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder)
def testUpdateCheckpointStateSaveRelativePaths(self): save_dir = self._get_test_dir("update_checkpoint_state") os.chdir(save_dir) abs_path2 = os.path.join(save_dir, "model-2") rel_path2 = "model-2" abs_path0 = os.path.join(save_dir, "model-0") rel_path0 = "model-0" checkpoint_management.update_checkpoint_state_internal( save_dir=save_dir, model_checkpoint_path=abs_path2, all_model_checkpoint_paths=[rel_path0, abs_path2], save_relative_paths=True) # File should contain relative paths. file_content = file_io.read_file_to_string( os.path.join(save_dir, "checkpoint")) ckpt = CheckpointState() text_format.Merge(file_content, ckpt) self.assertEqual(ckpt.model_checkpoint_path, rel_path2) self.assertEqual(len(ckpt.all_model_checkpoint_paths), 2) self.assertEqual(ckpt.all_model_checkpoint_paths[-1], rel_path2) self.assertEqual(ckpt.all_model_checkpoint_paths[0], rel_path0) # get_checkpoint_state should return absolute paths. ckpt = checkpoint_management.get_checkpoint_state(save_dir) self.assertEqual(ckpt.model_checkpoint_path, abs_path2) self.assertEqual(len(ckpt.all_model_checkpoint_paths), 2) self.assertEqual(ckpt.all_model_checkpoint_paths[-1], abs_path2) self.assertEqual(ckpt.all_model_checkpoint_paths[0], abs_path0)
def test_numerics(self): test_folder = os.path.join(self._bucket_root, 'test_numerics') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}] features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5)
def _read_file(filename): """Reads a file containing `GraphDef` and returns the protocol buffer. Args: filename: `graph_def` filename including the path. Returns: A `GraphDef` protocol buffer. Raises: IOError: If the file doesn't exist, or cannot be successfully parsed. """ graph_def = graph_pb2.GraphDef() if not file_io.file_exists(filename): raise IOError("File %s does not exist." % filename) # First try to read it as a binary file. file_content = file_io.read_file_to_string(filename) try: graph_def.ParseFromString(file_content) return graph_def except Exception: # pylint: disable=broad-except pass # Next try to read it as a text file. try: text_format.Merge(file_content.decode("utf-8"), graph_def) except text_format.ParseError as e: raise IOError("Cannot parse file %s: %s." % (filename, str(e))) return graph_def
def _GetBaseApiMap(self): """Get a map from graph op name to its base ApiDef. Returns: Dictionary mapping graph op name to corresponding ApiDef. """ # Convert base ApiDef in Multiline format to Proto format. converted_base_api_dir = os.path.join( test.get_temp_dir(), 'temp_base_api_defs') subprocess.check_call( [os.path.join(resource_loader.get_root_dir_with_all_resources(), _CONVERT_FROM_MULTILINE_SCRIPT), _BASE_API_DIR, converted_base_api_dir]) name_to_base_api_def = {} base_api_files = file_io.get_matching_files( os.path.join(converted_base_api_dir, 'api_def_*.pbtxt')) for base_api_file in base_api_files: if file_io.file_exists(base_api_file): api_defs = api_def_pb2.ApiDefs() text_format.Merge( file_io.read_file_to_string(base_api_file), api_defs) for api_def in api_defs.op: name_to_base_api_def[api_def.graph_op_name] = api_def return name_to_base_api_def
def testFileWrite(self): file_path = os.path.join(self.get_temp_dir(), "temp_file") file_io.write_string_to_file(file_path, "testing") self.assertTrue(file_io.file_exists(file_path)) file_contents = file_io.read_file_to_string(file_path) self.assertEqual(b"testing", file_contents) file_io.delete_file(file_path)
def _read_config_files(self, run_paths): configs = {} config_fpaths = {} for run_name, logdir in run_paths.items(): config_fpath = os.path.join(logdir, PROJECTOR_FILENAME) if not file_io.file_exists(config_fpath): # Skip runs that have no config file. continue # Read the config file. file_content = file_io.read_file_to_string(config_fpath).decode('utf-8') config = ProjectorConfig() text_format.Merge(file_content, config) if not config.model_checkpoint_path: # See if you can find a checkpoint file in the logdir. ckpt_path = latest_checkpoint(logdir) if not ckpt_path: # Or in the parent of logdir. ckpt_path = latest_checkpoint(os.path.join('../', logdir)) if not ckpt_path: logging.warning('Cannot find model checkpoint in %s', logdir) continue config.model_checkpoint_path = ckpt_path # Sanity check for the checkpoint file. if not file_io.file_exists(config.model_checkpoint_path): logging.warning('Checkpoint file %s not found', config.model_checkpoint_path) continue configs[run_name] = config config_fpaths[run_name] = config_fpath return configs, config_fpaths
def testCopyOverwrite(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.write_string_to_file(file_path, "testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.write_string_to_file(copy_path, "copy") file_io.copy(file_path, copy_path, overwrite=True) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
def testCopy(self): file_path = os.path.join(self.get_temp_dir(), "temp_file") file_io.write_string_to_file(file_path, "testing") copy_path = os.path.join(self.get_temp_dir(), "copy_file") file_io.copy(file_path, copy_path) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual(b"testing", file_io.read_file_to_string(file_path)) file_io.delete_file(file_path) file_io.delete_file(copy_path)
def _parse_saved_model(export_dir): """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`. Args: export_dir: Directory containing the SavedModel file. Returns: A `SavedModel` protocol buffer. Raises: IOError: If the file does not exist, or cannot be successfully parsed. """ # Build the path to the SavedModel in pbtxt format. path_to_pbtxt = os.path.join( compat.as_bytes(export_dir), compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT)) # Build the path to the SavedModel in pb format. path_to_pb = os.path.join( compat.as_bytes(export_dir), compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB)) # Ensure that the SavedModel exists at either path. if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists( path_to_pb): raise IOError("SavedModel file does not exist at: %s" % export_dir) saved_model = saved_model_pb2.SavedModel() # Parse the SavedModel protocol buffer. try: file_content = file_io.read_file_to_string(path_to_pb) saved_model.ParseFromString(file_content) return saved_model except Exception: # pylint: disable=broad-except # Pass for exceptions in order to try reading the file in text format. pass try: file_content = file_io.read_file_to_string(path_to_pbtxt) text_format.Merge(file_content.decode("utf-8"), saved_model) except text_format.ParseError as e: raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e))) return saved_model
def get_checkpoint_state(checkpoint_dir, latest_filename=None): """Returns CheckpointState proto from the "checkpoint" file. If the "checkpoint" file contains a valid CheckpointState proto, returns it. Args: checkpoint_dir: The directory of checkpoints. latest_filename: Optional name of the checkpoint file. Default to 'checkpoint'. Returns: A CheckpointState if the state was available, None otherwise. Raises: ValueError: if the checkpoint read doesn't have model_checkpoint_path set. """ ckpt = None coord_checkpoint_filename = _GetCheckpointFilename(checkpoint_dir, latest_filename) f = None try: # Check that the file exists before opening it to avoid # many lines of errors from colossus in the logs. if file_io.file_exists(coord_checkpoint_filename): file_content = file_io.read_file_to_string( coord_checkpoint_filename) ckpt = CheckpointState() text_format.Merge(file_content, ckpt) if not ckpt.model_checkpoint_path: raise ValueError("Invalid checkpoint state loaded from " + checkpoint_dir) # For relative model_checkpoint_path and all_model_checkpoint_paths, # prepend checkpoint_dir. if not os.path.isabs(ckpt.model_checkpoint_path): ckpt.model_checkpoint_path = os.path.join(checkpoint_dir, ckpt.model_checkpoint_path) for i in range(len(ckpt.all_model_checkpoint_paths)): p = ckpt.all_model_checkpoint_paths[i] if not os.path.isabs(p): ckpt.all_model_checkpoint_paths[i] = os.path.join(checkpoint_dir, p) except errors.OpError as e: # It's ok if the file cannot be read logging.warning("%s: %s", type(e).__name__, e) logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename) return None except text_format.ParseError as e: logging.warning("%s: %s", type(e).__name__, e) logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename) return None finally: if f: f.close() return ckpt
def testAssets(self): export_dir = os.path.join( compat.as_bytes(tf.test.get_temp_dir()), compat.as_bytes("with-assets")) builder = saved_model_builder.SavedModelBuilder(export_dir) with self.test_session(graph=tf.Graph()) as sess: v = tf.Variable(42, name="v") sess.run(tf.initialize_all_variables()) self.assertEqual(42, v.eval()) # Build an asset collection. asset_filepath = os.path.join( compat.as_bytes(tf.test.get_temp_dir()), compat.as_bytes("hello42.txt")) file_io.write_string_to_file(asset_filepath, "foo bar baz") asset_file_tensor = tf.constant(asset_filepath, name="asset_file_tensor") tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, asset_file_tensor) ignored_filepath = os.path.join( compat.as_bytes(tf.test.get_temp_dir()), compat.as_bytes("ignored.txt")) file_io.write_string_to_file(ignored_filepath, "will be ignored") asset_collection = tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS) builder.add_meta_graph_and_variables( sess, ["foo"], assets_collection=asset_collection) # Save the SavedModel to disk. builder.save() with self.test_session(graph=tf.Graph()) as sess: foo_graph = loader.load(sess, ["foo"], export_dir) # Validate the assets. collection_def = foo_graph.collection_def assets_any = collection_def[constants.ASSETS_KEY].any_list.value self.assertEqual(len(assets_any), 1) asset = manifest_pb2.AssetFile() assets_any[0].Unpack(asset) assets_path = os.path.join( compat.as_bytes(export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY), compat.as_bytes("hello42.txt")) asset_contents = file_io.read_file_to_string(assets_path) self.assertEqual("foo bar baz", compat.as_text(asset_contents)) self.assertEqual("hello42.txt", asset.filename) self.assertEqual("asset_file_tensor:0", asset.tensor_binding.tensor_name) ignored_asset_path = os.path.join( compat.as_bytes(export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY), compat.as_bytes("ignored.txt")) self.assertFalse(file_io.file_exists(ignored_asset_path))
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.schema: schema = json.loads( file_io.read_file_to_string(args.schema).decode()) else: import google.datalab.bigquery as bq schema = bq.Table(args.bigquery).schema._bq_schema features = json.loads( file_io.read_file_to_string(args.features).decode()) expand_defaults(schema, features) # features are updated. inverted_features = invert_features(features) check_schema_transforms_match(schema, inverted_features) file_io.recursive_create_dir(args.output) if args.cloud: run_cloud_analysis( output_dir=args.output, csv_file_pattern=args.csv, bigquery_table=args.bigquery, schema=schema, inverted_features=inverted_features) else: run_local_analysis( output_dir=args.output, csv_file_pattern=args.csv, schema=schema, inverted_features=inverted_features) # Save a copy of the schema and features in the output folder. file_io.write_string_to_file( os.path.join(args.output, constant.SCHEMA_FILE), json.dumps(schema, indent=2)) file_io.write_string_to_file( os.path.join(args.output, constant.FEATURES_FILE), json.dumps(features, indent=2))
def read_schema(path): """Reads a schema from the provided location. Args: path: The location of the file holding a serialized Schema proto. Returns: An instance of Schema or None if the input argument is None """ result = schema_pb2.Schema() contents = file_io.read_file_to_string(path) text_format.Parse(contents, result) return result
def load_schema_text(input_path: bytes) -> schema_pb2.Schema: """Loads the schema stored in text format in the input path. Args: input_path: File path to load the schema from. Returns: A Schema protocol buffer. """ schema = schema_pb2.Schema() schema_text = file_io.read_file_to_string(input_path) text_format.Parse(schema_text, schema) return schema
def testGeneratedFileMatchesHead(self): expected_contents = gradient_input_output_exclusions.get_contents() filename = os.path.join( resource_loader.get_root_dir_with_all_resources(), resource_loader.get_path_to_datafile( "pywrap_gradient_exclusions.cc")) actual_contents = file_io.read_file_to_string(filename) self.assertEqual( actual_contents, expected_contents, """ pywrap_gradient_exclusions.cc needs to be updated. Please regenerate using: bazel run tensorflow/python/eager:gradient_input_output_exclusions -- $PWD/tensorflow/python/eager/pywrap_gradient_exclusions.cc""" )
def get_data(name, dir): if name not in ['train', 'test', 'unlabeled']: raise ValueError('{} is not in the dataset!'.format(name)) data = np.load( BytesIO(file_io.read_file_to_string('{}/{}.npz'.format(dir, name)))) seqs = data['data'] labels = None if name != 'unlabeled': labels = data['label'] return seqs, labels
def load_batch(fpath): object = file_io.read_file_to_string(fpath) #origin_bytes = bytes(object, encoding='latin1') # with open(fpath, 'rb') as f: if sys.version_info > (3, 0): # Python3 d = pickle.loads(object, encoding='latin1') else: # Python2 d = pickle.loads(object) data = d["data"] labels = d["labels"] return data, labels
def _fetch_embedding(self, emb_filepath): try: embedding = np.frombuffer( file_io.read_file_to_string(emb_filepath), dtype=np.float32) embedding = embedding.reshape(self.SHAPE) except ValueError as e: logging.warn('Could not load an embedding file from %s: %s', emb_filepath, str(e)) error_count.inc() if e.message.startswith('cannot reshape array of size 0 into'): file_io.delete_file(emb_filepath) return raise e
def run_analysis(args): """Builds an analysis files for training.""" # Read the schema and input feature types schema_list = json.loads( file_io.read_file_to_string(args.schema_file)) run_numerical_categorical_analysis(args, schema_list) # Also save a copy of the schema in the output folder. file_io.copy(args.schema_file, os.path.join(args.output_dir, SCHEMA_FILE), overwrite=True)
def read_schema(file_path): """Reads a schema file from specified location. Args: file_path: The location of the file holding a serialized Schema proto. Returns: An instance of Schema object. """ result = schema_pb2.Schema() contents = file_io.read_file_to_string(file_path) text_format.Parse(contents, result) return result
def get_checkpoint_state(checkpoint_dir, latest_filename=None): """Returns CheckpointState proto from the "checkpoint" file. If the "checkpoint" file contains a valid CheckpointState proto, returns it. Args: checkpoint_dir: The directory of checkpoints. latest_filename: Optional name of the checkpoint file. Default to 'checkpoint'. Returns: A CheckpointState if the state was available, None otherwise. Raises: ValueError: if the checkpoint read doesn't have model_checkpoint_path set. """ ckpt = None coord_checkpoint_filename = _GetCheckpointFilename(checkpoint_dir, latest_filename) f = None try: # Check that the file exists before opening it to avoid # many lines of errors from colossus in the logs. if file_io.file_exists(coord_checkpoint_filename): file_content = file_io.read_file_to_string( coord_checkpoint_filename) ckpt = CheckpointState() text_format.Merge(file_content, ckpt) if not ckpt.model_checkpoint_path: raise ValueError("Invalid checkpoint state loaded from " + checkpoint_dir) # For relative model_checkpoint_path and all_model_checkpoint_paths, # prepend checkpoint_dir. if not os.path.isabs(ckpt.model_checkpoint_path): ckpt.model_checkpoint_path = os.path.join( checkpoint_dir, ckpt.model_checkpoint_path) for i, p in enumerate(ckpt.all_model_checkpoint_paths): if not os.path.isabs(p): ckpt.all_model_checkpoint_paths[i] = os.path.join( checkpoint_dir, p) except errors.OpError as e: # It's ok if the file cannot be read logging.warning("%s: %s", type(e).__name__, e) logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename) return None except text_format.ParseError as e: logging.warning("%s: %s", type(e).__name__, e) logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename) return None finally: if f: f.close() return ckpt
def load_from_saved_model(saved_model_path, custom_objects=None): """Loads a keras Model from a SavedModel created by `export_saved_model()`. This function reinstantiates model state by: 1) loading model topology from json (this will eventually come from metagraph). 2) loading model weights from checkpoint. Example: ```python import tensorflow as tf # Create a tf.keras model. model = tf.keras.Sequential() model.add(tf.keras.layers.Dense(1, input_shape=[10])) model.summary() # Save the tf.keras model in the SavedModel format. path = '/tmp/simple_keras_model' tf.keras.experimental.export_saved_model(model, path) # Load the saved keras model back. new_model = tf.keras.experimental.load_from_saved_model(path) new_model.summary() ``` Args: saved_model_path: a string specifying the path to an existing SavedModel. custom_objects: Optional dictionary mapping names (strings) to custom classes or functions to be considered during deserialization. Returns: a keras.Model instance. """ # restore model topology from json string model_json_filepath = os.path.join( compat.as_bytes(saved_model_path), compat.as_bytes(constants.ASSETS_DIRECTORY), compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON)) model_json = file_io.read_file_to_string(model_json_filepath) model = model_from_json(model_json, custom_objects=custom_objects) # restore model weights checkpoint_prefix = os.path.join( compat.as_text(saved_model_path), compat.as_text(constants.VARIABLES_DIRECTORY), compat.as_text(constants.VARIABLES_FILENAME)) model.load_weights(checkpoint_prefix) return model
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.csv_schema_file: schema = json.loads( file_io.read_file_to_string(args.csv_schema_file).decode()) else: import google.datalab.bigquery as bq schema = bq.Table(args.bigquery_table).schema._bq_schema features = json.loads( file_io.read_file_to_string(args.features_file).decode()) expand_defaults(schema, features) # features are updated. inverted_features = invert_features(features) check_schema_transforms_match(schema, inverted_features) file_io.recursive_create_dir(args.output_dir) if args.cloud: run_cloud_analysis(output_dir=args.output_dir, csv_file_pattern=args.csv_file_pattern, bigquery_table=args.bigquery_table, schema=schema, inverted_features=inverted_features) else: run_local_analysis(output_dir=args.output_dir, csv_file_pattern=args.csv_file_pattern, schema=schema, inverted_features=inverted_features) # Save a copy of the schema and features in the output folder. file_io.write_string_to_file( os.path.join(args.output_dir, constant.SCHEMA_FILE), json.dumps(schema, indent=2)) file_io.write_string_to_file( os.path.join(args.output_dir, constant.FEATURES_FILE), json.dumps(features, indent=2))
def main(train_file, test_file, output_file, num_words, batch_size, epoch): num_features = 300 model = word_level_cnn(num_words, num_features) print('=========== Loading word2vec ===========') wordModel = KeyedVectors.load_word2vec_format('../google_300_model.bin', binary=True) train_input = StringIO(file_io.read_file_to_string(train_file)) train = read_data(train_input) X = encode_data_by_word(train['review'], num_words, num_features, wordModel, True) y = keras.utils.to_categorical(train['sentiment']) #model = word_level_lstm(num_words, num_features) X = np.reshape(X, [-1, num_words, 1, num_features]) model.fit(X, y, batch_size=batch_size, epochs=epoch, shuffle=True) test_input = StringIO(file_io.read_file_to_string(test_file)) test = read_data(test_input) test_X = encode_data_by_word(test['review'], num_words, num_features, wordModel, True) test_X = np.reshape(test_X, [-1, num_words, 1, num_features]) print('========== do prediction ===============') pred = model.predict(test_X) pred = np.squeeze(pred) pred = np.argmax(pred, axis=1) output = pd.DataFrame(data={"id":test["id"], "sentiment":pred}) result_file = file_io.FileIO(output_file, 'w') output.to_csv(result_file, index=False, quoting=3) result_file.close()
def __init__(self, **kwargs): self.__dict__.update(kwargs) # load dictionary with top num_words to index tmp = StringIO(file_io.read_file_to_string(self.dict_file)) self.dict_top_words_to_index = np.load(tmp).item() # create dictionary index to word self.dict_index_to_top_words = {v: k for k, v in self.dict_top_words_to_index.iteritems()} self.num_words = len(self.dict_top_words_to_index) tmp = StringIO(file_io.read_file_to_string(self.bias_file)) self.word_bias_init = np.load(tmp) # load word embeddings self.tf_initialize_word_embeddings() print('word embeddings randomly initialized') # training set: initialize .tfrecords reader and tf batch variables print('initialize .tfrecords from: '+self.train_file) self.image_train, self.label_train, self.id_train = read_and_decode_example( self.train_file, self.max_sequence_len, self.num_epochs, ) self.image_batch_train, self.label_batch_train = tf.train.shuffle_batch( [self.image_train, self.label_train], batch_size=self.batch_size, capacity=self.min_after_dequeue + 3 * self.batch_size, min_after_dequeue=self.min_after_dequeue, ) # load pre-trained CNN weights tmp = StringIO(file_io.read_file_to_string(self.weights_file)) self.weights_dict = np.load(tmp, encoding='latin1').item() # run training self.train()
def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}, {'name': 'col3', 'type': 'FLOAT'}] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5, 'col3': i + 0.5} for i in range(100)] table.insert(data) features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}, 'col3': {'transform': 'target'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, features=features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.schema: schema = json.loads(file_io.read_file_to_string(args.schema).decode()) else: import google.datalab.bigquery as bq schema = bq.Table(args.bigquery).schema._bq_schema features = json.loads(file_io.read_file_to_string(args.features).decode()) file_io.recursive_create_dir(args.output) if args.cloud: run_cloud_analysis(output_dir=args.output, csv_file_pattern=args.csv, bigquery_table=args.bigquery, schema=schema, features=features) else: feature_analysis.run_local_analysis(output_dir=args.output, csv_file_pattern=args.csv, schema=schema, features=features)
def main(job_dir, **args): # # local load # X = np.load('x_mood_dataset_samples_22k_off43_dur59049.npy') # y = np.load('y_mood_dataset_samples_22k_off43_dur59049.npy') # cloud load (PYTHON 2 ONLY - py2.7 runs on CloudML by default) from StringIO import StringIO f = StringIO(file_io.read_file_to_string(job_dir + 'x_mood_dataset_samples_22k_off43_dur59049.npy')) X = np.load(f) f1 = StringIO(file_io.read_file_to_string(job_dir + 'y_mood_dataset_samples_22k_off43_dur59049.npy')) y = np.load(f1) print('shape of training data: ', X.shape) print('shape of labels: ', y.shape) y = to_categorical(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y) with tf.device('/cpu:0'): model = build_model(input_shape=X_train[0].shape) model = tf.keras.utils.multi_gpu_model(model, gpus=NUM_GPUS) train(parakkek, job_dir, X_train, y_train, X_test, y_test)
def load_stats_text(input_path): """Loads the specified DatasetFeatureStatisticsList proto stored in text format. Args: input_path: File path from which to load the DatasetFeatureStatisticsList proto. Returns: A DatasetFeatureStatisticsList proto. """ stats_proto = statistics_pb2.DatasetFeatureStatisticsList() stats_text = file_io.read_file_to_string(input_path) text_format.Parse(stats_text, stats_proto) return stats_proto
def read(self, schema_path: Text) -> schema_pb2.Schema: """Gets a tf.metadata schema. Args: schema_path: Path to schema file. Returns: A tf.metadata schema. """ result = schema_pb2.Schema() contents = file_io.read_file_to_string(schema_path) text_format.Parse(contents, result) return result
def main(train_file, test_file, output_file, num_chars, batch_size, epoch): char_to_idx = char_idx_map() vocab_size = len(char_to_idx) cnn = char_level_cnn(num_chars, vocab_size) train_input = StringIO(file_io.read_file_to_string(train_file)) train = read_data(train_input) test_input = StringIO(file_io.read_file_to_string(test_file)) test = read_data(test_input) X = encode_data_by_char(train['review'], num_chars, char_to_idx, False, one_hot=True) X = np.reshape(X, [-1, num_chars, 1, vocab_size]) y = keras.utils.to_categorical(train['sentiment']) cnn.fit(X, y, batch_size=batch_size, epochs=epoch, shuffle=True) test_X = encode_data_by_char(test['review'], num_chars, char_to_idx, False, one_hot=True) test_X = np.reshape(test_X, [-1, num_chars, 1, vocab_size]) print('========== do prediction ===============') pred = cnn.predict(test_X) pred = np.squeeze(pred) pred = np.argmax(pred, axis=1) output = pd.DataFrame(data={"id":test["id"], "sentiment":pred}) result_file = file_io.FileIO(output_file, 'w') output.to_csv(result_file, index=False, quoting=3) result_file.close()
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs: # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.base_model = base_model result.hyperparameters = hyperparameters_config result.custom_config = custom_config return result
def test_numerics(self): test_folder = os.path.join(self._bucket_root, 'test_numerics') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] features = { 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'identity', 'source_column': 'col2' } } analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5)
def _validate_asset_collection(self, export_dir, graph_collection_def, expected_asset_file_name, expected_asset_file_contents, expected_asset_tensor_name): assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value asset = meta_graph_pb2.AssetFileDef() assets_any[0].Unpack(asset) assets_path = os.path.join(compat.as_bytes(export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY), compat.as_bytes(expected_asset_file_name)) actual_asset_contents = file_io.read_file_to_string(assets_path) self.assertEqual(expected_asset_file_contents, compat.as_text(actual_asset_contents)) self.assertEqual(expected_asset_file_name, asset.filename) self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
def load_from_saved_model(saved_model_path): """Loads a keras.Model from a SavedModel created by keras export(). This function reinstantiates model state by: 1) loading model topology from json (this will eventually come from metagraph). 2) loading model weights from checkpoint. Example: ```python import tensorflow as tf # Create a tf.keras model. model = tf.keras.Sequential() model.add(tf.keras.layers.Dense(1, input_shape=[10])) model.summary() # Save the tf.keras model in the SavedModel format. saved_to_path = tf.keras.experimental.export( model, '/tmp/my_simple_tf_keras_saved_model') # Load the saved keras model back. model_prime = tf.keras.experimental.load_from_saved_model(saved_to_path) model_prime.summary() ``` Args: saved_model_path: a string specifying the path to an existing SavedModel. Returns: a keras.Model instance. """ # restore model topology from json string model_json_filepath = os.path.join( compat.as_bytes(saved_model_path), compat.as_bytes(constants.ASSETS_DIRECTORY), compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON)) model_json = file_io.read_file_to_string(model_json_filepath) model = model_from_json(model_json) # restore model weights checkpoint_prefix = os.path.join( compat.as_text(saved_model_path), compat.as_text(constants.VARIABLES_DIRECTORY), compat.as_text(constants.VARIABLES_FILENAME)) model.load_weights(checkpoint_prefix) return model
def testRun(self, mock_publisher): mock_publisher.return_value.publish_execution.return_value = {} test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) connection_config = metadata_store_pb2.ConnectionConfig() connection_config.sqlite.SetInParent() metadata_connection = metadata.Metadata(connection_config) pipeline_root = os.path.join(test_dir, 'Test') input_path = os.path.join(test_dir, 'input') fileio.makedirs(os.path.dirname(input_path)) file_io.write_string_to_file(input_path, 'test') input_artifact = test_utils._InputArtifact() input_artifact.uri = input_path component = test_utils._FakeComponent( name='FakeComponent', input_channel=channel_utils.as_channel([input_artifact])) pipeline_info = data_types.PipelineInfo(pipeline_name='Test', pipeline_root=pipeline_root, run_id='123') driver_args = data_types.DriverArgs(enable_cache=True) # We use InProcessComponentLauncher to test BaseComponentLauncher logics. launcher = in_process_component_launcher.InProcessComponentLauncher.create( component=component, pipeline_info=pipeline_info, driver_args=driver_args, metadata_connection=metadata_connection, beam_pipeline_args=[], additional_pipeline_args={}) self.assertEqual( launcher._component_info.component_type, '.'.join([ test_utils._FakeComponent.__module__, test_utils._FakeComponent.__name__ ])) launcher.launch() output_path = component.outputs['output'].get()[0].uri self.assertTrue(fileio.exists(output_path)) contents = file_io.read_file_to_string(output_path) self.assertEqual('test', contents)
def main(): tf.logging.set_verbosity(tf.logging.INFO) args = parse_arguments() args.slice_columns = [ column for column in column_group.split(',') for column_group in args.slice_columns ] schema = json.loads(file_io.read_file_to_string(args.schema)) eval_model_parent_dir = os.path.join(args.model, 'tfma_eval_model_dir') model_export_dir = os.path.join(eval_model_parent_dir, file_io.list_directory(eval_model_parent_dir)[0]) run_analysis(args.output, model_export_dir, args.eval, schema, args.project, args.mode, args.slice_columns) generate_static_html_output(args.output, args.slice_columns) with open('/output.txt', 'w') as f: f.write(args.output)
def _update_execution_proto( self, execution: metadata_store_pb2.Execution, pipeline_info: Optional[data_types.PipelineInfo] = None, component_info: Optional[data_types.ComponentInfo] = None, state: Optional[Text] = None, exec_properties: Optional[Dict[Text, Any]] = None, ) -> metadata_store_pb2.Execution: """Updates the execution proto with given type and state.""" if state is not None: execution.properties[ _EXECUTION_TYPE_KEY_STATE].string_value = tf.compat.as_text(state) # Forward-compatible change to leverage built-in schema to track states. if state == EXECUTION_STATE_CACHED: execution.last_known_state = metadata_store_pb2.Execution.CACHED elif state == EXECUTION_STATE_COMPLETE: execution.last_known_state = metadata_store_pb2.Execution.COMPLETE elif state == EXECUTION_STATE_NEW: execution.last_known_state = metadata_store_pb2.Execution.RUNNING exec_properties = exec_properties or {} # TODO(ruoyu): Enforce a formal rule for execution schema change. for k, v in exec_properties.items(): # We always convert execution properties to unicode. execution.properties[k].string_value = tf.compat.as_text( tf.compat.as_str_any(v)) # We also need to checksum UDF file to identify different binary being # used. Do we have a better way to checksum a file than hashlib.md5? # TODO(ruoyu): Find a better place / solution to the checksum logic. # TODO(ruoyu): SHA instead of MD5. if 'module_file' in exec_properties and exec_properties[ 'module_file'] and fileio.exists(exec_properties['module_file']): contents = file_io.read_file_to_string(exec_properties['module_file']) execution.properties['checksum_md5'].string_value = tf.compat.as_text( tf.compat.as_str_any( hashlib.md5(tf.compat.as_bytes(contents)).hexdigest())) if pipeline_info: execution.properties[ 'pipeline_name'].string_value = pipeline_info.pipeline_name execution.properties[ 'pipeline_root'].string_value = pipeline_info.pipeline_root if pipeline_info.run_id: execution.properties['run_id'].string_value = pipeline_info.run_id if component_info: execution.properties[ 'component_id'].string_value = component_info.component_id return execution
def test_run(self, mock_publisher): mock_publisher.return_value.publish_execution.return_value = {} test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) connection_config = metadata_store_pb2.ConnectionConfig() connection_config.sqlite.SetInParent() pipeline_root = os.path.join(test_dir, 'Test') input_path = os.path.join(test_dir, 'input') tf.gfile.MakeDirs(os.path.dirname(input_path)) file_io.write_string_to_file(input_path, 'test') input_artifact = types.TfxArtifact(type_name='InputPath') input_artifact.uri = input_path component = _FakeComponent(name='FakeComponent', input_channel=channel.as_channel( [input_artifact])) pipeline_info = data_types.PipelineInfo(pipeline_name='Test', pipeline_root=pipeline_root, run_id='123') driver_args = data_types.DriverArgs(worker_name=component.component_id, base_output_dir=os.path.join( pipeline_root, component.component_id), enable_cache=True) launcher = component_launcher.ComponentLauncher( component=component, pipeline_info=pipeline_info, driver_args=driver_args, metadata_connection_config=connection_config, additional_pipeline_args={}) self.assertEqual( launcher._component_info.component_type, '.'.join([_FakeComponent.__module__, _FakeComponent.__name__])) launcher.launch() output_path = os.path.join(pipeline_root, 'output') self.assertTrue(tf.gfile.Exists(output_path)) contents = file_io.read_file_to_string(output_path) self.assertEqual('test', contents)
def test_numerics(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] features = { 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'identity', 'source_column': 'col2' } } analyze.run_local_analysis(output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder)
def run_fn(fn_args: TrainerFnArgs): hparams = fn_args.hyperparameters if type(hparams) is dict and 'values' in hparams.keys(): hparams = hparams['values'] schema = schema_pb2.Schema() schema_text = file_io.read_file_to_string(fn_args.schema_file) text_format.Parse(schema_text, schema) feature_spec = schema_utils.schema_as_feature_spec(schema).feature_spec tf_transform_output = tft.TFTransformOutput(fn_args.transform_output) train_dataset = _input_fn(fn_args.train_files, tf_transform_output) eval_dataset = _input_fn(fn_args.eval_files, tf_transform_output) mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = _build_keras_model(hparams=hparams) try: log_dir = fn_args.model_run_dir except KeyError: log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs') # Write logs to path tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, update_freq='batch') model.fit(train_dataset, steps_per_epoch=fn_args.train_steps, validation_data=eval_dataset, validation_steps=fn_args.eval_steps, callbacks=[tensorboard_callback]) signatures = { 'serving_default': _get_serve_tf_examples_fn(model, tf_transform_output).get_concrete_function( tf.TensorSpec(shape=[None], dtype=tf.string, name='examples')) } model.save(fn_args.serving_model_dir, save_format='tf', signatures=signatures)
def testGeneratedFileMatchesHead(self): expected_contents = gradient_input_output_exclusions.get_contents() filename = os.path.join( resource_loader.get_root_dir_with_all_resources(), resource_loader.get_path_to_datafile("pywrap_gradient_exclusions.cc")) actual_contents = file_io.read_file_to_string(filename) # On windows, one or both of these strings may have CRLF line endings. # To make sure, sanitize both: sanitized_actual_contents = actual_contents.replace("\r", "") sanitized_expected_contents = expected_contents.replace("\r", "") self.assertEqual( sanitized_actual_contents, sanitized_expected_contents, """ pywrap_gradient_exclusions.cc needs to be updated. Please regenerate using: bazel run tensorflow/python/eager:gradient_input_output_exclusions -- $PWD/tensorflow/python/eager/pywrap_gradient_exclusions.cc""" )
def get_data(imgtype): train_images = [] path = "./Class/" for x in range(1, 7): path = "./Class" + str(x) + "/" print(path) read_file = file_io.read_file_to_string(path + imgtype + "/Label/Labels.txt") read_file = str(read_file) df = pd.read_fwf(path + imgtype + "/Label/Labels.txt") for i in range(0, len(df)): if (int(df.iloc[i][1]) == 1): fname = path + imgtype + "/" + str(df.iloc[i][2]) print(fname) img = cv2.imread(fname, cv2.IMREAD_GRAYSCALE) img = cv2.resize(img, (512, 512)) train_images.append([np.array(img)]) return train_images