def test_make_transform_graph_category(self): output_folder = tempfile.mkdtemp() try: file_io.write_string_to_file( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'cat1'), '\n'.join(['red,300', 'blue,200', 'green,100'])) file_io.write_string_to_file( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'cat2'), '\n'.join(['pizza,300', 'ice_cream,200', 'cookies,100'])) file_io.write_string_to_file( os.path.join(output_folder, analyze_data.STATS_FILE), json.dumps({})) # stats file needed but unused. analyze_data.make_transform_graph( output_folder, [{ 'name': 'cat1', 'type': 'STRING' }, { 'name': 'cat2', 'type': 'STRING' }], { 'cat1': { 'transform': 'one_hot' }, 'cat2': { 'transform': 'embedding' } }) model_path = os.path.join(output_folder, 'transform_fn') self.assertTrue( os.path.isfile(os.path.join(model_path, 'saved_model.pb'))) results = self._run_graph(model_path, { 'cat1': ['red', 'blue', 'green'], 'cat2': ['pizza', '', 'extra'] }) for result, expected_result in zip(results['cat1'].tolist(), [0, 1, 2]): self.assertAlmostEqual(result, expected_result) for result, expected_result in zip(results['cat2'].tolist(), [0, 3, 3]): self.assertAlmostEqual(result, expected_result) finally: shutil.rmtree(output_folder)
def test_make_transform_graph_images(self): print( 'Testing make_transform_graph with image_to_vec.' + 'It may take a few minutes because it needs to download a large inception checkpoint.' ) def _open_and_encode_image(img_url): with file_io.FileIO(img_url, 'r') as f: img = Image.open(f).convert('RGB') output = cStringIO.StringIO() img.save(output, 'jpeg') return output.getvalue() try: output_folder = tempfile.mkdtemp() stats_file_path = os.path.join(output_folder, analyze_data.STATS_FILE) file_io.write_string_to_file(stats_file_path, json.dumps({'column_stats': {}})) analyze_data.make_transform_graph(output_folder, [{ 'name': 'img', 'type': 'STRING' }], {'img': { 'transform': 'image_to_vec' }}) model_path = os.path.join(output_folder, 'transform_fn') self.assertTrue( os.path.isfile(os.path.join(model_path, 'saved_model.pb'))) img_string1 = _open_and_encode_image( 'gs://cloud-ml-data/img/flower_photos/daisy/15207766_fc2f1d692c_n.jpg' ) img_string2 = _open_and_encode_image( 'gs://cloud-ml-data/img/flower_photos/dandelion/8980164828_04fbf64f79_n.jpg' ) results = self._run_graph(model_path, {'img': [img_string1, img_string2]}) embeddings = results['img'] self.assertEqual(len(embeddings), 2) self.assertEqual(len(embeddings[0]), 2048) self.assertEqual(embeddings[0].dtype, np.float32) self.assertTrue(any(x != 0.0 for x in embeddings[1])) finally: shutil.rmtree(output_folder)
def test_make_transform_graph_text_bag_of_words(self): output_folder = tempfile.mkdtemp() try: # vocab id # red 0 # blue 1 # green 2 # oov 3 (out of vocab) file_io.write_string_to_file( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'cat1'), '\n'.join(['red,2', 'blue,2', 'green,1'])) file_io.write_string_to_file( os.path.join(output_folder, analyze_data.STATS_FILE), json.dumps({})) # Stats file needed but unused. analyze_data.make_transform_graph(output_folder, [{ 'name': 'cat1', 'type': 'STRING' }], {'cat1': { 'transform': 'bag_of_words' }}) model_path = os.path.join(output_folder, 'transform_fn') self.assertTrue( os.path.isfile(os.path.join(model_path, 'saved_model.pb'))) results = self._run_graph( model_path, { 'cat1': [ 'red red red', # doc 0 'red green red', # doc 1 'blue', # doc 2 'blue blue', # doc 3 '', # doc 4 'brown', # doc 5 'brown blue' ] }) # doc 6 # indices are in the form [doc id, vocab id] expected_indices = [[0, 0], [1, 0], [1, 1], [2, 0], [3, 0], [5, 0], [6, 0], [6, 1]] # Note in doc 6, is is blue, then brown. # doc id 0 1 1 2 3 5 6 6 expected_ids = [0, 0, 2, 1, 1, 3, 1, 3] # noqa expected_weights = [3, 2, 1, 1, 2, 1, 1, 1] self.assertEqual(results['cat1_ids'].indices.tolist(), expected_indices) self.assertEqual(results['cat1_ids'].dense_shape.tolist(), [7, 4]) self.assertEqual(results['cat1_ids'].values.tolist(), expected_ids) self.assertEqual(results['cat1_weights'].indices.tolist(), expected_indices) self.assertEqual(results['cat1_weights'].dense_shape.tolist(), [7, 4]) self.assertEqual(results['cat1_weights'].values.size, len(expected_weights)) for weight, exp_weight in zip( results['cat1_weights'].values.tolist(), expected_weights): self.assertAlmostEqual(weight, exp_weight) finally: shutil.rmtree(output_folder)
def test_make_transform_graph_text_tfidf(self): output_folder = tempfile.mkdtemp() try: # vocab id # red 0 # blue 1 # green 2 # oov 3 (out of vocab) # corpus size aka num_examples = 4 # IDF: log(num_examples/(1+number of examples that have this token)) # red: log(4/3) # blue: log(4/3) # green: log(4/2) # oov: log(4/1) file_io.write_string_to_file( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'cat1'), '\n'.join(['red,2', 'blue,2', 'green,1'])) file_io.write_string_to_file( os.path.join(output_folder, analyze_data.STATS_FILE), json.dumps({'num_examples': 4})) analyze_data.make_transform_graph(output_folder, [{ 'name': 'cat1', 'type': 'STRING' }], {'cat1': { 'transform': 'tfidf' }}) model_path = os.path.join(output_folder, 'transform_fn') self.assertTrue( os.path.isfile(os.path.join(model_path, 'saved_model.pb'))) results = self._run_graph( model_path, { 'cat1': [ 'red red red', # doc 0 'red green red', # doc 1 'blue', # doc 2 'blue blue', # doc 3 '', # doc 4 'brown', # doc 5 'brown blue' ] }) # doc 6 # indices are in the form [doc id, vocab id] expected_indices = [[0, 0], [1, 0], [1, 1], [2, 0], [3, 0], [5, 0], [6, 0], [6, 1]] expected_ids = [0, 0, 2, 1, 1, 3, 1, 3] # Note in doc 6, it is blue, then brown. self.assertEqual(results['cat1_ids'].indices.tolist(), expected_indices) self.assertEqual(results['cat1_ids'].dense_shape.tolist(), [7, 4]) self.assertEqual(results['cat1_ids'].values.tolist(), expected_ids) # Note, these are natural logs. expected_weights = [ math.log(4.0 / 3.0), # doc 0 2.0 / 3.0 * math.log(4.0 / 3.0), 1.0 / 3.0 * math.log(2.0), # doc 1 math.log(4.0 / 3.0), # doc 2 math.log(4.0 / 3.0), # doc 3 math.log(4.0), # doc 5 1.0 / 2.0 * math.log(4.0 / 3.0), 1.0 / 2.0 * math.log(4.0) ] # doc 6 self.assertEqual(results['cat1_weights'].indices.tolist(), expected_indices) self.assertEqual(results['cat1_weights'].dense_shape.tolist(), [7, 4]) self.assertEqual(results['cat1_weights'].values.size, len(expected_weights)) for weight, expected_weight in zip( results['cat1_weights'].values.tolist(), expected_weights): self.assertAlmostEqual(weight, expected_weight) finally: shutil.rmtree(output_folder)
def test_make_transform_graph_numerics_gcs(self): """Input and output of this test is on GCS.""" output_folder = 'gs://temp_pydatalab_test_%s' % uuid.uuid4().hex subprocess.check_call('gsutil mb %s' % output_folder, shell=True) stats_file_path = os.path.join(output_folder, analyze_data.STATS_FILE) try: file_io.write_string_to_file( stats_file_path, json.dumps({ 'column_stats': { 'num1': { 'max': 10.0, 'mean': 9.5, 'min': 0.0 }, # noqa 'num2': { 'max': 1.0, 'mean': 2.0, 'min': -1.0 }, 'num3': { 'max': 10.0, 'mean': 2.0, 'min': 5.0 } } })) analyze_data.make_transform_graph( output_folder, [{ 'name': 'num1', 'type': 'FLOAT' }, { 'name': 'num2', 'type': 'FLOAT' }, { 'name': 'num3', 'type': 'INTEGER' }], { 'num1': { 'transform': 'identity' }, 'num2': { 'transform': 'scale', 'value': 10 }, 'num3': { 'transform': 'scale' } }) model_path = os.path.join(output_folder, 'transform_fn') self.assertTrue( file_io.file_exists(os.path.join(model_path, 'saved_model.pb'))) results = self._run_graph(model_path, { 'num1': [5, 10, 15], 'num2': [-1, 1, 0.5], 'num3': [10, 5, 7] }) for result, expected_result in zip(results['num1'].tolist(), [5, 10, 15]): self.assertAlmostEqual(result, expected_result) for result, expected_result in zip(results['num2'].tolist(), [-10, 10, 5]): self.assertAlmostEqual(result, expected_result) for result, expected_result in zip( results['num3'].tolist(), [1, -1, (7.0 - 5) * 2.0 / 5.0 - 1]): self.assertAlmostEqual(result, expected_result) finally: subprocess.check_call('gsutil -m rm -r %s' % output_folder, shell=True)
def test_make_transform_graph_numerics(self): output_folder = tempfile.mkdtemp() stats_file_path = os.path.join(output_folder, analyze_data.STATS_FILE) try: file_io.write_string_to_file( stats_file_path, json.dumps({ 'column_stats': { 'num1': { 'max': 10.0, 'mean': 9.5, 'min': 0.0 }, # noqa 'num2': { 'max': 1.0, 'mean': 2.0, 'min': -1.0 }, 'num3': { 'max': 10.0, 'mean': 2.0, 'min': 5.0 } } })) analyze_data.make_transform_graph( output_folder, [{ 'name': 'num1', 'type': 'FLOAT' }, { 'name': 'num2', 'type': 'FLOAT' }, { 'name': 'num3', 'type': 'INTEGER' }], { 'num1': { 'transform': 'identity' }, 'num2': { 'transform': 'scale', 'value': 10 }, 'num3': { 'transform': 'scale' } }) model_path = os.path.join(output_folder, 'transform_fn') self.assertTrue( os.path.isfile(os.path.join(model_path, 'saved_model.pb'))) results = self._run_graph(model_path, { 'num1': [5, 10, 15], 'num2': [-1, 1, 0.5], 'num3': [10, 5, 7] }) for result, expected_result in zip(results['num1'].tolist(), [5, 10, 15]): self.assertAlmostEqual(result, expected_result) for result, expected_result in zip(results['num2'].tolist(), [-10, 10, 5]): self.assertAlmostEqual(result, expected_result) for result, expected_result in zip( results['num3'].tolist(), [1, -1, (7.0 - 5) * 2.0 / 5.0 - 1]): self.assertAlmostEqual(result, expected_result) finally: shutil.rmtree(output_folder)