Esempio n. 1
0
    def test_make_transform_graph_category(self):
        output_folder = tempfile.mkdtemp()
        try:
            file_io.write_string_to_file(
                os.path.join(output_folder,
                             analyze_data.VOCAB_ANALYSIS_FILE % 'cat1'),
                '\n'.join(['red,300', 'blue,200', 'green,100']))

            file_io.write_string_to_file(
                os.path.join(output_folder,
                             analyze_data.VOCAB_ANALYSIS_FILE % 'cat2'),
                '\n'.join(['pizza,300', 'ice_cream,200', 'cookies,100']))

            file_io.write_string_to_file(
                os.path.join(output_folder, analyze_data.STATS_FILE),
                json.dumps({}))  # stats file needed but unused.

            analyze_data.make_transform_graph(
                output_folder, [{
                    'name': 'cat1',
                    'type': 'STRING'
                }, {
                    'name': 'cat2',
                    'type': 'STRING'
                }], {
                    'cat1': {
                        'transform': 'one_hot'
                    },
                    'cat2': {
                        'transform': 'embedding'
                    }
                })

            model_path = os.path.join(output_folder, 'transform_fn')
            self.assertTrue(
                os.path.isfile(os.path.join(model_path, 'saved_model.pb')))

            results = self._run_graph(model_path, {
                'cat1': ['red', 'blue', 'green'],
                'cat2': ['pizza', '', 'extra']
            })

            for result, expected_result in zip(results['cat1'].tolist(),
                                               [0, 1, 2]):
                self.assertAlmostEqual(result, expected_result)

            for result, expected_result in zip(results['cat2'].tolist(),
                                               [0, 3, 3]):
                self.assertAlmostEqual(result, expected_result)
        finally:
            shutil.rmtree(output_folder)
Esempio n. 2
0
    def test_make_transform_graph_images(self):

        print(
            'Testing make_transform_graph with image_to_vec.' +
            'It may take a few minutes because it needs to download a large inception checkpoint.'
        )

        def _open_and_encode_image(img_url):
            with file_io.FileIO(img_url, 'r') as f:
                img = Image.open(f).convert('RGB')
                output = cStringIO.StringIO()
                img.save(output, 'jpeg')
            return output.getvalue()

        try:
            output_folder = tempfile.mkdtemp()
            stats_file_path = os.path.join(output_folder,
                                           analyze_data.STATS_FILE)
            file_io.write_string_to_file(stats_file_path,
                                         json.dumps({'column_stats': {}}))
            analyze_data.make_transform_graph(output_folder, [{
                'name': 'img',
                'type': 'STRING'
            }], {'img': {
                'transform': 'image_to_vec'
            }})

            model_path = os.path.join(output_folder, 'transform_fn')
            self.assertTrue(
                os.path.isfile(os.path.join(model_path, 'saved_model.pb')))

            img_string1 = _open_and_encode_image(
                'gs://cloud-ml-data/img/flower_photos/daisy/15207766_fc2f1d692c_n.jpg'
            )
            img_string2 = _open_and_encode_image(
                'gs://cloud-ml-data/img/flower_photos/dandelion/8980164828_04fbf64f79_n.jpg'
            )
            results = self._run_graph(model_path,
                                      {'img': [img_string1, img_string2]})
            embeddings = results['img']
            self.assertEqual(len(embeddings), 2)
            self.assertEqual(len(embeddings[0]), 2048)
            self.assertEqual(embeddings[0].dtype, np.float32)
            self.assertTrue(any(x != 0.0 for x in embeddings[1]))

        finally:
            shutil.rmtree(output_folder)
Esempio n. 3
0
    def test_make_transform_graph_text_bag_of_words(self):
        output_folder = tempfile.mkdtemp()
        try:
            # vocab  id
            # red    0
            # blue   1
            # green  2
            # oov    3 (out of vocab)
            file_io.write_string_to_file(
                os.path.join(output_folder,
                             analyze_data.VOCAB_ANALYSIS_FILE % 'cat1'),
                '\n'.join(['red,2', 'blue,2', 'green,1']))

            file_io.write_string_to_file(
                os.path.join(output_folder, analyze_data.STATS_FILE),
                json.dumps({}))  # Stats file needed but unused.

            analyze_data.make_transform_graph(output_folder, [{
                'name': 'cat1',
                'type': 'STRING'
            }], {'cat1': {
                'transform': 'bag_of_words'
            }})

            model_path = os.path.join(output_folder, 'transform_fn')
            self.assertTrue(
                os.path.isfile(os.path.join(model_path, 'saved_model.pb')))

            results = self._run_graph(
                model_path,
                {
                    'cat1': [
                        'red red red',  # doc 0
                        'red green red',  # doc 1
                        'blue',  # doc 2
                        'blue blue',  # doc 3
                        '',  # doc 4
                        'brown',  # doc 5
                        'brown blue'
                    ]
                })  # doc 6

            # indices are in the form [doc id, vocab id]
            expected_indices = [[0, 0], [1, 0], [1, 1], [2, 0], [3, 0], [5, 0],
                                [6, 0], [6, 1]]

            # Note in doc 6, is is blue, then brown.
            # doc id            0  1  1  2  3  5  6  6
            expected_ids = [0, 0, 2, 1, 1, 3, 1, 3]  # noqa
            expected_weights = [3, 2, 1, 1, 2, 1, 1, 1]
            self.assertEqual(results['cat1_ids'].indices.tolist(),
                             expected_indices)
            self.assertEqual(results['cat1_ids'].dense_shape.tolist(), [7, 4])
            self.assertEqual(results['cat1_ids'].values.tolist(), expected_ids)

            self.assertEqual(results['cat1_weights'].indices.tolist(),
                             expected_indices)
            self.assertEqual(results['cat1_weights'].dense_shape.tolist(),
                             [7, 4])
            self.assertEqual(results['cat1_weights'].values.size,
                             len(expected_weights))
            for weight, exp_weight in zip(
                    results['cat1_weights'].values.tolist(), expected_weights):
                self.assertAlmostEqual(weight, exp_weight)

        finally:
            shutil.rmtree(output_folder)
Esempio n. 4
0
    def test_make_transform_graph_text_tfidf(self):
        output_folder = tempfile.mkdtemp()
        try:
            # vocab  id
            # red    0
            # blue   1
            # green  2
            # oov    3 (out of vocab)
            # corpus size aka num_examples = 4
            # IDF: log(num_examples/(1+number of examples that have this token))
            #  red: log(4/3)
            #  blue: log(4/3)
            #  green: log(4/2)
            #  oov:  log(4/1)
            file_io.write_string_to_file(
                os.path.join(output_folder,
                             analyze_data.VOCAB_ANALYSIS_FILE % 'cat1'),
                '\n'.join(['red,2', 'blue,2', 'green,1']))

            file_io.write_string_to_file(
                os.path.join(output_folder, analyze_data.STATS_FILE),
                json.dumps({'num_examples': 4}))

            analyze_data.make_transform_graph(output_folder, [{
                'name': 'cat1',
                'type': 'STRING'
            }], {'cat1': {
                'transform': 'tfidf'
            }})

            model_path = os.path.join(output_folder, 'transform_fn')
            self.assertTrue(
                os.path.isfile(os.path.join(model_path, 'saved_model.pb')))

            results = self._run_graph(
                model_path,
                {
                    'cat1': [
                        'red red red',  # doc 0
                        'red green red',  # doc 1
                        'blue',  # doc 2
                        'blue blue',  # doc 3
                        '',  # doc 4
                        'brown',  # doc 5
                        'brown blue'
                    ]
                })  # doc 6

            # indices are in the form [doc id, vocab id]
            expected_indices = [[0, 0], [1, 0], [1, 1], [2, 0], [3, 0], [5, 0],
                                [6, 0], [6, 1]]
            expected_ids = [0, 0, 2, 1, 1, 3, 1,
                            3]  # Note in doc 6, it is blue, then brown.
            self.assertEqual(results['cat1_ids'].indices.tolist(),
                             expected_indices)
            self.assertEqual(results['cat1_ids'].dense_shape.tolist(), [7, 4])
            self.assertEqual(results['cat1_ids'].values.tolist(), expected_ids)

            # Note, these are natural logs.
            expected_weights = [
                math.log(4.0 / 3.0),  # doc 0
                2.0 / 3.0 * math.log(4.0 / 3.0),
                1.0 / 3.0 * math.log(2.0),  # doc 1
                math.log(4.0 / 3.0),  # doc 2
                math.log(4.0 / 3.0),  # doc 3
                math.log(4.0),  # doc 5
                1.0 / 2.0 * math.log(4.0 / 3.0),
                1.0 / 2.0 * math.log(4.0)
            ]  # doc 6

            self.assertEqual(results['cat1_weights'].indices.tolist(),
                             expected_indices)
            self.assertEqual(results['cat1_weights'].dense_shape.tolist(),
                             [7, 4])
            self.assertEqual(results['cat1_weights'].values.size,
                             len(expected_weights))
            for weight, expected_weight in zip(
                    results['cat1_weights'].values.tolist(), expected_weights):
                self.assertAlmostEqual(weight, expected_weight)

        finally:
            shutil.rmtree(output_folder)
Esempio n. 5
0
    def test_make_transform_graph_numerics_gcs(self):
        """Input and output of this test is on GCS."""

        output_folder = 'gs://temp_pydatalab_test_%s' % uuid.uuid4().hex
        subprocess.check_call('gsutil mb %s' % output_folder, shell=True)
        stats_file_path = os.path.join(output_folder, analyze_data.STATS_FILE)
        try:
            file_io.write_string_to_file(
                stats_file_path,
                json.dumps({
                    'column_stats': {
                        'num1': {
                            'max': 10.0,
                            'mean': 9.5,
                            'min': 0.0
                        },  # noqa
                        'num2': {
                            'max': 1.0,
                            'mean': 2.0,
                            'min': -1.0
                        },
                        'num3': {
                            'max': 10.0,
                            'mean': 2.0,
                            'min': 5.0
                        }
                    }
                }))
            analyze_data.make_transform_graph(
                output_folder, [{
                    'name': 'num1',
                    'type': 'FLOAT'
                }, {
                    'name': 'num2',
                    'type': 'FLOAT'
                }, {
                    'name': 'num3',
                    'type': 'INTEGER'
                }], {
                    'num1': {
                        'transform': 'identity'
                    },
                    'num2': {
                        'transform': 'scale',
                        'value': 10
                    },
                    'num3': {
                        'transform': 'scale'
                    }
                })

            model_path = os.path.join(output_folder, 'transform_fn')
            self.assertTrue(
                file_io.file_exists(os.path.join(model_path,
                                                 'saved_model.pb')))

            results = self._run_graph(model_path, {
                'num1': [5, 10, 15],
                'num2': [-1, 1, 0.5],
                'num3': [10, 5, 7]
            })

            for result, expected_result in zip(results['num1'].tolist(),
                                               [5, 10, 15]):
                self.assertAlmostEqual(result, expected_result)

            for result, expected_result in zip(results['num2'].tolist(),
                                               [-10, 10, 5]):
                self.assertAlmostEqual(result, expected_result)

            for result, expected_result in zip(
                    results['num3'].tolist(),
                [1, -1, (7.0 - 5) * 2.0 / 5.0 - 1]):
                self.assertAlmostEqual(result, expected_result)
        finally:
            subprocess.check_call('gsutil -m rm -r %s' % output_folder,
                                  shell=True)
Esempio n. 6
0
    def test_make_transform_graph_numerics(self):
        output_folder = tempfile.mkdtemp()
        stats_file_path = os.path.join(output_folder, analyze_data.STATS_FILE)
        try:
            file_io.write_string_to_file(
                stats_file_path,
                json.dumps({
                    'column_stats': {
                        'num1': {
                            'max': 10.0,
                            'mean': 9.5,
                            'min': 0.0
                        },  # noqa
                        'num2': {
                            'max': 1.0,
                            'mean': 2.0,
                            'min': -1.0
                        },
                        'num3': {
                            'max': 10.0,
                            'mean': 2.0,
                            'min': 5.0
                        }
                    }
                }))
            analyze_data.make_transform_graph(
                output_folder, [{
                    'name': 'num1',
                    'type': 'FLOAT'
                }, {
                    'name': 'num2',
                    'type': 'FLOAT'
                }, {
                    'name': 'num3',
                    'type': 'INTEGER'
                }], {
                    'num1': {
                        'transform': 'identity'
                    },
                    'num2': {
                        'transform': 'scale',
                        'value': 10
                    },
                    'num3': {
                        'transform': 'scale'
                    }
                })

            model_path = os.path.join(output_folder, 'transform_fn')
            self.assertTrue(
                os.path.isfile(os.path.join(model_path, 'saved_model.pb')))

            results = self._run_graph(model_path, {
                'num1': [5, 10, 15],
                'num2': [-1, 1, 0.5],
                'num3': [10, 5, 7]
            })

            for result, expected_result in zip(results['num1'].tolist(),
                                               [5, 10, 15]):
                self.assertAlmostEqual(result, expected_result)

            for result, expected_result in zip(results['num2'].tolist(),
                                               [-10, 10, 5]):
                self.assertAlmostEqual(result, expected_result)

            for result, expected_result in zip(
                    results['num3'].tolist(),
                [1, -1, (7.0 - 5) * 2.0 / 5.0 - 1]):
                self.assertAlmostEqual(result, expected_result)
        finally:
            shutil.rmtree(output_folder)