Beispiel #1
0
    def test_numerics(self):
        """Build a BQ table, and then call analyze on it."""
        schema = [{
            'name': 'col1',
            'type': 'INTEGER'
        }, {
            'name': 'col2',
            'type': 'FLOAT'
        }]
        project_id = dl.Context.default().project_id
        dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
        table_name = 'temp_table'
        full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name)

        output_folder = tempfile.mkdtemp()

        try:
            # Make a dataset, a table, and insert data.
            db = bq.Dataset((project_id, dataset_name))
            db.create()

            table = bq.Table(full_table_name)
            table.create(schema=bq.Schema(schema), overwrite=True)

            data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)]
            table.insert(data)

            analyze_data.run_cloud_analysis(output_dir=output_folder,
                                            csv_file_pattern=None,
                                            bigquery_table=full_table_name,
                                            schema=schema,
                                            features={
                                                'col1': {
                                                    'transform': 'scale'
                                                },
                                                'col2': {
                                                    'transform': 'identity'
                                                }
                                            })

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze_data.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
            db.delete(delete_contents=True)
Beispiel #2
0
    def test_numerics(self):
        test_folder = os.path.join(self._bucket_root, 'test_numerics')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        file_io.write_string_to_file(
            input_file_path,
            '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

        analyze_data.run_cloud_analysis(output_dir=output_folder,
                                        csv_file_pattern=input_file_path,
                                        bigquery_table=None,
                                        schema=[{
                                            'name': 'col1',
                                            'type': 'INTEGER'
                                        }, {
                                            'name': 'col2',
                                            'type': 'FLOAT'
                                        }],
                                        features={
                                            'col1': {
                                                'transform': 'scale'
                                            },
                                            'col2': {
                                                'transform': 'identity'
                                            }
                                        })
        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder, analyze_data.STATS_FILE)).decode())

        self.assertEqual(stats['num_examples'], 100)
        col = stats['column_stats']['col1']
        self.assertAlmostEqual(col['max'], 99.0)
        self.assertAlmostEqual(col['min'], 0.0)
        self.assertAlmostEqual(col['mean'], 49.5)

        col = stats['column_stats']['col2']
        self.assertAlmostEqual(col['max'], 990.5)
        self.assertAlmostEqual(col['min'], 0.5)
        self.assertAlmostEqual(col['mean'], 495.5)
Beispiel #3
0
    def test_text(self):
        test_folder = os.path.join(self._bucket_root, 'test_text')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        csv_file = [
            'the quick brown fox,raining in kir',
            'quick   brown brown chicken,raining in pdx'
        ]
        file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

        analyze_data.run_cloud_analysis(output_dir=output_folder,
                                        csv_file_pattern=input_file_path,
                                        bigquery_table=None,
                                        schema=[{
                                            'name': 'col1',
                                            'type': 'STRING'
                                        }, {
                                            'name': 'col2',
                                            'type': 'STRING'
                                        }],
                                        features={
                                            'col1': {
                                                'transform': 'bag_of_words'
                                            },
                                            'col2': {
                                                'transform': 'tfidf'
                                            }
                                        })

        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder, analyze_data.STATS_FILE)).decode())
        self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
        self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)

        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze_data.VOCAB_ANALYSIS_FILE % 'col1'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['col1', 'count'])
        self.assertEqual(vocab['col1'].tolist(), [
            'brown',
            'quick',
            'chicken',
            'fox',
            'the',
        ])
        self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze_data.VOCAB_ANALYSIS_FILE % 'col2'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['col2', 'count'])
        self.assertEqual(vocab['col2'].tolist(),
                         ['in', 'raining', 'kir', 'pdx'])
        self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
Beispiel #4
0
    def test_categorical(self):
        test_folder = os.path.join(self._bucket_root, 'test_categorical')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        csv_file = [
            'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train',
            'green,airplane'
        ]
        file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

        analyze_data.run_cloud_analysis(output_dir=output_folder,
                                        csv_file_pattern=input_file_path,
                                        bigquery_table=None,
                                        schema=[{
                                            'name': 'color',
                                            'type': 'STRING'
                                        }, {
                                            'name': 'transport',
                                            'type': 'STRING'
                                        }],
                                        features={
                                            'color': {
                                                'transform': 'one_hot'
                                            },
                                            'transport': {
                                                'transform': 'embedding'
                                            }
                                        })

        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder, analyze_data.STATS_FILE)).decode())
        self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
        self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

        # Color column.
        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze_data.VOCAB_ANALYSIS_FILE % 'color'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['color', 'count'])
        expected_vocab = pd.DataFrame(
            {
                'color': ['red', 'blue', 'green'],
                'count': [3, 2, 1]
            },
            columns=['color', 'count'])
        pd.util.testing.assert_frame_equal(vocab, expected_vocab)

        # transport column.
        vocab_str = file_io.read_file_to_string(
            os.path.join(output_folder,
                         analyze_data.VOCAB_ANALYSIS_FILE % 'transport'))
        vocab = pd.read_csv(six.StringIO(vocab_str),
                            header=None,
                            names=['transport', 'count'])
        self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
        self.assertEqual(vocab['transport'].tolist(),
                         ['airplane', 'bike', 'car', 'train', 'truck', 'van'])