def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)] table.insert(data) analyze_data.run_cloud_analysis(output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, features={ 'col1': { 'transform': 'scale' }, 'col2': { 'transform': 'identity' } }) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze_data.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def test_numerics(self): test_folder = os.path.join(self._bucket_root, 'test_numerics') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) analyze_data.run_cloud_analysis(output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=[{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }], features={ 'col1': { 'transform': 'scale' }, 'col2': { 'transform': 'identity' } }) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze_data.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5)
def test_text(self): test_folder = os.path.join(self._bucket_root, 'test_text') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = [ 'the quick brown fox,raining in kir', 'quick brown brown chicken,raining in pdx' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) analyze_data.run_cloud_analysis(output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=[{ 'name': 'col1', 'type': 'STRING' }, { 'name': 'col2', 'type': 'STRING' }], features={ 'col1': { 'transform': 'bag_of_words' }, 'col2': { 'transform': 'tfidf' } }) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze_data.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) self.assertEqual(vocab['col1'].tolist(), [ 'brown', 'quick', 'chicken', 'fox', 'the', ]) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
def test_categorical(self): test_folder = os.path.join(self._bucket_root, 'test_categorical') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = [ 'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train', 'green,airplane' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) analyze_data.run_cloud_analysis(output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=[{ 'name': 'color', 'type': 'STRING' }, { 'name': 'transport', 'type': 'STRING' }], features={ 'color': { 'transform': 'one_hot' }, 'transport': { 'transform': 'embedding' } }) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze_data.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( { 'color': ['red', 'blue', 'green'], 'count': [3, 2, 1] }, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze_data.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertEqual(vocab['transport'].tolist(), ['airplane', 'bike', 'car', 'train', 'truck', 'van'])