def test_numerics(self): test_folder = os.path.join(self._bucket_root, 'test_numerics') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}] features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5)
def test_text(self): test_folder = os.path.join(self._bucket_root, 'test_text') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = ['the quick brown fox,raining in kir,cat1|cat2,true', 'quick brown brown chicken,raining in pdx,cat2|cat3|cat4,false'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}, {'name': 'col3', 'type': 'STRING'}, {'name': 'col4', 'type': 'STRING'}] features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'}, 'col2': {'transform': 'tfidf', 'source_column': 'col2'}, 'col3': {'transform': 'multi_hot', 'source_column': 'col3', 'separator': '|'}, 'col4': {'transform': 'target'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, features=features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) self.assertEqual(stats['column_stats']['col3']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) self.assertEqual(vocab['col1'].tolist(), ['brown', 'quick', 'chicken', 'fox', 'the', ]) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
def test_categorical(self): test_folder = os.path.join(self._bucket_root, 'test_categorical') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = ['red,car,apple', 'red,truck,pepper', 'red,van,apple', 'blue,bike,grape', 'blue,train,apple', 'green,airplane,pepper'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'color', 'type': 'STRING'}, {'name': 'transport', 'type': 'STRING'}, {'name': 'type', 'type': 'STRING'}] features = {'color': {'transform': 'one_hot', 'source_column': 'color'}, 'transport': {'transform': 'embedding', 'source_column': 'transport'}, 'type': {'transform': 'target'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, features=features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]}, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertEqual(vocab['transport'].tolist(), ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}, {'name': 'col3', 'type': 'FLOAT'}] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5, 'col3': i + 0.5} for i in range(100)] table.insert(data) features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}, 'col3': {'transform': 'target'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, features=features) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def test_numerics(self): test_folder = os.path.join(self._bucket_root, 'test_numerics') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] features = { 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'identity', 'source_column': 'col2' } } analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5)