def test_numerics(self): test_folder = os.path.join(self._bucket_root, 'test_numerics') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}] features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5)
def test_numerics(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}] features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}} analyze.run_local_analysis( output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder)
def test_text(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = [ 'the quick brown fox,raining in kir', 'quick brown brown chicken,raining in pdx' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) schema = [{ 'name': 'col1', 'type': 'STRING' }, { 'name': 'col2', 'type': 'STRING' }] features = { 'col1': { 'transform': 'bag_of_words', 'source_column': 'col1' }, 'col2': { 'transform': 'tfidf', 'source_column': 'col2' } } analyze.run_local_analysis(output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) self.assertEqual(vocab['col1'].tolist(), ['quick', 'brown', 'the', 'fox', 'chicken']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) self.assertEqual(vocab['col2'].tolist(), ['raining', 'in', 'pdx', 'kir']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1]) finally: shutil.rmtree(output_folder)
def test_categorical(self): test_folder = os.path.join(self._bucket_root, 'test_categorical') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = ['red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train', 'green,airplane'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'color', 'type': 'STRING'}, {'name': 'transport', 'type': 'STRING'}] features = {'color': {'transform': 'one_hot', 'source_column': 'color'}, 'transport': {'transform': 'embedding', 'source_column': 'transport'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]}, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertEqual(vocab['transport'].tolist(), ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
def test_numerics(self): test_folder = os.path.join(self._bucket_root, 'test_numerics') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] features = { 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'identity', 'source_column': 'col2' } } analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5)
def test_text(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = ['the quick brown fox,raining in kir', 'quick brown brown chicken,raining in pdx'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}] features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'}, 'col2': {'transform': 'tfidf', 'source_column': 'col2'}} analyze.run_local_analysis( output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) # vocabs are sorted by count only col1_vocab = vocab['col1'].tolist() self.assertItemsEqual(col1_vocab[:2], ['brown', 'quick']) self.assertItemsEqual(col1_vocab[2:], ['chicken', 'fox', 'the']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) # vocabs are sorted by count only col2_vocab = vocab['col2'].tolist() self.assertItemsEqual(col2_vocab[:2], ['in', 'raining']) self.assertItemsEqual(col2_vocab[2:], ['kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1]) finally: shutil.rmtree(output_folder)
def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)] table.insert(data) features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def test_text(self): test_folder = os.path.join(self._bucket_root, 'test_text') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = ['the quick brown fox,raining in kir', 'quick brown brown chicken,raining in pdx'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}] features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'}, 'col2': {'transform': 'tfidf', 'source_column': 'col2'}} analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) self.assertEqual(vocab['col1'].tolist(), ['brown', 'quick', 'chicken', 'fox', 'the', ]) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
def test_categorical(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = ['red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train', 'green,airplane'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'color', 'type': 'STRING'}, {'name': 'transport', 'type': 'STRING'}] features = {'color': {'transform': 'one_hot', 'source_column': 'color'}, 'transport': {'transform': 'embedding', 'source_column': 'transport'}} analyze.run_local_analysis( output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]}, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. As each vocab has the same count, order in file is # not known. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertItemsEqual(vocab['transport'].tolist(), ['car', 'truck', 'van', 'bike', 'train', 'airplane']) finally: shutil.rmtree(output_folder)
def test_numerics(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] features = { 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'identity', 'source_column': 'col2' } } analyze.run_local_analysis(output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder)
def test_check_schema_transforms_match(self): with self.assertRaises(ValueError): analyze.check_schema_transforms_match( [{'name': 'col1', 'type': 'INTEGER'}], analyze.invert_features({'col1': {'transform': 'one_hot', 'source_column': 'col1'}})) with self.assertRaises(ValueError): analyze.check_schema_transforms_match( [{'name': 'col1', 'type': 'FLOAT'}], analyze.invert_features({'col1': {'transform': 'embedding', 'source_column': 'col1'}})) with self.assertRaises(ValueError): analyze.check_schema_transforms_match( [{'name': 'col1', 'type': 'STRING'}], analyze.invert_features({'col1': {'transform': 'scale', 'source_column': 'col1'}})) with self.assertRaises(ValueError): analyze.check_schema_transforms_match( [{'name': 'col1', 'type': 'xxx'}], analyze.invert_features({'col1': {'transform': 'scale', 'source_column': 'col1'}})) with self.assertRaises(ValueError): analyze.check_schema_transforms_match( [{'name': 'col1', 'type': 'INTEGER'}], analyze.invert_features({'col1': {'transform': 'xxx', 'source_column': 'col1'}})) with self.assertRaises(ValueError): # scale and one_hot different transform family analyze.check_schema_transforms_match( [{'name': 'col1', 'type': 'INTEGER'}], analyze.invert_features( {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'one_hot', 'source_column': 'col1'}, 'col3': {'transform': 'key', 'source_column': 'col1'}})) with self.assertRaises(ValueError): # Unknown transform analyze.check_schema_transforms_match( [{'name': 'col1', 'type': 'INTEGER'}], analyze.invert_features({'col1': {'transform': 'x', 'source_column': 'col1'}}))
def test_text(self): test_folder = os.path.join(self._bucket_root, 'test_text') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = [ 'the quick brown fox,raining in kir', 'quick brown brown chicken,raining in pdx' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) schema = [{ 'name': 'col1', 'type': 'STRING' }, { 'name': 'col2', 'type': 'STRING' }] features = { 'col1': { 'transform': 'bag_of_words', 'source_column': 'col1' }, 'col2': { 'transform': 'tfidf', 'source_column': 'col2' } } analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) self.assertEqual(vocab['col1'].tolist(), [ 'brown', 'quick', 'chicken', 'fox', 'the', ]) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
def test_categorical(self): test_folder = os.path.join(self._bucket_root, 'test_categorical') input_file_path = os.path.join(test_folder, 'input.csv') output_folder = os.path.join(test_folder, 'test_output') file_io.recursive_create_dir(output_folder) csv_file = [ 'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train', 'green,airplane' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) schema = [{ 'name': 'color', 'type': 'STRING' }, { 'name': 'transport', 'type': 'STRING' }] features = { 'color': { 'transform': 'one_hot', 'source_column': 'color' }, 'transport': { 'transform': 'embedding', 'source_column': 'transport' } } analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=input_file_path, bigquery_table=None, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( { 'color': ['red', 'blue', 'green'], 'count': [3, 2, 1] }, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertEqual(vocab['transport'].tolist(), ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
def test_numerics(self): """Build a BQ table, and then call analyze on it.""" schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] project_id = dl.Context.default().project_id dataset_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex table_name = 'temp_table' full_table_name = '%s.%s.%s' % (project_id, dataset_name, table_name) output_folder = tempfile.mkdtemp() try: # Make a dataset, a table, and insert data. db = bq.Dataset((project_id, dataset_name)) db.create() table = bq.Table(full_table_name) table.create(schema=bq.Schema(schema), overwrite=True) data = [{'col1': i, 'col2': 10 * i + 0.5} for i in range(100)] table.insert(data) features = { 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'identity', 'source_column': 'col2' } } analyze.run_cloud_analysis( output_dir=output_folder, csv_file_pattern=None, bigquery_table=full_table_name, schema=schema, inverted_features=analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder) db.delete(delete_contents=True)
def test_categorical(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = [ 'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train', 'green,airplane' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) schema = [{ 'name': 'color', 'type': 'STRING' }, { 'name': 'transport', 'type': 'STRING' }] features = { 'color': { 'transform': 'one_hot', 'source_column': 'color' }, 'transport': { 'transform': 'embedding', 'source_column': 'transport' } } analyze.run_local_analysis(output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( { 'color': ['red', 'blue', 'green'], 'count': [3, 2, 1] }, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. As each vocab has the same count, order in file is # not known. vocab_str = file_io.read_file_to_string( os.path.join( output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertItemsEqual( vocab['transport'].tolist(), ['car', 'truck', 'van', 'bike', 'train', 'airplane']) finally: shutil.rmtree(output_folder)
def test_check_schema_transforms_match(self): with self.assertRaises(ValueError): analyze.check_schema_transforms_match([{ 'name': 'col1', 'type': 'INTEGER' }], analyze.invert_features({ 'col1': { 'transform': 'one_hot', 'source_column': 'col1' } })) with self.assertRaises(ValueError): analyze.check_schema_transforms_match([{ 'name': 'col1', 'type': 'FLOAT' }], analyze.invert_features({ 'col1': { 'transform': 'embedding', 'source_column': 'col1' } })) with self.assertRaises(ValueError): analyze.check_schema_transforms_match([{ 'name': 'col1', 'type': 'STRING' }], analyze.invert_features({ 'col1': { 'transform': 'scale', 'source_column': 'col1' } })) with self.assertRaises(ValueError): analyze.check_schema_transforms_match([{ 'name': 'col1', 'type': 'xxx' }], analyze.invert_features({ 'col1': { 'transform': 'scale', 'source_column': 'col1' } })) with self.assertRaises(ValueError): analyze.check_schema_transforms_match([{ 'name': 'col1', 'type': 'INTEGER' }], analyze.invert_features({ 'col1': { 'transform': 'xxx', 'source_column': 'col1' } })) with self.assertRaises(ValueError): # scale and one_hot different transform family analyze.check_schema_transforms_match([{ 'name': 'col1', 'type': 'INTEGER' }], analyze.invert_features({ 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'one_hot', 'source_column': 'col1' }, 'col3': { 'transform': 'key', 'source_column': 'col1' } })) with self.assertRaises(ValueError): # Unknown transform analyze.check_schema_transforms_match([{ 'name': 'col1', 'type': 'INTEGER' }], analyze.invert_features({ 'col1': { 'transform': 'x', 'source_column': 'col1' } }))