def test_numerics(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{'name': 'col1', 'type': 'INTEGER'}, {'name': 'col2', 'type': 'FLOAT'}] features = {'col1': {'transform': 'scale', 'source_column': 'col1'}, 'col2': {'transform': 'identity', 'source_column': 'col2'}} analyze.run_local_analysis( output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder)
def test_text(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = [ 'the quick brown fox,raining in kir', 'quick brown brown chicken,raining in pdx' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) schema = [{ 'name': 'col1', 'type': 'STRING' }, { 'name': 'col2', 'type': 'STRING' }] features = { 'col1': { 'transform': 'bag_of_words', 'source_column': 'col1' }, 'col2': { 'transform': 'tfidf', 'source_column': 'col2' } } analyze.run_local_analysis(output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) self.assertEqual(vocab['col1'].tolist(), ['quick', 'brown', 'the', 'fox', 'chicken']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) self.assertEqual(vocab['col2'].tolist(), ['raining', 'in', 'pdx', 'kir']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1]) finally: shutil.rmtree(output_folder)
def test_text(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = ['the quick brown fox,raining in kir', 'quick brown brown chicken,raining in pdx'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}] features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'}, 'col2': {'transform': 'tfidf', 'source_column': 'col2'}} analyze.run_local_analysis( output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5) self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col1')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col1', 'count']) # vocabs are sorted by count only col1_vocab = vocab['col1'].tolist() self.assertItemsEqual(col1_vocab[:2], ['brown', 'quick']) self.assertItemsEqual(col1_vocab[2:], ['chicken', 'fox', 'the']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1]) vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'col2')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['col2', 'count']) # vocabs are sorted by count only col2_vocab = vocab['col2'].tolist() self.assertItemsEqual(col2_vocab[:2], ['in', 'raining']) self.assertItemsEqual(col2_vocab[2:], ['kir', 'pdx']) self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1]) finally: shutil.rmtree(output_folder)
def test_categorical(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = ['red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train', 'green,airplane'] file_io.write_string_to_file( input_file_path, '\n'.join(csv_file)) schema = [{'name': 'color', 'type': 'STRING'}, {'name': 'transport', 'type': 'STRING'}] features = {'color': {'transform': 'one_hot', 'source_column': 'color'}, 'transport': {'transform': 'embedding', 'source_column': 'transport'}} analyze.run_local_analysis( output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]}, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. As each vocab has the same count, order in file is # not known. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertItemsEqual(vocab['transport'].tolist(), ['car', 'truck', 'van', 'bike', 'train', 'airplane']) finally: shutil.rmtree(output_folder)
def test_numerics(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: file_io.write_string_to_file( input_file_path, '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)])) schema = [{ 'name': 'col1', 'type': 'INTEGER' }, { 'name': 'col2', 'type': 'FLOAT' }] features = { 'col1': { 'transform': 'scale', 'source_column': 'col1' }, 'col2': { 'transform': 'identity', 'source_column': 'col2' } } analyze.run_local_analysis(output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['num_examples'], 100) col = stats['column_stats']['col1'] self.assertAlmostEqual(col['max'], 99.0) self.assertAlmostEqual(col['min'], 0.0) self.assertAlmostEqual(col['mean'], 49.5) col = stats['column_stats']['col2'] self.assertAlmostEqual(col['max'], 990.5) self.assertAlmostEqual(col['min'], 0.5) self.assertAlmostEqual(col['mean'], 495.5) finally: shutil.rmtree(output_folder)
def test_categorical(self): output_folder = tempfile.mkdtemp() input_file_path = tempfile.mkstemp(dir=output_folder)[1] try: csv_file = [ 'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train', 'green,airplane' ] file_io.write_string_to_file(input_file_path, '\n'.join(csv_file)) schema = [{ 'name': 'color', 'type': 'STRING' }, { 'name': 'transport', 'type': 'STRING' }] features = { 'color': { 'transform': 'one_hot', 'source_column': 'color' }, 'transport': { 'transform': 'embedding', 'source_column': 'transport' } } analyze.run_local_analysis(output_folder, [input_file_path], schema, analyze.invert_features(features)) stats = json.loads( file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.STATS_FILE)).decode()) self.assertEqual(stats['column_stats']['color']['vocab_size'], 3) self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6) # Color column. vocab_str = file_io.read_file_to_string( os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['color', 'count']) expected_vocab = pd.DataFrame( { 'color': ['red', 'blue', 'green'], 'count': [3, 2, 1] }, columns=['color', 'count']) pd.util.testing.assert_frame_equal(vocab, expected_vocab) # transport column. As each vocab has the same count, order in file is # not known. vocab_str = file_io.read_file_to_string( os.path.join( output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'transport')) vocab = pd.read_csv(six.StringIO(vocab_str), header=None, names=['transport', 'count']) self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)]) self.assertItemsEqual( vocab['transport'].tolist(), ['car', 'truck', 'van', 'bike', 'train', 'airplane']) finally: shutil.rmtree(output_folder)