Example #1
0
  def test_numerics(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

      schema = [{'name': 'col1', 'type': 'INTEGER'},
                {'name': 'col2', 'type': 'FLOAT'}]
      features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                  'col2': {'transform': 'identity', 'source_column': 'col2'}}
      analyze.run_local_analysis(
          output_folder, [input_file_path], schema, analyze.invert_features(features))

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

      self.assertEqual(stats['num_examples'], 100)
      col = stats['column_stats']['col1']
      self.assertAlmostEqual(col['max'], 99.0)
      self.assertAlmostEqual(col['min'], 0.0)
      self.assertAlmostEqual(col['mean'], 49.5)

      col = stats['column_stats']['col2']
      self.assertAlmostEqual(col['max'], 990.5)
      self.assertAlmostEqual(col['min'], 0.5)
      self.assertAlmostEqual(col['mean'], 495.5)
    finally:
      shutil.rmtree(output_folder)
Example #2
0
    def test_text(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            csv_file = [
                'the quick brown fox,raining in kir',
                'quick   brown brown chicken,raining in pdx'
            ]
            file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

            schema = [{
                'name': 'col1',
                'type': 'STRING'
            }, {
                'name': 'col2',
                'type': 'STRING'
            }]
            features = {
                'col1': {
                    'transform': 'bag_of_words',
                    'source_column': 'col1'
                },
                'col2': {
                    'transform': 'tfidf',
                    'source_column': 'col2'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())
            self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
            self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)

            vocab_str = file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['col1', 'count'])
            self.assertEqual(vocab['col1'].tolist(),
                             ['quick', 'brown', 'the', 'fox', 'chicken'])
            self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

            vocab_str = file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['col2', 'count'])
            self.assertEqual(vocab['col2'].tolist(),
                             ['raining', 'in', 'pdx', 'kir'])
            self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
        finally:
            shutil.rmtree(output_folder)
Example #3
0
  def test_text(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['the quick brown fox,raining in kir',
                  'quick   brown brown chicken,raining in pdx']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'col1', 'type': 'STRING'}, {'name': 'col2', 'type': 'STRING'}]
      features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'},
                  'col2': {'transform': 'tfidf', 'source_column': 'col2'}}
      analyze.run_local_analysis(
        output_folder, [input_file_path], schema, analyze.invert_features(features))

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
      self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col1', 'count'])

      # vocabs are sorted by count only
      col1_vocab = vocab['col1'].tolist()
      self.assertItemsEqual(col1_vocab[:2], ['brown', 'quick'])
      self.assertItemsEqual(col1_vocab[2:], ['chicken', 'fox', 'the'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col2', 'count'])

      # vocabs are sorted by count only
      col2_vocab = vocab['col2'].tolist()
      self.assertItemsEqual(col2_vocab[:2], ['in', 'raining'])
      self.assertItemsEqual(col2_vocab[2:], ['kir', 'pdx'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
    finally:
      shutil.rmtree(output_folder)
Example #4
0
  def test_categorical(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train',
                  'green,airplane']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'color', 'type': 'STRING'},
                {'name': 'transport', 'type': 'STRING'}]
      features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                  'transport': {'transform': 'embedding', 'source_column': 'transport'}}
      analyze.run_local_analysis(
        output_folder, [input_file_path], schema, analyze.invert_features(features))

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
      self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

      # Color column.
      vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['color', 'count'])
      expected_vocab = pd.DataFrame(
          {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
          columns=['color', 'count'])
      pd.util.testing.assert_frame_equal(vocab, expected_vocab)

      # transport column. As each vocab has the same count, order in file is
      # not known.
      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['transport', 'count'])
      self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
      self.assertItemsEqual(vocab['transport'].tolist(),
                            ['car', 'truck', 'van', 'bike', 'train', 'airplane'])
    finally:
      shutil.rmtree(output_folder)
Example #5
0
    def test_numerics(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            file_io.write_string_to_file(
                input_file_path,
                '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

            schema = [{
                'name': 'col1',
                'type': 'INTEGER'
            }, {
                'name': 'col2',
                'type': 'FLOAT'
            }]
            features = {
                'col1': {
                    'transform': 'scale',
                    'source_column': 'col1'
                },
                'col2': {
                    'transform': 'identity',
                    'source_column': 'col2'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
Example #6
0
    def test_categorical(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            csv_file = [
                'red,car', 'red,truck', 'red,van', 'blue,bike', 'blue,train',
                'green,airplane'
            ]
            file_io.write_string_to_file(input_file_path, '\n'.join(csv_file))

            schema = [{
                'name': 'color',
                'type': 'STRING'
            }, {
                'name': 'transport',
                'type': 'STRING'
            }]
            features = {
                'color': {
                    'transform': 'one_hot',
                    'source_column': 'color'
                },
                'transport': {
                    'transform': 'embedding',
                    'source_column': 'transport'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())
            self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
            self.assertEqual(stats['column_stats']['transport']['vocab_size'],
                             6)

            # Color column.
            vocab_str = file_io.read_file_to_string(
                os.path.join(output_folder,
                             analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['color', 'count'])
            expected_vocab = pd.DataFrame(
                {
                    'color': ['red', 'blue', 'green'],
                    'count': [3, 2, 1]
                },
                columns=['color', 'count'])
            pd.util.testing.assert_frame_equal(vocab, expected_vocab)

            # transport column. As each vocab has the same count, order in file is
            # not known.
            vocab_str = file_io.read_file_to_string(
                os.path.join(
                    output_folder,
                    analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
            vocab = pd.read_csv(six.StringIO(vocab_str),
                                header=None,
                                names=['transport', 'count'])
            self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
            self.assertItemsEqual(
                vocab['transport'].tolist(),
                ['car', 'truck', 'van', 'bike', 'train', 'airplane'])
        finally:
            shutil.rmtree(output_folder)