コード例 #1
0
  def test_stats_gen_with_header_in_empty_csv_file(self):
    input_data_path = self._write_records_to_csv([], self._get_temp_dir(),
                                                 'input_data.csv')

    with self.assertRaisesRegexp(
        ValueError, 'Found empty file when reading the header.*'):
      _ = stats_gen_lib.generate_statistics_from_csv(
          data_location=input_data_path, column_names=None, delimiter=',')
コード例 #2
0
  def test_stats_gen_with_csv_tab_delimiter_no_header_in_file(self):
    records, header, expected_result = self._get_csv_test(delimiter='\t',
                                                          with_header=False)
    input_data_path = self._write_records_to_csv(records, self._get_temp_dir(),
                                                 'input_data.tsv')

    result = stats_gen_lib.generate_statistics_from_csv(
        data_location=input_data_path,
        column_names=header,
        delimiter='\t',
        stats_options=self._default_stats_options)
    compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
        self, expected_result)
    compare_fn([result])
コード例 #3
0
  def test_stats_gen_with_invalid_csv_header_in_multiple_files(self):
    records, _, _ = self._get_csv_test(delimiter=',',
                                       with_header=True)
    header = records.pop(0)
    # Split the records into two subsets and write to separate files.
    records1 = [header] + records[0:3]
    records2 = ['random,header'] + records[3:]
    tmp_dir = self._get_temp_dir()
    self._write_records_to_csv(records1, tmp_dir, 'input_data1.csv')
    self._write_records_to_csv(records2, tmp_dir, 'input_data2.csv')
    input_data_path = tmp_dir + '/input_data*'

    with self.assertRaisesRegexp(
        ValueError, 'Files have different headers.'):
      _ = stats_gen_lib.generate_statistics_from_csv(
          data_location=input_data_path, column_names=None, delimiter=',')
コード例 #4
0
    def test_stats_gen_with_csv_missing_column(self):
        records = [',', ',']
        input_data_path = self._write_records_to_csv(records,
                                                     self._get_temp_dir(),
                                                     'input_data.csv')
        expected_result = text_format.Parse(
            """
        datasets {
          num_examples: 2
          features {
            path {
              step: "feature1"
            }
            type: STRING
            string_stats {
              common_stats {
                num_missing: 2
              }
            }
          }
          features {
            path {
              step: "feature2"
            }
            type: STRING
            string_stats {
              common_stats {
                num_missing: 2
              }
            }
          }
        }
        """, statistics_pb2.DatasetFeatureStatisticsList())

        result = stats_gen_lib.generate_statistics_from_csv(
            data_location=input_data_path,
            column_names=['feature1', 'feature2'],
            delimiter=',',
            stats_options=self._default_stats_options)
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])
コード例 #5
0
    def test_stats_gen_with_csv_header_in_multiple_files(self):
        records, _, expected_result = self._get_csv_test(delimiter=',',
                                                         with_header=True)
        header = records.pop(0)
        # Split the records into two subsets and write to separate files.
        records1 = [header] + records[0:3]
        records2 = [header] + records[3:]
        tmp_dir = self._get_temp_dir()
        self._write_records_to_csv(records1, tmp_dir, 'input_data1.csv')
        self._write_records_to_csv(records2, tmp_dir, 'input_data2.csv')
        input_data_path = os.path.join(tmp_dir, 'input_data*')

        result = stats_gen_lib.generate_statistics_from_csv(
            data_location=input_data_path,
            column_names=None,
            delimiter=',',
            stats_options=self._default_stats_options)
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])
コード例 #6
0
  def test_stats_gen_with_csv_no_header_in_file(self, compression_type):
    records, header, expected_result = self._get_csv_test(delimiter=',',
                                                          with_header=False)
    compression_type_lookup = {
        CompressionTypes.AUTO: '',
        CompressionTypes.GZIP: 'gzip'
    }
    input_data_path = self._write_records_to_csv(
        records, self._get_temp_dir(), 'input_data.csv',
        compression_type=compression_type_lookup[compression_type])

    result = stats_gen_lib.generate_statistics_from_csv(
        data_location=input_data_path,
        column_names=header,
        delimiter=',',
        stats_options=self._default_stats_options,
        compression_type=compression_type)
    compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
        self, expected_result)
    compare_fn([result])
コード例 #7
0
    def test_stats_gen_with_csv_with_schema(self):
        records = ['feature1', '1']
        input_data_path = self._write_records_to_csv(records,
                                                     self._get_temp_dir(),
                                                     'input_data.csv')
        schema = text_format.Parse(
            """
        feature { name: "feature1" type: BYTES }
        """, schema_pb2.Schema())

        expected_result = text_format.Parse(
            """
    datasets {
  num_examples: 1
  features {
    path {
      step: "feature1"
    }
    type: STRING
    string_stats {
      common_stats {
        num_non_missing: 1
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 0.5
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 0.5
          }
          type: QUANTILES
        }
        tot_num_values: 1
      }
      unique: 1
      top_values {
        value: "1"
        frequency: 1.0
      }
      avg_length: 1.0
      rank_histogram {
        buckets {
          label: "1"
          sample_count: 1.0
        }
      }
    }
  }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        self._default_stats_options.schema = schema
        self._default_stats_options.infer_type_from_schema = True
        result = stats_gen_lib.generate_statistics_from_csv(
            data_location=input_data_path,
            delimiter=',',
            stats_options=self._default_stats_options)
        compare_fn = test_util.make_dataset_feature_stats_list_proto_equal_fn(
            self, expected_result)
        compare_fn([result])