def testGetDatasetsProtoFromEntriesLists(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 3], 'counts': [1, 1, 1], 'missing': 0, 'type': gfsg.GetFeatureStatsProtoDef().INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = gfsg.GetDatasetsProto(datasets) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(1, len(test_data.features)) numfeat = test_data.features[0] self.assertEqual('testFeature', numfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type) self.assertEqual(1, numfeat.num_stats.min) self.assertEqual(3, numfeat.num_stats.max) hist = numfeat.num_stats.common_stats.num_values_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.3, buckets[0].sample_count) self.assertEqual(1, buckets[9].low_value) self.assertEqual(1, buckets[9].high_value) self.assertEqual(.3, buckets[9].sample_count)
def testProtoFromDataFrames(self): data = [[1, 'hi'], [2, 'hello'], [3, 'hi']] df = pd.DataFrame(data, columns=['testFeatureInt', 'testFeatureString']) dataframes = [{'table': df, 'name': 'testDataset'}] p = gfsg.ProtoFromDataFrames(dataframes) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(2, len(test_data.features)) if test_data.features[0].name == 'testFeatureInt': numfeat = test_data.features[0] stringfeat = test_data.features[1] else: numfeat = test_data.features[1] stringfeat = test_data.features[0] self.assertEqual('testFeatureInt', numfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type) self.assertEqual(1, numfeat.num_stats.min) self.assertEqual(3, numfeat.num_stats.max) self.assertEqual('testFeatureString', stringfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, stringfeat.type) self.assertEqual(2, stringfeat.string_stats.unique)
def testGetDatasetsProtoWithWhitelist(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 3], 'counts': [1, 1, 1], 'missing': 0, 'type': gfsg.GetFeatureStatsProtoDef().INT } entries['ignoreFeature'] = { 'vals': [5, 6], 'counts': [1, 1], 'missing': 1, 'type': gfsg.GetFeatureStatsProtoDef().INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = gfsg.GetDatasetsProto(datasets, features=['testFeature']) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(1, len(test_data.features)) numfeat = test_data.features[0] self.assertEqual('testFeature', numfeat.name) self.assertEqual(1, numfeat.num_stats.min)
def testInfinityAndNan(self): examples = [] for i in range(50): example = tf.train.Example() example.features.feature['num'].float_list.value.append(i) examples.append(example) example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('inf')) examples.append(example) example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('-inf')) examples.append(example) example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('nan')) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) numfeat = p.datasets[0].features[0] self.assertEqual('num', numfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT, numfeat.type) self.assertTrue(np.isnan(numfeat.num_stats.min)) self.assertTrue(np.isnan(numfeat.num_stats.max)) self.assertTrue(np.isnan(numfeat.num_stats.mean)) self.assertTrue(np.isnan(numfeat.num_stats.median)) self.assertEqual(1, numfeat.num_stats.num_zeros) self.assertTrue(np.isnan(numfeat.num_stats.std_dev)) self.assertEqual(53, numfeat.num_stats.common_stats.num_non_missing) hist = buckets = numfeat.num_stats.histograms[0] buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, hist.type) self.assertEqual(1, hist.num_nan) self.assertEqual(10, len(buckets)) self.assertEqual(float('-inf'), buckets[0].low_value) self.assertEqual(4.9, buckets[0].high_value) self.assertEqual(6, buckets[0].sample_count) self.assertEquals(44.1, buckets[9].low_value) self.assertEqual(float('inf'), buckets[9].high_value) self.assertEqual(6, buckets[9].sample_count)
def testParseExampleStringsAndFloats(self): # Tests parsing examples of string and float features examples = [] for i in range(50): example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'hi') example.features.feature['float'].float_list.value.append(i) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) self.assertEqual(2, len(entries)) self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT, entries['float']['type']) self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, entries['str']['type']) for i in range(len(examples)): self.assertEqual(1, entries['str']['counts'][i]) self.assertEqual(1, entries['float']['counts'][i]) self.assertEqual(i, entries['float']['vals'][i]) self.assertEqual( 'hi', entries['str']['vals'][i].decode('UTF-8', 'strict'))
def _check_sequence_example_entries(self, entries, n_examples, n_features, feat_len=None): self.assertIn('num', entries) info = entries['num'] self.assertEqual(0, info['missing']) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, info['type']) for i in range(n_examples): self.assertEqual(n_features, info['counts'][i]) if feat_len is not None: self.assertEqual(feat_len, info['feat_lens'][i]) for i in range(n_examples * n_features): self.assertEqual(i, info['vals'][i]) if feat_len is None: self.assertEqual(0, len(info['feat_lens']))
def testDTypeToType(self): self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, gfsg.DtypeToType(np.dtype(np.int32))) # Boolean and time types treated as int self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, gfsg.DtypeToType(np.dtype(np.bool))) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, gfsg.DtypeToType(np.dtype(np.datetime64))) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, gfsg.DtypeToType(np.dtype(np.timedelta64))) self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT, gfsg.DtypeToType(np.dtype(np.float32))) self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, gfsg.DtypeToType(np.dtype(np.str))) # Unsupported types treated as string for now self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, gfsg.DtypeToType(np.dtype(np.void)))
def testParseExampleInt(self): # Tests parsing examples of integers examples = [] for i in range(50): example = tf.train.Example() example.features.feature['num'].int64_list.value.append(i) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) self.assertEqual(1, len(entries)) self.assertIn('num', entries) info = entries['num'] self.assertEqual(0, info['missing']) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, info['type']) for i in range(len(examples)): self.assertEqual(1, info['counts'][i]) self.assertEqual(i, info['vals'][i])
def testGetDatasetsProtoSequenceExampleHistogram(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 2, 3], 'counts': [1, 2, 1], 'feat_lens': [1, 2, 1], 'missing': 0, 'type': gfsg.GetFeatureStatsProtoDef().INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = gfsg.GetDatasetsProto(datasets) hist = p.datasets[0].features[ 0].num_stats.common_stats.feature_list_length_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.3, buckets[0].sample_count) self.assertEqual(1.8, buckets[9].low_value) self.assertEqual(2, buckets[9].high_value) self.assertEqual(.3, buckets[9].sample_count)
def testParseExampleMissingValueList(self): # Tests parsing examples of integers examples = [] example = tf.train.Example() # pylint: disable=pointless-statement example.features.feature['str'] # pylint: enable=pointless-statement examples.append(example) example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'test') examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) self.assertEqual(1, len(entries)) self.assertIn('str', entries) info = entries['str'] self.assertEqual(1, info['missing']) self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, info['type']) self.assertEqual(0, info['counts'][0]) self.assertEqual(1, info['counts'][1])
def _ParseExample(example_features, example_feature_lists, entries, index): """Parses data from an example, populating a dictionary of feature values. Args: example_features: A map of strings to tf.Features from the example. example_feature_lists: A map of strings to tf.FeatureLists from the example. entries: A dictionary of all features parsed thus far and arrays of their values. This is mutated by the function. index: The index of the example to parse from a list of examples. Raises: TypeError: Raises an exception when a feature has inconsistent types across examples. """ features_seen = set() for feature_list, is_feature in zip( [example_features, example_feature_lists], [True, False]): sequence_length = None for feature_name in feature_list: # If this feature has not been seen in previous examples, then initialize # its entry into the entries dictionary. if feature_name not in entries: entries[feature_name] = { 'vals': [], 'counts': [], 'feat_lens': [], 'missing': index } feature_entry = entries[feature_name] feature = feature_list[feature_name] value_type = None value_list = [] if is_feature: # If parsing a tf.Feature, extract the type and values simply. if feature.HasField('float_list'): value_list = feature.float_list.value value_type = gfsg.GetFeatureStatsProtoDef().FLOAT elif feature.HasField('bytes_list'): value_list = feature.bytes_list.value value_type = gfsg.GetFeatureStatsProtoDef().STRING elif feature.HasField('int64_list'): value_list = feature.int64_list.value value_type = gfsg.GetFeatureStatsProtoDef().INT else: # If parsing a tf.FeatureList, get the type and values by iterating # over all Features in the FeatureList. sequence_length = len(feature.feature) if sequence_length != 0 and feature.feature[0].HasField( 'float_list'): for feat in feature.feature: for value in feat.float_list.value: value_list.append(value) value_type = gfsg.GetFeatureStatsProtoDef().FLOAT elif sequence_length != 0 and feature.feature[0].HasField( 'bytes_list'): for feat in feature.feature: for value in feat.bytes_list.value: value_list.append(value) value_type = gfsg.GetFeatureStatsProtoDef().STRING elif sequence_length != 0 and feature.feature[0].HasField( 'int64_list'): for feat in feature.feature: for value in feat.int64_list.value: value_list.append(value) value_type = gfsg.GetFeatureStatsProtoDef().INT if value_type is not None: if 'type' not in feature_entry: feature_entry['type'] = value_type elif feature_entry['type'] != value_type: raise TypeError('type mismatch for feature ' + feature_name) feature_entry['counts'].append(len(value_list)) feature_entry['vals'].extend(value_list) if sequence_length is not None: feature_entry['feat_lens'].append(sequence_length) if value_list: features_seen.add(feature_name) # For all previously-seen features not found in this example, update the # feature's missing value. for f in entries: fv = entries[f] if f not in features_seen: fv['missing'] += 1
def testGetProtoStrings(self): # Tests converting string examples into the feature stats proto examples = [] for i in range(2): example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'hello') examples.append(example) for i in range(3): example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'hi') examples.append(example) example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'hey') examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('test', test_data.name) self.assertEqual(6, test_data.num_examples) strfeat = test_data.features[0] self.assertEqual('str', strfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, strfeat.type) self.assertEqual(3, strfeat.string_stats.unique) self.assertAlmostEqual(19 / 6.0, strfeat.string_stats.avg_length, 4) self.assertEqual(0, strfeat.string_stats.common_stats.num_missing) self.assertEqual(6, strfeat.string_stats.common_stats.num_non_missing) self.assertEqual(1, strfeat.string_stats.common_stats.min_num_values) self.assertEqual(1, strfeat.string_stats.common_stats.max_num_values) self.assertEqual(1, strfeat.string_stats.common_stats.avg_num_values) hist = strfeat.string_stats.common_stats.num_values_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.6, buckets[0].sample_count) self.assertEquals(1, buckets[9].low_value) self.assertEqual(1, buckets[9].high_value) self.assertEqual(.6, buckets[9].sample_count) self.assertEqual(2, len(strfeat.string_stats.top_values)) self.assertEqual(3, strfeat.string_stats.top_values[0].frequency) self.assertEqual('hi', strfeat.string_stats.top_values[0].value) self.assertEqual(2, strfeat.string_stats.top_values[1].frequency) self.assertEqual('hello', strfeat.string_stats.top_values[1].value) buckets = strfeat.string_stats.rank_histogram.buckets self.assertEqual(3, len(buckets)) self.assertEqual(0, buckets[0].low_rank) self.assertEqual(0, buckets[0].high_rank) self.assertEqual(3, buckets[0].sample_count) self.assertEqual('hi', buckets[0].label) self.assertEqual(2, buckets[2].low_rank) self.assertEqual(2, buckets[2].high_rank) self.assertEqual(1, buckets[2].sample_count) self.assertEqual('hey', buckets[2].label)
def testGetProtoNums(self): # Tests converting int examples into the feature stats proto examples = [] for i in range(50): example = tf.train.Example() example.features.feature['num'].int64_list.value.append(i) examples.append(example) example = tf.train.Example() example.features.feature['other'].int64_list.value.append(0) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('test', test_data.name) self.assertEqual(51, test_data.num_examples) numfeat = test_data.features[0] if ( test_data.features[0].name == 'num') else test_data.features[1] self.assertEqual('num', numfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type) self.assertEqual(0, numfeat.num_stats.min) self.assertEqual(49, numfeat.num_stats.max) self.assertEqual(24.5, numfeat.num_stats.mean) self.assertEqual(24.5, numfeat.num_stats.median) self.assertEqual(1, numfeat.num_stats.num_zeros) self.assertAlmostEqual(14.430869689, numfeat.num_stats.std_dev, 4) self.assertEqual(1, numfeat.num_stats.common_stats.num_missing) self.assertEqual(50, numfeat.num_stats.common_stats.num_non_missing) self.assertEqual(1, numfeat.num_stats.common_stats.min_num_values) self.assertEqual(1, numfeat.num_stats.common_stats.max_num_values) self.assertAlmostEqual(1, numfeat.num_stats.common_stats.avg_num_values, 4) hist = numfeat.num_stats.common_stats.num_values_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(5, buckets[0].sample_count) self.assertEquals(1, buckets[9].low_value) self.assertEqual(1, buckets[9].high_value) self.assertEqual(5, buckets[9].sample_count) self.assertEqual(2, len(numfeat.num_stats.histograms)) buckets = numfeat.num_stats.histograms[0].buckets self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, numfeat.num_stats.histograms[0].type) self.assertEqual(10, len(buckets)) self.assertEqual(0, buckets[0].low_value) self.assertEqual(4.9, buckets[0].high_value) self.assertEqual(5, buckets[0].sample_count) self.assertAlmostEqual(44.1, buckets[9].low_value) self.assertEqual(49, buckets[9].high_value) self.assertEqual(5, buckets[9].sample_count) buckets = numfeat.num_stats.histograms[1].buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, numfeat.num_stats.histograms[1].type) self.assertEqual(10, len(buckets)) self.assertEqual(0, buckets[0].low_value) self.assertEqual(4.9, buckets[0].high_value) self.assertEqual(5, buckets[0].sample_count) self.assertAlmostEqual(44.1, buckets[9].low_value) self.assertEqual(49, buckets[9].high_value) self.assertEqual(5, buckets[9].sample_count)