def test_consolidate_type(self): fields = brewery.FieldList([("s", "string"), ("i", "integer"), ("f", "float"), ("u", "unknown")]) self.input.fields = fields sample = [[" foo ", 123, 123, None], [123, "123", "123", None], [123.0, " 123 ", " 123 ", None], [" foo ", "1 2 3", "1 2 3 . 0", None], [" foo ", "fail", "fail", None], [None, None, None, None]] for row in sample: self.input.put(row) node = brewery.nodes.CoalesceValueToTypeNode() self.setup_node(node) self.initialize_node(node) node.run() node.finalize() strings = [] integers = [] floats = [] for row in self.output.buffer: strings.append(row[0]) integers.append(row[1]) floats.append(row[2]) self.assertEqual(["foo", "123", "123.0", "foo", "foo", None], strings) self.assertEqual([123, 123, 123, 123, None, None], integers) self.assertEqual([123, 123, 123, 123, None, None], floats)
def initialize(self): field_names = [t[0] for t in self.thresholds] self._output_fields = brewery.FieldList() for field in self.input.fields: self._output_fields.append(field) if self.prefix: prefix = self.prefix else: prefix = "" if self.suffix: suffix = self.suffix else: suffix = "_bin" for name in field_names: field = brewery.Field(prefix + name + suffix) field.storage_type = "string" field.analytical_type = "set" self._output_fields.append(field) input_fields = self.input.fields # Check input fields for name in field_names: if not name in self.input.fields: raise FieldError("No input field with name %s" % name) self.threshold_field_indexes = self.input.fields.indexes(field_names)
def output_fields(self): audit_record_fields = [ ("field_name", "string", "typeless"), ("record_count", "integer", "range"), ("null_count", "float", "range"), ("null_record_ratio", "float", "range"), ("empty_string_count", "integer", "range"), ("distinct_count", "integer", "range") ] fields = brewery.FieldList(audit_record_fields) return fields
def output_fields(self): # FIXME: use storage types based on aggregated field type fields = brewery.FieldList() if self.key_fields: for field in self.input_fields.fields(self.key_fields): fields.append(field) for field in self.measures: fields.append(brewery.Field(field + "_sum", storage_type = "float", analytical_type = "range")) fields.append(brewery.Field(field + "_min", storage_type = "float", analytical_type = "range")) fields.append(brewery.Field(field + "_max", storage_type = "float", analytical_type = "range")) fields.append(brewery.Field(field + "_average", storage_type = "float", analytical_type = "range")) fields.append(brewery.Field(self.record_count_field, storage_type = "integer", analytical_type = "range")) return fields
def test_strip_auto(self): fields = brewery.FieldList([("str1", "string"), ("x", "unknown"), ("str2", "string"), ("f", "unknown")]) self.input.fields = fields for i in range(0, 5): self.input.put([" foo ", " bar ", " baz ", " moo "]) node = brewery.nodes.StringStripNode() self.setup_node(node) self.initialize_node(node) node.run() node.finalize() row = self.output.buffer[0] self.assertEqual(["foo", " bar ", "baz", " moo "], row)
def initialize(self): pass # Check joins and normalize them first self._keys = {} self._kindexes = {} self.master_input = self.inputs[self.master] self.detail_inputs = [] for (tag, pipe) in enumerate(self.inputs): if pipe is not self.master_input: self.detail_inputs.append( (tag, pipe) ) for join in self.joins: joinlen = len(join) if joinlen == 3: (detail_tag, master_key, detail_key) = join elif joinlen == 2: # We use same key names for detail as master if no detail key is specified (detail_tag, master_key) = join detail_key = master_key else: raise Exception("Join specification should be a tuple/list of two or three elements.") # Convert to tuple if it is just a string (as expected later) if not (type(detail_key) == list or type(detail_key) == tuple): detail_key = (detail_key, ) if not (type(master_key) == list or type(master_key) == tuple): master_key = (master_key, ) if detail_tag == self.master: raise Exception("Can not join master to itself.") self._keys[detail_tag] = (detail_key, master_key) detail_input = self.inputs[detail_tag] # Get field indexes detail_indexes = detail_input.fields.indexes(detail_key) master_indexes = self.master_input.fields.indexes(master_key) self._kindexes[detail_tag] = (detail_indexes, master_indexes) # Prepare storage for input data self._input_rows = {} for (tag, pipe) in enumerate(self.inputs): self._input_rows[tag] = {} # Create map filters self._filters = {} self._maps = {} if self.maps: for (tag, fmap) in self.maps.items(): if type(fmap) == dict: fmap = brewery.FieldMap(rename = fmap.get("rename"), drop = fmap.get("drop"), keep=fmap.get("keep")) elif type(fmap) != brewery.FieldMap: raise Exception("Unknown field map type: %s" % type(fmap) ) f = fmap.row_filter(self.inputs[tag].fields) self._maps[tag] = fmap self._filters[tag] = f # Construct output fields fields = [] for (tag, pipe) in enumerate(self.inputs): fmap = self._maps.get(tag, None) if fmap: fields += fmap.map(pipe.fields) else: fields += pipe.fields self._output_fields = brewery.FieldList(fields)
def test_valid(self): "Checking empty FieldList" fields = brewery.FieldList(["a", "b", "c", "d"]) self.assertTrue(fields) fields = brewery.FieldList() self.assertFalse(fields)