Example #1
0
    def test_consolidate_type(self):
        fields = brewery.FieldList([("s", "string"), ("i", "integer"),
                                    ("f", "float"), ("u", "unknown")])
        self.input.fields = fields
        sample = [["  foo  ", 123, 123, None], [123, "123", "123", None],
                  [123.0, " 123  ", "  123  ", None],
                  ["  foo  ", "1 2 3", "1 2 3  . 0", None],
                  ["  foo  ", "fail", "fail", None], [None, None, None, None]]

        for row in sample:
            self.input.put(row)

        node = brewery.nodes.CoalesceValueToTypeNode()

        self.setup_node(node)

        self.initialize_node(node)

        node.run()
        node.finalize()

        strings = []
        integers = []
        floats = []

        for row in self.output.buffer:
            strings.append(row[0])
            integers.append(row[1])
            floats.append(row[2])

        self.assertEqual(["foo", "123", "123.0", "foo", "foo", None], strings)
        self.assertEqual([123, 123, 123, 123, None, None], integers)
        self.assertEqual([123, 123, 123, 123, None, None], floats)
    def initialize(self):
        field_names = [t[0] for t in self.thresholds]

        self._output_fields = brewery.FieldList()

        for field in self.input.fields:
            self._output_fields.append(field)

        if self.prefix:
            prefix = self.prefix
        else:
            prefix = ""

        if self.suffix:
            suffix = self.suffix
        else:
            suffix = "_bin"

        for name in field_names:
            field = brewery.Field(prefix + name + suffix)
            field.storage_type = "string"
            field.analytical_type = "set"
            self._output_fields.append(field)

        input_fields = self.input.fields

        # Check input fields
        for name in field_names:
            if not name in self.input.fields:
                raise FieldError("No input field with name %s" % name)

        self.threshold_field_indexes = self.input.fields.indexes(field_names)
    def output_fields(self):

        audit_record_fields = [
                               ("field_name", "string", "typeless"),
                               ("record_count", "integer", "range"),
                               ("null_count", "float", "range"),
                               ("null_record_ratio", "float", "range"),
                               ("empty_string_count", "integer", "range"),
                               ("distinct_count", "integer", "range")
                               ]
                               
        fields = brewery.FieldList(audit_record_fields)
        return fields
    def output_fields(self):
        # FIXME: use storage types based on aggregated field type
        fields = brewery.FieldList()

        if self.key_fields:
            for field in  self.input_fields.fields(self.key_fields):
                fields.append(field)

        for field in self.measures:
            fields.append(brewery.Field(field + "_sum", storage_type = "float", analytical_type = "range"))
            fields.append(brewery.Field(field + "_min", storage_type = "float", analytical_type = "range"))
            fields.append(brewery.Field(field + "_max", storage_type = "float", analytical_type = "range"))
            fields.append(brewery.Field(field + "_average", storage_type = "float", analytical_type = "range"))
        fields.append(brewery.Field(self.record_count_field, storage_type = "integer", analytical_type = "range"))

        return fields
Example #5
0
    def test_strip_auto(self):
        fields = brewery.FieldList([("str1", "string"), ("x", "unknown"),
                                    ("str2", "string"), ("f", "unknown")])
        self.input.fields = fields
        for i in range(0, 5):
            self.input.put([" foo ", " bar ", " baz ", " moo "])

        node = brewery.nodes.StringStripNode()

        self.setup_node(node)

        self.initialize_node(node)

        node.run()
        node.finalize()

        row = self.output.buffer[0]
        self.assertEqual(["foo", " bar ", "baz", " moo "], row)
    def initialize(self):
        pass
        # Check joins and normalize them first
        self._keys = {}
        self._kindexes = {}
        
        self.master_input = self.inputs[self.master]
        self.detail_inputs = []
        for (tag, pipe) in enumerate(self.inputs):
            if pipe is not self.master_input:
                self.detail_inputs.append( (tag, pipe) )

        for join in self.joins:
            joinlen = len(join)
            if joinlen == 3:
                (detail_tag, master_key, detail_key) = join
            elif joinlen == 2:
                # We use same key names for detail as master if no detail key is specified
                (detail_tag, master_key) = join
                detail_key = master_key
            else:
                raise Exception("Join specification should be a tuple/list of two or three elements.")

            # Convert to tuple if it is just a string (as expected later)
            if not (type(detail_key) == list or type(detail_key) == tuple):
                detail_key = (detail_key, )
            if not (type(master_key) == list or type(master_key) == tuple):
                master_key = (master_key, )

            if detail_tag == self.master:
                raise Exception("Can not join master to itself.")
            
            self._keys[detail_tag] = (detail_key, master_key)
            
            detail_input = self.inputs[detail_tag]
            
            # Get field indexes
            detail_indexes = detail_input.fields.indexes(detail_key)
            master_indexes = self.master_input.fields.indexes(master_key)
            self._kindexes[detail_tag] = (detail_indexes, master_indexes)

        # Prepare storage for input data
        self._input_rows = {}
        for (tag, pipe) in enumerate(self.inputs):
            self._input_rows[tag] = {}

        # Create map filters
        
        self._filters = {}
        self._maps = {}
        if self.maps:
            for (tag, fmap) in self.maps.items():
                if type(fmap) == dict:
                    fmap = brewery.FieldMap(rename = fmap.get("rename"), drop = fmap.get("drop"), keep=fmap.get("keep"))
                elif type(fmap) != brewery.FieldMap:
                    raise Exception("Unknown field map type: %s" % type(fmap) )
                f = fmap.row_filter(self.inputs[tag].fields)
                self._maps[tag] = fmap
                self._filters[tag] = f

        # Construct output fields
        fields = []
        for (tag, pipe) in enumerate(self.inputs):
            fmap = self._maps.get(tag, None)
            if fmap:
                fields += fmap.map(pipe.fields)
            else:
                fields += pipe.fields

        self._output_fields = brewery.FieldList(fields)
Example #7
0
 def test_valid(self):
     "Checking empty FieldList"
     fields = brewery.FieldList(["a", "b", "c", "d"])
     self.assertTrue(fields)
     fields = brewery.FieldList()
     self.assertFalse(fields)