Beispiel #1
0
    def test_remove_null_rows(self):

        # Test that null rows are removed from a single column
        null_table = Table([{'a': 1, 'b': 2}, {'a': 1, 'b': None}])
        self.assertEqual(null_table.remove_null_rows('b').num_rows, 1)

        # Teest that null rows are removed from multiple columns
        null_table = Table([{
            'a': 1,
            'b': 2,
            'c': 3
        }, {
            'a': 1,
            'b': None,
            'c': 3
        }])
        self.assertEqual(null_table.remove_null_rows(['b', 'c']).num_rows, 1)
Beispiel #2
0
    def unpack_nested_columns_as_rows(self,
                                      column,
                                      key='id',
                                      expand_original=False):
        """
        Unpack list or dict values from one column into separate rows.
        Not recommended for JSON columns (i.e. lists of dicts), but can handle columns
        with any mix of types. Makes use of PETL's `melt()` method.

        `Args:`
            column: str
                The column name to unpack
            key: str
                The column to use as a key when unpacking. Defaults to `id`
            expand_original: boolean or int
                If `True`: Add resulting unpacked rows (with all other columns) to original
                If `int`: Add to original unless the max added per key is above the given number
                If `False` (default): Return unpacked rows (with `key` column only) as standalone
                Removes packed list and dict rows from original either way.
        `Returns:`
            If `expand_original`, original table with packed rows replaced by unpacked rows
            Otherwise, standalone table with key column and unpacked values only
        """

        if isinstance(expand_original, int) and expand_original is not True:
            lengths = {
                len(row[column])
                for row in self if isinstance(row[column], (dict, list))
            }
            max_len = sorted(lengths, reverse=True)[0]
            if max_len > expand_original:
                expand_original = False

        if expand_original:
            # Include all columns and filter out other non-dict types in table_list
            table = self
            table_list = table.select_rows(
                lambda row: isinstance(row[column], list))
        else:
            # Otherwise, include only key and column, but keep all non-dict types in table_list
            table = self.cut(key, column)
            table_list = table.select_rows(
                lambda row: not isinstance(row[column], dict))

        # All the columns other than column to ignore while melting
        ignore_cols = table.columns
        ignore_cols.remove(column)

        # Unpack lists as separate columns
        table_list.unpack_list(column, replace=True)

        # Rename the columns to retain only the number
        for col in table_list.columns:
            if f'{column}_' in col:
                table_list.rename_column(col, col.replace(f'{column}_', ""))

        # Filter dicts and unpack as separate columns
        table_dict = table.select_rows(
            lambda row: isinstance(row[column], dict))
        table_dict.unpack_dict(column, prepend=False)

        from parsons.etl.table import Table

        # Use melt to pivot both sets of columns into their own Tables and clean out None values
        melted_list = Table(petl.melt(table_list.table, ignore_cols))
        melted_dict = Table(petl.melt(table_dict.table, ignore_cols))

        melted_list.remove_null_rows('value')
        melted_dict.remove_null_rows('value')

        melted_list.rename_column('variable', column)
        melted_dict.rename_column('variable', column)

        # Combine the list and dict Tables
        melted_list.concat(melted_dict)

        import hashlib

        if expand_original:
            # Add unpacked rows to the original table (minus packed rows)
            orig = self.select_rows(
                lambda row: not isinstance(row[column], (dict, list)))
            orig.concat(melted_list)
            # Add unique id column by hashing all the other fields
            if 'uid' not in self.columns:
                orig.add_column(
                    'uid', lambda row: hashlib.md5(
                        str.encode(''.join([str(x)
                                            for x in row]))).hexdigest())
                orig.move_column('uid', 0)

            # Rename value column in case this is done again to this Table
            orig.rename_column('value', f'{column}_value')

            # Keep column next to column_value
            orig.move_column(column, -1)
            output = orig
        else:
            orig = self.remove_column(column)
            # Add unique id column by hashing all the other fields
            melted_list.add_column(
                'uid', lambda row: hashlib.md5(
                    str.encode(''.join([str(x) for x in row]))).hexdigest())
            melted_list.move_column('uid', 0)
            output = melted_list

        self = orig
        return output