def from_json(self, json_form, overwrite_name=True):
        """     
        Load the table from a JSON string, of the form produced by toJSON().  Note
        that if the overwrite_name parameter = True (the default), this will also
        overwrite the table name.

        Throws InvalidDataException id json_form is malformed

        Args:
            json_form: A JSON form of the Dictionary

        Returns:
            None

        Throws:
            InvalidDataException if json_form is malformed

        """
        try:
            record = loads(json_form)
        except JSONDecodeError(msg):
            raise InvalidDataException(msg)
        if (type(record) != dict):
            raise InvalidDataException(
                f'JSON form of table must be a dictionary, not {type(record)}')
        self._check_fields(record, {"name", "table"}, 'JSON form of table')
        self.load_from_dictionary(record["table"])
        if (overwrite_name):
            self.name = record["name"]
 def _check_schema_match(self, schema, data):
     for row in data:
         if (len(row) != len(schema)):
             raise InvalidDataException(
                 f"All rows must have length {len(schema)}")
     try:
         table = gviz_api.DataTable(schema)
         table.LoadData(data)
         table.ToJSon()
     except gviz_api.DataTableException as schema_error:
         raise InvalidDataException(schema_error)
    def filter_by_function(self,
                           column_name,
                           function,
                           new_table_name,
                           column_types={}):
        '''
        Create a new table, with name table_name, with rows such that 
        function(row[column_name]) == True.  The new table will have
        columns {self.columns} - {column_name}, same types, and same order
        Throws an InvalidDataException if:
        1. new_table_name is None or not a string
        2. column_name is not a name of an existing column
        3. if column_types is not empty, the type of the selected column doesn't match one of the allowed types

        Args:
            column_name: the column to filter by
            function: a Boolean function with a single argument of the type of columns[column_name]
            new_table_name: name of the new table
            column_types: set of the allowed column types; if empty, any type is permitted
        
        Returns:
            A table with column[column_name] missing and filtered
        
        Throws:
            InvalidDataException if new_table_name is empty, column_name is not a name of an existing column, or the type of column_name isn't in column_types (if column_types is non-empty)
        '''
        if (not new_table_name):
            raise InvalidDataException('new_table_name cannot be empty')
        if (not column_name):
            raise InvalidDataException('column_name cannot be empty')
        indices = [
            i for i in range(len(self.schema))
            if self.schema[i]["name"] == column_name
        ]
        if (len(indices) == 0):
            raise InvalidDataException(
                f'Column {column_name} not found in schema')
        index = indices[0]
        if (column_types):
            if (not self.schema[index]["type"] in column_types):
                raise InvalidDataException(
                    f'Type {self.schema[index]["type"]} not found in {column_types}'
                )
        data = [
            row[:index] + row[index + 1:] for row in self.data
            if function(row[index])
        ]
        schema = self.schema[:index] + self.schema[index + 1:]
        result = GalyleoTable(new_table_name)
        result.load_from_dictionary({"columns": schema, "rows": data})
        return result
    def filter_range(self, column_name, range_as_tuple, new_table_name,
                     column_types):
        '''
        A convenience method over filter_by_function.  This is identical to
        filter_by_function(column_name, lambda x: x >= range_as_tuple[0], x <= range_as_tuple[1], new_table_name, column_types)

        Args:
            column_name: the column to filter by
            range_as_tupe: the tuple representing the range
            new_table_name: name of the new table
            column_types: set of the allowed column types; if empty, any type is permitted
        
        Returns:
            A table with column[column_name] missing and filtered
        
         Throws:
            InvalidDataException if new_table_name is empty, column_name is not a name of an existing column, or the type of column_name isn't in column_types (if column_types is non-empty), if len(range_as_tuple) != 2
        '''

        try:
            assert (range_as_tuple and len(range_as_tuple) == 2)
        except Exception:
            raise InvalidDataException(
                f'{range_as_tuple} should be a tuple of length 2')

        return self.filter_by_function(
            column_name,
            lambda x: x <= range_as_tuple[1] and x >= range_as_tuple[0],
            new_table_name, column_types)
 def _get_column_index(self, column_name):
     indices = [
         i for i in range(len(self.schema))
         if self.schema[i]["name"] == column_name
     ]
     if (len(indices) == 0):
         raise InvalidDataException(
             f'Column {column_name} is not in the schema')
     return indices[0]
    def pivot_on_column(self,
                        pivot_column_name,
                        value_column_name,
                        new_table_name,
                        pivot_column_values={},
                        other_column=False):
        '''
        The pivot_on_column method breaks out value_column into n separate columns, one for each
        member of pivot_column_values plus (if other_column = True), an "Other" column.  This is easiest to see with an example.  Consider a table with columns (Year, State, Party, Percentage).  pivot_on_column('Party', {'Republican', 'Democratic'}, 'Percentage', 'pivot_table', False) would create a new table with columns Year, State, Republican, Democratic, where the values in the Republican and Democratic columns are the  values in the Percentage column where the Party column value was Republican or Democratic, respectively.  If Other = True, an additional column, Other, is found where the value is (generally) the sum of values where Party not equal Republican or Democratic

        Args:
            pivot_column_name: the column holding the keys to pivot on
            value_column_name: the column holding the values to spread out over the pivots
            new_table_name: name of the new table
            pivot_column_values: the values to pivot on.  If empty, all values used
            other_column: if True, aggregate other values into a column
        
        Returns:
            A table as described in the comments above
        
         Throws:
            InvalidDataException if new_table_name is empty, pivot_column_name is not a name of an existing column, or value_column_name is not the name of an existing column
        '''
        names = [(new_table_name, 'new_table_name'),
                 (pivot_column_name, 'pivot_column_name'),
                 (value_column_name, 'value_column_name')]
        for name in names:
            if (not name[0]):
                raise InvalidDataException(f'{name[1]} cannot be empty')
        if (value_column_name == pivot_column_name):
            raise InvalidDataException(
                f'Pivot and value columns cannot be identical: both are {value_column_name}'
            )

        value_column_index = self._get_column_index(value_column_name)
        pivot_column_index = self._get_column_index(pivot_column_name)
        key_columns = list(
            set(range(len(self.schema))) -
            {value_column_index, pivot_column_index})
        key_columns.sort()

        # Split each row into a dict:
        # key (value of the other columns). Note this is a tuple so it can index a set
        # pivot_value: value of the pivot column
        # value: value of the value column

        def make_record(row):
            return {
                "key": tuple([row[i] for i in key_columns]),
                "pivot": row[pivot_column_index],
                "value": row[value_column_index]
            }

        partition = [make_record(row) for row in self.data]

        # get the set of all distinct keys
        keys = set([record["key"] for record in partition])

        # get the distinct values of the pivot column
        pivot_value_set = set([record["pivot"] for record in partition])

        # Note whether we will have an "Other" column.  We will when:
        # (a) other_column = True AND
        # (b) pivot_column_values is not empty AND
        # (c) there are columns in pivot_value_set - pivot_column_values
        other_columns = pivot_value_set - pivot_column_values if pivot_column_values else {}
        use_other = other_column and other_columns

        if (pivot_column_values):
            pivot_value_set = pivot_value_set.intersection(pivot_column_values)

        value_column_type = self.schema[value_column_index]["type"]

        def new_pivot_record():
            initial_value = 0 if value_column_type == GALYLEO_NUMBER else None
            result = {}
            for name in pivot_value_set:
                result[name] = initial_value
            result["Other"] = initial_value
            return result

        # set up the dictionary
        pivot_records = {}
        for key in keys:
            pivot_records[key] = new_pivot_record()
        for record in partition:
            pivot = record["pivot"]
            if (pivot in pivot_value_set):
                pivot_records[key][pivot] = record["value"]
            else:
                pivot_records[key][
                    "Other"] = record["value"] + pivot_records[key]["Other"]

        # Now just create and return the new table
        new_schema = [self.schema[i] for i in key_columns]

        pivot_schema = [{
            "name": name,
            "type": value_column_type
        } for name in pivot_value_set]

        if (use_other):
            pivot_schema.append({"name": "Other", "type": value_column_type})

        def make_record(key_value):
            result = list(key_value)
            record = pivot_records[key_value]
            values = [record[pivot] for pivot in pivot_value_set]
            if (use_other): values.append(record["Other"])
            return result + values

        data = [make_record(key) for key in pivot_records]
        result = GalyleoTable(new_table_name)
        result.load_from_dictionary({
            "columns": new_schema + pivot_schema,
            "rows": data
        })
        return result
    def aggregate_by(self,
                     aggregate_column_names,
                     new_column_name="count",
                     new_table_name=None):
        """
        Create a new table by aggregating over multiple columns.  The resulting table
        contains the aggregate column names and the new column name, and for each
        unique combination of values among the aggregate column names, the count of rows in this
        table with that unique combination of values.
        The new table will have name new_table_name
        Throws an InvalidDataException if aggregate_column_names is not a subset of the names in self.schema

        Args:
            aggregate_column_names: names of the  columns to aggregate over
            new_column_name: name of the column for the aggregate count.  Defaults to count
            new_table_name: name of the new table.  If omitted, defaults to None, in which case a name will be generated

        Returns:
            A new table with name new_table_name, or a generated name if new_table_name == None
        
        Throws:
            InvalidDataException if one of the column names is missing
        
        """
        if (aggregate_column_names == None
                or len(aggregate_column_names) == 0):
            raise InvalidDataException('No columns specified for aggregation')
        column_names = set(aggregate_column_names)
        columns = [
            entry for entry in self.schema if entry["name"] in column_names
        ]
        if (len(aggregate_column_names) != len(columns)):
            # We have a missing column.  Find it and throw the InvalidDataException
            current_columns = set([entry["name"] for entry in columns])
            missing_columns = column_names - current_columns
            raise InvalidDataException(
                f'Columns {missing_columns} are not present in the schema')
        # Make a table name
        if (new_table_name == None):
            letters = [name[0] for name in aggregate_column_names]
            new_table_name = 'aggregate_' + ''.join(
                [name[0] for name in aggregate_column_names])
        # Collect the indices of the requested columns
        indices = [
            i for i in range(len(self.schema))
            if self.schema[i]["name"] in column_names
        ]
        # Collect the columns for each row, making each short_row a tuple so they can be
        # indexed in a set
        simple_keys = len(indices) == 1
        if simple_keys:
            short_rows = [row[indices[0]] for row in self.data]
        else:
            short_rows = [
                tuple([row[i] for i in indices]) for row in self.data
            ]
        keys = set(short_rows)
        # Now that we have the unique keys, count the number of instances of each
        count = {}
        for key in keys:
            count[key] = 0
        for key in short_rows:
            count[key] = count[key] + 1
        # convert the keys from tuples to lists, add the count for each one, and
        # filter out the 0's
        data = []
        for key in keys:
            key_as_list = [key] if simple_keys else list(key)
            data.append(key_as_list + [count[key]])
        data = [row for row in data if row[-1] > 0]
        # The schema is just the requested columns + new_column_name, and the type
        # of new_column_name is a number.  Then create the result table, load in the
        # schema and data, and quit.
        schema = columns[:] + [{
            "name": new_column_name,
            "type": GALYLEO_NUMBER
        }]
        table = GalyleoTable(new_table_name)
        table.load_from_dictionary({"columns": schema, "rows": data})
        return table
 def _check_fields(self, record, required_fields, message_header):
     fields = set(record.keys())
     if (not fields.issuperset(required_fields)):
         raise InvalidDataException(
             f'{message_header} is missing fields {required_fields - fields}'
         )