def _CheckHierarchies(self): """Make sure that each concept instance has no more than one parent.""" for column in self.column_bundle.GetColumnIterator(): total_vals = {} if column.parent_ref: # Do not count total values as instances if column.total_val: total_vals[column.column_id] = column.total_val if self.column_bundle.GetColumnByID( column.parent_ref).total_val: total_vals[column.parent_ref] = ( self.column_bundle.GetColumnByID( column.parent_ref).total_val) combination_count = self.data_container.CombinationCount( column.column_id, column.parent_ref, total_vals) error_values = [] for row in combination_count: if row[1] > 1: error_values.append(row[0]) if error_values: raise data_source.DataSourceError( 'Instances of column %s have multiple parent values: %s' % (column.column_id, error_values))
def _CheckHierarchies(self): """Make sure that each concept instance has no more than one parent.""" cursor = self.sqlite_connection.cursor() for column in self.column_bundle.GetColumnIterator(): if column.parent_ref: if column.total_val: where_clause = ('WHERE %s != "%s"' % (column.column_id, column.total_val)) else: where_clause = '' query_str = ( 'SELECT %s, COUNT(*) FROM (SELECT DISTINCT %s, %s ' 'FROM csv_table %s) GROUP BY %s' % (column.column_id, column.column_id, column.parent_ref, where_clause, column.column_id)) try: cursor.execute(query_str) except sqlite3.OperationalError as e: raise data_source.DataSourceError( 'Error executing query: %s\n%s' % (query_str, str(e))) error_values = [] for row in cursor: if int(row[1]) > 1: error_values.append(row[0]) if error_values: raise data_source.DataSourceError( 'Instances of column %s have multiple parent values: %s' % (column.column_id, error_values)) cursor.close()
def _HeaderToColumn(header_string): """Parse the header string for a column. Args: header_string: The complete string for the column header Returns: A DataColumn object populated based on the header data Raises: DataSourceError: If there are any errors in parsing, e.g. if an unrecognized key is found. """ # The column id must be at least one character long, and cannot contain the # characters '[', ']', ';', or whitespace parameters_match = re.match('^([^\]\[;\s]+)(?:\[(.*)\]){0,1}$', header_string.strip().replace('"', '')) if not parameters_match: raise data_source.DataSourceError( 'Formatting error for header string: %s' % header_string) column_id = parameters_match.group(1) column = data_source.DataSourceColumn(column_id, internal_parameters={}) if parameters_match.group(2): # Parse the column parameters key_value_pairs = parameters_match.group(2).split(';') for key_value_pair in key_value_pairs: try: [key, value] = key_value_pair.split('=') except ValueError: raise data_source.DataSourceError( 'Formatting error for header string: %s' % header_string) # Map the key to the appropriate field of the DataSourceColumn object if key == 'type': if value not in ['date', 'float', 'integer', 'string']: raise data_source.DataSourceError( 'Unknown data type for column %s: %s' % (column.column_id, value)) column.data_type = value elif key == 'format': column.data_format = value elif key == 'concept': column.concept_ref = value elif key == 'extends': column.concept_extension = value elif key == 'parent': column.parent_ref = value elif key == 'slice_role': role_value = value.lower() if role_value not in ['dimension', 'metric']: raise data_source.DataSourceError( 'Unrecognized slice_roll in column %s: %s' % (column.column_id, value)) else: column.slice_role = role_value elif key == 'rollup': if value.lower() == 'true': column.rollup = True elif value.lower() == 'false': column.rollup = False else: raise data_source.DataSourceError( 'Unrecognized boolean value in column %s: %s' % (column.column_id, value)) elif key == 'total_val': column.total_val = value elif key == 'dropif': column.internal_parameters['dropif_val'] = value elif key == 'zeroif': column.internal_parameters['zeroif_val'] = value elif key == 'aggregation': if string.lower(value) not in [ 'sum', 'max', 'min', 'avg', 'count' ]: raise data_source.DataSourceError( 'Unknown aggregation for column %s: %s' % (column.column_id, value)) column.internal_parameters['aggregation'] = value else: raise data_source.DataSourceError( 'Unknown parameter for column %s: %s' % (column.column_id, key)) return column
def ConstructColumnBundle(csv_file, verbose=True): """Construct a ColumnBundle from the header information in a CSV file. Args: csv_file: The complete string for the column header verbose: Print out extra information to stdout Returns: A data_source.ColumnBundle object populated based on the CSV header Raises: DataSourceError: If there are any parsing errors or data inconsistencies """ # Get the first and second rows of the CSV header_csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"') header_row_values = header_csv_reader.next() second_row_values = header_csv_reader.next() csv_file.seek(0) # Check that second row is properly formatted if len(header_row_values) != len(second_row_values): raise data_source.DataSourceError( 'Number of columns in row 2 (%d) does not match number ' 'expected (%d)' % (len(second_row_values), len(header_row_values))) column_bundle = data_source.DataSourceColumnBundle() for header_element in header_row_values: column_bundle.AddColumn(_HeaderToColumn(header_element)) num_date_columns = 0 has_metric_column = False column_ids = [ column.column_id for column in column_bundle.GetColumnIterator() ] # Iterate through columns, populating and refining DataSourceColumn # parameters as necessary for c, column in enumerate(column_bundle.GetColumnIterator()): if verbose: print '\nEvaluating column %s' % column.column_id # Check data type if not column.data_type: column.data_type = (data_source.GuessDataType( second_row_values[c], column.column_id)) if verbose: print 'Guessing that column %s is of type %s' % ( column.column_id, column.data_type) # Check slice type if not column.slice_role: if column.data_type == 'integer' or column.data_type == 'float': column.slice_role = 'metric' else: column.slice_role = 'dimension' if verbose: print 'Guessing that column %s is a %s' % (column.column_id, column.slice_role) # Check aggregation if column.slice_role == 'metric': has_metric_column = True if 'aggregation' not in column.internal_parameters: column.internal_parameters['aggregation'] = 'SUM' if verbose: print 'Guessing that column %s should be aggregated by %s' % ( column.column_id, column.internal_parameters['aggregation']) # Check parent if column.parent_ref: if column.parent_ref not in column_ids: raise data_source.DataSourceError( 'Column %s references a parent not defined in this dataset: %s' % (column.column_id, column.parent_ref)) parent_column = column_bundle.GetColumnByID(column.parent_ref) if not parent_column.rollup: parent_column.rollup = True if verbose: print( 'Making column %s rollup since it is a parent to column %s' % (parent_column.column_id, column.column_id)) # Check date format and concept if column.data_type == 'date': num_date_columns += 1 if not column.data_format: column.data_format = (data_source.GuessDateFormat( second_row_values[c])) if not column.concept_ref: column.concept_ref = (data_source.GuessDateConcept( column.data_format)) if verbose: print( 'Guessing that column %s is formatted as %s and ' 'corresponds to %s' % (column.column_id, column.data_format, column.concept_ref)) # Warn user if their file will not produce interesting DSPL visualizations if num_date_columns == 0: warnings.warn('Input file does not have a date column', data_source.DataSourceWarning) elif num_date_columns > 1: warnings.warn('Input file has more than one date column', data_source.DataSourceWarning) if not has_metric_column: warnings.warn('Input file does not have any metrics', data_source.DataSourceWarning) return column_bundle
def __init__(self, csv_file, verbose=True): """Populate a CSVDataSourceSqlite object based on a CSV file. Note that the caller is responsible for closing the csv_file. Args: csv_file: A file-like object, opened for reading, that has CSV data in it verbose: Print out status messages to stdout Raises: DataSourceError: If CSV isn't properly formatted """ self.sqlite_dir = tempfile.mkdtemp() self.verbose = verbose self.column_bundle = csv_utilities.ConstructColumnBundle( csv_file, verbose) num_columns = self.column_bundle.GetNumColumns() # Set up sqlite table to store data columns_string = (','.join([ '%s %s' % (column.column_id, _DSPL_TYPE_TO_SQLITE_TYPE[column.data_type]) for column in self.column_bundle.GetColumnIterator() ])) if self.verbose: print '\nCreating sqlite3 table: %s' % (columns_string) self.sqlite_connection = sqlite3.connect( os.path.join(self.sqlite_dir, 'db.dat')) cursor = self.sqlite_connection.cursor() cursor.execute('create table csv_table (%s)' % (columns_string)) if self.verbose: print 'Adding CSV data to SQLite table' body_csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"') body_csv_reader.next() for r, row in enumerate(body_csv_reader): transformed_row_values = [] # Ignore blank rows if row: if len(row) != num_columns: raise data_source.DataSourceError( 'Number of columns in row %d (%d) does not match number ' 'expected (%d)' % (r + 2, len(row), num_columns)) skip_row = False for v, row_value in enumerate(row): column = self.column_bundle.GetColumnByOrder(v) # Handle dropif_val and zeroif_val parameters if 'dropif_val' in column.internal_parameters: if row_value == column.internal_parameters[ 'dropif_val']: skip_row = True break elif 'zeroif_val' in column.internal_parameters: if row_value == column.internal_parameters[ 'zeroif_val']: row_value = '0' transformed_row_values.append( _CleanDBValue(row_value, column.data_type)) if skip_row: continue # Add row to sqlite table transformed_values_str = ','.join(transformed_row_values) try: cursor.execute('insert into csv_table values (%s)' % (transformed_values_str)) except sqlite3.OperationalError as e: raise data_source.DataSourceError( 'Error putting line %d of input file into database: %s' '\n%s' % (r + 2, transformed_values_str, str(e))) if self.verbose: print 'Committing transactions\n' self.sqlite_connection.commit() cursor.close() if self.verbose: print 'Checking concept hierarchies' self._CheckHierarchies()
def GetTableData(self, query_parameters): """Calculate and return the requested table data. Uses sqlite to group and aggregate the raw data from the original CSV. Args: query_parameters: A QueryParameters object Returns: A TableData object containing the data for the requested table Raises: DataSourceError: If query against sqlite instance fails """ if query_parameters.query_type == data_source.QueryParameters.CONCEPT_QUERY: # This request is for a concept definition table # Filter out total values where_statements = [] for column_id in query_parameters.column_ids: column = self.column_bundle.GetColumnByID(column_id) if column.total_val: where_statements.append( '%s != "%s"' % (column.column_id, column.total_val)) if where_statements: where_clause = 'WHERE ' + ','.join(where_statements) else: where_clause = '' query_str = ('SELECT DISTINCT %s FROM csv_table %s ORDER BY %s' % (','.join(query_parameters.column_ids), where_clause, ','.join(query_parameters.column_ids))) elif query_parameters.query_type == data_source.QueryParameters.SLICE_QUERY: # This request is for a slice table sql_names = [] dimension_sql_names = [] where_statements = [] time_dimension_id = '' # Construct a SQL query that selects all parameters (with the necessary # aggregations), groups by non-time dimensions, and orders by all the # dimensions, with time last. for column_id in query_parameters.column_ids: column = self.column_bundle.GetColumnByID(column_id) if column.total_val: where_statements.append( '%s != "%s"' % (column.column_id, column.total_val)) if column.slice_role == 'dimension': sql_names.append(column_id) dimension_sql_names.append(column_id) if column.data_type == 'date': time_dimension_id = column_id elif column.slice_role == 'metric': sql_names.append( '%s(%s) AS %s' % (column.internal_parameters['aggregation'], column_id, column_id)) order_sql_names = ([ d for d in dimension_sql_names if d != time_dimension_id ]) if time_dimension_id: order_sql_names.append(time_dimension_id) # Handle total values in non-selected columns for column in self.column_bundle.GetColumnIterator(): if column.column_id not in query_parameters.column_ids: if column.total_val: where_statements.append( '%s = "%s"' % (column.column_id, column.total_val)) if where_statements: where_clause = 'WHERE ' + ','.join(where_statements) else: where_clause = '' query_str = ( 'SELECT %s FROM csv_table %s GROUP BY %s ORDER BY %s' % (','.join(sql_names), where_clause, ','.join(dimension_sql_names), ','.join(order_sql_names))) else: raise data_source.DataSourceError('Unknown query type: %s' % query_parameters.query_type) if self.verbose: print 'Executing query:\n%s\n' % (query_str) # Execute the query against the sqlite backend cursor = self.sqlite_connection.cursor() try: cursor.execute(query_str) except sqlite3.OperationalError as e: raise data_source.DataSourceError('Error executing query: %s\n%s' % (query_str, str(e))) query_results = [] for row in cursor: query_results.append(list(row)) cursor.close() return data_source.TableData(rows=query_results)
def GetTableData(self, query_parameters): """Calculate and return the requested table data. Use in-memory data container to group and aggregate the raw data from the original CSV. Args: query_parameters: A QueryParameters object Returns: A TableData object containing the data for the requested table Raises: DataSourceError: If query against sqlite instance fails """ if query_parameters.query_type == data_source.QueryParameters.CONCEPT_QUERY: omitted_values = {} # Pull out total values for column_id in query_parameters.column_ids: column = self.column_bundle.GetColumnByID(column_id) if column.total_val: omitted_values[column_id] = [column.total_val] query_results = self.data_container.DistinctValues( query_parameters.column_ids, omitted_values) elif query_parameters.query_type == data_source.QueryParameters.SLICE_QUERY: # This request is for a slice table all_columns = [] dimension_columns = [] time_dimension_id = '' metric_columns = [] metric_aggregation_map = {} query_total_vals = {} # Construct a SQL query that selects all parameters (with the necessary # aggregations), groups by non-time dimensions, and orders by all the # dimensions, with time last. for column_id in query_parameters.column_ids: column = self.column_bundle.GetColumnByID(column_id) all_columns.append(column_id) if column.slice_role == 'dimension': dimension_columns.append(column_id) if column.data_type == 'date': time_dimension_id = column_id elif column.slice_role == 'metric': metric_columns.append(column_id) metric_aggregation_map[column_id] = ( column.internal_parameters['aggregation']) if column.total_val: query_total_vals[column.column_id] = column.total_val order_by_columns = ([ d for d in dimension_columns if d != time_dimension_id ]) if time_dimension_id: order_by_columns.append(time_dimension_id) # Calculate the rows to filter out based on totals aggregated_total_vals = {} for column in self.column_bundle.GetColumnIterator(): if column.column_id not in query_parameters.column_ids: if column.total_val: aggregated_total_vals[ column.column_id] = column.total_val query_results = self.data_container.GroupedValues( all_columns, dimension_columns, order_by_columns, metric_aggregation_map, aggregated_total_vals, query_total_vals) else: raise data_source.DataSourceError('Unknown query type: %s' % query_parameters.query_type) return data_source.TableData(rows=query_results)
def __init__(self, csv_file, verbose=True): """Populate a CSVDataSource object based on a CSV file. Note that the caller is responsible for closing the csv_file. Args: csv_file: A file-like object, opened for reading, that has CSV data in it verbose: Print out status messages to stdout Raises: DataSourceError: If CSV isn't properly formatted """ self.verbose = verbose self.column_bundle = csv_utilities.ConstructColumnBundle( csv_file, verbose) column_ids = [ column.column_id for column in self.column_bundle.GetColumnIterator() ] num_columns = self.column_bundle.GetNumColumns() self.data_container = DataContainer(column_ids) if self.verbose: print 'Reading CSV data' body_csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"') body_csv_reader.next() for r, row in enumerate(body_csv_reader): transformed_row_values = [] # Ignore blank rows if row: if len(row) != num_columns: raise data_source.DataSourceError( 'Number of columns in row %d (%d) does not match number ' 'expected (%d)' % (r + 2, len(row), num_columns)) skip_row = False for v, row_value in enumerate(row): column = self.column_bundle.GetColumnByOrder(v) # Handle dropif_val and zeroif_val parameters if 'dropif_val' in column.internal_parameters: if row_value == column.internal_parameters[ 'dropif_val']: skip_row = True break elif 'zeroif_val' in column.internal_parameters: if row_value == column.internal_parameters[ 'zeroif_val']: row_value = 0.0 if column.data_type == 'integer': typed_row_value = int(row_value) elif column.data_type == 'float': typed_row_value = float(row_value) else: typed_row_value = row_value transformed_row_values.append(typed_row_value) if skip_row: continue self.data_container.AddRow(transformed_row_values) if self.verbose: print 'Checking concept hierarchies' self._CheckHierarchies()