Ejemplo n.º 1
0
    def testCalculateSlices(self):
        """Test of _CalculateSlices with powersets."""
        column1 = data_source.DataSourceColumn(
            'col1', rollup=True, concept_extension='entity:entity')
        column2 = data_source.DataSourceColumn('col2', rollup=False)
        column3 = data_source.DataSourceColumn('col3',
                                               rollup=True,
                                               parent_ref='col5')
        column4 = data_source.DataSourceColumn('col4',
                                               rollup=True,
                                               parent_ref='col3')
        column5 = data_source.DataSourceColumn('col5', rollup=True)

        column_bundle = data_source.DataSourceColumnBundle(
            columns=[column1, column2, column3, column4, column5])

        slice_column_sets = data_source_to_dspl._CalculateSlices(column_bundle)

        # Convert columns to id strings
        slice_column_ids = []

        for slice_column_set in slice_column_sets:
            slice_column_ids.append([c.column_id for c in slice_column_set])

        # Sort the actual and expected results so that the test is not order
        # dependent
        self.assertEqual(
            sorted([sorted(s) for s in slice_column_ids]),
            sorted([
                sorted(s)
                for s in [['col1', 'col2', 'col3'], ['col1', 'col2', 'col4'],
                          ['col1', 'col2', 'col5'], ['col1', 'col2'],
                          ['col2', 'col3'], ['col2', 'col4'], ['col2', 'col5'],
                          ['col2']]
            ]))
Ejemplo n.º 2
0
    def GetColumnBundle(self):
        column1 = data_source.DataSourceColumn(
            'col1',
            data_type='string',
            slice_role='dimension',
            concept_extension='entity:entity',
            rollup=True)
        column2 = data_source.DataSourceColumn(
            'col2',
            data_type='string',
            concept_extension='geo:location',
            slice_role='dimension',
            parent_ref='col6')
        column3 = data_source.DataSourceColumn('col3',
                                               data_type='date',
                                               concept_ref='time:year',
                                               data_format='yyyy',
                                               slice_role='dimension')
        column4 = data_source.DataSourceColumn('col4',
                                               data_type='float',
                                               slice_role='metric')
        column5 = data_source.DataSourceColumn('col5',
                                               data_type='integer',
                                               slice_role='metric')
        column6 = data_source.DataSourceColumn('col6',
                                               data_type='string',
                                               slice_role='dimension',
                                               rollup=True)

        return data_source.DataSourceColumnBundle(
            columns=[column1, column2, column3, column4, column5, column6])
Ejemplo n.º 3
0
def ConstructColumnBundle(csv_file, verbose=True):
    """Construct a ColumnBundle from the header information in a CSV file.

  Args:
    csv_file: The complete string for the column header
    verbose: Print out extra information to stdout

  Returns:
    A data_source.ColumnBundle object populated based on the CSV header

  Raises:
    DataSourceError: If there are any parsing errors or data
                     inconsistencies
  """
    # Get the first and second rows of the CSV
    header_csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
    header_row_values = header_csv_reader.next()
    second_row_values = header_csv_reader.next()
    csv_file.seek(0)

    # Check that second row is properly formatted
    if len(header_row_values) != len(second_row_values):
        raise data_source.DataSourceError(
            'Number of columns in row 2 (%d) does not match number '
            'expected (%d)' % (len(second_row_values), len(header_row_values)))

    column_bundle = data_source.DataSourceColumnBundle()

    for header_element in header_row_values:
        column_bundle.AddColumn(_HeaderToColumn(header_element))

    num_date_columns = 0
    has_metric_column = False
    column_ids = [
        column.column_id for column in column_bundle.GetColumnIterator()
    ]

    # Iterate through columns, populating and refining DataSourceColumn
    # parameters as necessary
    for c, column in enumerate(column_bundle.GetColumnIterator()):
        if verbose:
            print '\nEvaluating column %s' % column.column_id

        # Check data type
        if not column.data_type:
            column.data_type = (data_source.GuessDataType(
                second_row_values[c], column.column_id))

            if verbose:
                print 'Guessing that column %s is of type %s' % (
                    column.column_id, column.data_type)

        # Check slice type
        if not column.slice_role:
            if column.data_type == 'integer' or column.data_type == 'float':
                column.slice_role = 'metric'
            else:
                column.slice_role = 'dimension'

            if verbose:
                print 'Guessing that column %s is a %s' % (column.column_id,
                                                           column.slice_role)

        # Check aggregation
        if column.slice_role == 'metric':
            has_metric_column = True

            if 'aggregation' not in column.internal_parameters:
                column.internal_parameters['aggregation'] = 'SUM'

                if verbose:
                    print 'Guessing that column %s should be aggregated by %s' % (
                        column.column_id,
                        column.internal_parameters['aggregation'])

        # Check parent
        if column.parent_ref:
            if column.parent_ref not in column_ids:
                raise data_source.DataSourceError(
                    'Column %s references a parent not defined in this dataset: %s'
                    % (column.column_id, column.parent_ref))

            parent_column = column_bundle.GetColumnByID(column.parent_ref)

            if not parent_column.rollup:
                parent_column.rollup = True

                if verbose:
                    print(
                        'Making column %s rollup since it is a parent to column %s'
                        % (parent_column.column_id, column.column_id))

        # Check date format and concept
        if column.data_type == 'date':
            num_date_columns += 1

            if not column.data_format:
                column.data_format = (data_source.GuessDateFormat(
                    second_row_values[c]))

            if not column.concept_ref:
                column.concept_ref = (data_source.GuessDateConcept(
                    column.data_format))

            if verbose:
                print(
                    'Guessing that column %s is formatted as %s and '
                    'corresponds to %s' %
                    (column.column_id, column.data_format, column.concept_ref))

    # Warn user if their file will not produce interesting DSPL visualizations
    if num_date_columns == 0:
        warnings.warn('Input file does not have a date column',
                      data_source.DataSourceWarning)

    elif num_date_columns > 1:
        warnings.warn('Input file has more than one date column',
                      data_source.DataSourceWarning)

    if not has_metric_column:
        warnings.warn('Input file does not have any metrics',
                      data_source.DataSourceWarning)

    return column_bundle
Ejemplo n.º 4
0
 def setUp(self):
     self.column_bundle = data_source.DataSourceColumnBundle([
         data_source.DataSourceColumn(column_id='col1'),
         data_source.DataSourceColumn(column_id='col2'),
         data_source.DataSourceColumn(column_id='col3')
     ])