def test_infer_schema_with_row_limit():
    descriptor = infer('data/data_infer_row_limit.csv', limit=4)
    assert descriptor == {
        'fields': [
            {'name': 'id', 'type': 'integer', 'format': 'default'},
            {'name': 'age', 'type': 'integer', 'format': 'default'},
            {'name': 'name', 'type': 'string', 'format': 'default'}],
        'missingValues': [''],
    }
def test_infer_schema_utf8():
    descriptor = infer('data/data_infer_utf8.csv')
    assert descriptor == {
        'fields': [
            {'name': 'id', 'type': 'integer', 'format': 'default'},
            {'name': 'age', 'type': 'integer', 'format': 'default'},
            {'name': 'name', 'type': 'string', 'format': 'default'}],
        'missingValues': [''],
    }
Exemple #3
0
    def test_infer_schema_row_limit(self):
        filepath = os.path.join(self.data_dir, 'data_infer_row_limit.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = tableschema.compat.csv_reader(stream)
            schema = tableschema.infer(headers, values, row_limit=4)
        schema_model = tableschema.Schema(schema)

        self.assertEqual(schema_model.get_field('id').type, 'integer')
        self.assertEqual(schema_model.get_field('age').type, 'integer')
        self.assertEqual(schema_model.get_field('name').type, 'string')
Exemple #4
0
    def test_infer_schema_primary_key_list(self):
        primary_key = ['id', 'age']
        filepath = os.path.join(self.data_dir, 'data_infer.csv')
        with io.open(filepath) as stream:
            headers = stream.readline().rstrip('\n').split(',')
            values = tableschema.compat.csv_reader(stream)
            schema = tableschema.infer(headers,
                                       values,
                                       primary_key=primary_key)
        schema_model = tableschema.Schema(schema)

        self.assertTrue(schema_model.primary_key, primary_key)
Exemple #5
0
def get_schema_dict(filename):
    try:
        s = Schema(infer(filename))
    except:
        return None

    # work with schema in dict form
    s_dict = s.descriptor
    s_dict['foreignKeys'] = []
    for d in s_dict['fields']:
        if d['name'] in FOREIGN_KEYS.keys():
            s_dict['foreignKeys'].append(FOREIGN_KEYS[d['name']])
    return s_dict
Exemple #6
0
def infer(data, row_limit, encoding, to_file):
    """Infer a schema from data.

    * data must be a local filepath
    * data must be CSV
    * the file encoding is assumed to be UTF-8 unless an encoding is passed
      with --encoding
    * the first line of data must be headers
    * these constraints are just for the CLI
    """
    descriptor = tableschema.infer(data, encoding=encoding, limit=row_limit)
    if to_file:
        with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
            dest.write(json.dumps(descriptor, ensure_ascii=False, indent=4))
    click.echo(descriptor)
Exemple #7
0
def infer(data, row_limit, encoding, to_file):
    """Infer a schema from data.

    * data must be a local filepath
    * data must be CSV
    * the file encoding is assumed to be UTF-8 unless an encoding is passed
      with --encoding
    * the first line of data must be headers
    * these constraints are just for the CLI
    """
    descriptor = tableschema.infer(data, encoding=encoding, limit=row_limit)
    if to_file:
        with io.open(to_file, mode='w+t', encoding='utf-8') as dest:
            dest.write(json.dumps(descriptor, ensure_ascii=False, indent=4))
    click.echo(descriptor)
Exemple #8
0
def add_extended_metadata(filename, metadata):
    """Update the given metadata dict with additional tableschema metadata.
    This is only available for files that can be read by tableschema, which are
    only tsvs and csvs. Tableschema doesn't add much, but it could be handy
    if it can detect extended types like locations."""
    metadata['previewbytes'] = get_preview_byte_count(filename)
    try:
        ts_info = tableschema.Schema(tableschema.infer(filename)).descriptor

        new_field_definitions = []
        for m, ts in zip(metadata['field_definitions'], ts_info['fields']):
            m['format'] = ts['format']
            new_field_definitions.append(m)
        metadata['field_definitions'] = new_field_definitions
    except tabulator.exceptions.FormatError:
        pass
    return metadata
Exemple #9
0
def test_infer_increase_limit_issue_212():
    descriptor = infer('data/data_infer_increase_limit.csv', limit=200)
    assert descriptor == {
        'fields': [
            {
                'name': 'a',
                'type': 'integer',
                'format': 'default'
            },
            {
                'name': 'b',
                'type': 'number',
                'format': 'default'
            },
        ],
        'missingValues': [''],
    }
Exemple #10
0
def test_infer_schema_with_missing_values_default():
    descriptor = infer('data/data_infer_missing_values.csv')
    assert descriptor == {
        'fields': [{
            'name': 'id',
            'type': 'string',
            'format': 'default'
        }, {
            'name': 'age',
            'type': 'integer',
            'format': 'default'
        }, {
            'name': 'name',
            'type': 'string',
            'format': 'default'
        }],
        'missingValues': [''],
    }
Exemple #11
0
def test_infer_xlsx_file_with_boolean_column_issue_203():
    descriptor = infer('data/data_infer_boolean.xlsx')
    assert descriptor == {
        'fields': [{
            'name': 'number',
            'type': 'integer',
            'format': 'default'
        }, {
            'name': 'string',
            'type': 'string',
            'format': 'default'
        }, {
            'name': 'boolean',
            'type': 'boolean',
            'format': 'default'
        }],
        'missingValues': [''],
    }
Exemple #12
0
def analyze_dataframe(filename, foreign_keys=None):
    # Pandas analysis
    df = pandas.read_csv(filename, sep='\t')
    pandas_info = df.describe(include='all')
    # Tableschema analysis
    ts_info = tableschema.Schema(tableschema.infer(filename)).descriptor

    column_metadata = []
    for column in ts_info['fields'][:10]:
        df_metadata = column.copy()
        col_name = column['name']
        df_metadata.update(get_pandas_field_metadata(pandas_info, col_name))
        df_metadata.update(get_foreign_key(foreign_keys, column))
        column_metadata.append(df_metadata)

    dataframe_metadata = {
        'name': 'Data Dictionary',
        # df.shape[0] seems to have issues determining rows
        'numrows': len(df.index),
        'numcols': df.shape[1],
        'previewbytes': get_preview_byte_count(filename),
        'field_definitions': column_metadata,
        'labels': {
            'name': 'Column Name',
            'type': 'Data Type',
            'format': 'Format',
            'count': 'Number of non-null entries',
            '25': '25th Percentile',
            '50': '50th Percentile',
            '75': '75th Percentile',
            'std': 'Standard Deviation',
            'mean': 'Mean Value',
            'min': 'Minimum Value',
            'max': 'Maximum Value',
            'unique': 'Unique Values',
            'top': 'Top Common',
            'frequency': 'Frequency of Top Common Value',
            'reference': 'Link to resource definition'
        }
    }
    return dataframe_metadata
Exemple #13
0
def main(input_filepath):
    """ Loads the data from input_filepath and infers the data schema,
        finally saving a JSON table schema representing your data.
        (recommended to run this file with a sliced dataset of max 100 lines)
    """
    logger = logging.getLogger(__name__)
    logger.info('making data schema for workbench')
    logger.info(
        '(recommended to run this file with a sliced dataset of max 100 lines)'
    )

    with io.open(input_filepath) as stream:
        headers = stream.readline().rstrip('\n').split(',')
        valid_headers = True
        for header in headers:
            if not header.islower():
                logger.error("ensure that all header names are lowercase: %s",
                             header)
                valid_headers = False
        if valid_headers:
            values = csv.reader(stream)
            schema = infer(headers, values)
            schema_to_json(schema, logger)
def test_check_type_boolean_string_tie():
    descriptor = infer([['f'], ['stringish']], headers=['field'])
    assert descriptor['fields'][0]['type'] == 'string'
Exemple #15
0
def test_check_type_boolean_string_tie():
    descriptor = infer([['f'], ['stringish']], headers=['field'])
    assert descriptor['fields'][0]['type'] == 'string'