def create_dpkg(top_level_dict, ev_ob_dict, directory, joint_id): """Create the datapackage representation. Keyword arguments: top_level_dict -- the dictionary with the TOP_LEVEL_INFO ev_ob_dict -- the dictionary containing events and objects directory -- the directory joint_id -- the joint_identifier """ myDP = dp.DataPackage() for k, v in top_level_dict.items(): myDP.descriptor[k] = v myDP.descriptor['resources'] = [] # the events block # key = 'events' events_table = ev_ob_dict.get(key) path = key + '.csv' with io.open(directory + os.sep + key + '.csv') as stream: headers = stream.readline().rstrip('\n').split(',') values = csv.reader(stream) schema = infer(headers, values, row_limit=50, primary_key=joint_id) referenced_resource = key + 'Table' myDP.descriptor['resources'].append({ "name": key + 'Table', "path": path, "schema": schema, }) # the objects block # key = 'objects' objects_table = ev_ob_dict.get(key) path = key + '.csv' with io.open(directory + os.sep + key + '.csv') as stream: headers = stream.readline().rstrip('\n').split(',') values = csv.reader(stream) schema = infer(headers, values, row_limit=50) schema['foreignKeys'] = [{ "fields": joint_id, "reference": { "datapackage": "", "resource": referenced_resource, "fields": joint_id } }] myDP.descriptor['resources'].append({ "name": key + 'Table', "path": path, "schema": schema, }) return myDP
def create_datapackage(ds): # Create datapackage based on dataset.json dp = datapackage.DataPackage() basepath = '{0}/{1}/{2}'.format(DIR,private_or_public(ds),ds['name']) dp.metadata['name'] = ds['name'] dp.metadata['title'] = ds['label'] dp.metadata['description'] = ds['description'] if ds['territories']: dp.metadata['countryCode'] = ds['territories'] dp.metadata['profiles'] = {'fiscal': '*','tabular': '*'} dp.metadata['resources'] = [{}] resource = dp.resources[0] resource.metadata['name'] = 'dataset' resource.metadata['path'] = 'dataset.csv' # Infer schema of dataset.csv file with io.open(basepath + '/dataset.csv') as stream: headers = stream.readline().rstrip('\n').split(',') values = csv.reader(stream) schema = infer(headers, values, row_limit=1000) resource.metadata['schema'] = schema # Translate mapping dp.metadata['mapping'] = transform_dataset(ds) return dp
def pre_run(self, data_table): if (self.schema is None) and self.infer_schema: sample_values = data_table.get_sample(300) self.schema = self.schema_model(jsontableschema.infer(data_table.headers, sample_values)) return True, data_table
def infer(data, row_limit, encoding, to_file): """Infer a schema from data. * data must be a local filepath * data must be CSV * the file encoding is assumed to be UTF-8 unless an encoding is passed with --encoding * the first line of data must be headers * these constraints are just for the CLI """ if not row_limit: row_limit = None with io.open(data, mode='r+t', encoding=encoding) as stream: try: headers = stream.readline().rstrip('\n').split(',') values = jsontableschema.compat.csv_reader(stream) except UnicodeDecodeError: response = "Could not decode the data file as {0}. " \ "Please specify an encoding to use with the " \ "--encoding argument.".format(encoding) else: response = jsontableschema.infer(headers, values, row_limit=row_limit) if to_file: with io.open(to_file, mode='w+t', encoding='utf-8') as dest: dest.write(json.dumps(response, ensure_ascii=False, indent=2)) click.echo(response)
def infer(data, row_limit, to_file): """Infer a schema from data. * data must be a local filepath * data must be CSV * data must be UTF-8 encoded * the first line of data must be headers * these constraints are just for the CLI """ if not row_limit: row_limit = None with io.open(data, mode='r+t', encoding='utf-8') as stream: headers = stream.readline().rstrip('\n').split(',') values = csv.reader(stream) response = jsontableschema.infer(headers, values, row_limit=row_limit) if to_file: with io.open(to_file, mode='w+t', encoding='utf-8') as dest: dest.write(json.dumps(response, ensure_ascii=False, indent=2)) click.echo(response)
def infer_csv(csv_file, outfile, row_limit=0): with io.open(outfile, 'w') as fp: with io.open(csv_file) as stream: headers = stream.readline().rstrip('\n').split(',') values = jsontableschema.compat.csv_reader(stream) schema = jsontableschema.infer(headers, values, row_limit=row_limit) fp.write(six.u(json.dumps(schema, indent=2, ensure_ascii=False)))
def infer_from_df(df, **kwargs): # df.iterrows does not preserve types h = df.head() fields = list(df) iterrows = ([str(h[_].values[i]) for _ in fields] for i in range(h.shape[0])) return infer(fields, iterrows, **kwargs)
def test_infer_explicit_true(self): filepath = os.path.join(self.data_dir, 'data_infer.csv') with io.open(filepath) as stream: headers = stream.readline().rstrip('\n').split(',') values = jsontableschema.compat.csv_reader(stream) schema = jsontableschema.infer(headers, values, explicit=True) self.assertTrue(schema['fields'][0].get('constraints'))
def pre_run(self, data_table): sample_values = data_table.get_sample(300) if (self.schema is None) and self.infer_schema: self.schema = self.schema_model(jsontableschema.infer(data_table.headers, sample_values)) if self.schema and self.process_extra_fields: self.extra_fields = (set(data_table.headers)).difference(set(self.schema.headers)) infered_schema = jsontableschema.infer(data_table.headers, sample_values) complete_schema_dict = self.schema._to_python() for field in infered_schema['fields']: if field['name'] in self.extra_fields: complete_schema_dict['fields'].append(copy.deepcopy(field)) self.schema = self.schema_model(complete_schema_dict) return True, data_table
def test_infer_schema_primary_key_list(self): primary_key = ['id', 'age'] filepath = os.path.join(self.data_dir, 'data_infer.csv') with io.open(filepath) as stream: headers = stream.readline().rstrip('\n').split(',') values = jsontableschema.compat.csv_reader(stream) schema = jsontableschema.infer(headers, values, primary_key=primary_key) schema_model = jsontableschema.models.SchemaModel(schema) self.assertTrue(schema_model.primaryKey, primary_key)
def test_infer_schema_primary_key_list(self): primary_key = ['id', 'age'] filepath = os.path.join(self.data_dir, 'data_infer.csv') with io.open(filepath) as stream: headers = stream.readline().rstrip('\n').split(',') values = jsontableschema.compat.csv_reader(stream) schema = jsontableschema.infer(headers, values, primary_key=primary_key) schema_model = jsontableschema.model.SchemaModel(schema) self.assertTrue(schema_model.primaryKey, primary_key)
def test_infer_schema_row_limit(self): filepath = os.path.join(self.data_dir, 'data_infer_row_limit.csv') with io.open(filepath) as stream: headers = stream.readline().rstrip('\n').split(',') values = jsontableschema.compat.csv_reader(stream) schema = jsontableschema.infer(headers, values, row_limit=4) schema_model = jsontableschema.models.SchemaModel(schema) self.assertEqual(schema_model.get_field('id')['type'], 'integer') self.assertEqual(schema_model.get_field('age')['type'], 'integer') self.assertEqual(schema_model.get_field('name')['type'], 'string')
def test_infer_schema_row_limit(self): filepath = os.path.join(self.data_dir, 'data_infer_row_limit.csv') with io.open(filepath) as stream: headers = stream.readline().rstrip('\n').split(',') values = jsontableschema.compat.csv_reader(stream) schema = jsontableschema.infer(headers, values, row_limit=4) schema_model = jsontableschema.model.SchemaModel(schema) self.assertEqual(schema_model.get_field('id')['type'], 'integer') self.assertEqual(schema_model.get_field('age')['type'], 'integer') self.assertEqual(schema_model.get_field('name')['type'], 'string')
def pre_run(self, data_table): sample_values = data_table.get_sample(300) if (self.schema is None) and self.infer_schema: self.schema = self.schema_model( jsontableschema.infer(data_table.headers, sample_values)) if self.schema and self.process_extra_fields: self.extra_fields = (set(data_table.headers)).difference( set(self.schema.headers)) infered_schema = jsontableschema.infer(data_table.headers, sample_values) complete_schema_dict = self.schema._to_python() for field in infered_schema['fields']: if field['name'] in self.extra_fields: complete_schema_dict['fields'].append(copy.deepcopy(field)) self.schema = self.schema_model(complete_schema_dict) return True, data_table
def package_yearly_data(): csv_file_path = CSV_DATE_DIR + YEARLY_DATA_FILE_CSV pkg_file_path = PKG_DATE_DIR + YEARLY_DATA_FILE_PKG dp = datapackage.DataPackage() dp.descriptor['name'] = 'yearly-gas-price' dp.descriptor['title'] = 'Yearly Avg Gas Price' with io.open(csv_file_path) as stream: headers = stream.readline().rstrip('\n').split(',') values = csv.reader(stream) schema = infer(headers, values) dp.descriptor['resources'] = [ { 'name': 'data', 'path': csv_file_path, 'schema': schema } ] with open(pkg_file_path, 'w') as f: f.write(dp.to_json())
def extra_header(errors, columns, sample, infer_fields=False): for column in copy(columns): if 'field' not in column: # Infer field if infer_fields: column_sample = [] for row in sample: value = None if len(row) > column['number']: value = row[column['number']] column_sample.append(value) descriptor = infer([column['header']], column_sample) column['field'] = Schema(descriptor).fields[0] # Add error/remove column else: message = spec['errors']['extra-header']['message'] message = message.format(column_number=column['number']) errors.append({ 'code': 'extra-header', 'message': message, 'row-number': None, 'column-number': column['number'], }) columns.remove(column)
def __inspect_table(self, table): # Start timer start = datetime.datetime.now() # Prepare vars errors = [] headers = None row_number = 0 fatal_error = False checks = copy(self.__checks) source = table['source'] stream = table['stream'] schema = table['schema'] extra = table['extra'] # Prepare table try: stream.open() sample = stream.sample headers = stream.headers if self.__filter_checks(checks, type='schema'): if schema is None and self.__infer_schema: schema = Schema(infer(headers, sample)) if schema is None: checks = self.__filter_checks(checks, type='schema', inverse=True) except Exception as exception: fatal_error = True message = str(exception) if isinstance(exception, tabulator.exceptions.SourceError): code = 'source-error' elif isinstance(exception, tabulator.exceptions.SchemeError): code = 'scheme-error' elif isinstance(exception, tabulator.exceptions.FormatError): code = 'format-error' elif isinstance(exception, tabulator.exceptions.EncodingError): code = 'encoding-error' elif isinstance(exception, tabulator.exceptions.IOError): code = 'io-error' elif isinstance(exception, tabulator.exceptions.HTTPError): code = 'http-error' else: raise errors.append({ 'row': None, 'code': code, 'message': message, 'row-number': None, 'column-number': None, }) # Prepare columns if not fatal_error: columns = [] fields = [None] * len(headers) if schema is not None: fields = schema.fields iterator = zip_longest(headers, fields, fillvalue=_FILLVALUE) for number, (header, field) in enumerate(iterator, start=1): column = {'number': number} if header is not _FILLVALUE: column['header'] = header if field is not _FILLVALUE: column['field'] = field columns.append(column) # Head checks if not fatal_error: head_checks = self.__filter_checks(checks, context='head') for check in head_checks: if not columns: break check['func'](errors, columns, sample) for error in errors: error['row'] = None # Body checks if not fatal_error: states = {} colmap = {column['number']: column for column in columns} body_checks = self.__filter_checks(checks, context='body') with stream: for row_number, headers, row in stream.iter(extended=True): columns = [] iterator = zip_longest(headers, row, fillvalue=_FILLVALUE) for number, (header, value) in enumerate(iterator, start=1): colref = colmap.get(number, {}) column = {'number': number} if header is not _FILLVALUE: column['header'] = colref.get('header', header) if 'field' in colref: column['field'] = colref['field'] if value is not _FILLVALUE: column['value'] = value columns.append(column) for check in body_checks: if not columns: break state = states.setdefault(check['code'], {}) check['func'](errors, columns, row_number, state) for error in reversed(errors): if 'row' in error: break error['row'] = row if row_number >= self.__row_limit: break if len(errors) >= self.__error_limit: break # Stop timer stop = datetime.datetime.now() # Compose report errors = errors[:self.__error_limit] report = copy(extra) report.update({ 'time': round((stop - start).total_seconds(), 3), 'valid': not bool(errors), 'error-count': len(errors), 'row-count': row_number, 'headers': headers, 'source': source, 'errors': errors, }) return report