def write(self, table, data): """Write data to table. Parameters ---------- table: str Table name. data: list List of data tuples. """ # Process data schema = self.describe(table) model = SchemaModel(schema) cdata = [] for row in data: rdata = {} for index, field in enumerate(model.fields): value = row[index] try: value = model.cast(field['name'], value) except InvalidObjectType as exception: value = json.loads(value) rdata[field['name']] = value cdata.append(rdata) # Insert data dbtable = self.__get_dbtable(table) dbtable.insert().execute(cdata)
def __init__(self, infile, spec, orig_spec, validate=False, debug=False): self.spec = spec self.table_schema = SchemaModel(orig_spec['schema']) self.validate = validate self.infile = infile self.debug = debug self.stopped = False
def check_resource_schema(self, default_resource, resource): """Check that user resource schema contains all the mandatory fields""" def get_uncustomizable_fields(schema): uncustomizable = ['constraints', 'format', 'name', 'type'] field_filter = lambda field: {key: val for key, val in field.items() if key in uncustomizable} fields = [field_filter(field) for field in schema.fields] fields = sorted(fields, key=lambda k: k['name']) resource_schema = SchemaModel(resource.descriptor['schema']) default_schema_dict = default_resource.descriptor['schema'] if default_resource.descriptor['name'] == 'source_file': for field in default_schema_dict['fields']: if field['name'] == 'data': field['name'] = self.data_key default_schema = SchemaModel(default_schema_dict) if default_resource.descriptor['name'] in self.inflexible_resources: if get_uncustomizable_fields(default_schema) != \ get_uncustomizable_fields(resource_schema): msg = ('The fields for "{0}" are not subject to' 'change').format(resource.local_data_path) raise ValueError(msg, resource.local_data_path) else: required_headers = set(default_schema.required_headers) resource_headers = set(resource_schema.headers) if not required_headers.issubset(resource_headers): missing_headers = required_headers.difference(resource_headers) msg = ('Fields [{0}] are needed for internal processing' 'but are missing from {1}.' ).format(','.join(missing_headers), resource.local_data_path) raise ValueError(msg, resource.local_data_path)
def schema_validator(resource): schema = SchemaModel(resource.spec['schema']) for row in resource: for k, v in row.items(): try: schema.cast(k, v) except InvalidCastError: logging.error('Bad value %r for field %s', v, k) raise yield row
class Schema(API): """Processor to add types to row. Parameters ---------- schema: str/dict Schema as in https://github.com/okfn/jsontableschema-py#model. If schema is None processor will cast values using type detection. """ # Public def __init__(self, schema=None): self.__schema = None if schema is not None: self.__schema = SchemaModel(schema) def process(self, iterator): if self.__schema is None: values = [] for value in iterator.values: value = helpers.parse_value(value) values.append(value) iterator.values = tuple(values) else: values = self.__schema.convert_row(*iterator.values) iterator.values = tuple(values) def handle(self, iterator): pass # pragma: no cover
def export_package(storage, descriptor, datapackage_name): """Export Data Package from storage. Parameters ---------- storage: object Storage object. descriptor: str Path where to store descriptor. datapackage_name: str Name of the exported datapackage. """ # Iterate over tables resources = [] mapping = {} for table in storage.tables: # Prepare schema = storage.describe(table) base = os.path.dirname(descriptor) path, name = _restore_path(table) fullpath = os.path.join(base, path) if name is not None: mapping[table] = name # Write data _ensure_dir(fullpath) with io.open(fullpath, mode=_write_mode, newline=_write_newline, encoding=_write_encoding) as file: model = SchemaModel(deepcopy(schema)) data = storage.read(table) writer = csv.writer(file) writer.writerow(model.headers) for row in data: writer.writerow(row) # Add resource resource = {'schema': schema, 'path': path} if name is not None: resource['name'] = name resources.append(resource) # Write descriptor resources = _restore_resources(mapping, resources) _ensure_dir(descriptor) with io.open(descriptor, mode=_write_mode, encoding=_write_encoding) as file: descriptor = { 'name': datapackage_name, 'resources': resources, } json.dump(descriptor, file, indent=4)
class ResourceIterator(object): def __init__(self, infile, spec, orig_spec, validate=False, debug=False): self.spec = spec self.table_schema = SchemaModel(orig_spec['schema']) self.validate = validate self.infile = infile self.debug = debug self.stopped = False def __iter__(self): return self def __next__(self): if self.stopped: raise StopIteration() if self.debug: logging.error('WAITING') line = self.infile.readline().strip() if self.debug: logging.error('INGESTING: %r', line) if line == '': self.stopped = True raise StopIteration() line = json.loads(line) if self.validate: for k, v in line.items(): try: self.table_schema.cast(k, v) except (InvalidCastError, TypeError): field = self.table_schema.get_field(k) if field is None: raise ValueError('Validation failed: No such field %s', k) else: raise ValueError( 'Validation failed: Bad value %r ' 'for field %s with type %s', v, k, field.get('type')) return line def next(self): return self.__next__()
def __init__(self, schema): self.data = schema # print(self.data) self.schema_model = SchemaModel(schema) # print("printing schema model") # print(self.schema_model.fields) # for f in self.schema_model.fields: # print(SchemaField(f)) self.fields = [SchemaField(f) for f in self.schema_model.fields] self.species_fields = self.find_species_fields(self)
def assert_conforms_to_schema(schema, doc): assert isinstance(doc, dict), "invalid doc: {}".format(doc) row = [doc[field["name"]] for field in schema["fields"]] try: Schema(schema).cast_row(row) except Exception as e: logging.exception(e) raise Exception( "row does not conform to schema\nrow='{}'\nschema='{}'".format( json.dumps(row), json.dumps(schema))) schema_model = SchemaModel(schema) res = {} for k, v in doc.items(): try: res[k] = schema_model.cast(k, v) except Exception as e: logging.exception(e) raise Exception("doc attribute '{}' with value '{}' " "does not conform to schema '{}'".format( *map(json.dumps, [k, v, schema]))) return res
def update_sources_period(self, new_sources): """Overwrite source_file with the identified period_id""" source_resource = utilities.get_datapackage_resource(self.source_file, self.datapackage) source_idx = self.datapackage.resources.index(source_resource) source_schema_dict = self.datapackage.resources[source_idx].descriptor['schema'] updates = {'fields':[{'name': 'period_id', 'type': 'string', 'title': 'The period source data is relevant for.'}]} utilities.deep_update_dict(source_schema_dict, updates) source_schema = SchemaModel(source_schema_dict) with compat.UnicodeWriter(self.source_file) as source_file: source_file.writerow(source_schema.headers) for row in utilities.dicts_to_schema_rows(new_sources, source_schema): source_file.writerow(row)
def verify_csvimport(args): if not os.path.exists(args.csvfile): LOG.error('input CSV file %s does not exist' % args.csvfile) exit(-1) if os.path.exists(args.schema): schemafile = args.schema + '.json' else: schemafile = os.path.join(schemas_dir, args.schema + '.json') if not os.path.exists(schemafile): LOG.error('This schema file %s doesn' 't exist in current directory or csv_schemas directory' % (args.schema + '.json')) exit(-1) try: schema = SchemaModel(schemafile, case_insensitive_headers=True) if 'account' not in schema.headers and args.accountname is None: LOG.error('schema headers: %s' % schema.headers) LOG.error( 'This schema does not have an account column and no account name was provided' ) exit(-1) with open(schemafile, 'r') as sf: schemacontent = json.load(sf) try: setattr(schema, 'nheaders', schemacontent['nheaders']) except KeyError: setattr(schema, 'nheaders', 1) return schema except InvalidSchemaError as e: LOG.error('Invalid CSV schema %s' % e) exit(-1)
def do_csvimport(args, client=None): if client is None: client = clientfromargs(args) logger = get_logger(args) logger.debug('selected schema %s' % (args.schema,)) if os.path.exists(args.schema): schemafile = args.schema else: schemafile = os.path.join(schemas_dir, args.schema + '.json') if not os.path.exists(schemafile): logger.error('This schema doesn''t exist in csv_schemas') exit(-1) try: schema = SchemaModel(schemafile, case_insensitive_headers=True) with open(schemafile, 'r') as sf: schemacontent = json.load(sf) try: nheaders = schemacontent['nheaders'] except KeyError: nheaders = 1 except InvalidSchemaError: logger.error('Invalid CSV schema') raise logger.debug('schema headers %s' % schema.headers) if 'account' not in schema.headers and args.accountname is None: logger.error('This schema does not have an account column and no account name was provided') exit(-1) accounts = {x.account_name: x for x in client.budget.be_accounts} payees = {p.name: p for p in client.budget.be_payees} mastercategories_perid = {m.id: m for m in client.budget.be_master_categories} subcategories = {} for s in client.budget.be_subcategories: m = mastercategories_perid[s.entities_master_category_id] subcategories[m.name + ':' + s.name] = s def getaccount(accountname): try: logger.debug('searching for account %s' % accountname) return accounts[accountname] except KeyError: logger.error('Couldn''t find this account: %s' % accountname) exit(-1) def getpayee(payeename): try: logger.debug('searching for payee %s' % payeename) return payees[payeename] except KeyError: logger.debug('Couldn''t find this payee: %s' % payeename) payee = Payee(name=payeename) client.budget.be_payees.append(payee) return payee def getsubcategory(categoryname): try: logger.debug('searching for subcategory %s' % categoryname) return subcategories[categoryname] except KeyError: logger.debug('Couldn''t find this category: %s' % categoryname) exit(-1) entities_account_id = None if 'account' not in schema.headers: entities_account_id = getaccount(args.accountname).id amount = None if 'inflow' in schema.headers and 'outflow' in schema.headers: pass elif 'amount' in schema.headers: pass else: logger.error('This schema doesn''t provide an amount column or (inflow,outflow) columns') exit(-1) csvrow = namedtuple('CSVrow', field_names=schema.headers) transactions = [] imported_date = datetime.now().date() logger.debug('OK starting the import from %s ' % os.path.abspath(args.csvfile)) with open(args.csvfile, 'r') as inputfile: header = [] for i in range(0, nheaders): header.append(inputfile.readline()) for row in csv.reader(inputfile): if sys.version[0] == '2': row = [cell.decode('utf-8') for cell in row] if all(map(lambda x: x.strip() == '', row)): continue logger.debug('read line %s' % row) result = csvrow(*list(schema.convert_row(*row, fail_fast=True))) if 'account' in schema.headers: entities_account_id = getaccount(result.account).id if entities_account_id is None: logger.error( 'No account id, the account %s in the an account column was not recognized' % result.account) exit(-1) if 'inflow' in schema.headers and 'outflow' in schema.headers: amount = result.inflow - result.outflow elif 'amount' in schema.headers: amount = result.amount if 'category' in schema.headers and result.category: entities_subcategory_id = getsubcategory(result.category).id else: entities_subcategory_id = None if 'payee' in schema.headers: imported_payee = result.payee else: imported_payee = '' entities_payee_id = getpayee(imported_payee).id if 'memo' in schema.headers: memo = result.memo else: memo = '' transaction = Transaction( entities_account_id=entities_account_id, amount=amount, date=result.date, entities_payee_id=entities_payee_id, entities_subcategory_id=entities_subcategory_id, imported_date=imported_date, imported_payee=imported_payee, memo=memo, source="Imported" ) if args.import_duplicates or (not transaction in client.budget.be_transactions): logger.debug('Appending transaction %s ' % transaction.get_dict()) transactions.append(transaction) else: logger.debug('Duplicate transaction found %s ' % transaction.get_dict()) client.add_transactions(transactions)
def __init__(self, schema=None): self.__schema = None if schema is not None: self.__schema = SchemaModel(schema)
class SchemaField: """ Utility class for a field in a schema. It uses the schema types of https://github.com/frictionlessdata/jsontableschema-py#types for validation. """ # For most of the type we use the jsontableschema ones # TODO: SchemaModel is deprecated in favor of of # jsontableschema.schema.Schema but there's no _type_map! BASE_TYPE_MAP = SchemaModel._type_map() # except for anything date. BASE_TYPE_MAP['date'] = DayFirstDateType BASE_TYPE_MAP['datetime'] = DayFirstDateTimeType # and string BASE_TYPE_MAP['string'] = NotBlankStringType WL_TYPE_MAP = {} def __init__(self, data): self.data = data self.name = self.data.get('name') # We want to throw an exception if there is no name if not self.name: raise FieldSchemaError("A field without a name: {}".format( json.dumps(data))) # wl specific self.wl = WLSchema(self.data.get('wl')) # set the type: wl type as precedence type_class = self.WL_TYPE_MAP.get( self.wl.type) or self.BASE_TYPE_MAP.get(self.data.get('type')) self.type = type_class(self.data) self.constraints = SchemaConstraints(self.data.get('constraints', {})) # implement some dict like methods def __getitem__(self, item): return self.data.__getitem__(item) def get(self, k, d=None): return self.data.get(k, d) @property def title(self): return self.data.get('title') @property def column_name(self): return self.name @property def required(self): return self.constraints.required @property def is_species(self): return self.wl.is_species_type() @property def species_type(self): result = None if self.is_species: return self.wl.species_type or 'all' return result def cast(self, value): """ Returns a native Python object of the expected format. Will throw an exception if the value doesn't complies with any constraints. See for details: https://github.com/frictionlessdata/jsontableschema-py#types This method is mainly a helper for the validation_error :param value: :return: """ if isinstance(value, six.string_types) and not isinstance( value, six.text_type): # the StringType accepts only unicode value = six.u(value) elif isinstance(value, six.integer_types): value = '{}'.format(value) return self.type.cast(value) def validate(self, value): return self.validation_error(value) def validation_error(self, value): """ Return an error message if the value is not valid according to the schema. It relies on exception thrown by the 'cast1 method of Type method. :param value: :return: None if value is valid or an error message string """ error = None # override the integer validation. The default message is a bit cryptic if there's an error casting a string # like '1.2' into an int. if isinstance(self.type, types.IntegerType): if not is_blank_value(value): not_integer = False try: casted = self.cast(value) # there's also the case where the case where a float 1.2 is successfully casted in 1 # (ex: int(1.2) = 1) if str(casted) != str(value): not_integer = True except Exception: not_integer = True if not_integer: return 'The field "{}" must be a whole number.'.format( self.name) try: self.cast(value) except Exception as e: error = "{}".format(e) # Override the default enum exception message to include all # possible values if error.find('enum array') and self.constraints.enum: values = [str(v) for v in self.constraints.enum] error = "The value must be one the following: {}".format( values) return error def __str__(self): return '{}'.format(self.name)
def do_csvimport(args,client=None): if client is None: client = clientfromargs(args) logger=get_logger(args) logger.debug('selected schema %s' % (args.schema,)) if os.path.exists(args.schema): schemafile = args.schema else: schemafile = os.path.join(schemas_dir, args.schema + '.json') if not os.path.exists(schemafile): logger.error('This schema doesn''t exist in csv_schemas') exit(-1) try: schema = SchemaModel(schemafile, case_insensitive_headers=True) with open(schemafile,'r') as sf: schemacontent = json.load(sf) try: nheaders = schemacontent['nheaders'] except KeyError: nheaders = 1 except InvalidSchemaError: logger.error('Invalid CSV schema') raise logger.debug('schema headers %s' % schema.headers) if 'account' not in schema.headers and args.accountname is None: logger.error('This schema does not have an account column and no account name was provided') exit(-1) accounts = {x.account_name: x for x in client.budget.be_accounts} payees = {p.name: p for p in client.budget.be_payees} mastercategories_perid = {m.id: m for m in client.budget.be_master_categories} subcategories = {} for s in client.budget.be_subcategories: m=mastercategories_perid[s.entities_master_category_id] subcategories[m.name+':'+s.name]=s def getaccount(accountname): try: logger.debug('searching for account %s' % accountname) return accounts[accountname] except KeyError: logger.error('Couldn''t find this account: %s' % accountname) exit(-1) def getpayee(payeename): try: logger.debug('searching for payee %s' % payeename) return payees[payeename] except KeyError: logger.debug('Couldn''t find this payee: %s' % payeename) payee=Payee(name=payeename) client.budget.be_payees.append(payee) return payee def getsubcategory(categoryname): try: logger.debug('searching for subcategory %s' % categoryname) return subcategories[categoryname] except KeyError: get_logger(args).debug('Couldn''t find this category: %s' % categoryname) exit(-1) if 'account' not in schema.headers: entities_account_id = getaccount(args.accountname).id if 'inflow' in schema.headers and 'outflow' in schema.headers: pass elif 'amount' in schema.headers: pass else: logger.error('This schema doesn''t provide an amount column or (inflow,outflow) columns') exit(-1) csvrow = namedtuple('CSVrow', field_names=schema.headers) transactions = [] imported_date=datetime.now().date() get_logger(args).debug('OK starting the import from %s '%os.path.abspath(args.csvfile)) with open(args.csvfile, 'r') as inputfile: header = inputfile.readline() for row in csv.reader(inputfile): if sys.version[0] == '2': row = [cell.decode('utf-8') for cell in row] get_logger(args).debug('read line %s' % row) result = csvrow(*list(schema.convert_row(*row, fail_fast=True))) if 'account' in schema.headers: entities_account_id = getaccount(result.account).id if 'inflow' in schema.headers and 'outflow' in schema.headers: amount = result.inflow - result.outflow elif 'amount' in schema.headers: amount = result.amount else: get_logger(args).error('Couldn''t find this account: %s' % args.accountname) exit(-1) if 'category' in schema.headers and result.category: entities_subcategory_id = getsubcategory(result.category).id else: entities_subcategory_id = None if 'payee' in schema.headers: imported_payee=result.payee else: imported_payee='' entities_payee_id = getpayee(imported_payee).id if 'memo' in schema.headers: memo=result.memo else: memo='' transaction=Transaction( entities_account_id=entities_account_id, amount=amount, date=result.date, entities_payee_id=entities_payee_id, entities_subcategory_id=entities_subcategory_id, imported_date=imported_date, imported_payee=imported_payee, memo=memo, source="Imported" ) if args.import_duplicates or (not client.budget.be_transactions.containsduplicate(transaction)): get_logger(args).debug('Appending transaction %s '%transaction.getdict()) transactions.append(transaction) else: get_logger(args).debug('Duplicate transaction found %s '%transaction.getdict()) client.add_transactions(transactions)
def __init__(self, data): self.data = data self.name = data["name"] # We want to throw an exception if there is no name # use of jsontableschema.types to help constraint validation self.type = SchemaModel._type_map()[data.get("type")](data)
def __init__(self, schema): self.schema_model = SchemaModel(schema) self.fields = [SchemaField(f) for f in self.schema_model.fields]
def __init__(self, data): self.data = data self.name = data[ 'name'] # We want to throw an exception if there is no name # use of jsontableschema.types to help constraint validation self.type = SchemaModel._type_map()[data.get('type')](data)
def __init__(self, schema): self.data = schema self.schema_model = SchemaModel(schema) self.fields = [SchemaField(f) for f in self.schema_model.fields] self.species_fields = self.find_species_fields(self)
def pull_datapackage(descriptor, name, backend, **backend_options): """Pull Data Package from storage. All parameters should be used as keyword arguments. Args: descriptor (str): path where to store descriptor name (str): name of the pulled datapackage backend (str): backend name like `sql` or `bigquery` backend_options (dict): backend options mentioned in backend docs """ # Save datapackage name datapackage_name = name # Get storage plugin = import_module('jsontableschema.plugins.%s' % backend) storage = plugin.Storage(**backend_options) # Iterate over tables resources = [] for table in storage.tables: # Prepare schema = storage.describe(table) base = os.path.dirname(descriptor) path, name = mappers.restore_path(table) fullpath = os.path.join(base, path) # Write data helpers.ensure_dir(fullpath) with io.open(fullpath, 'wb') as file: model = SchemaModel(deepcopy(schema)) data = storage.read(table) writer = csv.writer(file, encoding='utf-8') writer.writerow(model.headers) for row in data: writer.writerow(row) # Add resource resource = {'schema': schema, 'path': path} if name is not None: resource['name'] = name resources.append(resource) # Write descriptor mode = 'w' encoding = 'utf-8' if six.PY2: mode = 'wb' encoding = None resources = mappers.restore_resources(resources) helpers.ensure_dir(descriptor) with io.open(descriptor, mode=mode, encoding=encoding) as file: descriptor = { 'name': datapackage_name, 'resources': resources, } json.dump(descriptor, file, indent=4) return storage
def convert_data(schema, data): result = [] model = SchemaModel(schema) for item in data: result.append(tuple(model.convert_row(*item))) return result