def wrangle_columns(df, config): use_columns = [] rename_columns = {} map_funcs = {} for column in config.columns: if isinstance(column, str): use_columns.append(column) elif banal.is_mapping(column): if len(column) > 1: raise ConfigError(f'Column config `{column}` has errors.') target, source = list(column.items())[0] if banal.is_mapping(source): source_column = source.get('column', target) map_func = source.get('map') if map_func: map_funcs[target] = safe_eval(map_func) else: source_column = source use_columns.append(source_column) rename_columns[source_column] = target else: raise ConfigError(f'Column config `{column}` has errors.') df = df[use_columns] if rename_columns: df = df.rename(columns=rename_columns) if map_funcs: for col, func in map_funcs.items(): df[col] = df[col].map(func) return df
def flatten_id(data, field, nested): if not is_mapping(data): return data value = stringify(data.get(field)) if value is None: nested = data.get(nested) if is_mapping(nested): value = stringify(nested.get('id')) data[field] = value
def read_object(stream): line = stream.readline() if not line: return data = json.loads(line) if is_mapping(data) and 'schema' in data: return model.get_proxy(data) if is_mapping(data) and 'enricher' in data: enricher = load_enricher(data.get('enricher')) return Result.from_dict(enricher, data) return data
def save_issue(conn: Conn, event: Dict[str, Any]) -> None: data = dict(event) for key, value in data.items(): if hasattr(value, "to_dict"): value = value.to_dict() if isinstance(value, set): value = list(value) data[key] = value data.pop("_record", None) data.pop("timestamp", None) record = { "timestamp": settings.RUN_TIME, "module": data.pop("logger", None), "level": data.pop("level"), "message": data.pop("event", None), "dataset": data.pop("dataset"), } entity = data.pop("entity", None) if is_mapping(entity): record["entity_id"] = entity.get("id") record["entity_schema"] = entity.get("schema") elif isinstance(entity, str): record["entity_id"] = entity record["data"] = data q = issue_table.insert().values([record]) conn.execute(q) return None
def to_jsonschema(obj): """Schema are stored in OpenAPI spec and might need some massaging to make for valid JSON Schema.""" if is_mapping(obj): # Re-write nullable fields: type_ = obj.get("type") if obj.get("nullable", False): type_ = obj.pop("type", None) format_ = obj.pop("format", None) obj["oneOf"] = [ { "type": "null" }, { "type": type_, "format": format_ }, ] obj.pop("nullable", None) out = {} for key, value in obj.items(): out[key] = to_jsonschema(value) return out if is_listish(obj): return [to_jsonschema(o) for o in obj] return obj
def request(self, method, url, headers={}, auth=None, data=None, params=None, json=None, allow_redirects=True, lazy=False): if is_mapping(params): params = list(params.items()) url = normalize_url(url, extra_query_args=params) method = method.upper().strip() request = Request(method, url, data=data, headers=headers, json=json, auth=auth) response = ContextHttpResponse(self, request=request, allow_redirects=allow_redirects) if not lazy: response.fetch() return response
def bulk_write(collection, items, merge=True): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. now = datetime.utcnow().isoformat() aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if not unsafe: entity = remove_checksums(entity) entity.context = { 'role_id': role_id, 'created_at': now, 'updated_at': now, } writer.put(entity, origin='bulk') if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def apply_ops(df, ops): """apply any valid operation from `pd.DataFrame.<op>` with optional arguments in given order""" for op in ops: op_name = op op_args = None if banal.is_mapping(op): name = list(op.keys()) if len(name) > 1: raise ConfigError( f'Operation not valid: {name} - should be only 1 item.') op_name = name[0] op_args = list(op.values()) if len(op_args) > 1: raise ConfigError( f'Operation arguments not valid: {op_args} - should be only 1 mapping item.' ) op_args = { k: safe_eval(v) if k == 'func' else v for k, v in op_args[0].items() } func = getattr(DataFrame, op_name, None) if func is None or not callable(func): raise ConfigError( f'{op} is not a valid opration for `pd.DataFrame`') if op_args: df = func(df, **op_args) else: df = func(df) return df
def convert_entity(self, result, data): data = ensure_dict(data) if 'properties' not in data or 'schema' not in data: return try: entity = result.make_entity(data.get('schema')) except InvalidData: log.error("Server model mismatch: %s" % data.get('schema')) return entity.id = data.get('id') links = ensure_dict(data.get('links')) entity.add('alephUrl', links.get('self')) properties = ensure_dict(data.get('properties')) for prop, values in properties.items(): for value in ensure_list(values): if is_mapping(value): child = self.convert_entity(result, value) if child.id is None: continue value = child.id try: entity.add(prop, value, cleaned=True) except InvalidData: msg = "Server property mismatch (%s): %s" log.warning(msg % (entity.schema.name, prop)) result.add_entity(entity) return entity
def filter_text(spec, invert=False): """Try to convert a given filter to a lucene query string.""" # CAVEAT: This doesn't cover all filters used by aleph. if isinstance(spec, (list, tuple, set)): parts = [filter_text(s, invert=invert) for s in spec] return " ".join(parts) if not is_mapping(spec): return spec for op, props in spec.items(): if op == "term": field, value = next(iter(props.items())) field = "-%s" % field if invert else field return '%s:"%s"' % (field, value) if op == "terms": field, values = next(iter(props.items())) parts = [{"term": {field: v}} for v in values] parts = [filter_text(p, invert=invert) for p in parts] predicate = " AND " if invert else " OR " text = predicate.join(parts) if len(parts) > 1: text = "(%s)" % text return text if op == "exists": field = props.get("field") field = "-%s" % field if invert else field return "%s:*" % field
def get_entity_id(obj): """Given an entity-ish object, try to get the ID.""" if is_mapping(obj): obj = obj.get('id') elif hasattr(obj, 'id'): obj = obj.id return sanitize_text(obj)
def bulk_write(collection, items): """Write a set of entities - given as raw dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data") entity = model.get_proxy(item) if entity.id is None: raise InvalidData("No ID for entity") if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} if len(entities): index.index_bulk(collection.id, entities)
def request( self, method, url, headers={}, auth=None, data=None, params=None, json=None, allow_redirects=True, timeout=settings.HTTP_TIMEOUT, lazy=False, ): if is_mapping(params): params = list(params.items()) method = method.upper().strip() request = Request(method, url, data=data, headers=headers, json=json, auth=auth, params=params) response = ContextHttpResponse(self, request=request, allow_redirects=allow_redirects, timeout=timeout) if not lazy: response.fetch() return response
def read_entity(stream): line = stream.readline() if not line: return data = json.loads(line) if is_mapping(data) and 'schema' in data: return model.get_proxy(data) return data
def convert_nested(self, data): entity = self.convert_entity(data) properties = ensure_dict(data.get("properties")) for prop, values in properties.items(): for value in ensure_list(values): if is_mapping(value): yield self.convert_entity(value) yield entity
def convert_classification(entity, item, prop='classification'): if not is_mapping(item): entity.add(prop, item) else: if 'classification' in item: convert_classification(entity, item.get('classification'), prop) convert_description(entity, item, prop) convert_address(entity, item.pop('deliveryAddress', {}))
def get_entity_id(obj): """Given an entity-ish object, try to get the ID.""" if isinstance(obj, str): return obj if is_mapping(obj): return obj.get('id') if hasattr(obj, 'id'): return obj.id
def _generate(): for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if not unsafe: entity = remove_checksums(entity) yield _process_entity(entity)
def merge_data(old, new): """Extend the values of the new doc with extra values from the old.""" if is_sequence(old) or is_sequence(new): new = ensure_list(new) new.extend(ensure_list(old)) return unique_list(new) if is_mapping(old) or is_mapping(new): old = old if is_mapping(old) else {} new = new if is_mapping(new) else {} keys = set(new.keys()) keys.update(old.keys()) combined = {} for key in keys: value = merge_data(old.get(key), new.get(key)) if value is not None: combined[key] = value return combined return new or old
def object_id(obj, clazz=None): """Turn a given object into an ID that can be stored in with the notification.""" clazz = clazz or type(obj) if isinstance(obj, clazz): obj = obj.id elif is_mapping(obj): obj = obj.get('id') return obj
def read_result(stream): line = stream.readline() if not line: return data = json.loads(line) if is_mapping(data) and 'enricher' in data: enricher = load_enricher(data.get('enricher')) return Result.from_dict(enricher, data) return data
def refresh_entity(entity, sync=False): if is_mapping(entity): entity_id = entity.get('id') collection_id = entity.get('collection_id') else: entity_id = entity.id collection_id = entity.collection_id cache.kv.delete(cache.object_key(Entity, entity_id), cache.object_key(Collection, collection_id))
def get_entity_id(obj): """Given an entity-ish object, try to get the ID.""" if is_mapping(obj): obj = obj.get("id") else: try: obj = obj.id except AttributeError: pass return obj
def get_entity_id(obj: Any) -> Optional[str]: """Given an entity-ish object, try to get the ID.""" if is_mapping(obj): obj = obj.get("id") else: try: obj = obj.id except AttributeError: pass return stringify(obj)
def ref(self, value): """Generate a qualified form for storage in a triplestore.""" if self.prefix is None: return if is_mapping(value): value = value.get('id') value = stringify(value) if value is None: return return ':'.join((self.prefix, value))
def __init__(self, schema, id, properties, key_prefix=None): self.schema = schema self.id = stringify(id) self.key_prefix = stringify(key_prefix) self.countries = set() self.names = set() self._properties = {} if is_mapping(properties): for key, value in properties.items(): self.add(key, value, cleaned=True, quiet=True)
def generate(self): self.model.properties.add(self) if self.range is None and self.type == registry.entity: self.range = self.model.get(self.data.get('range')) reverse_ = self.data.get('reverse') if self.reverse is None and self.range and reverse_: if not is_mapping(reverse_): raise InvalidModel("Invalid reverse: %s" % self) self.reverse = self.range._add_reverse(reverse_, self)
def generate(self): range_ = self.data.get('schema', 'Thing') if range_: self.range = self.schema.model.get(range_) if self.range is None: raise InvalidModel("Cannot find range: %s" % self._range) reverse_ = self.data.get('reverse') if self.range and reverse_: if not is_mapping(reverse_): raise InvalidModel("Invalid reverse: %s" % self) self.reverse = self.range._add_reverse(reverse_, self)
def convert_identifier(entity, identifier): if not is_mapping(identifier): entity.add(DEFTAULT_IDENTIFIER, identifier) return convert_name(entity, identifier) scheme = identifier.pop('scheme', None) prop = IDENTIFIERS.get(scheme, None) if prop is None: log.info("Unknown identifier scheme: %s", scheme) prop = DEFTAULT_IDENTIFIER IDENTIFIERS[scheme] = prop entity.add(prop, identifier.pop('id', None))
def bulk_write(collection, items, merge=True, unsafe=False): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) if not unsafe: entity = namespace.apply(entity) entity = remove_checksums(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)