def __init__(self, model, query, name, data, key_prefix=None): self.model = model self.name = name self.data = data self.seed = sha1(key_bytes(key_prefix)) self.seed.update(key_bytes(data.get('key_literal'))) self.keys = keys_values(data, 'key', 'keys') if not len(self.keys): raise InvalidMapping("No keys: %r" % name) self.schema = model.get(data.get('schema')) if self.schema is None: raise InvalidMapping("Invalid schema: %s" % data.get('schema')) self.refs = set(self.keys) self.dependencies = set() self.properties = [] for name, mapping in data.get('properties', {}).items(): prop = self.schema.get(name) if prop is None: raise InvalidMapping("Invalid property: %s" % name) mapping = PropertyMapping(query, mapping, prop) self.properties.append(mapping) self.refs.update(mapping.refs) if mapping.entity: self.dependencies.add(mapping.entity)
def __init__(self, model, query, name, data, key_prefix=None): self.model = model self.name = name self.data = data self.seed = sha1(key_bytes(key_prefix)) self.seed.update(key_bytes(data.get("key_literal"))) self.keys = keys_values(data, "key", "keys") self.id_column = data.get("id_column") if not len(self.keys) and self.id_column is None: raise InvalidMapping("No keys or ID: %r" % name) if len(self.keys) and self.id_column is not None: msg = "Please use only keys or id_column, not both: %r" % name raise InvalidMapping(msg) self.schema = model.get(data.get("schema")) if self.schema is None: raise InvalidMapping("Invalid schema: %s" % data.get("schema")) self.refs = set(self.keys) if self.id_column: self.refs.add(self.id_column) self.dependencies = set() self.properties = [] for name, mapping in data.get("properties", {}).items(): prop = self.schema.get(name) if prop is None: raise InvalidMapping("Invalid property: %s" % name) mapping = PropertyMapping(query, mapping, prop) self.properties.append(mapping) self.refs.update(mapping.refs) if mapping.entity: self.dependencies.add(mapping.entity)
def stream_mapping(infile: Path, outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: queries: List[Tuple[str, QueryMapping]] = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): data.pop("database", None) data["csv_url"] = "/dev/null" query = model.make_mapping(data, key_prefix=dataset) queries.append((dataset, query)) try: with path_writer(outfile) as outfh: with input_file(infile) as fh: for record in CSVSource.read_csv(fh): for (dataset, query) in queries: ns = Namespace(dataset) if query.source.check_filters(record): # type: ignore entities = query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort()
def __init__(self, query, data): super(CSVSource, self).__init__(query, data) self.urls = set() for url in keys_values(data, 'csv_url', 'csv_urls'): self.urls.add(os.path.expandvars(url)) if not len(self.urls): raise InvalidMapping("No CSV URLs are specified.")
def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None: super().__init__(query, data) self.urls: Set[str] = set() for url in keys_values(data, "csv_url", "csv_urls"): self.urls.add(cast(str, os.path.expandvars(url))) if not len(self.urls): raise InvalidMapping("No CSV URLs are specified.") self.filters_set = self._parse_filters(self.filters) self.filters_not_set = self._parse_filters(self.filters_not)
def bulk_load(queue, collection, config): """Bulk load entities from a CSV file or SQL database. This is done by mapping the rows in the source data to entities and links which can be understood by the entity index. """ queries = keys_values(config, 'queries', 'query') for query in queries: bulk_load_query(queue, collection, hash_data(query), query) queue_task(collection, OP_INDEX) queue.remove()
def run_mapping(mapping_yaml): config = load_config_file(mapping_yaml) stream = click.get_text_stream('stdout') try: for dataset, meta in config.items(): for mapping in keys_values(meta, 'queries', 'query'): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: write_object(stream, entity) except BrokenPipeError: pass
def run_mapping(outfile, mapping_yaml): config = load_mapping_file(mapping_yaml) try: for dataset, meta in config.items(): for mapping in keys_values(meta, 'queries', 'query'): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: write_object(outfile, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def run_mapping(mapping_yaml): config = load_mapping_file(mapping_yaml) stream = click.get_text_stream('stdout') try: for dataset, meta in config.items(): for mapping in keys_values(meta, 'queries', 'query'): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: read_entity(stream, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def __init__(self, query, data): super(SQLSource, self).__init__(query, data) self.database_uri = os.path.expandvars(data.get("database")) kwargs = {} if self.database_uri.lower().startswith("postgres"): kwargs["server_side_cursors"] = True self.engine = create_engine(self.database_uri, poolclass=NullPool, **kwargs) self.meta = MetaData() self.meta.bind = self.engine tables = keys_values(data, "table", "tables") self.tables = [QueryTable(self, f) for f in tables] self.joins = ensure_list(data.get("joins"))
def __init__(self, query, data, prop): self.query = query data = deepcopy(data) self.data = data self.prop = prop self.name = prop.name self.type = prop.type self.refs = keys_values(data, 'column', 'columns') self.literals = keys_values(data, 'literal', 'literals') self.join = data.pop('join', None) self.split = data.pop('split', None) self.entity = data.pop('entity', None) self.required = data.pop('required', False) self.template = stringify(data.pop('template', None)) self.replacements = {} if self.template is not None: # this is hacky, trying to generate refs from template for ref in self.FORMAT_PATTERN.findall(self.template): self.refs.append(ref) self.replacements['{{%s}}' % ref] = ref
def __init__(self, query, data, prop): self.query = query data = deepcopy(data) self.data = data self.prop = prop self.name = prop.name self.type = prop.type self.refs = keys_values(data, "column", "columns") self.literals = keys_values(data, "literal", "literals") self.join = data.pop("join", None) self.split = data.pop("split", None) self.entity = data.pop("entity", None) self.required = data.pop("required", False) self.template = sanitize_text(data.pop("template", None)) self.replacements = {} if self.template is not None: # this is hacky, trying to generate refs from template for ref in self.FORMAT_PATTERN.findall(self.template): self.refs.append(ref) self.replacements["{{%s}}" % ref] = ref
def mapping(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) except InvalidMapping as invalid: raise BadRequest(invalid) queue_task(collection, OP_BULKLOAD, payload=data) return ('', 202)
def __init__(self, query: "QueryMapping", data: Dict[str, Any], prop: Property) -> None: self.query = query data = deepcopy(data) self.prop = prop self.refs = cast(List[str], keys_values(data, "column", "columns")) self.join = cast(Optional[str], data.pop("join", None)) self.split = cast(Optional[str], data.pop("split", None)) self.entity = stringify(data.pop("entity", None)) self.format = stringify(data.pop("format", None)) self.fuzzy = as_bool(data.pop("fuzzy", False)) self.required = as_bool(data.pop("required", False)) self.literals = cast(List[str], keys_values(data, "literal", "literals")) self.template = sanitize_text(data.pop("template", None)) self.replacements: Dict[str, str] = {} if self.template is not None: # this is hacky, trying to generate refs from template for ref in self.FORMAT_PATTERN.findall(self.template): self.refs.append(ref) self.replacements["{{%s}}" % ref] = ref
def bulk_load(config): """Bulk load entities from a CSV file or SQL database. This is done by mapping the rows in the source data to entities and links which can be understood by the entity index. """ from aleph.logic.collections import create_collection for foreign_id, data in config.items(): data['foreign_id'] = foreign_id data['label'] = data.get('label', foreign_id) collection = create_collection(data) collection_id = collection.get('id') # FIXME: this does not perform collection metadata validation. for query in keys_values(data, 'queries', 'query'): bulk_load_query.apply_async([collection_id, query], priority=6)
def run_mapping(outfile, mapping_yaml, sign=True): config = load_mapping_file(mapping_yaml) try: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def run_mapping(outfile: Path, mapping_yaml: Path, sign: bool = True) -> None: config = load_mapping_file(mapping_yaml) try: with path_writer(outfile) as outfh: for dataset, meta in config.items(): ns = Namespace(dataset) for mapping in keys_values(meta, "queries", "query"): entities = model.map_entities(mapping, key_prefix=dataset) for entity in entities: if sign: entity = ns.apply(entity) write_entity(outfh, entity) except BrokenPipeError: raise click.Abort() except Exception as exc: raise click.ClickException(str(exc))
def mapping_process(id): collection = get_db_collection(id, request.authz.WRITE) require(request.authz.can_bulk_import()) # TODO: we need to look into possible abuse of mapping load path for local # path access on the machine running the mapping. Until then, this action # must be restricted to admins: require(request.authz.is_admin) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) bulk_load_query.apply_async([collection.id, query], priority=6) except InvalidMapping as invalid: raise BadRequest(invalid) return ('', 204)
def mapping_process(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) # TODO: we need to look into possible abuse of mapping load path for local # path access on the machine running the mapping. Until then, this action # must be restricted to admins: require(request.authz.is_admin) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) bulk_load_query.apply_async([collection.id, query], priority=6) except InvalidMapping as invalid: raise BadRequest(invalid) return ('', 204)
def __init__(self, query: "QueryMapping", data: Dict[str, Any]) -> None: super(SQLSource, self).__init__(query, data) database = data.get("database") if database is None: raise InvalidMapping("No database in SQL mapping!") self.database_uri = cast(str, os.path.expandvars(database)) kwargs = {} if self.database_uri.lower().startswith("postgres"): kwargs["server_side_cursors"] = True self.engine = create_engine(self.database_uri, poolclass=NullPool, **kwargs) # type: ignore self.meta = MetaData() self.meta.bind = self.engine tables = keys_values(data, "table", "tables") self.tables = [QueryTable(self.meta, f) for f in tables] self.joins = cast(List[Dict[str, str]], ensure_list(data.get("joins")))
def stream_mapping(infile, outfile, mapping_yaml): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: for record in StreamSource.read_csv(infile): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def __init__( self, model: "Model", query: "QueryMapping", name: str, data: Dict[str, Any], key_prefix: Optional[str] = None, ) -> None: self.model = model self.name = name self.seed = sha1(key_bytes(key_prefix)) self.seed.update(key_bytes(data.get("key_literal"))) self.keys = keys_values(data, "key", "keys") self.id_column = stringify(data.get("id_column")) if not len(self.keys) and self.id_column is None: raise InvalidMapping("No keys or ID: %r" % name) if len(self.keys) and self.id_column is not None: msg = "Please use only keys or id_column, not both: %r" % name raise InvalidMapping(msg) schema_name = stringify(data.get("schema")) if schema_name is None: raise InvalidMapping("No schema: %s" % name) schema = model.get(schema_name) if schema is None: raise InvalidMapping("Invalid schema: %s" % schema_name) self.schema = schema self.refs = set(self.keys) if self.id_column: self.refs.add(self.id_column) self.dependencies: Set[str] = set() self.properties: List[PropertyMapping] = [] for name, prop_mapping in data.get("properties", {}).items(): prop = self.schema.get(name) if prop is None: raise InvalidMapping("Invalid property: %s" % name) mapping = PropertyMapping(query, prop_mapping, prop) self.properties.append(mapping) self.refs.update(mapping.refs) if mapping.entity: self.dependencies.add(mapping.entity)
def stream_mapping(infile, outfile, mapping_yaml, sign=True): sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, "queries", "query"): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append((dataset, source)) try: for record in StreamSource.read_csv(infile): for (dataset, source) in sources: ns = Namespace(dataset) if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): if sign: entity = ns.apply(entity) write_object(outfile, entity) except BrokenPipeError: raise click.Abort()
def stream_mapping(mapping_yaml): stdin = click.get_text_stream('stdin') stdout = click.get_text_stream('stdout') sources = [] config = load_mapping_file(mapping_yaml) for dataset, meta in config.items(): for data in keys_values(meta, 'queries', 'query'): query = model.make_mapping(data, key_prefix=dataset) source = StreamSource(query, data) sources.append(source) try: for record in StreamSource.read_csv(stdin): for source in sources: if source.check_filters(record): entities = source.query.map(record) for entity in entities.values(): read_entity(stdout, entity) except BrokenPipeError: raise click.Abort()