def tsplit_df(state, *columns): """Split of dataframe by columns (version of split_df expecting a first row with tags)""" from liquer.parser import parse state = qtsplit_df(state, *columns) df = state.get().copy() query_column = state.vars.get("query_column") if query_column is None: query_column = "query" link_column = state.vars.get("link_column") if link_column is None: link_column = "link" split_link_type = state.vars.get("split_link_type") if split_link_type is None: split_link_type = "url" # df.loc[:,link_column] = [""]+[evaluate(encode(decode(q)+[["link",split_link_type]])).get() for q in list(df[query_column])[1:]] df.loc[:, link_column] = [""] + [ evaluate(parse(q).with_action("link", split_link_type).encode()).get() for q in list(df[query_column])[1:] ] return state.with_data(df)
def split_df(state, *columns): """Split of dataframe by columns Creates a dataframe with unique (combinations of) value from supplied columns and queries to obtain the corresponding filtered dataframes from the original dataframe. This behaves like qsplit_df, with two important differenced: - each generated query is evaluated (and thus eventually cached) - link is generated and put into link column (state variable link_column) The split_link_type state variable is used to determine the link type; url by default. """ from liquer.parser import parse state = qsplit_df(state, *columns) df = state.get().copy() query_column = state.vars.get("query_column") if query_column is None: query_column = "query" link_column = state.vars.get("link_column") if link_column is None: link_column = "link" split_link_type = state.vars.get("split_link_type") if split_link_type is None: split_link_type = "url" # df.loc[:,link_column] = [evaluate(encode(decode(q)+[["link",split_link_type]])).get() for q in df[query_column]] df.loc[:, link_column] = [ evaluate(parse(q).with_action("link", split_link_type).encode()).get() for q in df[query_column] ] return state.with_data(df)
def apply(self, query, description=None): self.debug(f"APPLY {query}") if self.parent_query in (None, "", "/"): self.debug(f" no parent query in apply {query}") return self.evaluate(query, description=description) if isinstance(query, str): query = parse(query) if query.absolute: self.debug(f" absolute link in apply {query}") return self.evaluate(query, description=description) tq = query.transform_query() if tq is None: raise Exception( f"Only transform query supported in apply ({query} on {self.parent_query})" ) q = (parse(self.parent_query) + tq).encode() self.debug(f"apply {query} on {self.parent_query} yields {q}") return self.evaluate(q, description=description)
def to_query(cls, query): if query is None: return "", Query() if isinstance(query, str): return query, parse(query) elif isinstance(query, Query): return query.encode(), query else: raise Exception(f"Unsupported query type: {type(query)}")
def metadata(self): metadata = self._metadata.as_dict() title = self.title description = self.description if title is None: if self.raw_query is None: title = "" else: p = parse(self.raw_query) if title in ("", None): title = p.filename() or "" mimetype = self.mimetype if mimetype is None: if self.query is not None: if self.query.extension() is None: mimetype = "application/octet-stream" else: mimetype = mimetype_from_extension(self.query.extension()) message = self._metadata.message if message in (None, ""): log = self._metadata.get("log", []) if len(log): message = log[-1]["message"] if message in (None, ""): log = self._metadata.get("child_log", []) if len(log): message = log[-1]["message"] metadata.update( dict( status=self.status.value, title=title, description=description, mimetype=mimetype, query=self.raw_query, parent_query=self.parent_query, argument_queries=self.argument_queries, # log=self.log[:], is_error=self.is_error, direct_subqueries=self.direct_subqueries[:], progress_indicators=self.progress_indicators[:], child_progress_indicators=self.child_progress_indicators[:], child_log=self.child_log, message=message, started=self.started, updated=self.now(), created=self.created, caching=self.caching, vars=dict(self.vars), html_preview=self.html_preview, side_effect=False, )) return metadata
def resolve_recipe_definition(r, directory, metadata): if type(r) == str: try: query = parse(r) filename = query.filename() return dict(type="query", query=r, CWD=directory, filename=filename, provides=[filename]) except: metadata.warning(f"Can't resolve recipe '{r}'", traceback=traceback.format_exc()) print(f"Can't resolve recipe '{r}'") traceback.print_exc() return None elif isinstance(r, dict): if r.get("type") in (None, "query") and "query" in r: try: query = parse(r["query"]) filename = r.get("filename", query.filename()) title = r.get("title", filename) description = r.get("description", f'Generated from query: {r["query"]}') rkey = join_key(directory, filename) return dict(type="query", query=r["query"], title=title, description=description, CWD=directory, filename=filename, provides=[filename]) except: metadata.warning(f"Can't resolve query recipe", traceback=traceback.format_exc()) traceback.print_exc() else: print(f"Unsupported recipe type: {type(r)}") if "filename" in r and "provides" not in r: r["provides"] = [r["filename"]] return r
def make_execution_context(self, tmpdir, store, context): import datafusion as daf ctx = daf.ExecutionContext() register = self.data.get("register", []) store = store.root_store() path = Path(tmpdir) for query in register: context.info(f"Register {query}") try: q = parse(query) except: context.warning( f"Could not parse query '{query}' in parquet_sql recipe {self.recipe_name()}", traceback=traceback.format_exc()) if q.is_resource_query(): key = q.resource_query().path() if store.is_dir(key): context.info(f"Registering directory {key}") for k in store.listdir_keys(key): if not store.is_dir(k) and key_extension( k) == "parquet": (path / key_name(k)).write_bytes( store.get_bytes(k)) context.info( f"Registering {key_name_without_extension(k)} from {key}" ) ctx.register_parquet(key_name_without_extension(k), str(path / key_name(k))) else: (path / key_name(key)).write_bytes(store.get_bytes(key)) context.info(f"Registering resource {key}") ctx.register_parquet(key_name_without_extension(key), str(path / key_name(key))) else: filename = q.filename() if filename is None: context.warning( f"Skipping '{query}' registering because it is lacking a filename" ) continue v = filename.split(".") context.info(f"Evaluating query {query}") context.evaluate_and_save(query, target_directory=str(tmpdir), target_file=filename) context.info(f"Registering {v[0]} from query {query}") ctx.register_parquet(v[0], str(path / filename)) return ctx
def queries_status(include_ready=False, reduce=True): import liquer.parser as lp import traceback try: cache = get_cache() data = [] for key in sorted(cache.keys()): metadata = cache.get_metadata(key) if metadata is None: continue progress = metadata.get("progress_indicators", []) + metadata.get( "child_progress_indicators", []) d = dict( query=key, short=lp.parse(key).short(), status=metadata.get("status", "none"), updated=metadata.get("updated", "?"), message=metadata.get("message", ""), progress=progress[:3], ) if include_ready or d["status"] != Status.READY.value: data.append((key, d)) data = [x[1] for x in sorted(data)] if reduce and len(data): reduced_data = [data[0]] for d, next_d in zip(data[1:], data[2:]): previous_d = reduced_data[-1] if not (previous_d["status"] == Status.EVALUATING_PARENT.value and d["status"] == Status.EVALUATING_PARENT.value and d["query"].startswith(previous_d["query"]) and next_d["query"].startswith(d["query"])): reduced_data.append(d) reduced_data.append(data[-1]) data = reduced_data return data except: return [ dict( query="", status="not available", updated="", message=traceback.format_exc(), progress=[], ) ]
def dr(state, type_identifier=None, extension=None, context=None): """Decode resource Decodes the bytes into a data structure. This is meant to be used in connection to a resource query. Resource part of the query will typically fetch the data from a store and thus return bytes (together with metadata). Command dr will convert the bytes (assuming proper metadata are provided) into a data structure. The metadata must contain type_identifier in metadata or metadata['resource_metadata'], a filename with extension or extension with known decoding. """ from liquer.state_types import state_types_registry from liquer.parser import parse if state.data is None: context.error( f"Bytes expected, None received in dr from {state.query}") return if type_identifier is None: type_identifier = state.metadata.get( "type_identifier", state.metadata.get("resource_metadata", {}).get("type_identifier"), ) if type_identifier in (None, "bytes"): type_identifier = state.metadata.get("resource_metadata", {}).get("type_identifier") if extension is None: extension = state.metadata.get("extension") if extension is None: query = state.metadata.get("query") if query is not None: filename = parse(query).filename() if filename is not None: v = filename.split(".") if len(v) > 1: extension = v[-1] context.info(f"Extension: {extension} - from query '{query}'") else: key = state.metadata.get("resource_metadata", {}).get("key") if key is not None: filename = context.store().key_name(key) v = filename.split(".") if len(v) > 1: extension = v[-1] context.info(f"Extension: {extension} - from key '{key}'") if type_identifier in (None, "bytes"): type_identifier = type_identifier_from_extension(extension) context.info( f"Type identifier: {type_identifier} - from extension '{extension}'" ) if type_identifier is not None: if extension in ("parquet", "xlsx", "csv", "tsv") and type_identifier in ("generic", "dictionary", "pickle"): context.warning( f"Type identifier '{type_identifier}' seems to be inconsistent with the extension '{extension}'" ) context.warning( f"This might indicate a problem with executing the partent query '{context.parent_query}'" ) type_identifier = type_identifier_by_extension.get(extension) context.warning( f"To fix the inconsistency, type identifier: {type_identifier} is used from extension '{extension}'" ) context.info( f"Type identifier: {type_identifier}, Extension: {extension}") t = state_types_registry().get(type_identifier) data = t.from_bytes(state.data, extension=extension) return state.with_data(data) else: context.error(f"Decode resource (dr) command failed") raise Exception( f"Failed to resolve type for query {state.metadata.get('query')}") return state
def update_recipes(self): import yaml recipes = {} for key in self.substore.keys(): spec = None if self.key_name( key ) == self.RECIPES_FILE and not self.substore.is_dir(key): spec = yaml.load(self.substore.get_bytes(key), Loader=Loader) recipes_key = key if spec is not None: parent = self.parent_key(key) for directory, items in spec.items(): for r in items: if type(r) == str: try: query = parse(r) filename = query.filename() parent = self.parent_key(key) if len(parent) > 0 and not parent.endswith( "/"): parent += "/" rkey = (f"{parent}{filename}" if directory == self.LOCAL_RECIPES else f"{parent}{directory}/{filename}") recipes[rkey] = r self.recipes_info[rkey] = dict( query=r, title=filename, description="", recipes_key=recipes_key, recipes_directory=directory, ) except: traceback.print_exc() elif isinstance(r, dict): try: query = parse(r["query"]) filename = r.get("filename", query.filename()) title = r.get("title", filename) description = r.get("description", r["query"]) parent = self.parent_key(key) if len(parent) > 0 and not parent.endswith( "/"): parent += "/" rkey = (f"{parent}{filename}" if directory == self.LOCAL_RECIPES else f"{parent}{directory}/{filename}") recipes[rkey] = r["query"] self.recipes_info[rkey] = dict( query=r["query"], title=title, description=description, recipes_key=recipes_key, recipes_directory=directory, ) except: traceback.print_exc() else: print(f"Unsupported recipe type: {type(r)}") self._recipes = recipes return recipes