Beispiel #1
0
def tsplit_df(state, *columns):
    """Split of dataframe by columns (version of split_df expecting a first row with tags)"""
    from liquer.parser import parse

    state = qtsplit_df(state, *columns)
    df = state.get().copy()

    query_column = state.vars.get("query_column")
    if query_column is None:
        query_column = "query"

    link_column = state.vars.get("link_column")
    if link_column is None:
        link_column = "link"

    split_link_type = state.vars.get("split_link_type")
    if split_link_type is None:
        split_link_type = "url"

    #    df.loc[:,link_column] = [""]+[evaluate(encode(decode(q)+[["link",split_link_type]])).get() for q in list(df[query_column])[1:]]
    df.loc[:, link_column] = [""] + [
        evaluate(parse(q).with_action("link", split_link_type).encode()).get()
        for q in list(df[query_column])[1:]
    ]
    return state.with_data(df)
Beispiel #2
0
def split_df(state, *columns):
    """Split of dataframe by columns
    Creates a dataframe with unique (combinations of) value from supplied columns and queries
    to obtain the corresponding filtered dataframes from the original dataframe.

    This behaves like qsplit_df, with two important differenced:
    - each generated query is evaluated (and thus eventually cached)
    - link is generated and put into link column (state variable link_column)
    The split_link_type state variable is used to determine the link type; url by default.
    """
    from liquer.parser import parse

    state = qsplit_df(state, *columns)
    df = state.get().copy()

    query_column = state.vars.get("query_column")
    if query_column is None:
        query_column = "query"

    link_column = state.vars.get("link_column")
    if link_column is None:
        link_column = "link"

    split_link_type = state.vars.get("split_link_type")
    if split_link_type is None:
        split_link_type = "url"

    #    df.loc[:,link_column] = [evaluate(encode(decode(q)+[["link",split_link_type]])).get() for q in df[query_column]]
    df.loc[:, link_column] = [
        evaluate(parse(q).with_action("link", split_link_type).encode()).get()
        for q in df[query_column]
    ]
    return state.with_data(df)
Beispiel #3
0
 def apply(self, query, description=None):
     self.debug(f"APPLY {query}")
     if self.parent_query in (None, "", "/"):
         self.debug(f"  no parent query in apply {query}")
         return self.evaluate(query, description=description)
     if isinstance(query, str):
         query = parse(query)
     if query.absolute:
         self.debug(f"  absolute link in apply {query}")
         return self.evaluate(query, description=description)
     tq = query.transform_query()
     if tq is None:
         raise Exception(
             f"Only transform query supported in apply ({query} on {self.parent_query})"
         )
     q = (parse(self.parent_query) + tq).encode()
     self.debug(f"apply {query} on {self.parent_query} yields {q}")
     return self.evaluate(q, description=description)
Beispiel #4
0
 def to_query(cls, query):
     if query is None:
         return "", Query()
     if isinstance(query, str):
         return query, parse(query)
     elif isinstance(query, Query):
         return query.encode(), query
     else:
         raise Exception(f"Unsupported query type: {type(query)}")
Beispiel #5
0
    def metadata(self):
        metadata = self._metadata.as_dict()
        title = self.title
        description = self.description
        if title is None:
            if self.raw_query is None:
                title = ""
            else:
                p = parse(self.raw_query)
                if title in ("", None):
                    title = p.filename() or ""

        mimetype = self.mimetype
        if mimetype is None:
            if self.query is not None:
                if self.query.extension() is None:
                    mimetype = "application/octet-stream"
                else:
                    mimetype = mimetype_from_extension(self.query.extension())

        message = self._metadata.message
        if message in (None, ""):
            log = self._metadata.get("log", [])
            if len(log):
                message = log[-1]["message"]
        if message in (None, ""):
            log = self._metadata.get("child_log", [])
            if len(log):
                message = log[-1]["message"]

        metadata.update(
            dict(
                status=self.status.value,
                title=title,
                description=description,
                mimetype=mimetype,
                query=self.raw_query,
                parent_query=self.parent_query,
                argument_queries=self.argument_queries,
                #            log=self.log[:],
                is_error=self.is_error,
                direct_subqueries=self.direct_subqueries[:],
                progress_indicators=self.progress_indicators[:],
                child_progress_indicators=self.child_progress_indicators[:],
                child_log=self.child_log,
                message=message,
                started=self.started,
                updated=self.now(),
                created=self.created,
                caching=self.caching,
                vars=dict(self.vars),
                html_preview=self.html_preview,
                side_effect=False,
            ))
        return metadata
Beispiel #6
0
def resolve_recipe_definition(r, directory, metadata):
    if type(r) == str:
        try:
            query = parse(r)
            filename = query.filename()
            return dict(type="query",
                        query=r,
                        CWD=directory,
                        filename=filename,
                        provides=[filename])
        except:
            metadata.warning(f"Can't resolve recipe '{r}'",
                             traceback=traceback.format_exc())
            print(f"Can't resolve recipe '{r}'")
            traceback.print_exc()
            return None

    elif isinstance(r, dict):
        if r.get("type") in (None, "query") and "query" in r:
            try:
                query = parse(r["query"])
                filename = r.get("filename", query.filename())
                title = r.get("title", filename)
                description = r.get("description",
                                    f'Generated from query: {r["query"]}')
                rkey = join_key(directory, filename)
                return dict(type="query",
                            query=r["query"],
                            title=title,
                            description=description,
                            CWD=directory,
                            filename=filename,
                            provides=[filename])
            except:
                metadata.warning(f"Can't resolve query recipe",
                                 traceback=traceback.format_exc())
                traceback.print_exc()
    else:
        print(f"Unsupported recipe type: {type(r)}")
    if "filename" in r and "provides" not in r:
        r["provides"] = [r["filename"]]
    return r
Beispiel #7
0
    def make_execution_context(self, tmpdir, store, context):
        import datafusion as daf
        ctx = daf.ExecutionContext()
        register = self.data.get("register", [])
        store = store.root_store()

        path = Path(tmpdir)
        for query in register:
            context.info(f"Register {query}")
            try:
                q = parse(query)
            except:
                context.warning(
                    f"Could not parse query '{query}' in parquet_sql recipe {self.recipe_name()}",
                    traceback=traceback.format_exc())
            if q.is_resource_query():
                key = q.resource_query().path()
                if store.is_dir(key):
                    context.info(f"Registering directory {key}")
                    for k in store.listdir_keys(key):
                        if not store.is_dir(k) and key_extension(
                                k) == "parquet":
                            (path / key_name(k)).write_bytes(
                                store.get_bytes(k))
                            context.info(
                                f"Registering {key_name_without_extension(k)} from {key}"
                            )
                            ctx.register_parquet(key_name_without_extension(k),
                                                 str(path / key_name(k)))
                else:
                    (path / key_name(key)).write_bytes(store.get_bytes(key))
                    context.info(f"Registering resource {key}")
                    ctx.register_parquet(key_name_without_extension(key),
                                         str(path / key_name(key)))
            else:
                filename = q.filename()
                if filename is None:
                    context.warning(
                        f"Skipping '{query}' registering because it is lacking a filename"
                    )
                    continue
                v = filename.split(".")
                context.info(f"Evaluating query {query}")
                context.evaluate_and_save(query,
                                          target_directory=str(tmpdir),
                                          target_file=filename)
                context.info(f"Registering {v[0]} from query {query}")
                ctx.register_parquet(v[0], str(path / filename))
        return ctx
Beispiel #8
0
def queries_status(include_ready=False, reduce=True):
    import liquer.parser as lp
    import traceback

    try:
        cache = get_cache()
        data = []
        for key in sorted(cache.keys()):
            metadata = cache.get_metadata(key)
            if metadata is None:
                continue
            progress = metadata.get("progress_indicators", []) + metadata.get(
                "child_progress_indicators", [])
            d = dict(
                query=key,
                short=lp.parse(key).short(),
                status=metadata.get("status", "none"),
                updated=metadata.get("updated", "?"),
                message=metadata.get("message", ""),
                progress=progress[:3],
            )
            if include_ready or d["status"] != Status.READY.value:
                data.append((key, d))
        data = [x[1] for x in sorted(data)]
        if reduce and len(data):
            reduced_data = [data[0]]
            for d, next_d in zip(data[1:], data[2:]):
                previous_d = reduced_data[-1]
                if not (previous_d["status"] == Status.EVALUATING_PARENT.value
                        and d["status"] == Status.EVALUATING_PARENT.value
                        and d["query"].startswith(previous_d["query"])
                        and next_d["query"].startswith(d["query"])):
                    reduced_data.append(d)
            reduced_data.append(data[-1])
            data = reduced_data
        return data
    except:
        return [
            dict(
                query="",
                status="not available",
                updated="",
                message=traceback.format_exc(),
                progress=[],
            )
        ]
Beispiel #9
0
def dr(state, type_identifier=None, extension=None, context=None):
    """Decode resource
    Decodes the bytes into a data structure. This is meant to be used in connection to a resource query.
    Resource part of the query will typically fetch the data from a store and thus return bytes (together with metadata).
    Command dr will convert the bytes (assuming proper metadata are provided) into a data structure.
    The metadata must contain type_identifier in metadata or metadata['resource_metadata'], a filename with extension
    or extension with known decoding.
    """
    from liquer.state_types import state_types_registry
    from liquer.parser import parse

    if state.data is None:
        context.error(
            f"Bytes expected, None received in dr from {state.query}")
        return

    if type_identifier is None:
        type_identifier = state.metadata.get(
            "type_identifier",
            state.metadata.get("resource_metadata", {}).get("type_identifier"),
        )

    if type_identifier in (None, "bytes"):
        type_identifier = state.metadata.get("resource_metadata",
                                             {}).get("type_identifier")

    if extension is None:
        extension = state.metadata.get("extension")
    if extension is None:
        query = state.metadata.get("query")
        if query is not None:
            filename = parse(query).filename()
        if filename is not None:
            v = filename.split(".")
            if len(v) > 1:
                extension = v[-1]
                context.info(f"Extension: {extension} - from query '{query}'")
        else:
            key = state.metadata.get("resource_metadata", {}).get("key")
            if key is not None:
                filename = context.store().key_name(key)
            v = filename.split(".")
            if len(v) > 1:
                extension = v[-1]
                context.info(f"Extension: {extension} - from key '{key}'")

    if type_identifier in (None, "bytes"):
        type_identifier = type_identifier_from_extension(extension)
        context.info(
            f"Type identifier: {type_identifier} - from extension '{extension}'"
        )

    if type_identifier is not None:
        if extension in ("parquet", "xlsx", "csv",
                         "tsv") and type_identifier in ("generic",
                                                        "dictionary",
                                                        "pickle"):
            context.warning(
                f"Type identifier '{type_identifier}' seems to be inconsistent with the extension '{extension}'"
            )
            context.warning(
                f"This might indicate a problem with executing the partent query '{context.parent_query}'"
            )
            type_identifier = type_identifier_by_extension.get(extension)
            context.warning(
                f"To fix the inconsistency, type identifier: {type_identifier} is used from extension '{extension}'"
            )

        context.info(
            f"Type identifier: {type_identifier},  Extension: {extension}")
        t = state_types_registry().get(type_identifier)
        data = t.from_bytes(state.data, extension=extension)
        return state.with_data(data)
    else:
        context.error(f"Decode resource (dr) command failed")
        raise Exception(
            f"Failed to resolve type for query {state.metadata.get('query')}")
    return state
Beispiel #10
0
    def update_recipes(self):
        import yaml

        recipes = {}
        for key in self.substore.keys():
            spec = None
            if self.key_name(
                    key
            ) == self.RECIPES_FILE and not self.substore.is_dir(key):
                spec = yaml.load(self.substore.get_bytes(key), Loader=Loader)
            recipes_key = key
            if spec is not None:
                parent = self.parent_key(key)
                for directory, items in spec.items():
                    for r in items:
                        if type(r) == str:
                            try:
                                query = parse(r)
                                filename = query.filename()
                                parent = self.parent_key(key)
                                if len(parent) > 0 and not parent.endswith(
                                        "/"):
                                    parent += "/"
                                rkey = (f"{parent}{filename}"
                                        if directory == self.LOCAL_RECIPES else
                                        f"{parent}{directory}/{filename}")
                                recipes[rkey] = r
                                self.recipes_info[rkey] = dict(
                                    query=r,
                                    title=filename,
                                    description="",
                                    recipes_key=recipes_key,
                                    recipes_directory=directory,
                                )
                            except:
                                traceback.print_exc()
                        elif isinstance(r, dict):
                            try:
                                query = parse(r["query"])
                                filename = r.get("filename", query.filename())
                                title = r.get("title", filename)
                                description = r.get("description", r["query"])
                                parent = self.parent_key(key)
                                if len(parent) > 0 and not parent.endswith(
                                        "/"):
                                    parent += "/"
                                rkey = (f"{parent}{filename}"
                                        if directory == self.LOCAL_RECIPES else
                                        f"{parent}{directory}/{filename}")
                                recipes[rkey] = r["query"]
                                self.recipes_info[rkey] = dict(
                                    query=r["query"],
                                    title=title,
                                    description=description,
                                    recipes_key=recipes_key,
                                    recipes_directory=directory,
                                )
                            except:
                                traceback.print_exc()
                        else:
                            print(f"Unsupported recipe type: {type(r)}")
        self._recipes = recipes
        return recipes