Ejemplo n.º 1
0
    def read_opencypher(self, query: str, headers: Any = None) -> Any:
        """Execute the provided openCypher query.

        Parameters
        ----------
        query : str
            The query to execute
        headers : Any, optional
            Any additional headers that should be associated with the query. Defaults to None.

        Returns
        -------
        Any
            The result of the query.
        """
        if headers is None:
            headers = {}

        if "content-type" not in headers:
            headers["content-type"] = "application/x-www-form-urlencoded"

        url = f"{HTTP_PROTOCOL}://{self.host}:{self.port}/openCypher"
        data = {"query": query}

        req = self._prepare_request("POST", url, data=data, headers=headers)
        res = self._http_session.send(req)
        _logger.debug(res)
        if res.ok:
            return res.json()["results"]
        raise exceptions.QueryFailed(
            f"Status Code: {res.status_code} Reason: {res.reason} Message: {res.text}"
        )
Ejemplo n.º 2
0
def wait_query(query_execution_id: str, boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]:
    """Wait for the query end.

    Parameters
    ----------
    query_execution_id : str
        Athena query execution ID.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Dictionary with the get_query_execution response.

    Examples
    --------
    >>> import awswrangler as wr
    >>> res = wr.athena.wait_query(query_execution_id='query-execution-id')

    """
    client_athena: boto3.client = _utils.client(service_name="athena", session=boto3_session)
    response: Dict[str, Any] = client_athena.get_query_execution(QueryExecutionId=query_execution_id)
    state: str = response["QueryExecution"]["Status"]["State"]
    while state not in _QUERY_FINAL_STATES:
        time.sleep(_QUERY_WAIT_POLLING_DELAY)
        response = client_athena.get_query_execution(QueryExecutionId=query_execution_id)
        state = response["QueryExecution"]["Status"]["State"]
    _logger.debug("state: %s", state)
    _logger.debug("StateChangeReason: %s", response["QueryExecution"]["Status"].get("StateChangeReason"))
    if state == "FAILED":
        raise exceptions.QueryFailed(response["QueryExecution"]["Status"].get("StateChangeReason"))
    if state == "CANCELLED":
        raise exceptions.QueryCancelled(response["QueryExecution"]["Status"].get("StateChangeReason"))
    return cast(Dict[str, Any], response["QueryExecution"])
Ejemplo n.º 3
0
 def _execute_gremlin(self,
                      query: str,
                      headers: Any = None) -> List[Dict[str, Any]]:
     try:
         c = self._get_gremlin_connection(headers)
         result = c.submit(query)
         future_results = result.all()
         results = future_results.result()
         return GremlinParser.gremlin_results_to_dict(results)
     except Exception as e:
         if isinstance(self.gremlin_connection, client.Client):
             self.gremlin_connection.close()
         self.gremlin_connection = None
         _logger.error(e)
         raise exceptions.QueryFailed(e)
Ejemplo n.º 4
0
def wait_query(
        query_id: str,
        boto3_session: Optional[boto3.Session] = None) -> Dict[str, Any]:
    """Wait query ends.

    https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/CWL_QuerySyntax.html

    Parameters
    ----------
    query_id : str
        Query ID.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.

    Returns
    -------
    Dict[str, Any]
        Query result payload.

    Examples
    --------
    >>> import awswrangler as wr
    >>> query_id = wr.cloudwatch.start_query(
    ...     log_group_names=["loggroup"],
    ...     query="fields @timestamp, @message | sort @timestamp desc | limit 5",
    ... )
    ... response = wr.cloudwatch.wait_query(query_id=query_id)

    """
    final_states: List[str] = ["Complete", "Failed", "Cancelled"]
    client_logs: boto3.client = _utils.client(service_name="logs",
                                              session=boto3_session)
    response: Dict[str, Any] = client_logs.get_query_results(queryId=query_id)
    status: str = response["status"]
    while status not in final_states:
        time.sleep(_QUERY_WAIT_POLLING_DELAY)
        response = client_logs.get_query_results(queryId=query_id)
        status = response["status"]
    _logger.debug("status: %s", status)
    if status == "Failed":
        raise exceptions.QueryFailed(f"query ID: {query_id}")
    if status == "Cancelled":
        raise exceptions.QueryCancelled(f"query ID: {query_id}")
    return response
Ejemplo n.º 5
0
def write_gremlin_df(client: NeptuneClient, df: pd.DataFrame,
                     mode: WriteDFType, batch_size: int) -> bool:
    """Write the provided dataframe using Gremlin.

    Parameters
    ----------
    client : NeptuneClient
        The Neptune client to write the dataframe
    df : pd.DataFrame
        The dataframe to write
    mode : WriteDFType
        The type of dataframe to write
    batch_size : int
        The size of the batch to write

    Returns
    -------
    bool
        True if the write operation succeeded
    """
    g = Graph().traversal()
    # Loop through items in the DF
    for (index, row) in df.iterrows():
        # build up a query
        if mode == WriteDFType.EDGE:
            g = _build_gremlin_edges(g, row.to_dict())
        elif mode == WriteDFType.VERTEX:
            g = _build_gremlin_vertices(g, row.to_dict())
        else:
            g = _build_gremlin_update(g, row.to_dict())
        # run the query
        if index > 0 and index % batch_size == 0:
            res = _run_gremlin_insert(client, g)
            if res:
                g = Graph().traversal()
            else:
                _logger.debug(res)
                raise exceptions.QueryFailed(
                    """Failed to insert part or all of the data in the DataFrame, please check the log output."""
                )

    return _run_gremlin_insert(client, g)
Ejemplo n.º 6
0
    def _execute_sparql(self, query: str, headers: Any) -> Any:
        if headers is None:
            headers = {}

        s = SPARQLWrapper("")
        s.setQuery(query)
        query_type = s.queryType.upper()
        if query_type in ["SELECT", "CONSTRUCT", "ASK", "DESCRIBE"]:
            data = {"query": query}
        else:
            data = {"update": query}

        if "content-type" not in headers:
            headers["content-type"] = "application/x-www-form-urlencoded"

        uri = f"{HTTP_PROTOCOL}://{self.host}:{self.port}/sparql"
        req = self._prepare_request("POST", uri, data=data, headers=headers)
        res = self._http_session.send(req)
        _logger.debug(res)
        if res.ok:
            return res.json()
        raise exceptions.QueryFailed(
            f"Status Code: {res.status_code} Reason: {res.reason} Message: {res.text}"
        )
Ejemplo n.º 7
0
def _get_query_metadata(  # pylint: disable=too-many-statements
    query_execution_id: str,
    boto3_session: boto3.Session,
    categories: Optional[List[str]] = None,
    query_execution_payload: Optional[Dict[str, Any]] = None,
) -> _QueryMetadata:
    """Get query metadata."""
    if (query_execution_payload is not None) and (query_execution_payload["Status"]["State"] in _QUERY_FINAL_STATES):
        if query_execution_payload["Status"]["State"] != "SUCCEEDED":
            reason: str = query_execution_payload["Status"]["StateChangeReason"]
            raise exceptions.QueryFailed(f"Query error: {reason}")
        _query_execution_payload: Dict[str, Any] = query_execution_payload
    else:
        _query_execution_payload = wait_query(query_execution_id=query_execution_id, boto3_session=boto3_session)
    cols_types: Dict[str, str] = get_query_columns_types(
        query_execution_id=query_execution_id, boto3_session=boto3_session
    )
    _logger.debug("cols_types: %s", cols_types)
    dtype: Dict[str, str] = {}
    parse_timestamps: List[str] = []
    parse_dates: List[str] = []
    converters: Dict[str, Any] = {}
    binaries: List[str] = []
    col_name: str
    col_type: str
    for col_name, col_type in cols_types.items():
        if col_type == "array":
            raise exceptions.UnsupportedType(
                "List data type is not support with ctas_approach=False. "
                "Please use ctas_approach=True for List columns."
            )
        if col_type == "row":
            raise exceptions.UnsupportedType(
                "Struct data type is not support with ctas_approach=False. "
                "Please use ctas_approach=True for Struct columns."
            )
        pandas_type: str = _data_types.athena2pandas(dtype=col_type)
        if (categories is not None) and (col_name in categories):
            dtype[col_name] = "category"
        elif pandas_type in ["datetime64", "date"]:
            parse_timestamps.append(col_name)
            if pandas_type == "date":
                parse_dates.append(col_name)
        elif pandas_type == "bytes":
            dtype[col_name] = "string"
            binaries.append(col_name)
        elif pandas_type == "decimal":
            converters[col_name] = lambda x: Decimal(str(x)) if str(x) not in ("", "none", " ", "<NA>") else None
        else:
            dtype[col_name] = pandas_type

    output_location: Optional[str] = None
    if "ResultConfiguration" in _query_execution_payload:
        output_location = _query_execution_payload["ResultConfiguration"].get("OutputLocation")

    athena_statistics: Dict[str, Union[int, str]] = _query_execution_payload.get("Statistics", {})
    manifest_location: Optional[str] = str(athena_statistics.get("DataManifestLocation"))

    query_metadata: _QueryMetadata = _QueryMetadata(
        execution_id=query_execution_id,
        dtype=dtype,
        parse_timestamps=parse_timestamps,
        parse_dates=parse_dates,
        converters=converters,
        binaries=binaries,
        output_location=output_location,
        manifest_location=manifest_location,
        raw_payload=_query_execution_payload,
    )
    _logger.debug("query_metadata:\n%s", query_metadata)
    return query_metadata