def test_vDF_pivot_table_chi2(self, titanic_vd):
     result = titanic_vd.pivot_table_chi2("survived")
     assert result["chi2"][0] == pytest.approx(345.12775126385327)
     assert result["chi2"][1] == pytest.approx(139.2026595859198)
     assert result["chi2"][2] == pytest.approx(103.48373526643627)
     assert result["chi2"][3] == pytest.approx(53.47413727076916)
     assert result["chi2"][4] == pytest.approx(44.074789072247576)
     assert result["chi2"][5] == pytest.approx(42.65927519250441)
     assert result["chi2"][6] == pytest.approx(34.54227539207669)
     assert result["chi2"][7] == pytest.approx(-2.6201263381153694e-14)
     assert result["p_value"][0] == pytest.approx(4.8771014178794746e-77)
     assert result["p_value"][1] == pytest.approx(6.466448884215497e-32)
     assert result["p_value"][2] == pytest.approx(5.922792773022131e-31)
     assert result["p_value"][3] == pytest.approx(2.9880194205753303e-09)
     assert result["p_value"][4] == pytest.approx(7.143900900120299e-08)
     assert result["p_value"][5] == pytest.approx(5.4532585750903e-10)
     assert result["p_value"][6] == pytest.approx(0.0028558258758971957)
     set_option("random_state", 0)
     result = titanic_vd.pivot_table_chi2("survived", method="smart")
     assert result["chi2"][0] == pytest.approx(345.12775126385327)
     assert result["chi2"][1] == pytest.approx(187.75090682844288)
     assert result["chi2"][2] == pytest.approx(139.2026595859198)
     assert result["chi2"][3] == pytest.approx(53.474137270768885)
     assert result["chi2"][4] == pytest.approx(44.074789072247576)
     assert result["chi2"][5] == pytest.approx(42.65927519250441)
     assert result["chi2"][6] == pytest.approx(38.109904618027755)
     assert result["chi2"][7] == pytest.approx(0.0)
Beispiel #2
0
# See the License for the specific language governing permissions and
# limitations under the License.

# Pytest
import pytest

# Standard Python Modules
import warnings, os

# VerticaPy
import verticapy
from verticapy import drop, set_option, tablesample
from verticapy.datasets import load_titanic
from verticapy.sql import sql

set_option("print_info", False)


@pytest.fixture(scope="module")
def titanic_vd():
    titanic = load_titanic()
    yield titanic
    drop(name="public.titanic")


class TestSQL:
    def test_sql(self, titanic_vd):

        # SQL line Test -c
        result = sql('  -c "SELECT * FROM titanic;"', "")
        assert result.shape() == (1234, 14)
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest, warnings
from verticapy import vDataFrame
from verticapy import drop_table

from verticapy import set_option

set_option("print_info", False)
set_option("random_state", 0)


@pytest.fixture(scope="module")
def smart_meters_vd(base):
    from verticapy.learn.datasets import load_smart_meters

    smart_meters = load_smart_meters(cursor=base.cursor)
    yield smart_meters
    with warnings.catch_warnings(record=True) as w:
        drop_table(
            name="public.smart_meters", cursor=base.cursor,
        )

Beispiel #4
0
def sql(line, cell="", local_ns=None):

    # We don't want to display the query/time twice if the options are still on
    # So we save the previous configuration and turn them off.
    sql_on, time_on = verticapy.options["sql_on"], verticapy.options["time_on"]
    set_option("sql_on", False)
    set_option("time_on", False)

    try:

        # Initialization
        queries = "" if (not (cell) and (line)) else cell

        # Options
        options = {}
        all_options_dict = get_magic_options(line)

        for option in all_options_dict:

            if option.lower() in (
                    "-f",
                    "--file",
                    "-o",
                    "--output",
                    "-nrows",
                    "-ncols",
                    "-c",
                    "--command",
            ):

                if option.lower() in ("-f", "--file"):
                    if "-f" in options:
                        raise ParameterError("Duplicate option '-f'.")
                    options["-f"] = all_options_dict[option]
                elif option.lower() in ("-o", "--output"):
                    if "-o" in options:
                        raise ParameterError("Duplicate option '-o'.")
                    options["-o"] = all_options_dict[option]
                elif option.lower() in ("-c", "--command"):
                    if "-c" in options:
                        raise ParameterError("Duplicate option '-c'.")
                    options["-c"] = all_options_dict[option]
                elif option.lower() in ("-nrows", ):
                    if "-nrows" in options:
                        raise ParameterError("Duplicate option '-nrows'.")
                    options["-nrows"] = int(all_options_dict[option])
                elif option.lower() in ("-ncols", ):
                    if "-ncols" in options:
                        raise ParameterError("Duplicate option '-ncols'.")
                    options["-ncols"] = int(all_options_dict[option])

            elif verticapy.options["print_info"]:
                warning_message = (
                    f"\u26A0 Warning : The option '{option}' doesn't "
                    "exist, it was skipped.")
                warnings.warn(warning_message, Warning)

        if "-f" in options and "-c" in options:
            raise ParameterError("Do not find which query to run: One of "
                                 "the options '-f' and '-c' must be empty.")

        if cell and ("-f" in options or "-c" in options):
            raise ParameterError(
                "Cell must be empty when using options '-f' or '-c'.")

        if "-f" in options:
            f = open(options["-f"], "r")
            queries = f.read()
            f.close()

        elif "-c" in options:
            queries = options["-c"]

        # Cleaning the Query
        queries = clean_query(queries)
        queries = replace_vars_in_query(queries, locals()["local_ns"])

        n, i, all_split = len(queries), 0, []

        while i < n and queries[n - i - 1] in (";", " ", "\n"):
            i += 1

        queries = queries[:n - i]
        i, n = 0, n - i

        while i < n:

            if queries[i] == '"':
                i += 1
                while i < n and queries[i] != '"':
                    i += 1
            elif queries[i] == "'":
                i += 1
                while i < n and queries[i] != "'":
                    i += 1
            elif queries[i] == ";":
                all_split += [i]
            i += 1

        all_split = [0] + all_split + [n]
        m = len(all_split)
        queries = [
            queries[all_split[i]:all_split[i + 1]] for i in range(m - 1)
        ]
        n = len(queries)

        for i in range(n):

            query = queries[i]
            while len(query) > 0 and (query[-1] in (";", " ")):
                query = query[0:-1]
            while len(query) > 0 and (query[0] in (";", " ")):
                query = query[1:]
            queries[i] = query

        queries_tmp, i = [], 0

        while i < n:

            query = queries[i]
            if (i < n - 1) and (queries[i + 1].lower() == "end"):
                query += "; {}".format(queries[i + 1])
                i += 1
            queries_tmp += [query]
            i += 1

        queries, n = queries_tmp, len(queries_tmp)
        result, start_time = None, time.time()

        # Executing the Queries

        for i in range(n):

            query = queries[i]

            if query.split(" ")[0]:
                query_type = query.split(" ")[0].upper()
            else:
                query_type = query.split(" ")[1].upper()

            if len(query_type) > 1 and query_type[0:2] in ("/*", "--"):
                query_type = "undefined"

            if (query_type == "COPY") and ("from local" in query.lower()):

                query = re.split("from local", query, flags=re.IGNORECASE)
                if query[1].split(" ")[0]:
                    file_name = query[1].split(" ")[0]
                else:
                    file_name = query[1].split(" ")[1]
                query = ("".join(query[0]) + "FROM" +
                         "".join(query[1]).replace(file_name, "STDIN"))
                if (file_name[0] == file_name[-1]) and (file_name[0]
                                                        in ('"', "'")):
                    file_name = file_name[1:-1]

                executeSQL(query,
                           method="copy",
                           path=file_name,
                           print_time_sql=False)

            elif (i < n - 1) or (
                (i == n - 1) and
                (query_type.lower() not in ("select", "with", "undefined"))):

                executeSQL(query, print_time_sql=False)
                if verticapy.options["print_info"]:
                    print(query_type)

            else:

                error = ""
                try:
                    result = vDataFrameSQL("({}) x".format(query))
                    # Display parameters
                    if "-nrows" in options:
                        result._VERTICAPY_VARIABLES_["max_rows"] = options[
                            "-nrows"]
                    if "-ncols" in options:
                        result._VERTICAPY_VARIABLES_["max_columns"] = options[
                            "-ncols"]

                except:
                    try:
                        final_result = executeSQL(query,
                                                  method="fetchfirstelem",
                                                  print_time_sql=False)
                        if final_result and verticapy.options["print_info"]:
                            print(final_result)
                        elif verticapy.options["print_info"]:
                            print(query_type)
                    except Exception as e:
                        error = e

                if error:
                    raise QueryError(error)

        # Displaying the information

        elapsed_time = round(time.time() - start_time, 3)

        if verticapy.options["print_info"]:
            display(HTML(f"<div><b>Execution: </b> {elapsed_time}s</div>"))

        # Exporting the result

        if isinstance(result, vDataFrame) and "-o" in options:

            if options["-o"][-4:] == "json":
                result.to_json(options["-o"])
            else:
                result.to_csv(options["-o"])

        # we load the previous configuration before returning the result.
        set_option("sql_on", sql_on)
        set_option("time_on", time_on)

        return result

    except:

        # If it fails, we load the previous configuration before raising the error.
        set_option("sql_on", sql_on)
        set_option("time_on", time_on)

        raise