def load_dataset(schema: str, name: str, dtype: dict, copy_cols: list = [], dataset_name: str = ""): """ General Function to ingest a dataset """ check_types([("schema", schema, [str]), ("name", name, [str])]) try: vdf = vDataFrame(name, schema=schema) except: name = quote_ident(name) schema = "v_temp_schema" if not (schema) else quote_ident(schema) create_table(table_name=name, dtype=dtype, schema=schema) try: path = os.path.dirname(verticapy.__file__) path += f"/data/{dataset_name}.csv" if not (copy_cols): copy_cols = [quote_ident(col) for col in dtype] copy_cols = ", ".join(copy_cols) query = ("COPY {0}.{1}({2}) FROM {3} DELIMITER ',' NULL '' " "ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;").format( schema, name, copy_cols, "{}") cur = current_cursor() if isinstance(cur, vertica_python.vertica.cursor.Cursor): query = query.format("STDIN") executeSQL(query, title="Ingesting the data.", method="copy", path=path) else: query = query.format("LOCAL '{0}'".format(path)) executeSQL(query, title="Ingesting the data.") executeSQL("COMMIT;", title="Commit.") vdf = vDataFrame(name, schema=schema) except: drop(schema + "." + name, method="table") raise return vdf
def test_vDF_sql(self, titanic_vd): sql = """-- Selecting some columns \n SELECT age, fare FROM titanic WHERE age IS NOT NULL;;""" vdf = vDataFrame(sql=sql) assert vdf.shape() == (997, 2) vdf = vDataFrame(sql=sql, usecols=["age"]) assert vdf.shape() == (997, 1)
def load_dataset_num(table_name: str = "dataset_num", schema: str = "public"): # Numerical Dataset data = [ [1, 7.2, 3.6, 6.1, 2.5], [2, 7.7, 2.8, 6.7, 2.0], [3, 7.7, 3.0, 6.1, 2.3], [4, 7.9, 3.8, 6.4, 2.0], [5, 4.4, 2.9, 1.4, 0.2], [6, 4.6, 3.6, 1.0, 0.2], [7, 4.7, 3.2, 1.6, 0.2], [8, 6.5, 2.8, 4.6, 1.5], [9, 6.8, 2.8, 4.8, 1.4], [10, 7.0, 3.2, 4.7, 1.4], ] input_relation = "{}.{}".format(quote_ident(schema), quote_ident(table_name)) drop(name=input_relation, method="table") create_table( table_name=table_name, schema=schema, dtype={ "Id": "INT", "col1": "FLOAT", "col2": "FLOAT", "col3": "FLOAT", "col4": "FLOAT", }, ) insert_into(table_name=table_name, schema=schema, data=data, copy=False) return vDataFrame(input_relation=input_relation)
def tr_data_vd(base): base.cursor.execute("DROP TABLE IF EXISTS public.tr_data") base.cursor.execute( 'CREATE TABLE IF NOT EXISTS public.tr_data(Id INT, transportation INT, gender VARCHAR, "owned cars" INT, cost VARCHAR, income CHAR(4))' ) base.cursor.execute( "INSERT INTO tr_data VALUES (1, 0, 'Male', 0, 'Cheap', 'Low')") base.cursor.execute( "INSERT INTO tr_data VALUES (2, 0, 'Male', 1, 'Cheap', 'Med')") base.cursor.execute( "INSERT INTO tr_data VALUES (3, 1, 'Female', 1, 'Cheap', 'Med')") base.cursor.execute( "INSERT INTO tr_data VALUES (4, 0, 'Female', 0, 'Cheap', 'Low')") base.cursor.execute( "INSERT INTO tr_data VALUES (5, 0, 'Male', 1, 'Cheap', 'Med')") base.cursor.execute( "INSERT INTO tr_data VALUES (6, 1, 'Male', 0, 'Standard', 'Med')") base.cursor.execute( "INSERT INTO tr_data VALUES (7, 1, 'Female', 1, 'Standard', 'Med')") base.cursor.execute( "INSERT INTO tr_data VALUES (8, 2, 'Female', 1, 'Expensive', 'Hig')") base.cursor.execute( "INSERT INTO tr_data VALUES (9, 2, 'Male', 2, 'Expensive', 'Med')") base.cursor.execute( "INSERT INTO tr_data VALUES (10, 2, 'Female', 2, 'Expensive', 'Hig')") base.cursor.execute("COMMIT") tr_data = vDataFrame(input_relation="public.tr_data", cursor=base.cursor) yield tr_data with warnings.catch_warnings(record=True) as w: drop(name="public.tr_data", cursor=base.cursor)
def test_creating_vDF_using_input_relation_vcolumns( self, base, titanic_vd): tvdf = vDataFrame(input_relation='public.titanic', usecols=["age", "survived"], cursor=base.cursor) assert tvdf["survived"].count() == 1234
def load_dataset_reg(table_name: str = "dataset_reg", schema: str = "public"): # Regression Dataset data = [ [1, 0, "Male", 0, "Cheap", "Low"], [2, 0, "Male", 1, "Cheap", "Med"], [3, 1, "Female", 1, "Cheap", "Med"], [4, 0, "Female", 0, "Cheap", "Low"], [5, 0, "Male", 1, "Cheap", "Med"], [6, 1, "Male", 0, "Standard", "Med"], [7, 1, "Female", 1, "Standard", "Med"], [8, 2, "Female", 1, "Expensive", "Hig"], [9, 2, "Male", 2, "Expensive", "Med"], [10, 2, "Female", 2, "Expensive", "Hig"], ] input_relation = "{}.{}".format(quote_ident(schema), quote_ident(table_name)) drop(name=input_relation, method="table") create_table( table_name=table_name, schema=schema, dtype={ "Id": "INT", "transportation": "INT", "gender": "VARCHAR", "owned cars": "INT", "cost": "VARCHAR", "income": "CHAR(4)", }, ) insert_into(table_name=table_name, schema=schema, data=data, copy=False) return vDataFrame(input_relation=input_relation)
def Balance( name: str, input_relation: str, y: str, cursor=None, method: str = "hybrid", ratio: float = 0.5, ): """ --------------------------------------------------------------------------- Creates a view with an equal distribution of the input data based on the response_column. Parameters ---------- name: str Name of the the view. input_relation: str Relation to use to create the new relation. y: str Response column. cursor: DBcursor, optional Vertica DB cursor. method: str, optional Method to use to do the balancing. hybrid : Performs over-sampling and under-sampling on different classes so each class is equally represented. over : Over-samples on all classes, with the exception of the most majority class, towards the most majority class's cardinality. under : Under-samples on all classes, with the exception of the most minority class, towards the most minority class's cardinality. ratio: float, optional The desired ratio between the majority class and the minority class. This value has no effect when used with balance method 'hybrid'. Returns ------- vDataFrame vDataFrame of the created view """ check_types( [ ("name", name, [str],), ("input_relation", input_relation, [str],), ("y", y, [str],), ("method", method, ["hybrid", "over", "under"],), ("ratio", ratio, [float],), ] ) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) version(cursor=cursor, condition=[8, 1, 1]) method = method.lower() sql = "SELECT BALANCE('{}', '{}', '{}', '{}_sampling' USING PARAMETERS sampling_ratio = {})".format( name, input_relation, y, method, ratio ) cursor.execute(sql) return vDataFrame(name, cursor)
def test_to_memmodel(self, model): mmodel = model.to_memmodel() res = mmodel.predict( [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]] ) res_py = model.to_python()( [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]] ) assert res[0] == res_py[0] assert res[1] == res_py[1] res = mmodel.predict_proba( [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]] ) res_py = model.to_python(return_proba=True)( [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]] ) assert res[0][0] == res_py[0][0] assert res[0][1] == res_py[0][1] assert res[0][2] == res_py[0][2] assert res[1][0] == res_py[1][0] assert res[1][1] == res_py[1][1] assert res[1][2] == res_py[1][2] vdf = vDataFrame("public.xgbc_data") vdf["prediction_sql"] = mmodel.predict_sql( ['"Gender"', '"owned cars"', '"cost"', '"income"'] ) vdf["prediction_proba_sql_0"] = mmodel.predict_proba_sql( ['"Gender"', '"owned cars"', '"cost"', '"income"'] )[0] vdf["prediction_proba_sql_1"] = mmodel.predict_proba_sql( ['"Gender"', '"owned cars"', '"cost"', '"income"'] )[1] vdf["prediction_proba_sql_2"] = mmodel.predict_proba_sql( ['"Gender"', '"owned cars"', '"cost"', '"income"'] )[2] model.predict(vdf, name="prediction_vertica_sql") model.predict_proba( vdf, name="prediction_proba_vertica_sql_0", pos_label=model.classes_[0] ) model.predict_proba( vdf, name="prediction_proba_vertica_sql_1", pos_label=model.classes_[1] ) model.predict_proba( vdf, name="prediction_proba_vertica_sql_2", pos_label=model.classes_[2] ) score = vdf.score("prediction_sql", "prediction_vertica_sql", "accuracy") assert score == pytest.approx(1.0) score = vdf.score( "prediction_proba_sql_0", "prediction_proba_vertica_sql_0", "r2" ) assert score == pytest.approx(1.0) score = vdf.score( "prediction_proba_sql_1", "prediction_proba_vertica_sql_1", "r2" ) assert score == pytest.approx(1.0) score = vdf.score( "prediction_proba_sql_2", "prediction_proba_vertica_sql_2", "r2" ) assert score == pytest.approx(1.0)
def load_dataset(cursor, schema: str, name: str, str_create: str, str_copy: str, dataset_name: str): """ General Function to ingest a dataset """ check_types([( "schema", schema, [str], ), ( "name", name, [str], )]) cursor = check_cursor(cursor)[0] try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute("CREATE TABLE {}.{}({});".format( str_column(schema), str_column(name), str_create, )) try: path = os.path.dirname( verticapy.__file__) + "/learn/data/{}.csv".format(dataset_name) query = "COPY {}.{}({}) FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format( str_column(schema), str_column(name), str_copy, "{}") import vertica_python if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open(path, "r") as fs: cursor.copy(query.format("STDIN"), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) cursor.execute("COMMIT;") vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute("DROP TABLE {}.{}".format(str_column(schema), str_column(name))) raise return vdf
def predict(self): """ --------------------------------------------------------------------------- Creates a vDataFrame of the model. Returns ------- vDataFrame the vDataFrame including the prediction. """ return vDataFrame(self.name, self.cursor)
def test_to_memmodel(self, model): mmodel = model.to_memmodel() res = mmodel.predict([["Male", 0, "Cheap", "Low"], ["Female", 1, "Expensive", "Low"]]) res_py = model.to_python()([["Male", 0, "Cheap", "Low"], ["Female", 1, "Expensive", "Low"]]) assert res[0] == res_py[0] assert res[1] == res_py[1] vdf = vDataFrame("public.rfr_data") vdf["prediction_sql"] = mmodel.predict_sql( ['"Gender"', '"owned cars"', '"cost"', '"income"']) model.predict(vdf, name="prediction_vertica_sql") score = vdf.score("prediction_sql", "prediction_vertica_sql", "r2") assert score == pytest.approx(1.0)
def test_to_memmodel(self, model): mmodel = model.to_memmodel() res = mmodel.predict([[5.006, 3.418, 1.464, 0.244], [3.0, 11.0, 1993.0, 0.0]]) res_py = model.to_python()([[5.006, 3.418, 1.464, 0.244], [3.0, 11.0, 1993.0, 0.0]]) assert res[0] == res_py[0] assert res[1] == res_py[1] vdf = vDataFrame("public.bsk_data") vdf["prediction_sql"] = mmodel.predict_sql( ["col1", "col2", "col3", "col4"]) model.predict(vdf, name="prediction_vertica_sql") score = vdf.score("prediction_sql", "prediction_vertica_sql", "accuracy") assert score == pytest.approx(1.0)
def bsk_data_vd(base): base.cursor.execute("DROP TABLE IF EXISTS public.bsk_data") base.cursor.execute( "CREATE TABLE IF NOT EXISTS public.bsk_data(Id INT, col1 FLOAT, col2 FLOAT, col3 FLOAT, col4 FLOAT)" ) base.cursor.execute("INSERT INTO bsk_data VALUES (1, 7.2, 3.6, 6.1, 2.5)") base.cursor.execute("INSERT INTO bsk_data VALUES (2, 7.7, 2.8, 6.7, 2.0)") base.cursor.execute("INSERT INTO bsk_data VALUES (3, 7.7, 3.0, 6.1, 2.3)") base.cursor.execute("INSERT INTO bsk_data VALUES (4, 7.9, 3.8, 6.4, 2.0)") base.cursor.execute("INSERT INTO bsk_data VALUES (5, 4.4, 2.9, 1.4, 0.2)") base.cursor.execute("INSERT INTO bsk_data VALUES (6, 4.6, 3.6, 1.0, 0.2)") base.cursor.execute("INSERT INTO bsk_data VALUES (7, 4.7, 3.2, 1.6, 0.2)") base.cursor.execute("INSERT INTO bsk_data VALUES (8, 6.5, 2.8, 4.6, 1.5)") base.cursor.execute("INSERT INTO bsk_data VALUES (9, 6.8, 2.8, 4.8, 1.4)") base.cursor.execute("INSERT INTO bsk_data VALUES (10, 7.0, 3.2, 4.7, 1.4)") base.cursor.execute("COMMIT") bsk_data = vDataFrame(input_relation="public.bsk_data", cursor=base.cursor) yield bsk_data with warnings.catch_warnings(record=True) as w: drop_table(name="public.bsk_data", cursor=base.cursor)
def load_winequality(cursor=None, schema: str = "public", name: str = "winequality"): """ --------------------------------------------------------------------------- Ingests the winequality dataset in the Vertica DB (Dataset ideal for Regression and Classification). If a table with the same name and schema already exists, this function will create a vDataFrame from the input relation. Parameters ---------- cursor: DBcursor, optional Vertica DB cursor. schema: str, optional Schema of the new relation. The default schema is public. name: str, optional Name of the new relation. Returns ------- vDataFrame the winequality vDataFrame. See Also -------- load_amazon : Ingests the amazon dataset in the Vertica DB. (Time Series / Regression). load_commodities : Ingests the commodities dataset in the Vertica DB. (Time Series / Regression). load_iris : Ingests the iris dataset in the Vertica DB. (Clustering / Classification). load_market : Ingests the market dataset in the Vertica DB. (Basic Data Exploration). load_smart_meters : Ingests the smart meters dataset in the Vertica DB. (Time Series / Regression). load_titanic : Ingests the titanic dataset in the Vertica DB. (Classification). """ check_types([("schema", schema, [str],), ("name", name, [str],)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( 'CREATE TABLE {}.{}("fixed_acidity" Numeric(6,3), "volatile_acidity" Numeric(7,4), "citric_acid" Numeric(6,3), "residual_sugar" Numeric(7,3), "chlorides" Float, "free_sulfur_dioxide" Numeric(7,2), "total_sulfur_dioxide" Numeric(7,2), "density" Float, "pH" Numeric(6,3), "sulphates" Numeric(6,3), "alcohol" Float, "quality" Integer, "good" Integer, "color" Varchar(20));'.format( str_column(schema), str_column(name) ) ) try: path = os.path.dirname(verticapy.__file__) + "/learn/data/winequality.csv" query = 'COPY {}.{}("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "pH", "sulphates", "alcohol", "quality", "good", "color") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format( str_column(schema), str_column(name), "{}" ) import vertica_python if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open(path, "r") as fs: cursor.copy(query.format("STDIN"), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) cursor.execute("COMMIT;") vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( "DROP TABLE {}.{}".format(str_column(schema), str_column(name)) ) raise return vdf
def test_vDF_to_db(self, titanic_vd): drop("verticapy_titanic_tmp") # testing relation_type = view try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="view", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame("verticapy_titanic_tmp") assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 current_cursor().execute( "SELECT table_name FROM view_columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = current_cursor().fetchone() assert result[0] == "verticapy_titanic_tmp" except: drop("verticapy_titanic_tmp", method="view") raise drop("verticapy_titanic_tmp", method="view") # testing relation_type = table try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="table", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame("verticapy_titanic_tmp") assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 current_cursor().execute( "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = current_cursor().fetchone() assert result[0] == "verticapy_titanic_tmp" except: drop("verticapy_titanic_tmp") raise drop("verticapy_titanic_tmp") # testing relation_type = temporary table try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="temporary", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame("verticapy_titanic_tmp") assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 current_cursor().execute( "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = current_cursor().fetchone() assert result[0] == "verticapy_titanic_tmp" except: drop("verticapy_titanic_tmp") raise drop("verticapy_titanic_tmp") # testing relation_type = temporary local table try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="local", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame("v_temp_schema.verticapy_titanic_tmp") assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 current_cursor().execute( "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = current_cursor().fetchone() assert result[0] == "verticapy_titanic_tmp" except: drop("verticapy_titanic_tmp") raise drop("verticapy_titanic_tmp")
def load_titanic(cursor=None, schema: str = "public", name: str = "titanic"): """ --------------------------------------------------------------------------- Ingests the titanic dataset in the Vertica DB (Dataset ideal for Classification). If a table with the same name and schema already exists, this function will create a vDataFrame from the input relation. Parameters ---------- cursor: DBcursor, optional Vertica DB cursor. schema: str, optional Schema of the new relation. The default schema is public. name: str, optional Name of the new relation. Returns ------- vDataFrame the titanic vDataFrame. See Also -------- load_amazon : Ingests the amazon dataset in the Vertica DB. (Time Series / Regression). load_commodities : Ingests the commodities dataset in the Vertica DB. (Time Series / Regression). load_iris : Ingests the iris dataset in the Vertica DB. (Clustering / Classification). load_market : Ingests the market dataset in the Vertica DB. (Basic Data Exploration). load_smart_meters : Ingests the smart meters dataset in the Vertica DB. (Time Series / Regression). load_winequality : Ingests the winequality dataset in the Vertica DB. (Regression / Classification). """ check_types([("schema", schema, [str],), ("name", name, [str],)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( 'CREATE TABLE {}.{}("pclass" Integer, "survived" Integer, "name" Varchar(164), "sex" Varchar(20), "age" Numeric(6,3), "sibsp" Integer, "parch" Integer, "ticket" Varchar(36), "fare" Numeric(10,5), "cabin" Varchar(30), "embarked" Varchar(20), "boat" Varchar(100), "body" Integer, "home.dest" Varchar(100));'.format( str_column(schema), str_column(name) ) ) try: path = os.path.dirname(verticapy.__file__) + "/learn/data/titanic.csv" query = 'COPY {}.{}("pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format( str_column(schema), str_column(name), "{}" ) import vertica_python if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open(path, "r") as fs: cursor.copy(query.format("STDIN"), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) cursor.execute("COMMIT;") vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( "DROP TABLE {}.{}".format(str_column(schema), str_column(name)) ) raise return vdf
def test_creating_vDF_using_input_relation_schema(self, base, titanic_vd): tvdf = vDataFrame(input_relation="titanic", schema="public", cursor=base.cursor) assert tvdf["pclass"].count() == 1234
def load_amazon(cursor=None, schema: str = "public", name: str = "amazon"): """ --------------------------------------------------------------------------- Ingests the amazon dataset in the Vertica DB (Dataset ideal for TS and Regression). If a table with the same name and schema already exists, this function will create a vDataFrame from the input relation. Parameters ---------- cursor: DBcursor, optional Vertica DB cursor. schema: str, optional Schema of the new relation. The default schema is public. name: str, optional Name of the new relation. Returns ------- vDataFrame the amazon vDataFrame. See Also -------- load_commodities : Ingests the commodities dataset in the Vertica DB. (Time Series / Regression). load_iris : Ingests the iris dataset in the Vertica DB. (Clustering / Classification). load_market : Ingests the market dataset in the Vertica DB. (Basic Data Exploration). load_smart_meters : Ingests the smart meters dataset in the Vertica DB. (Time Series / Regression). load_titanic : Ingests the titanic dataset in the Vertica DB. (Classification). load_winequality : Ingests the winequality dataset in the Vertica DB. (Regression / Classification). """ check_types([("schema", schema, [str],), ("name", name, [str],)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( 'CREATE TABLE {}.{}("date" Date, "state" Varchar(32), "number" Integer);'.format( str_column(schema), str_column(name) ) ) try: path = os.path.dirname(verticapy.__file__) + "/learn/data/amazon.csv" query = "COPY {}.{}(\"date\", \"state\", \"number\") FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format( str_column(schema), str_column(name), "{}" ) import vertica_python if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open(path, "r") as fs: cursor.copy(query.format("STDIN"), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) cursor.execute("COMMIT;") vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( "DROP TABLE {}.{}".format(str_column(schema), str_column(name)) ) raise return vdf
def fit( self, input_relation: (str, vDataFrame), X: list = [], key_columns: list = [], index: str = "", ): """ --------------------------------------------------------------------------- Trains the model. Parameters ---------- input_relation: str/vDataFrame Train relation. X: list, optional List of the predictors. If empty, all the numerical vcolumns will be used. key_columns: list, optional Columns not used during the algorithm computation but which will be used to create the final relation. index: str, optional Index to use to identify each row separately. It is highly recommanded to have one already in the main table to avoid creation of temporary tables. Returns ------- object self """ check_types([ ( "input_relation", input_relation, [str, vDataFrame], ), ( "X", X, [list], ), ( "key_columns", key_columns, [list], ), ( "index", index, [str], ), ]) self.cursor = check_cursor(self.cursor, input_relation, True)[0] check_model(name=self.name, cursor=self.cursor) if isinstance(input_relation, vDataFrame): if not (X): X = input_relation.numcol() input_relation = input_relation.__genSQL__() else: if not (X): X = vDataFrame(input_relation, self.cursor).numcol() X = [str_column(column) for column in X] self.X = X self.key_columns = [str_column(column) for column in key_columns] self.input_relation = input_relation schema, relation = schema_relation(input_relation) cursor = self.cursor def drop_temp_elem(cursor): try: cursor.execute( "DROP TABLE IF EXISTS v_temp_schema.VERTICAPY_MAIN_{}". format(get_session(cursor))) cursor.execute( "DROP TABLE IF EXISTS v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS" ) except: pass try: if not (index): index = "id" main_table = "VERTICAPY_MAIN_{}".format( get_session(self.cursor)) drop_temp_elem(cursor) sql = "CREATE LOCAL TEMPORARY TABLE {} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format( main_table, ", ".join(X + key_columns), self.input_relation, " AND ".join(["{} IS NOT NULL".format(item) for item in X]), ) executeSQL(cursor, sql, "Computing the DBSCAN Table - STEP 0.") else: cursor.execute("SELECT {} FROM {} LIMIT 10".format( ", ".join(X + key_columns + [index]), self.input_relation)) main_table = self.input_relation sql = [ "POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], self.parameters["p"]) for i in range(len(X)) ] distance = "POWER({}, 1 / {})".format(" + ".join(sql), self.parameters["p"]) sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance FROM {} AS x CROSS JOIN {} AS y".format( index, index, distance, main_table, main_table) sql = "SELECT node_id, nn_id, SUM(CASE WHEN distance <= {} THEN 1 ELSE 0 END) OVER (PARTITION BY node_id) AS density, distance FROM ({}) distance_table".format( self.parameters["eps"], sql) sql = "SELECT node_id, nn_id FROM ({}) VERTICAPY_SUBTABLE WHERE density > {} AND distance < {} AND node_id != nn_id".format( sql, self.parameters["min_samples"], self.parameters["eps"]) executeSQL(cursor, sql, "Computing the DBSCAN Table - STEP 1.") graph = cursor.fetchall() main_nodes = list( dict.fromkeys([elem[0] for elem in graph] + [elem[1] for elem in graph])) clusters = {} for elem in main_nodes: clusters[elem] = None i = 0 while graph: node = graph[0][0] node_neighbor = graph[0][1] if (clusters[node] == None) and (clusters[node_neighbor] == None): clusters[node] = i clusters[node_neighbor] = i i = i + 1 else: if clusters[node] != None and clusters[ node_neighbor] == None: clusters[node_neighbor] = clusters[node] elif clusters[node_neighbor] != None and clusters[ node] == None: clusters[node] = clusters[node_neighbor] del graph[0] try: f = open("VERTICAPY_DBSCAN_CLUSTERS_ID.csv", "w") for elem in clusters: f.write("{}, {}\n".format(elem, clusters[elem])) f.close() try: cursor.execute( "DROP TABLE IF EXISTS v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS" ) except: pass cursor.execute( "CREATE LOCAL TEMPORARY TABLE VERTICAPY_DBSCAN_CLUSTERS(node_id int, cluster int) ON COMMIT PRESERVE ROWS" ) if isinstance(cursor, vertica_python.vertica.cursor.Cursor): with open("./VERTICAPY_DBSCAN_CLUSTERS_ID.csv", "r") as fs: cursor.copy( "COPY v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS(node_id, cluster) FROM STDIN DELIMITER ',' ESCAPE AS '\\';", fs, ) else: cursor.execute( "COPY v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS(node_id, cluster) FROM LOCAL './VERTICAPY_DBSCAN_CLUSTERS_ID.csv' DELIMITER ',' ESCAPE AS '\\';" ) try: cursor.execute("COMMIT") except: pass os.remove("VERTICAPY_DBSCAN_CLUSTERS_ID.csv") except: os.remove("VERTICAPY_DBSCAN_CLUSTERS_ID.csv") raise self.n_cluster_ = i executeSQL( cursor, "CREATE TABLE {} AS SELECT {}, COALESCE(cluster, -1) AS dbscan_cluster FROM v_temp_schema.{} AS x LEFT JOIN v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS AS y ON x.{} = y.node_id" .format(self.name, ", ".join(self.X + self.key_columns), main_table, index), "Computing the DBSCAN Table - STEP 2.", ) cursor.execute( "SELECT COUNT(*) FROM {} WHERE dbscan_cluster = -1".format( self.name)) self.n_noise_ = cursor.fetchone()[0] except: drop_temp_elem(cursor) raise drop_temp_elem(cursor) model_save = { "type": "DBSCAN", "input_relation": self.input_relation, "key_columns": self.key_columns, "X": self.X, "p": self.parameters["p"], "eps": self.parameters["eps"], "min_samples": self.parameters["min_samples"], "n_cluster": self.n_cluster_, "n_noise": self.n_noise_, } insert_verticapy_schema( model_name=self.name, model_type="DBSCAN", model_save=model_save, cursor=self.cursor, ) return self
def test_vDF_to_db(self, titanic_vd): try: with warnings.catch_warnings(record=True) as w: drop( "verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"], method="view", ) drop( "verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"], method="table", ) except: pass # testing relation_type = view try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="view", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame( "verticapy_titanic_tmp", cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"], ) assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute( "SELECT table_name FROM view_columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone() assert result[0] == "verticapy_titanic_tmp" except: with warnings.catch_warnings(record=True) as w: drop( "verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"], method="view", ) raise with warnings.catch_warnings(record=True) as w: drop( "verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"], method="view", ) # testing relation_type = table try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="table", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame( "verticapy_titanic_tmp", cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"], ) assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute( "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone() assert result[0] == "verticapy_titanic_tmp" except: with warnings.catch_warnings(record=True) as w: drop("verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"]) raise with warnings.catch_warnings(record=True) as w: drop("verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"]) # testing relation_type = temporary table try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="temporary", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame( "verticapy_titanic_tmp", cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"], ) assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute( "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone() assert result[0] == "verticapy_titanic_tmp" except: with warnings.catch_warnings(record=True) as w: drop("verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"]) raise with warnings.catch_warnings(record=True) as w: drop("verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"]) # testing relation_type = temporary local table try: titanic_vd.copy().to_db( name="verticapy_titanic_tmp", usecols=["age", "fare", "survived"], relation_type="local", db_filter="age > 40", nb_split=3, ) titanic_tmp = vDataFrame( "v_temp_schema.verticapy_titanic_tmp", cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"], ) assert titanic_tmp.shape() == (220, 4) assert titanic_tmp["_verticapy_split_"].min() == 0 assert titanic_tmp["_verticapy_split_"].max() == 2 titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute( "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'" ) result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone() assert result[0] == "verticapy_titanic_tmp" except: with warnings.catch_warnings(record=True) as w: drop("verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"]) raise with warnings.catch_warnings(record=True) as w: drop("verticapy_titanic_tmp", titanic_vd._VERTICAPY_VARIABLES_["cursor"])
def fit(self, input_relation: (str, vDataFrame), X: list = []): """ --------------------------------------------------------------------------- Trains the model. Parameters ---------- input_relation: str/vDataFrame Train relation. X: list List of the predictors. If empty, all the columns will be used. Returns ------- object self """ check_types([( "input_relation", input_relation, [str, vDataFrame], ), ( "X", X, [list], )]) self.cursor = check_cursor(self.cursor, input_relation, True)[0] if isinstance(input_relation, vDataFrame): if not (X): X = input_relation.get_columns() self.input_relation = input_relation.__genSQL__() else: if not (X): X = vDataFrame(input_relation, self.cursor).get_columns() self.input_relation = input_relation self.X = [str_column(elem) for elem in X] schema, relation = schema_relation(self.name) schema = str_column(schema) tmp_name = "{}.VERTICAPY_COUNT_VECTORIZER_{}".format( schema, get_session(self.cursor)) try: self.drop() except: pass sql = "CREATE TABLE {}(id identity(2000) primary key, text varchar({})) ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;" executeSQL( self.cursor, sql.format(tmp_name, self.parameters["max_text_size"]), "Computing the CountVectorizer - STEP 0.", ) text = (" || ".join(self.X) if not (self.parameters["lowercase"]) else "LOWER({})".format(" || ".join(self.X))) if self.parameters["ignore_special"]: text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text) sql = "INSERT INTO {}(text) SELECT {} FROM {}".format( tmp_name, text, self.input_relation) executeSQL(self.cursor, sql, "Computing the CountVectorizer - STEP 1.") sql = "CREATE TEXT INDEX {} ON {}(id, text) stemmer NONE;".format( self.name, tmp_name) executeSQL(self.cursor, sql, "Computing the CountVectorizer - STEP 2.") stop_words = "SELECT token FROM (SELECT token, cnt / SUM(cnt) OVER () AS df, rnk FROM (SELECT token, COUNT(*) AS cnt, RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk FROM {} GROUP BY 1) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE WHERE not(df BETWEEN {} AND {})".format( self.name, self.parameters["min_df"], self.parameters["max_df"]) if self.parameters["max_features"] > 0: stop_words += " OR (rnk > {})".format( self.parameters["max_features"]) self.cursor.execute(stop_words) self.stop_words_ = [item[0] for item in self.cursor.fetchall()] self.cursor.execute(self.deploySQL()) self.vocabulary_ = [item[0] for item in self.cursor.fetchall()] self.countvectorizer_table = tmp_name model_save = { "type": "CountVectorizer", "input_relation": self.input_relation, "X": self.X, "countvectorizer_table": tmp_name, "lowercase": self.parameters["lowercase"], "max_df": self.parameters["max_df"], "min_df": self.parameters["min_df"], "max_features": self.parameters["max_features"], "ignore_special": self.parameters["ignore_special"], "max_text_size": self.parameters["max_text_size"], "vocabulary": self.vocabulary_, "stop_words": self.stop_words_, } insert_verticapy_schema( model_name=self.name, model_type="CountVectorizer", model_save=model_save, cursor=self.cursor, ) return self
def fit(self, input_relation: Union[str, vDataFrame], X: list = []): """ --------------------------------------------------------------------------- Trains the model. Parameters ---------- input_relation: str/vDataFrame Training relation. X: list List of the predictors. If empty, all the columns will be used. Returns ------- object self """ if isinstance(X, str): X = [X] check_types( [("input_relation", input_relation, [str, vDataFrame]), ("X", X, [list])] ) if verticapy.options["overwrite_model"]: self.drop() else: does_model_exist(name=self.name, raise_error=True) if isinstance(input_relation, vDataFrame): if not (X): X = input_relation.get_columns() self.input_relation = input_relation.__genSQL__() else: if not (X): X = vDataFrame(input_relation).get_columns() self.input_relation = input_relation self.X = [quote_ident(elem) for elem in X] schema, relation = schema_relation(self.name) schema = quote_ident(schema) tmp_name = gen_tmp_name(schema=schema, name="countvectorizer") try: self.drop() except: pass sql = """CREATE TABLE {0}(id identity(2000) primary key, text varchar({1})) ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;""" executeSQL( sql.format(tmp_name, self.parameters["max_text_size"]), title="Computing the CountVectorizer [Step 0].", ) text = ( " || ".join(self.X) if not (self.parameters["lowercase"]) else "LOWER({})".format(" || ".join(self.X)) ) if self.parameters["ignore_special"]: text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text) sql = "INSERT INTO {}(text) SELECT {} FROM {}".format( tmp_name, text, self.input_relation ) executeSQL(sql, "Computing the CountVectorizer [Step 1].") sql = "CREATE TEXT INDEX {} ON {}(id, text) stemmer NONE;".format( self.name, tmp_name ) executeSQL(sql, "Computing the CountVectorizer [Step 2].") stop_words = """SELECT token FROM (SELECT token, cnt / SUM(cnt) OVER () AS df, rnk FROM (SELECT token, COUNT(*) AS cnt, RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk FROM {0} GROUP BY 1) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE WHERE not(df BETWEEN {1} AND {2})""".format( self.name, self.parameters["min_df"], self.parameters["max_df"] ) if self.parameters["max_features"] > 0: stop_words += " OR (rnk > {})".format(self.parameters["max_features"]) res = executeSQL(stop_words, print_time_sql=False, method="fetchall") self.stop_words_ = [item[0] for item in res] res = executeSQL(self.deploySQL(), print_time_sql=False, method="fetchall") self.vocabulary_ = [item[0] for item in res] self.countvectorizer_table = tmp_name model_save = { "type": "CountVectorizer", "input_relation": self.input_relation, "X": self.X, "countvectorizer_table": tmp_name, "lowercase": self.parameters["lowercase"], "max_df": self.parameters["max_df"], "min_df": self.parameters["min_df"], "max_features": self.parameters["max_features"], "ignore_special": self.parameters["ignore_special"], "max_text_size": self.parameters["max_text_size"], "vocabulary": self.vocabulary_, "stop_words": self.stop_words_, } insert_verticapy_schema( model_name=self.name, model_type="CountVectorizer", model_save=model_save, ) return self
def test_creating_vDF_using_input_relation(self, base, titanic_vd): tvdf = vDataFrame(input_relation='public.titanic', cursor=base.cursor) assert tvdf["pclass"].count() == 1234
def fit( self, input_relation: (str, vDataFrame), X: list = [], key_columns: list = [], index: str = "", ): """ --------------------------------------------------------------------------- Trains the model. Parameters ---------- input_relation: str/vDataFrame Training relation. X: list, optional List of the predictors. If empty, all the numerical vcolumns will be used. key_columns: list, optional Columns not used during the algorithm computation but which will be used to create the final relation. index: str, optional Index used to identify each row separately. It is highly recommanded to have one already in the main table to avoid creating temporary tables. Returns ------- object self """ if isinstance(key_columns, str): key_columns = [key_columns] if isinstance(X, str): X = [X] check_types( [ ("input_relation", input_relation, [str, vDataFrame]), ("X", X, [list]), ("key_columns", key_columns, [list]), ("index", index, [str]), ] ) if verticapy.options["overwrite_model"]: self.drop() else: does_model_exist(name=self.name, raise_error=True) if isinstance(input_relation, vDataFrame): if not (X): X = input_relation.numcol() input_relation = input_relation.__genSQL__() else: if not (X): X = vDataFrame(input_relation).numcol() X = [quote_ident(column) for column in X] self.X = X self.key_columns = [quote_ident(column) for column in key_columns] self.input_relation = input_relation schema, relation = schema_relation(input_relation) name_main, name_dbscan_clusters = ( gen_tmp_name(name="main"), gen_tmp_name(name="clusters"), ) try: if not (index): index = "id" drop(f"v_temp_schema.{name_main}", method="table") sql = """CREATE LOCAL TEMPORARY TABLE {0} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {1} FROM {2} WHERE {3}""".format( name_main, ", ".join(X + key_columns), self.input_relation, " AND ".join([f"{item} IS NOT NULL" for item in X]), ) executeSQL(sql, title="Computing the DBSCAN Table [Step 0]") else: executeSQL( "SELECT {0} FROM {1} LIMIT 10".format( ", ".join(X + key_columns + [index]), self.input_relation ), print_time_sql=False, ) name_main = self.input_relation sql = [ "POWER(ABS(x.{0} - y.{0}), {1})".format(X[i], self.parameters["p"]) for i in range(len(X)) ] distance = "POWER({0}, 1 / {1})".format( " + ".join(sql), self.parameters["p"] ) sql = """SELECT x.{0} AS node_id, y.{0} AS nn_id, {1} AS distance FROM {2} AS x CROSS JOIN {2} AS y""".format( index, distance, name_main ) sql = """SELECT node_id, nn_id, SUM(CASE WHEN distance <= {0} THEN 1 ELSE 0 END) OVER (PARTITION BY node_id) AS density, distance FROM ({1}) distance_table""".format( self.parameters["eps"], sql ) if isinstance(verticapy.options["random_state"], int): order_by = "ORDER BY node_id, nn_id" else: order_by = "" sql = """SELECT node_id, nn_id FROM ({0}) VERTICAPY_SUBTABLE WHERE density > {1} AND distance < {2} AND node_id != nn_id {3}""".format( sql, self.parameters["min_samples"], self.parameters["eps"], order_by, ) graph = executeSQL( sql, title="Computing the DBSCAN Table [Step 1]", method="fetchall" ) main_nodes = list( dict.fromkeys([elem[0] for elem in graph] + [elem[1] for elem in graph]) ) clusters = {} for elem in main_nodes: clusters[elem] = None i = 0 while graph: node = graph[0][0] node_neighbor = graph[0][1] if (clusters[node] == None) and (clusters[node_neighbor] == None): clusters[node] = i clusters[node_neighbor] = i i = i + 1 else: if clusters[node] != None and clusters[node_neighbor] == None: clusters[node_neighbor] = clusters[node] elif clusters[node_neighbor] != None and clusters[node] == None: clusters[node] = clusters[node_neighbor] del graph[0] try: f = open("{}.csv".format(name_dbscan_clusters), "w") for elem in clusters: f.write("{}, {}\n".format(elem, clusters[elem])) f.close() drop("v_temp_schema.{}".format(name_dbscan_clusters), method="table") executeSQL( ( f"CREATE LOCAL TEMPORARY TABLE {name_dbscan_clusters}" "(node_id int, cluster int) ON COMMIT PRESERVE ROWS" ), print_time_sql=False, ) if isinstance(current_cursor(), vertica_python.vertica.cursor.Cursor): executeSQL( ( f"COPY v_temp_schema.{name_dbscan_clusters}(node_id, cluster)" " FROM STDIN DELIMITER ',' ESCAPE AS '\\';" ), method="copy", print_time_sql=False, path=f"./{name_dbscan_clusters}.csv", ) else: executeSQL( """COPY v_temp_schema.{0}(node_id, cluster) FROM LOCAL './{0}.csv' DELIMITER ',' ESCAPE AS '\\';""".format( name_dbscan_clusters ), print_time_sql=False, ) executeSQL("COMMIT;", print_time_sql=False) os.remove(f"{name_dbscan_clusters}.csv") except: os.remove(f"{name_dbscan_clusters}.csv") raise self.n_cluster_ = i executeSQL( """CREATE TABLE {0} AS SELECT {1}, COALESCE(cluster, -1) AS dbscan_cluster FROM v_temp_schema.{2} AS x LEFT JOIN v_temp_schema.{3} AS y ON x.{4} = y.node_id""".format( self.name, ", ".join(self.X + self.key_columns), name_main, name_dbscan_clusters, index, ), title="Computing the DBSCAN Table [Step 2]", ) self.n_noise_ = executeSQL( "SELECT COUNT(*) FROM {0} WHERE dbscan_cluster = -1".format(self.name), method="fetchfirstelem", print_time_sql=False, ) except: drop(f"v_temp_schema.{name_main}", method="table") drop(f"v_temp_schema.{name_dbscan_clusters}", method="table") raise drop(f"v_temp_schema.{name_main}", method="table") drop(f"v_temp_schema.{name_dbscan_clusters}", method="table") model_save = { "type": "DBSCAN", "input_relation": self.input_relation, "key_columns": self.key_columns, "X": self.X, "p": self.parameters["p"], "eps": self.parameters["eps"], "min_samples": self.parameters["min_samples"], "n_cluster": self.n_cluster_, "n_noise": self.n_noise_, } insert_verticapy_schema( model_name=self.name, model_type="DBSCAN", model_save=model_save, ) return self