Beispiel #1
0
 def fit(self,
         input_relation: str,
         X: list,
         y: str,
         test_relation: str = ""):
     self.input_relation = input_relation
     self.test_relation = test_relation if (
         test_relation) else input_relation
     self.X = [str_column(column) for column in X]
     self.y = str_column(y)
     query = "SELECT SVM_REGRESSOR('{}', '{}', '{}', '{}' USING PARAMETERS C = {}, epsilon = {}, max_iterations = {}"
     query = query.format(self.name, input_relation, self.y,
                          ", ".join(self.X), self.C, self.tol,
                          self.max_iter)
     query += ", error_tolerance = {}".format(self.acceptable_error_margin)
     if (self.fit_intercept):
         query += ", intercept_mode = '{}', intercept_scaling = {}".format(
             self.intercept_mode, self.intercept_scaling)
     query += ")"
     self.cursor.execute(query)
     self.coef = to_tablesample(
         query=
         "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')"
         .format(self.name),
         cursor=self.cursor)
     self.coef.table_info = False
     return (self)
 def fit(self,
         input_relation: str,
         X: list,
         y: str,
         test_relation: str = ""):
     self.input_relation = input_relation
     self.test_relation = test_relation if (
         test_relation) else input_relation
     self.X = [str_column(column) for column in X]
     self.y = str_column(y)
     if (self.max_features == "auto"):
         self.max_features = int(len(self.X) / 3 + 1)
     elif (self.max_features == "max"):
         self.max_features = len(self.X)
     query = "SELECT RF_CLASSIFIER('{}', '{}', '{}', '{}' USING PARAMETERS ntree = {}, mtry = {}, sampling_size = {}"
     query = query.format(self.name, input_relation, self.y,
                          ", ".join(self.X), self.n_estimators,
                          self.max_features, self.sample)
     query += ", max_depth = {}, max_breadth = {}, min_leaf_size = {}, min_info_gain = {}, nbins = {})".format(
         self.max_depth, int(self.max_leaf_nodes), self.min_samples_leaf,
         self.min_info_gain, self.nbins)
     self.cursor.execute(query)
     self.cursor.execute(
         "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1".
         format(self.y, input_relation, self.y))
     classes = self.cursor.fetchall()
     self.classes = [item[0] for item in classes]
     return (self)
Beispiel #3
0
def lof_plot(input_relation: str,
			 cursor,
			 columns: list,
			 lof: str,
			 tablesample: float = -1):
	import matplotlib.pyplot as plt
	tablesample = "TABLESAMPLE({})".format(tablesample) if (tablesample > 0 and tablesample < 100) else ""
	if (len(columns) == 1):
		column = str_column(columns[0])
		query = "SELECT {}, {} FROM {} {} WHERE {} IS NOT NULL".format(column, lof, input_relation, tablesample, column)
		cursor.execute(query)
		query_result = cursor.fetchall()
		column1, lof = [item[0] for item in query_result], [item[1] for item in query_result]
		column2 = [0] * len(column1)
		plt.figure(figsize = (10,2))
		plt.gca().grid()
		plt.gca().set_axisbelow(True)
		plt.title('Local Outlier Factor (LOF)')
		plt.xlabel(column)
		radius = [1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof]
		plt.scatter(column1, column2, color = "#214579", s = 14, label = 'Data points')
		plt.scatter(column1, column2, color = "#FFCC01", s = radius, label = 'Outlier scores', facecolors = 'none')
	elif (len(columns) == 2):
		columns = [str_column(column) for column in columns]
		query = "SELECT {}, {}, {} FROM {} {} WHERE {} IS NOT NULL AND {} IS NOT NULL".format(columns[0], columns[1], lof, input_relation, tablesample, columns[0], columns[1])
		cursor.execute(query)
		query_result = cursor.fetchall()
		column1, column2, lof = [item[0] for item in query_result], [item[1] for item in query_result], [item[2] for item in query_result]
		plt.figure(figsize = (10,8))
		plt.gca().grid()
		plt.gca().set_axisbelow(True)
		plt.title('Local Outlier Factor (LOF)')
		plt.ylabel(columns[1])
		plt.xlabel(columns[0])
		radius = [1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof]
		plt.scatter(column1, column2, color = "#214579", s = 14, label = 'Data points')
		plt.scatter(column1, column2, color = "#FFCC01", s = radius, label = 'Outlier scores', facecolors = 'none')
	elif (len(columns) == 3):
		from mpl_toolkits.mplot3d import Axes3D
		query = "SELECT {}, {}, {}, {} FROM {} {} WHERE {} IS NOT NULL AND {} IS NOT NULL AND {} IS NOT NULL".format(
					columns[0], columns[1], columns[2], lof, input_relation, tablesample, columns[0], columns[1], columns[2])
		cursor.execute(query)
		query_result = cursor.fetchall()
		column1, column2, column3, lof = [float(item[0]) for item in query_result], [float(item[1]) for item in query_result], [float(item[2]) for item in query_result], [float(item[3]) for item in query_result]
		fig = plt.figure(figsize = (10,8))
		ax = fig.add_subplot(111, projection = '3d')
		plt.title('Local Outlier Factor (LOF)')
		ax.set_xlabel(columns[0])
		ax.set_ylabel(columns[1])
		ax.set_zlabel(columns[2])
		radius = [1000 * (item - min(lof)) / (max(lof) - min(lof)) for item in lof]
		ax.scatter(column1, column2, column3, color = "#214579", label = 'Data points')
		ax.scatter(column1, column2, column3, color = "#FFCC01", s = radius, facecolors = 'none')
		ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
		ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
		ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
	else:
		raise ValueError("LocalOutlierFactor Plot is available for a maximum of 3 columns")
	plt.show()
 def fit(self,
         input_relation: str,
         X: list,
         y: str,
         test_relation: str = ""):
     self.input_relation = input_relation
     self.test_relation = test_relation if (
         test_relation) else input_relation
     self.X = [str_column(column) for column in X]
     self.y = str_column(y)
     return (self)
Beispiel #5
0
def load_amazon(cursor, schema: str = 'public', name='amazon'):
    try:
        query = "CREATE TABLE {}.{}(\"number\" Integer, \"date\" Date, \"state\" Varchar(32));"
        query += "COPY {}.{}(\"number\", \"date\", \"state\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/amazon.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
Beispiel #6
0
def load_titanic(cursor, schema: str = 'public', name='titanic'):
    try:
        query = "CREATE TABLE {}.{}(\"pclass\" Integer, \"survived\" Integer, \"name\" Varchar(164), \"sex\" Varchar(20), \"age\" Numeric(6,3), \"sibsp\" Integer, \"parch\" Integer, \"ticket\" Varchar(36), \"fare\" Numeric(10,5), \"cabin\" Varchar(30), \"embarked\" Varchar(20), \"boat\" Varchar(100), \"body\" Integer, \"home.dest\" Varchar(100));"
        query += "COPY {}.{}(\"pclass\", \"survived\", \"name\", \"sex\", \"age\", \"sibsp\", \"parch\", \"ticket\", \"fare\", \"cabin\", \"embarked\", \"boat\", \"body\", \"home.dest\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/titanic.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
Beispiel #7
0
def load_smart_meters(cursor, schema: str = 'public', name='smart_meters'):
    try:
        query = "CREATE TABLE {}.{}(\"time\" Timestamp, \"val\" Numeric(11,7), \"id\" Integer);"
        query += "COPY {}.{}(\"time\", \"val\", \"id\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/smart_meters.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
Beispiel #8
0
def load_iris(cursor, schema: str = 'public', name='iris'):
    try:
        query = "CREATE TABLE {}.{}(\"SepalLengthCm\" Numeric(5,2), \"SepalWidthCm\" Numeric(5,2), \"PetalLengthCm\" Numeric(5,2), \"PetalWidthCm\" Numeric(5,2), \"Species\" Varchar(30));"
        query += "COPY {}.{}(\"Id\" FILLER Integer, \"SepalLengthCm\", \"SepalWidthCm\", \"PetalLengthCm\", \"PetalWidthCm\", \"Species\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/iris.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
Beispiel #9
0
def load_winequality(cursor, schema: str = 'public', name='winequality'):
    try:
        query = "CREATE TABLE {}.{}(\"fixed_acidity\" Numeric(6,3), \"volatile_acidity\" Numeric(7,4), \"citric_acid\" Numeric(6,3), \"residual_sugar\" Numeric(7,3), \"chlorides\" Float, \"free_sulfur_dioxide\" Numeric(7,2), \"total_sulfur_dioxide\" Numeric(7,2), \"density\" Float, \"pH\" Numeric(6,3), \"sulphates\" Numeric(6,3), \"alcohol\" Float, \"quality\" Integer, \"good\" Integer, \"color\" Varchar(20));"
        query += "COPY {}.{}(\"fixed_acidity\", \"volatile_acidity\", \"citric_acid\", \"residual_sugar\", \"chlorides\", \"free_sulfur_dioxide\", \"total_sulfur_dioxide\", \"density\", \"pH\", \"sulphates\", \"alcohol\", \"quality\", \"good\", \"color\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/winequality.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
 def fit(self, input_relation: str, X: list):
     self.input_relation = input_relation
     self.X = [str_column(column) for column in X]
     query = "SELECT PCA('{}', '{}', '{}' USING PARAMETERS scale = {}, method = '{}'"
     query = query.format(self.name, input_relation, ", ".join(self.X),
                          self.scale, self.method)
     if (self.n_components):
         query += ", num_components = {}".format(self.n_components)
     query += ")"
     self.cursor.execute(query)
     self.components = to_tablesample(
         query=
         "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'principal_components')"
         .format(self.name),
         cursor=self.cursor)
     self.components.table_info = False
     self.explained_variance = to_tablesample(
         query=
         "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'singular_values')"
         .format(self.name),
         cursor=self.cursor)
     self.explained_variance.table_info = False
     self.mean = to_tablesample(
         query=
         "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'columns')"
         .format(self.name),
         cursor=self.cursor)
     self.mean.table_info = False
     return (self)
Beispiel #11
0
def elbow(X: list,
		  input_relation: str,
		  cursor,
		  n_cluster = (1, 15),
		  init = "kmeanspp",
		  max_iter: int = 50,
		  tol: float = 1e-4):
	import matplotlib.pyplot as plt
	from vertica_ml_python.learn.cluster import KMeans
	schema, relation = schema_relation(input_relation)
	schema = str_column(schema)
	relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
	all_within_cluster_SS = []
	L = [i for i in range(n_cluster[0], n_cluster[1])] if not(type(n_cluster) == list) else n_cluster
	for i in L:
		cursor.execute("DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha))
		model = KMeans("{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha), cursor, i, init, max_iter, tol)
		model.fit(input_relation, X)
		all_within_cluster_SS += [float(model.metrics.values["value"][3])]
		model.drop()
	plt.figure(figsize = (10,8))
	plt.rcParams['axes.facecolor'] = '#F4F4F4'
	plt.grid()
	plt.plot(L, all_within_cluster_SS, marker = "s", color = "#214579")
	plt.title("Elbow Curve")
	plt.xlabel('Number of Clusters')
	plt.ylabel('Between-Cluster SS / Total SS')
	plt.subplots_adjust(left = 0.2)
	plt.show()
	values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
	return tablesample(values = values, table_info = False)
 def fit(self,
         input_relation: str,
         X: list,
         y: str,
         test_relation: str = ""):
     self.input_relation = input_relation
     self.test_relation = test_relation if (
         test_relation) else input_relation
     self.X = [str_column(column) for column in X]
     self.y = str_column(y)
     self.cursor.execute(
         "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1".
         format(self.y, input_relation, self.y))
     classes = self.cursor.fetchall()
     self.classes = [item[0] for item in classes]
     return (self)
def train_test_split(input_relation: str, cursor, test_size: float = 0.33):
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
    test_name, train_name = "{}_{}".format(relation_alpha, int(
        test_size * 100)), "{}_{}".format(relation_alpha,
                                          int(100 - test_size * 100))
    cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_{}".format(
            schema, relation_alpha))
    cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format(
        schema, test_name))
    cursor.execute("DROP VIEW IF EXISTS {}.vpython_train_test_split_{}".format(
        schema, train_name))
    query = "CREATE TABLE {}.vpython_train_test_split_{} AS SELECT *, (CASE WHEN RANDOM() < {} THEN True ELSE False END) AS test FROM {}".format(
        schema, relation_alpha, test_size, input_relation)
    cursor.execute(query)
    query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE test".format(
        schema, test_name,
        "{}.vpython_train_test_split_{}".format(schema, relation_alpha))
    cursor.execute(query)
    query = "CREATE VIEW {}.vpython_train_test_split_{} AS SELECT * FROM {} WHERE NOT(test)".format(
        schema, train_name,
        "{}.vpython_train_test_split_{}".format(schema, relation_alpha))
    cursor.execute(query)
    return ("{}.vpython_train_test_split_{}".format(schema, train_name),
            "{}.vpython_train_test_split_{}".format(schema, test_name))
def best_k(X: list,
           input_relation: str,
           cursor,
           n_cluster=(1, 100),
           init="kmeanspp",
           max_iter: int = 50,
           tol: float = 1e-4,
           elbow_score_stop=0.8):
    from vertica_ml_python.learn.cluster import KMeans
    L = range(n_cluster[0],
              n_cluster[1]) if not (type(n_cluster) == list) else n_cluster
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}._vpython_kmeans_tmp_model_{}".format(
                schema, relation_alpha))
        model = KMeans(
            "{}._vpython_kmeans_tmp_model_{}".format(schema, relation_alpha),
            cursor, i, init, max_iter, tol)
        model.fit(input_relation, X)
        score = model.metrics.values["value"][3]
        if (score > elbow_score_stop):
            return i
        score_prev = score
    print(
        "/!\\ The K was not found. The last K (= {}) is returned with an elbow score of {}"
        .format(i, score))
    return i
 def deployInverseSQL(self, key_columns: list = []):
     sql = "APPLY_INVERSE_PCA({} USING PARAMETERS model_name = '{}', match_by_pos = 'true'"
     if (key_columns):
         sql += ", key_columns = '{}'".format(", ".join(
             [str_column(item) for item in key_columns]))
     sql += ")"
     return (sql.format(", ".join(self.X), self.name))
Beispiel #16
0
	def fit(self, input_relation: str, X: list):
		self.input_relation = input_relation
		self.X = [str_column(column) for column in X]
		query = "SELECT NORMALIZE_FIT('{}', '{}', '{}', '{}')".format(self.name, input_relation, ", ".join(self.X), self.method)
		self.cursor.execute(query)
		self.param = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')".format(self.name), cursor = self.cursor)
		self.param.table_info = False
		return (self)
Beispiel #17
0
	def fit(self, input_relation: str, X: list):
		self.input_relation = input_relation
		self.X = [str_column(column) for column in X]
		query = "SELECT KMEANS('{}', '{}', '{}', {} USING PARAMETERS max_iterations = {}, epsilon = {}".format(self.name, input_relation, ", ".join(self.X), self.n_cluster, self.max_iter, self.tol)
		name = "_vpython_kmeans_initial_centers_table_" 
		schema, relation = schema_relation(input_relation)
		schema = str_column(schema)
		if (type(self.init) != str):
			self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name))
			if (len(self.init) != self.n_cluster):
				raise ValueError("'init' must be a list of 'n_cluster' = {} points".format(self.n_cluster))
			else:
				for item in self.init:
					if (len(X) != len(item)):
						raise ValueError("Each points of 'init' must be of size len(X) = {}".format(len(self.X)))
				temp_initial_centers = [item for item in self.init]
				for item in temp_initial_centers:
					del temp_initial_centers[0]
					if (item in temp_initial_centers):
						raise ValueError("All the points of 'init' must be different")
				query0 = []
				for i in range(len(self.init)):
					line = []
					for j in range(len(self.init[0])):
						line += [str(self.init[i][j]) + " AS " + X[j]]
					line = ",".join(line)
					query0 += ["SELECT " + line]
				query0 = " UNION ".join(query0)
				query0 = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS {}".format(schema, name, query0)
				self.cursor.execute(query0)
				query += ", initial_centers_table = '" + name + "'"
		else:
			query += ", init_method = '" + self.init + "'"
		query += ")"
		self.cursor.execute(query)
		self.cursor.execute("DROP TABLE IF EXISTS {}.{}".format(schema, name))
		self.cluster_centers = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(self.name), cursor = self.cursor)
		self.cluster_centers.table_info = False
		query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'metrics')".format(self.name)
		self.cursor.execute(query)
		result = self.cursor.fetchone()[0]
		values = {"index": ["Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged"]}
		values["value"] = [float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True"] 
		self.metrics = tablesample(values, table_info = False)
		return (self)
Beispiel #18
0
	def fit(self,
			input_relation: str, 
			X: list, 
			y: str,
			test_relation: str = ""):
		self.input_relation = input_relation
		self.test_relation = test_relation if (test_relation) else input_relation
		self.X = [str_column(column) for column in X]
		self.y = str_column(y)
		query = "SELECT LOGISTIC_REG('{}', '{}', '{}', '{}' USING PARAMETERS optimizer = '{}', epsilon = {}, max_iterations = {}"
		query = query.format(self.name, input_relation, self.y, ", ".join(self.X), self.solver, self.tol, self.max_iter)
		query += ", regularization = '{}', lambda = {}".format(self.penalty, self.C)
		if (self.penalty == 'ENet'):
			query += ", alpha = {}".format(self.l1_ratio)
		query += ")"
		self.cursor.execute(query)
		self.coef = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'details')".format(self.name), cursor = self.cursor)
		self.coef.table_info = False
		return (self)
 def fit(self,
         input_relation: str,
         X: list,
         y: str,
         test_relation: str = ""):
     self.input_relation = input_relation
     self.test_relation = test_relation if (
         test_relation) else input_relation
     self.X = [str_column(column) for column in X]
     self.y = str_column(y)
     query = "SELECT NAIVE_BAYES('{}', '{}', '{}', '{}' USING PARAMETERS alpha = {})".format(
         self.name, input_relation, self.y, ", ".join(self.X), self.alpha)
     self.cursor.execute(query)
     self.cursor.execute(
         "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1".
         format(self.y, input_relation, self.y))
     classes = self.cursor.fetchall()
     self.classes = [item[0] for item in classes]
     return (self)
 def fit(self,
         input_relation: str,
         X: list,
         y: str,
         test_relation: str = ""):
     func = "APPROXIMATE_MEDIAN" if (self.p == 1) else "AVG"
     self.input_relation = input_relation
     self.test_relation = test_relation if (
         test_relation) else input_relation
     self.X = [str_column(column) for column in X]
     self.y = str_column(y)
     query = "SELECT {}, {} FROM {} WHERE {} IS NOT NULL GROUP BY {}".format(
         ", ".join([
             "{}({}) AS {}".format(func, column, column)
             for column in self.X
         ]), self.y, input_relation, self.y, self.y)
     self.centroids = to_tablesample(query=query, cursor=self.cursor)
     self.centroids.table_info = False
     self.classes = self.centroids.values[y]
     return (self)
 def deploySQL(self,
               n_components: int = 0,
               cutoff: float = 1,
               key_columns: list = []):
     sql = "APPLY_PCA({} USING PARAMETERS model_name = '{}', match_by_pos = 'true'"
     if (key_columns):
         sql += ", key_columns = '{}'".format(", ".join(
             [str_column(item) for item in key_columns]))
     if (n_components):
         sql += ", num_components = {}".format(n_components)
     else:
         sql += ", cutoff = {}".format(cutoff)
     sql += ")"
     return (sql.format(", ".join(self.X), self.name))
Beispiel #22
0
	def fit(self, input_relation: str, X: list):
		self.input_relation = input_relation
		self.X = [str_column(column) for column in X]
		query = "SELECT ONE_HOT_ENCODER_FIT('{}', '{}', '{}' USING PARAMETERS extra_levels = '{}')".format(self.name, input_relation, ", ".join(self.X), self.extra_levels)
		self.cursor.execute(query)
		try:
			self.param = to_tablesample(query = "SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) x UNION ALL SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(self.name, self.name), cursor = self.cursor)
		except:
			try:
				self.param = to_tablesample(query = "SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) x".format(self.name), cursor = self.cursor)
			except:
				self.param = to_tablesample(query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(self.name), cursor = self.cursor)
		self.param.table_info = False
		return (self)
Beispiel #23
0
 def fit(self,
         input_relation: str,
         X: list,
         y: str,
         test_relation: str = ""):
     self.input_relation = input_relation
     self.test_relation = test_relation if (
         test_relation) else input_relation
     self.X = [str_column(column) for column in X]
     self.y = str_column(y)
     if (self.max_features == "auto"):
         self.max_features = int(len(self.X) / 3 + 1)
     elif (self.max_features == "max"):
         self.max_features = len(self.X)
     query = "SELECT RF_REGRESSOR('{}', '{}', '{}', '{}' USING PARAMETERS ntree = {}, mtry = {}, sampling_size = {}"
     query = query.format(self.name, input_relation, self.y,
                          ", ".join(self.X), self.n_estimators,
                          self.max_features, self.sample)
     query += ", max_depth = {}, max_breadth = {}, min_leaf_size = {}, min_info_gain = {}, nbins = {})".format(
         self.max_depth, int(self.max_leaf_nodes), self.min_samples_leaf,
         self.min_info_gain, self.nbins)
     self.cursor.execute(query)
     return (self)
Beispiel #24
0
	def fit(self, input_relation: str, X: list):
		self.input_relation = input_relation
		self.X = [str_column(elem) for elem in X]
		schema, relation = schema_relation(input_relation)
		schema = str_column(schema)
		relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
		self.cursor.execute("DROP TABLE IF EXISTS {}.{}_countvectorizer_vpython CASCADE".format(schema, relation_alpha))
		sql = "CREATE TABLE {}.{}_countvectorizer_vpython(id identity(2000) primary key, text varchar({})) ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;"
		self.cursor.execute(sql.format(schema, relation_alpha, self.max_text_size))
		text = " || ".join(self.X) if not (self.lowercase) else "LOWER({})".format(" || ".join(self.X))
		if (self.ignore_special):
			text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text)
		sql = "INSERT INTO {}.{}_countvectorizer_vpython(text) SELECT {} FROM {}".format(schema, relation_alpha, text, input_relation)
		self.cursor.execute(sql)
		sql = "CREATE TEXT INDEX {} ON {}.{}_countvectorizer_vpython(id, text) stemmer NONE;".format(self.name, schema, relation_alpha)
		self.cursor.execute(sql)
		stop_words = "SELECT token FROM (SELECT token, cnt / SUM(cnt) OVER () AS df, rnk FROM (SELECT token, COUNT(*) AS cnt, RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk FROM {} GROUP BY 1) x) y WHERE not(df BETWEEN {} AND {})".format(self.name, self.min_df, self.max_df)
		if (self.max_features > 0):
			stop_words += " OR (rnk > {})".format(self.max_features)
		self.cursor.execute(stop_words)
		self.stop_words = [item[0] for item in self.cursor.fetchall()]
		self.cursor.execute(self.deploySQL())
		self.vocabulary = [item[0] for item in self.cursor.fetchall()]
		return (self)
def fast_cv(algorithm: str,
            input_relation: str,
            cursor,
            X: list,
            y: str,
            cv: int = 3,
            metrics: list = [],
            params: dict = {},
            cutoff: float = -1):
    if (algorithm.lower()
            in ("logistic_reg", "logistic_regression", "logisticregression")):
        algorithm = "logistic_reg"
    elif (algorithm.lower()
          in ("linear_reg", "linear_regression", "linearregression")):
        algorithm = "linear_reg"
    elif (algorithm.lower()
          in ("svm_classifier", "svmclassifier", "linearsvc")):
        algorithm = "svm_classifier"
    elif (algorithm.lower() in ("svm_regressor", "svmregressor", "linearsvr")):
        algorithm = "svm_regressor"
    elif (algorithm.lower() in ("naive_bayes", "naivebayes", "multinomialnb")):
        algorithm = "naive_bayes"
    if not (metrics):
        if algorithm in ("naive_bayes", "svm_classifier", "logistic_reg"):
            metrics = ["accuracy", "auc_roc", "auc_prc", "fscore"]
        elif algorithm in ("svm_regressor", "linear_reg"):
            metrics = ["MSE", "MAE", "rsquared", "explained_variance"]
    sql = "SELECT CROSS_VALIDATE('{}', '{}', '{}', '{}' USING PARAMETERS cv_fold_count = {}, cv_metrics = '{}'".format(
        algorithm, input_relation, y,
        ", ".join([str_column(item) for item in X]), cv, ", ".join(metrics))
    if (params):
        sql += ", cv_hyperparams = '{}'".format(params)
    if (cutoff <= 1 and cutoff >= 0):
        sql += ", cv_prediction_cutoff = '{}'".format(cutoff)
    sql += ')'
    cursor.execute(sql)
    return (cursor.fetchone()[0])
Beispiel #26
0
	def fit(self, input_relation: str, X: list, key_columns: list = [], index: str = ""):
		X = [str_column(column) for column in X]
		self.X = X
		self.key_columns = [str_column(column) for column in key_columns]
		self.input_relation = input_relation
		schema, relation = schema_relation(input_relation)
		schema = str_column(schema)
		relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
		cursor = self.cursor
		if not(index):
			index = "id"
			main_table = "{}.main_{}_vpython_".format(schema, relation_alpha)
			cursor.execute("DROP TABLE IF EXISTS {}".format(main_table))
			sql = "CREATE TEMPORARY TABLE {} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format(main_table, ", ".join(X + key_columns), input_relation, " AND ".join(["{} IS NOT NULL".format(item) for item in X]))
			cursor.execute(sql)
		else:
			main_table = input_relation
		sql = ["POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], self.p) for i in range(len(X))] 
		distance = "POWER({}, 1 / {})".format(" + ".join(sql), self.p)
		sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance FROM {} AS x CROSS JOIN {} AS y".format(index, index, distance, main_table, main_table)
		sql = "SELECT node_id, nn_id, SUM(CASE WHEN distance <= {} THEN 1 ELSE 0 END) OVER (PARTITION BY node_id) AS density, distance FROM ({}) distance_table".format(self.eps, sql)
		cursor.execute("DROP TABLE IF EXISTS {}.graph_{}_vpython_".format(schema, relation_alpha))
		sql = "SELECT node_id, nn_id FROM ({}) x WHERE density > {} AND distance < {} AND node_id != nn_id".format(sql, self.min_samples, self.eps)
		cursor.execute(sql)
		graph = cursor.fetchall()
		main_nodes = list(dict.fromkeys([elem[0] for elem in graph] + [elem[1] for elem in graph]))
		clusters = {}
		for elem in main_nodes:
			clusters[elem] = None
		i = 0
		while (graph):
			node = graph[0][0]
			node_neighbor = graph[0][1]
			if (clusters[node] == None) and (clusters[node_neighbor] == None):
				clusters[node] = i 
				clusters[node_neighbor] = i
				i = i + 1
			else:
				if (clusters[node] != None):
					clusters[node_neighbor] = clusters[node]
				else:
					clusters[node] = clusters[node_neighbor]
			del(graph[0])
		try:
			f = open("dbscan_id_cluster_vpython.csv", 'w')
			for elem in clusters:
				f.write("{}, {}\n".format(elem, clusters[elem]))
			f.close()
			cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema))
			cursor.execute("CREATE TEMPORARY TABLE {}.dbscan_clusters(node_id int, cluster int) ON COMMIT PRESERVE ROWS".format(schema))
			cursor.execute("COPY {}.dbscan_clusters(node_id, cluster) FROM LOCAL './dbscan_id_cluster_vpython.csv' DELIMITER ',' ESCAPE AS '\\'".format(schema))
			cursor.execute("COMMIT")
			os.remove("dbscan_id_cluster_vpython.csv")
		except:
			os.remove("dbscan_id_cluster_vpython.csv")
			raise
		self.n_cluster = i
		cursor.execute("CREATE TABLE {} AS SELECT {}, COALESCE(cluster, -1) AS dbscan_cluster FROM {} AS x LEFT JOIN {}.dbscan_clusters AS y ON x.{} = y.node_id".format(self.name, ", ".join(self.X + self.key_columns), main_table, schema, index))
		cursor.execute("SELECT COUNT(*) FROM {} WHERE dbscan_cluster = -1".format(self.name))
		self.n_noise = cursor.fetchone()[0]
		cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython_".format(schema, relation_alpha))
		cursor.execute("DROP TABLE IF EXISTS {}.dbscan_clusters".format(schema))
		return (self)
 def fit(self,
         input_relation: str,
         X: list,
         key_columns: list = [],
         index=""):
     X = [str_column(column) for column in X]
     self.X = X
     self.key_columns = [str_column(column) for column in key_columns]
     self.input_relation = input_relation
     cursor = self.cursor
     n_neighbors = self.n_neighbors
     p = self.p
     relation_alpha = ''.join(ch for ch in input_relation if ch.isalnum())
     schema, relation = schema_relation(input_relation)
     schema = str_column(schema)
     if not (index):
         index = "id"
         relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
         main_table = "main_{}_vpython".format(relation_alpha)
         cursor.execute("DROP TABLE IF EXISTS {}.{}".format(
             schema, main_table))
         sql = "CREATE TEMPORARY TABLE {}.{} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format(
             schema, main_table, ", ".join(X + key_columns), input_relation,
             " AND ".join(["{} IS NOT NULL".format(item) for item in X]))
         cursor.execute(sql)
     else:
         main_table = input_relation
     sql = [
         "POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i], p)
         for i in range(len(X))
     ]
     distance = "POWER({}, 1 / {})".format(" + ".join(sql), p)
     sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance, ROW_NUMBER() OVER(PARTITION BY x.{} ORDER BY {}) AS knn FROM {}.{} AS x CROSS JOIN {}.{} AS y".format(
         index, index, distance, index, distance, schema, main_table,
         schema, main_table)
     sql = "SELECT node_id, nn_id, distance, knn FROM ({}) distance_table WHERE knn <= {}".format(
         sql, n_neighbors + 1)
     cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format(
         schema, relation_alpha))
     sql = "CREATE TEMPORARY TABLE {}.distance_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format(
         schema, relation_alpha, sql)
     cursor.execute(sql)
     kdistance = "(SELECT node_id, nn_id, distance AS distance FROM {}.distance_{}_vpython WHERE knn = {}) AS kdistance_table".format(
         schema, relation_alpha, n_neighbors + 1)
     lrd = "SELECT distance_table.node_id, {} / SUM(CASE WHEN distance_table.distance > kdistance_table.distance THEN distance_table.distance ELSE kdistance_table.distance END) AS lrd FROM ({}.distance_{}_vpython AS distance_table LEFT JOIN {} ON distance_table.nn_id = kdistance_table.node_id) x GROUP BY 1".format(
         n_neighbors, schema, relation_alpha, kdistance)
     cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format(
         schema, relation_alpha))
     sql = "CREATE TEMPORARY TABLE {}.lrd_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format(
         schema, relation_alpha, lrd)
     cursor.execute(sql)
     sql = "SELECT x.node_id, SUM(y.lrd) / (MAX(x.node_lrd) * {}) AS LOF FROM (SELECT n_table.node_id, n_table.nn_id, lrd_table.lrd AS node_lrd FROM {}.distance_{}_vpython AS n_table LEFT JOIN {}.lrd_{}_vpython AS lrd_table ON n_table.node_id = lrd_table.node_id) x LEFT JOIN {}.lrd_{}_vpython AS y ON x.nn_id = y.node_id GROUP BY 1".format(
         n_neighbors, schema, relation_alpha, schema, relation_alpha,
         schema, relation_alpha)
     cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format(
         schema, relation_alpha))
     sql = "CREATE TEMPORARY TABLE {}.lof_{}_vpython ON COMMIT PRESERVE ROWS AS {}".format(
         schema, relation_alpha, sql)
     cursor.execute(sql)
     sql = "SELECT {}, (CASE WHEN lof > 1e100 OR lof != lof THEN 0 ELSE lof END) AS lof_score FROM {} AS x LEFT JOIN {}.lof_{}_vpython AS y ON x.{} = y.node_id".format(
         ", ".join(X + self.key_columns), main_table, schema,
         relation_alpha, index)
     sql = "CREATE TABLE {} AS {}".format(self.name, sql)
     cursor.execute(sql)
     sql = "SELECT COUNT(*) FROM {}.lof_{}_vpython z WHERE lof > 1e100 OR lof != lof".format(
         schema, relation_alpha)
     cursor.execute(sql)
     self.n_errors = cursor.fetchone()[0]
     cursor.execute("DROP TABLE IF EXISTS {}.main_{}_vpython".format(
         schema, relation_alpha))
     cursor.execute("DROP TABLE IF EXISTS {}.distance_{}_vpython".format(
         schema, relation_alpha))
     cursor.execute("DROP TABLE IF EXISTS {}.lrd_{}_vpython".format(
         schema, relation_alpha))
     cursor.execute("DROP TABLE IF EXISTS {}.lof_{}_vpython".format(
         schema, relation_alpha))
     return (self)
def cross_validate(estimator,
                   input_relation: str,
                   X: list,
                   y: str,
                   cv: int = 3,
                   pos_label=None,
                   cutoff: float = 0.5):
    if (estimator.type == "regressor"):
        result = {
            "index": [
                "explained_variance", "max_error", "median_absolute_error",
                "mean_absolute_error", "mean_squared_error", "r2"
            ]
        }
    elif (estimator.type == "classifier"):
        result = {
            "index": [
                "auc", "prc_auc", "accuracy", "log_loss", "precision",
                "recall", "f1-score", "mcc", "informedness", "markedness",
                "csi"
            ]
        }
    else:
        raise ValueError(
            "Cross Validation is only possible for Regressors and Classifiers")
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = ''.join(ch for ch in relation if ch.isalnum())
    test_name, train_name = "{}_{}".format(relation_alpha,
                                           int(1 / cv * 100)), "{}_{}".format(
                                               relation_alpha,
                                               int(100 - 1 / cv * 100))
    estimator.cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, relation_alpha))
    query = "CREATE TEMPORARY TABLE {}.vpython_train_test_split_cv_{} ON COMMIT PRESERVE ROWS AS SELECT *, RANDOMINT({}) AS test FROM {}".format(
        schema, relation_alpha, cv, input_relation)
    estimator.cursor.execute(query)
    for i in range(cv):
        try:
            estimator.cursor.execute("DROP MODEL IF EXISTS {}".format(
                estimator.name))
        except:
            pass
        estimator.cursor.execute(
            "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
                schema, test_name))
        estimator.cursor.execute(
            "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
                schema, train_name))
        query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test = {})".format(
            schema, test_name,
            "{}.vpython_train_test_split_cv_{}".format(schema,
                                                       relation_alpha), i)
        estimator.cursor.execute(query)
        query = "CREATE VIEW {}.vpython_train_test_split_cv_{} AS SELECT * FROM {} WHERE (test != {})".format(
            schema, train_name,
            "{}.vpython_train_test_split_cv_{}".format(schema,
                                                       relation_alpha), i)
        estimator.cursor.execute(query)
        estimator.fit(
            "{}.vpython_train_test_split_cv_{}".format(schema, train_name), X,
            y, "{}.vpython_train_test_split_cv_{}".format(schema, test_name))
        if (estimator.type == "regressor"):
            result["{}-fold".format(
                i + 1)] = estimator.regression_report().values["value"]
        else:
            if (len(estimator.classes) > 2) and (pos_label
                                                 not in estimator.classes):
                raise ValueError(
                    "'pos_label' must be in the estimator classes, it must be the main class to study for the Cross Validation"
                )
            try:
                result["{}-fold".format(i +
                                        1)] = estimator.classification_report(
                                            labels=[pos_label],
                                            cutoff=cutoff).values["value"]
            except:
                result["{}-fold".format(i +
                                        1)] = estimator.classification_report(
                                            cutoff=cutoff).values["value"]
        try:
            estimator.cursor.execute("DROP MODEL IF EXISTS {}".format(
                estimator.name))
        except:
            pass
    n = 6 if (estimator.type == "regressor") else 11
    total = [[] for item in range(n)]
    for i in range(cv):
        for k in range(n):
            total[k] += [result["{}-fold".format(i + 1)][k]]
    result["avg"] = [np.mean(item) for item in total]
    result["std"] = [np.std(item) for item in total]
    estimator.cursor.execute(
        "DROP TABLE IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, relation_alpha))
    estimator.cursor.execute(
        "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, test_name))
    estimator.cursor.execute(
        "DROP VIEW IF EXISTS {}.vpython_train_test_split_cv_{}".format(
            schema, train_name))
    return (tablesample(values=result, table_info=False).transpose())