コード例 #1
0
def load_titanic(cursor, schema: str = 'public', name='titanic'):
    try:
        query = "CREATE TABLE {}.{}(\"pclass\" Integer, \"survived\" Integer, \"name\" Varchar(164), \"sex\" Varchar(20), \"age\" Numeric(6,3), \"sibsp\" Integer, \"parch\" Integer, \"ticket\" Varchar(36), \"fare\" Numeric(10,5), \"cabin\" Varchar(30), \"embarked\" Varchar(20), \"boat\" Varchar(100), \"body\" Integer, \"home.dest\" Varchar(100));"
        query += "COPY {}.{}(\"pclass\", \"survived\", \"name\", \"sex\", \"age\", \"sibsp\", \"parch\", \"ticket\", \"fare\", \"cabin\", \"embarked\", \"boat\", \"body\", \"home.dest\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/titanic.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
コード例 #2
0
def load_smart_meters(cursor, schema: str = 'public', name='smart_meters'):
    try:
        query = "CREATE TABLE {}.{}(\"time\" Timestamp, \"val\" Numeric(11,7), \"id\" Integer);"
        query += "COPY {}.{}(\"time\", \"val\", \"id\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/smart_meters.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
コード例 #3
0
def load_iris(cursor, schema: str = 'public', name='iris'):
    try:
        query = "CREATE TABLE {}.{}(\"SepalLengthCm\" Numeric(5,2), \"SepalWidthCm\" Numeric(5,2), \"PetalLengthCm\" Numeric(5,2), \"PetalWidthCm\" Numeric(5,2), \"Species\" Varchar(30));"
        query += "COPY {}.{}(\"Id\" FILLER Integer, \"SepalLengthCm\", \"SepalWidthCm\", \"PetalLengthCm\", \"PetalWidthCm\", \"Species\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/iris.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
コード例 #4
0
def load_amazon(cursor, schema: str = 'public', name='amazon'):
    try:
        query = "CREATE TABLE {}.{}(\"number\" Integer, \"date\" Date, \"state\" Varchar(32));"
        query += "COPY {}.{}(\"number\", \"date\", \"state\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/amazon.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
コード例 #5
0
def load_winequality(cursor, schema: str = 'public', name='winequality'):
    try:
        query = "CREATE TABLE {}.{}(\"fixed_acidity\" Numeric(6,3), \"volatile_acidity\" Numeric(7,4), \"citric_acid\" Numeric(6,3), \"residual_sugar\" Numeric(7,3), \"chlorides\" Float, \"free_sulfur_dioxide\" Numeric(7,2), \"total_sulfur_dioxide\" Numeric(7,2), \"density\" Float, \"pH\" Numeric(6,3), \"sulphates\" Numeric(6,3), \"alcohol\" Float, \"quality\" Integer, \"good\" Integer, \"color\" Varchar(20));"
        query += "COPY {}.{}(\"fixed_acidity\", \"volatile_acidity\", \"citric_acid\", \"residual_sugar\", \"chlorides\", \"free_sulfur_dioxide\", \"total_sulfur_dioxide\", \"density\", \"pH\", \"sulphates\", \"alcohol\", \"quality\", \"good\", \"color\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;"
        query = query.format(
            str_column(schema), str_column(name), str_column(schema),
            str_column(name),
            os.path.dirname(vertica_ml_python.__file__) +
            "/learn/data/winequality.csv")
        cursor.execute(query)
        vdf = vDataframe(name, cursor, schema=schema)
    except:
        vdf = vDataframe(name, cursor, schema=schema)
    return (vdf)
コード例 #6
0
	def plot(self):
		from vertica_ml_python import vDataframe
		vdf = vDataframe(self.input_relation, self.cursor)
		self.add_to_vdf(vdf, "kmeans_cluster")
		if (len(self.X) <= 3):
			vdf.scatter(columns = self.X, catcol = "kmeans_cluster", max_cardinality = 100, max_nb_points = 10000)
		else:
			raise ValueError("Clustering Plots are only available in 2D or 3D")
コード例 #7
0
def vdf_from_relation(relation: str,
                      name: str = "VDF",
                      cursor=None,
                      dsn: str = "",
                      schema: str = "public"):
    check_types([("relation", relation, [str], False),
                 ("name", name, [str], False), ("dsn", dsn, [str], False),
                 ("schema", schema, [str], False)])
    name = ''.join(ch for ch in name if ch.isalnum())
    from vertica_ml_python import vDataframe
    vdf = vDataframe("", empty=True)
    vdf.dsn = dsn
    if (cursor == None):
        from vertica_ml_python import vertica_cursor
        cursor = vertica_cursor(dsn)
    vdf.input_relation = name
    vdf.main_relation = relation
    vdf.schema = schema
    vdf.cursor = cursor
    vdf.query_on = False
    vdf.time_on = False
    cursor.execute("DROP TABLE IF EXISTS {}._vpython_{}_test_;".format(
        str_column(schema), name))
    cursor.execute(
        "CREATE TEMPORARY TABLE {}._vpython_{}_test_ ON COMMIT PRESERVE ROWS AS SELECT * FROM {} LIMIT 10;"
        .format(str_column(schema), name, relation))
    cursor.execute(
        "SELECT column_name, data_type FROM columns WHERE table_name = '_vpython_{}_test_' AND table_schema = '{}'"
        .format(name, schema.replace('"', '')))
    result = cursor.fetchall()
    cursor.execute("DROP TABLE IF EXISTS {}._vpython_{}_test_;".format(
        str_column(schema), name))
    vdf.columns = ['"' + item[0] + '"' for item in result]
    vdf.where = []
    vdf.order_by = ['' for i in range(100)]
    vdf.exclude_columns = []
    vdf.history = []
    vdf.saving = []
    for column, ctype in result:
        column = '"' + column + '"'
        from vertica_ml_python.vcolumn import vColumn
        new_vColumn = vColumn(column,
                              parent=vdf,
                              transformations=[
                                  (column, ctype,
                                   category_from_type(ctype=ctype))
                              ])
        setattr(vdf, column, new_vColumn)
        setattr(vdf, column[1:-1], new_vColumn)
    return (vdf)
コード例 #8
0
 def to_vdf(self):
     from vertica_ml_python import vDataframe
     return (vDataframe(self.name, self.cursor))
コード例 #9
0
	def plot(self):
		from vertica_ml_python import vDataframe
		if (len(self.X) <= 3):
			vDataframe(self.name, self.cursor).scatter(columns = self.X, catcol = "dbscan_cluster", max_cardinality = 100, max_nb_points = 10000)
		else:
			raise ValueError("Clustering Plots are only available in 2D or 3D")
コード例 #10
0
def read_json(path: str,
              cursor,
              schema: str = 'public',
              table_name: str = '',
              usecols: list = [],
              new_name: dict = {},
              insert: bool = False):
    check_types([("schema", schema, [str], False),
                 ("table_name", table_name, [str], False),
                 ("usecols", usecols, [list], False),
                 ("new_name", new_name, [dict], False),
                 ("insert", insert, [bool], False)])
    file = path.split("/")[-1]
    file_extension = file[-4:len(file)]
    if (file_extension != 'json'):
        raise ValueError("The file extension is incorrect !")
    table_name = table_name if (
        table_name) else path.split("/")[-1].split(".json")[0]
    query = "SELECT column_name, data_type FROM columns WHERE table_name='{}' AND table_schema='{}'".format(
        table_name, schema)
    cursor.execute(query)
    column_name = cursor.fetchall()
    if ((column_name != []) and not (insert)):
        raise Exception("The table \"{}\".\"{}\" already exists !".format(
            schema, table_name))
    elif ((column_name == []) and (insert)):
        raise Exception("The table \"{}\".\"{}\" doesn't exist !".format(
            schema, table_name))
    else:
        input_relation, flex_name = '"{}"."{}"'.format(
            schema, table_name), "_vpython" + str(
                np.random.randint(10000000)) + "_flex_"
        cursor.execute(
            "CREATE FLEX LOCAL TEMP TABLE {}(x int) ON COMMIT PRESERVE ROWS;".
            format(flex_name))
        cursor.execute("COPY {} FROM LOCAL '{}' PARSER FJSONPARSER();".format(
            flex_name, path))
        cursor.execute(
            "SELECT compute_flextable_keys('{}');".format(flex_name))
        cursor.execute(
            "SELECT key_name, data_type_guess FROM {}_keys".format(flex_name))
        result = cursor.fetchall()
        dtype = {}
        for column_dtype in result:
            try:
                if ("Varchar" not in column_dtype[1]):
                    cursor.execute('SELECT "{}"::{} FROM {} LIMIT 1000'.format(
                        column_dtype[0], column_dtype[1], flex_name))
                dtype[column_dtype[0]] = column_dtype[1]
            except:
                dtype[column_dtype[0]] = "Varchar(100)"
        if not (insert):
            cols = [column for column in dtype
                    ] if not (usecols) else [column for column in usecols]
            for i, column in enumerate(cols):
                cols[i] = '"{}"::{} AS "{}"'.format(column.replace(
                    '"', ''), dtype[column], new_name[column]) if (
                        column in new_name) else '"{}"::{}'.format(
                            column.replace('"', ''), dtype[column])
            cursor.execute("CREATE TABLE {} AS SELECT {} FROM {}".format(
                input_relation, ", ".join(cols), flex_name))
            print("The table {} has been successfully created.".format(
                input_relation))
        else:
            column_name_dtype = {}
            for elem in column_name:
                column_name_dtype[elem[0]] = elem[1]
            final_cols = {}
            for column in column_name_dtype:
                final_cols[column] = None
            for column in column_name_dtype:
                if column in dtype:
                    final_cols[column] = column
                else:
                    for col in new_name:
                        if (new_name[col] == column):
                            final_cols[column] = col
            final_transformation = []
            for column in final_cols:
                final_transformation += [
                    'NULL AS "{}"'.format(column)
                ] if (final_cols[column] == None) else [
                    '"{}"::{} AS "{}"'.format(
                        final_cols[column], column_name_dtype[column], column)
                ]
            cursor.execute("INSERT INTO {} SELECT {} FROM {}".format(
                input_relation, ", ".join(final_transformation), flex_name))
        cursor.execute("DROP TABLE " + flex_name)
        from vertica_ml_python import vDataframe
        return vDataframe(table_name, cursor, schema=schema)
コード例 #11
0
def read_csv(path: str,
             cursor,
             schema: str = 'public',
             table_name: str = '',
             delimiter: str = ',',
             header: bool = True,
             header_names: list = [],
             null: str = '',
             enclosed_by: str = '"',
             escape: str = '\\',
             genSQL: bool = False,
             parse_n_lines: int = -1,
             insert: bool = False):
    check_types([("schema", schema, [str], False),
                 ("table_name", table_name, [str], False),
                 ("delimiter", delimiter, [str], False),
                 ("header", header, [bool], False),
                 ("header_names", header_names, [list], False),
                 ("null", null, [str], False),
                 ("enclosed_by", enclosed_by, [str], False),
                 ("escape", escape, [str], False),
                 ("genSQL", genSQL, [bool], False),
                 ("parse_n_lines", parse_n_lines, [int, float], False),
                 ("insert", insert, [bool], False)])
    file = path.split("/")[-1]
    file_extension = file[-3:len(file)]
    if (file_extension != 'csv'):
        raise ValueError("The file extension is incorrect !")
    table_name = table_name if (
        table_name) else path.split("/")[-1].split(".csv")[0]
    query = "SELECT column_name FROM columns WHERE table_name='{}' AND table_schema='{}'".format(
        table_name, schema)
    cursor.execute(query)
    result = cursor.fetchall()
    if ((result != []) and not (insert)):
        raise Exception("The table \"{}\".\"{}\" already exists !".format(
            schema, table_name))
    elif ((result == []) and (insert)):
        raise Exception("The table \"{}\".\"{}\" doesn't exist !".format(
            schema, table_name))
    else:
        input_relation = '"{}"."{}"'.format(schema, table_name)
        f = open(path, 'r')
        file_header = f.readline().replace('\n',
                                           '').replace('"',
                                                       '').split(delimiter)
        f.close()
        if ((header_names == []) and (header)):
            header_names = file_header
        elif (len(file_header) > len(header_names)):
            header_names += [
                "ucol{}".format(i + len(header_names))
                for i in range(len(file_header) - len(header_names))
            ]
        if ((parse_n_lines > 0) and not (insert)):
            f = open(path, 'r')
            f2 = open(path[0:-4] + "_vpython_copy.csv", 'w')
            for i in range(parse_n_lines + int(header)):
                line = f.readline()
                f2.write(line)
            f.close()
            f2.close()
            path_test = path[0:-4] + "_vpython_copy.csv"
        else:
            path_test = path
        query = ""
        if not (insert):
            dtype = pcsv(path_test, cursor, delimiter, header, header_names,
                         null, enclosed_by, escape)
            if (parse_n_lines > 0):
                os.remove(path[0:-4] + "_vpython_copy.csv")
            query = "CREATE TABLE {}({});\n".format(
                input_relation, ", ".join([
                    '"{}" {}'.format(column, dtype[column])
                    for column in header_names
                ]))
        skip = " SKIP 1" if (header) else ""
        query += "COPY {}({}) FROM LOCAL '{}' DELIMITER '{}' NULL '{}' ENCLOSED BY '{}' ESCAPE AS '{}'{};".format(
            input_relation,
            ", ".join(['"' + column + '"' for column in header_names]), path,
            delimiter, null, enclosed_by, escape, skip)
        if (genSQL):
            print(query)
        else:
            cursor.execute(query)
            if not (insert):
                print("The table {} has been successfully created.".format(
                    input_relation))
            from vertica_ml_python import vDataframe
            return vDataframe(table_name, cursor, schema=schema)