Esempio n. 1
0
    def __init__(self,
                 master="local[*]",
                 app_name="optimus",
                 checkpoint=False,
                 path=None,
                 file_system="local",
                 verbose=False,
                 dl=False):
        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        """

        if verbose is True:
            logging.basicConfig(format="%(message)s", level=logging.INFO)
        elif verbose is False:
            logging.propagate = False
            logging.disable(logging.NOTSET)

        if dl is True:
            Optimus.add_spark_packages([
                "databricks:spark-deep-learning:1.1.0-spark2.3-s_2.11 pyspark-shell"
            ])

            Spark.instance = Spark(master, app_name)
            from optimus.dl.models import DL
            self.dl = DL()
        else:

            Spark.instance = Spark(master, app_name)
            pass

        if path is None:
            path = os.getcwd()

        # Initialize Spark
        logging.info("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logging.info(STARTING_OPTIMUS)
        if checkpoint is True:
            self.set_check_point_folder(path, file_system)

        logging.info(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read
        self.profiler = Profiler()
        self.ml = ML()
Esempio n. 2
0
    "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv"
)

t.create(
    op, "load.json", "remote_json", "df",
    "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json"
)

t.create(
    op, "load.parquet", "remote_parquet", "df",
    "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.parquet"
)

# +
from optimus.profiler.profiler import Profiler
p = Profiler()

print(p.run(source_df1, "japanese name"))
# -

# df_string = source_df.cols.cast("*","str")
t.create(source_df, "save.csv", None, None, "test.csv")

t.create(None, "save.json", None, None, "test.json")

t.create(None, "save.parquet", None, None, "test.parquet")

t.run()

source_df.table()
Esempio n. 3
0
    def __init__(self,
                 session=None,
                 master="local[*]",
                 app_name="optimus",
                 checkpoint=False,
                 path=None,
                 file_system="local",
                 verbose=False,
                 server=False,
                 repositories=None,
                 packages=None,
                 jars=None,
                 driver_class_path=None,
                 options=None,
                 additional_options=None,
                 comm=None,
                 load_avro=False,
                 cache=True):
        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        :param additional_options:


        :param options: Configuration options that are passed to spark-submit.
            See `the list of possible options
            <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_.
            Note that any options set already through PYSPARK_SUBMIT_ARGS will override
            these.
        :type options: (dict[str,str])
        :param repositories: List of additional maven repositories for package lookup.
        :type repositories: (list[str])

        :param packages: Spark packages that should be installed.
        :type packages: (list[str])

        :param jars: Full paths to jar files that we want to include to the session.
        :type jars: (list[str])

        """

        self.preserve = False

        Optimus.cache = cache

        if comm is True:
            Comm.instance = Comm()
        else:
            Comm.instance = comm

        if jars is None:
            jars = []

        if driver_class_path is None:
            driver_class_path = []

        if session is None:
            # Creating Spark Session
            # If a Spark session in not passed by argument create one

            self.master = master
            self.app_name = app_name

            if options is None:
                options = {}

            self.options = options

            # Initialize as lists
            self.packages = val_to_list(packages)
            self.repositories = val_to_list(repositories)
            self.jars = val_to_list(jars)
            self.driver_class_path = val_to_list(driver_class_path)

            self.additional_options = additional_options

            self.verbose(verbose)

            # Because avro depends of a external package you can decide if should be loaded
            if load_avro == "2.4":
                self._add_spark_packages(
                    ["org.apache.spark:spark-avro_2.12:2.4.3"])

            elif load_avro == "2.3":
                self._add_spark_packages(
                    ["com.databricks:spark-avro_2.11:4.0.0"])

            jdbc_jars = [
                "/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar",
                "/jars/RedshiftJDBC42-1.2.16.1027.jar",
                "/jars/mysql-connector-java-8.0.16.jar", "/jars/ojdbc8.jar",
                "/jars/postgresql-42.2.5.jar", "/jars/presto-jdbc-0.224.jar",
                "/jars/spark-cassandra-connector_2.11-2.4.1.jar",
                "/jars/sqlite-jdbc-3.27.2.1.jar",
                "/jars/mssql-jdbc-7.4.1.jre8.jar"
            ]

            self._add_jars(absolute_path(jdbc_jars, "uri"))
            self._add_driver_class_path(absolute_path(jdbc_jars, "posix"))

            self._create_session()

            if path is None:
                path = os.getcwd()

            if checkpoint is True:
                self._set_check_point_folder(path, file_system)

        else:
            # If a session is passed by arguments just save the reference
            # logger.print("Spark session")
            Spark.instance = Spark().load(session)

        # Initialize Spark
        logger.print("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logger.print(STARTING_OPTIMUS)

        # Pickling
        Spark.instance.sc.addPyFile(absolute_path("/infer.py"))

        if server:
            logger.print("Starting Optimus Server...")
            s = Server()
            s.start()
            self.server_instance = s

        logger.print(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read

        # Create singleton profiler
        Profiler.instance = Profiler()
        self.profiler = Profiler.instance
        self.ml = ML()

        # Set global output as html
        self.output("html")
Esempio n. 4
0
def func(col_name, attrs):
    return F.col(col_name) * 2


numeric_col = "height(ft)"
numeric_col_B = "rank"
numeric_col_C = "rank"
string_col = "function"
date_col = "date arrival"
date_col_B = "last date seen"
new_col = "new col"
array_col = "attributes"
# -

from optimus.profiler.profiler import Profiler
p = Profiler()

p.run(source_df, "*")

t.create(p, "dataset", None, 'json', None, source_df, "*")

t.run()

mismatch = {
    "names": "dd/mm/yyyy",
    "height(ft)":
    r'^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$',
    "function": "yyyy-mm-dd"
}
t.create(p,
         "dataset",
Esempio n. 5
0
def func(col_name, attrs):
    return F.col(col_name) * 2


numeric_col = "height(ft)"
numeric_col_B = "rank"
numeric_col_C = "rank"
string_col = "function"
date_col = "date arrival"
date_col_B = "last date seen"
new_col = "new col"
array_col = "attributes"
# -

from optimus.profiler.profiler import Profiler
p = Profiler()

from optimus.ml import feature as fe
t.create(p, "minimal_stats", None, 'json', None, source_df, "*")

t.create(p, "to_json", None, 'json', None, source_df, "*")

t.create(p, "columns", None, 'json', None, source_df, "*")

t.create(p, "general_stats", None, 'json', None, source_df, "*")

t.run()

source_df.sample()
Esempio n. 6
0
    def __init__(self,
                 session=None,
                 master="local[*]",
                 app_name="optimus",
                 checkpoint=False,
                 path=None,
                 file_system="local",
                 verbose=False,
                 dl=False,
                 server=False,
                 repositories=None,
                 packages=None,
                 jars=None,
                 options=None,
                 additional_options=None,
                 enricher_host="localhost",
                 enricher_port=27017,
                 queue_url=None,
                 queue_exchange=None,
                 queue_routing_key="optimus"):
        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        :param additional_options:


        :param options: Configuration options that are passed to spark-submit.
            See `the list of possible options
            <https://spark.apache.org/docs/2.1.0/configuration.html#available-properties>`_.
            Note that any options set already through PYSPARK_SUBMIT_ARGS will override
            these.
        :type options: (dict[str,str])
        :param repositories: List of additional maven repositories for package lookup.
        :type repositories: (list[str])

        :param packages: Spark packages that should be installed.
        :type packages: (list[str])

        :param jars: Full paths to jar files that we want to include to the session.
        :type jars: (list[str])

        """
        if session is None:
            # print("Creating Spark Session...")
            # If a Spark session in not passed by argument create it

            self.master = master
            self.app_name = app_name

            if options is None:
                options = {}

            self.options = options

            if packages is None:
                packages = []
            else:
                packages = val_to_list(packages)

            self.packages = packages
            self.repositories = repositories

            if jars is None:
                jars = {}

            self.jars = jars
            self.additional_options = additional_options

            self.verbose(verbose)

            # Load Avro.
            # TODO: if the Spark 2.4 version is going to be used this is not neccesesary.
            #  Maybe we can check a priori which version fo Spark is going to be used
            # self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"])

            if dl is True:
                self._add_spark_packages(
                    ["databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11"])

                self._start_session()

                from optimus.dl.models import DL
                self.dl = DL()
            else:
                self._start_session()

            if path is None:
                path = os.getcwd()

            if checkpoint is True:
                self._set_check_point_folder(path, file_system)

        else:
            # If a session is passed by arguments  just save the reference
            Spark.instance = session

        # Initialize Spark
        logger.print("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logger.print(STARTING_OPTIMUS)

        if server:
            logger.print("Starting Optimus Server...")
            s = Server()
            s.start()
            self.server_instance = s

        logger.print(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read
        self.profiler = Profiler(queue_url=queue_url,
                                 queue_exchange=queue_exchange,
                                 queue_routing_key=queue_routing_key)
        self.ml = ML()
        self.enricher = Enricher(
            op=self,
            host=enricher_host,
            port=enricher_port,
        )
Esempio n. 7
0
    def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None,
                 file_system="local",
                 verbose=False,
                 server=False,
                 repositories=None,
                 packages=None,
                 jars=None,
                 driver_class_path=None,
                 options=None,
                 additional_options=None,
                 queue_url=None,
                 queue_exchange=None,
                 queue_routing_key="optimus"
                 ):

        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        :param additional_options:


        :param options: Configuration options that are passed to spark-submit.
            See `the list of possible options
            <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_.
            Note that any options set already through PYSPARK_SUBMIT_ARGS will override
            these.
        :type options: (dict[str,str])
        :param repositories: List of additional maven repositories for package lookup.
        :type repositories: (list[str])

        :param packages: Spark packages that should be installed.
        :type packages: (list[str])

        :param jars: Full paths to jar files that we want to include to the session.
        :type jars: (list[str])

        """
        if session is None:
            # print("Creating Spark Session...")
            # If a Spark session in not passed by argument create it

            self.master = master
            self.app_name = app_name

            if options is None:
                options = {}

            self.options = options

            if packages is None:
                packages = []
            else:
                packages = val_to_list(packages)

            self.packages = packages
            self.repositories = repositories

            # Jars
            self.jars = jars
            self._add_jars(jars)

            # Class Drive Path
            self.driver_class_path = driver_class_path
            self._add_driver_class_path(driver_class_path)

            # Additional Options
            self.additional_options = additional_options

            self.verbose(verbose)

            # Load Avro.
            # TODO:
            #  if the Spark 2.4 version is going to be used this is not neccesesary.
            #  Maybe we can check a priori which version fo Spark is going to be used
            self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"])

            def c(files):
                return [Path(path + file).as_posix() for file in files]

            path = os.path.dirname(os.path.abspath(__file__))

            # Add databases jars
            self._add_jars(["../jars/RedshiftJDBC42-1.2.16.1027.jar", "../jars/mysql-connector-java-8.0.16.jar",
                            "../jars/ojdbc7.jar", "../jars/postgresql-42.2.5.jar"])

            self._add_driver_class_path(
                c(["//jars//RedshiftJDBC42-1.2.16.1027.jar", "//jars//mysql-connector-java-8.0.16.jar",
                   "//jars//ojdbc7.jar", "//jars//postgresql-42.2.5.jar"]))

            self._start_session()

            if path is None:
                path = os.getcwd()

            if checkpoint is True:
                self._set_check_point_folder(path, file_system)

        else:
            # If a session is passed by arguments just save the reference

            Spark.instance = Spark().load(session)

        # Initialize Spark
        logger.print("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logger.print(STARTING_OPTIMUS)

        if server:
            logger.print("Starting Optimus Server...")
            s = Server()
            s.start()
            self.server_instance = s

        logger.print(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read
        self.profiler = Profiler(
            queue_url=queue_url,
            queue_exchange=queue_exchange,
            queue_routing_key=queue_routing_key
        )
        self.ml = ML()

        #
        self._load_css()

        # Set global output as html
        self.output("html")