Ejemplo n.º 1
0
def test_workflow_transaction(hdfs_cluster):
    w = WebHDFS(hdfs_cluster,
                user="******",
                data_proxy={"worker.example.com": "localhost"})
    fn = "/user/testuser/testrun/afile"
    w.mkdirs("/user/testuser/testrun")
    with w.transaction:
        with w.open(fn, "wb") as f:
            f.write(b"hello")
        assert not w.exists(fn)
    assert w.exists(fn)
    assert w.ukey(fn)
    files = w.ls("/user/testuser/testrun", True)
    summ = w.content_summary("/user/testuser/testrun")
    assert summ["length"] == files[0]["size"]
    assert summ["fileCount"] == 1

    w.rm("/user/testuser/testrun", recursive=True)
    assert not w.exists(fn)
Ejemplo n.º 2
0
def test_workflow_transaction(hdfs_cluster):
    w = WebHDFS(hdfs_cluster,
                user='******',
                data_proxy={'worker.example.com': 'localhost'})
    fn = '/user/testuser/testrun/afile'
    w.mkdirs('/user/testuser/testrun')
    with w.transaction:
        with w.open(fn, 'wb') as f:
            f.write(b'hello')
        assert not w.exists(fn)
    assert w.exists(fn)
    assert w.ukey(fn)
    files = w.ls('/user/testuser/testrun', True)
    summ = w.content_summary('/user/testuser/testrun')
    assert summ['length'] == files[0]['size']
    assert summ['fileCount'] == 1

    w.rm('/user/testuser/testrun', recursive=True)
    assert not w.exists(fn)
Ejemplo n.º 3
0
class DISC:
    from _PATHS import _IMPALA_HOST, _HIVE_HOST, _HTTPFS_HOST, _HDFS_PATH, USER_GUIDE_URL

    __TEMP_LOCAL_DIR = os.path.join(os.path.dirname(__file__), '._temp_connectors')
    __PEM_PATH = os.path.join(os.path.dirname(__file__), 'certificates/accprd-truststore.pem')

    def __init__(self):
        try:    # This works on jupyter ipython
            self._is_jupyter = bool(get_ipython().config)
            self._is_ipython = True
        except: # On plain python get_ipython is not defined
            self._is_jupyter = self._is_ipython =  False
        self.open()
        self.__log()
        self.spark = None
        self._spark_uri = None
    
    def open(self, hive=False):
        """Opens DISC connection: 
        selects automatically according to platform (Local Windows or CDSW)
        """
        from fsspec.implementations.webhdfs import WebHDFS
        os.environ['REQUESTS_CA_BUNDLE'] = self.__PEM_PATH

        self._hdfs_cnxn = WebHDFS(self._HTTPFS_HOST,
                                  port=14000,
                                  kerberos=True,
                                  use_https=True,
                                  use_ssl=True,
                                  use_listings_cache=False)
        
        self._engine = "hive" if hive else 'impala'
        if IS_WINDOWS:        # LOCAL - Windows
            from pyodbc import connect
            self._cnxn = connect('DSN=DISC DP Impala 64bit' if not hive 
                                 else 'DSN=DISC DP Hive 64bit', 
                                 autocommit=True)
            
        else:                 # CDSW (os.name='POSIX')
            from impala.dbapi import connect
            self._cnxn = connect(host=self._HIVE_HOST if hive
                                      else self._IMPALA_HOST,
                                 use_ssl=True,
                                 timeout=30,
                                 kerberos_service_name=self._engine,
                                 port=10000 if hive else 21050,
                                 auth_mechanism="GSSAPI")  #['NOSASL', 'PLAIN', 'GSSAPI', 'LDAP']

        self._cursor = self._cnxn.cursor()
        if not os.path.exists(self.__TEMP_LOCAL_DIR):
            os.mkdir(self.__TEMP_LOCAL_DIR)
        
        self._is_disc_connected = True
        self.db = None

    
    def connect_spark(self, app_name=None, master=None, 
                      config=dict(), return_SparkSession=False):
        """Connects to spark via pyspark. 
        Stores the spark session in the attribute `disc.spark`. 
        Access to the Spark UI is provided via the link `disc.spark_ui` 
        (the address is also available as `disc._spark_uri)`.  
        
        Args:
            app_name (str or None):  A name for the current session.
            master (str or None):    Either `yarn` (default) or `local`.
            config (dict):     Dictionary with spark configurations.       
            return_SparkSession (bool): Whether to return the spark session (default is False).
        
        Returns: None or SparkSession
        """

        self._spark_uri = f"https://spark-{os.environ['CDSW_ENGINE_ID']}.{os.environ['CDSW_DOMAIN']}/jobs/"
        from pyspark.sql import SparkSession
        spark = (SparkSession.builder
                             .appName(app_name or 'SparkSession'))
        if master is not None:
            spark = spark.master(master)
        for k, v in config.items():
            spark = spark.config(k, v)
        
        spark = spark.getOrCreate()
        
        self.spark = spark
        
        if return_SparkSession: 
            return spark
        else:                   
            return self.spark_ui

    @property
    def spark_ui(self):
        if self._is_spark_connected:
            from IPython.core.display import HTML
            return HTML(f'<a href="{self._spark_uri}">Go to Spark UI</a>')
        else:
            return 'Spark is not connected. Connect with `disc.connect_spark()`.'
        
        
    def stop_spark(self):
        if self._is_spark_connected:
            self.spark.sparkContext.stop()
            self.spark.stop()
    
    @property
    def _is_spark_connected(self):
        try:
            urllib.request.urlopen(self._spark_uri).getcode()
            return True
        except:
            return False
            

    
    @lru_cache()
    def __get_databases(self):
        self._cursor.execute("show databases")
        return [*zip(*self._cursor.fetchall())][0]
    
    def get_databases(self,ret=False):
        """Prints and returns available databases."""
        databases = self.__get_databases()
        if ret:
            return databases
        else:
            for db in databases:
                print(db) 

    def select_database(self, database):
        """Navigates to given database.

        Args:
            database (str): the database to be selected
        """
        self._cursor = self._cnxn.cursor()
        self._cursor.execute(f"use {database}")
        self.db = database

    def get_tables(self, database=None, ret=False):
        """Prints and returs tables within current database."""
        if database is not None:
            self.select_database(database)
        self._cursor.execute("show tables")
        tables, = zip(*self._cursor.fetchall())
        if ret:
            return tables
        else:
            for table in tables:
                print(table)
           
    def describe_table(self, table):
        """Describes a table and returns DataFrame with description.""" 
        self._cursor.execute(f"describe {table}")
        columns, dtype, desc = [*zip(*self._cursor.fetchall())]
        df = pd.DataFrame({'columns': columns, 
                           'dtype': dtype,
                           'desc': desc})
        return df
    
    def _delete_table(self, lab, table_name):
        self._cursor.execute(f"DROP TABLE IF EXISTS {lab}.{table_name}")

    def _create_table(self, lab, table_name, dtypes, path, cols):
        col_and_types = [f'{col} {tp}' for col,tp in zip(cols, dtypes)]
        col_and_types = ', '.join(col_and_types)
        self._cursor.execute(f"""
            CREATE EXTERNAL TABLE IF NOT EXISTS 
            {lab}.{table_name}({col_and_types})
            ROW FORMAT DELIMITED 
            FIELDS TERMINATED BY ','
            STORED AS TEXTFILE
            LOCATION '{path}'
            """)
    
    # TODO: add APPEND TO TABLE OPTION
    
    def _refresh_table(self, lab, table_name):
        self._cursor.execute(f'REFRESH {lab}.{table_name}')
    
    def create_table_csv(self, df, lab, table_name, path, dtypes, cols=None):
        """
        Wrapper method: deletes previous table, transfers new csv file,
        creates new table, and refreshes it.
        Note: dtypes must be one of: ARRAY, BIGINT, BINARY, BOOLEAN, 
        CHAR, DATE, DATETIME, DECIMAL, REAL, FLOAT, INTEGER, MAP, SMALLINT, 
        STRING, STRUCT, TIMESTAMP, TINYINT, VARCHAR
        Args:
           df: A pandas.DataFrame.
           lab: Name of DataLab where to create the table.
           table_name: Name of the table to be created.
           path: Where to store the underlying data.
           dtypes: SQL data types for each column.
           cols: List of strings with column names. By default df.columns.values.
        
        """
            
        self._delete_table(lab=lab, table_name=table_name)
        self.to_csv(df, f'{path}/{table_name}.csv', index=False, header=False)
        self._create_table(lab=lab, 
                           table_name=table_name, 
                           cols=cols or df.columns.values, 
                           dtypes=dtypes,  
                           path=path)
        
    def create_table(self, df, lab, table_name, path, external=True, permissions=_default_permissions):
        """Create table storing as parquet file.
        TODO: integrate with the above, allowing user to choose.;Allow to append rows to existing table instead of deleting old one; check whether there is no other table in present
        Args:
           df: A pandas.DataFrame or pyspark.sql.dataframe.DataFrame.
           lab: Name of DataLab where to create the table.
           table_name: Name of the table to be created.
           path: Where to store the underlying data.
           external: EXTERNAL table if `True`.
           permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
        """
        self._delete_table(lab=lab, table_name=table_name)
        if not isinstance(df, pd.DataFrame):  # If Spark dataframe
            path += f'/{table_name}'
            df.write.parquet(path)
            self._hdfs_cnxn.chmod(path, permissions)
            file_path = [f for f in disc.ls(path) if f[-8:]=='.parquet'][0]

        else:                                # If Pandas dataframe
            df = df.reset_index(drop=True)
            file_path = f'{path}/{table_name}.parq'
            df.columns = df.columns.str.replace(':','').str.replace(' ','')
            self.to_parquet(df, file_path, permissions=permissions)

        query = (f"""CREATE {'EXTERNAL' if external else ''} TABLE {lab}.{table_name}
                     LIKE PARQUET '{file_path}' 
                     STORED AS PARQUET 
                     {f"LOCATION '{path}'" if external else ''};""")
        self._cursor.execute(query)


    def _fix_path(self, path):
        """Adds hdfs root to path."""
        if path[:len(self._HDFS_PATH)] != self._HDFS_PATH:
            path = self._HDFS_PATH + path
        return path
    
    def read_csv(self, path, **kwargs):
        """Wrapper around pandas.read_csv.
        Args:
          path (str):     Path to DISC location
          kwargs: Keyword arguments to be passed to pandas.read_csv
          """
        with self._hdfs_cnxn.open(path) as f:
            df = pd.read_csv(f, **kwargs)
        return df
    
    def to_csv(self, df, path, name=None, permissions=_default_permissions, **kwargs):
        """Save dataframe to DISC `path` in csv format.
        Args:
          df (pandas.DataFrame)
          path (str):     Path to DISC location
          permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
          kwargs: Keyword arguments to be passed to pandas.to_csv"""
        if name is None:
            name = ntpath.basename(path)
            path = ntpath.dirname(path)
        token = secrets.token_hex(nbytes=8)
        local_file = f'{self.__TEMP_LOCAL_DIR}/{token}{name}'
        df.to_csv(local_file, **kwargs)
        self.upload_file(local_file=local_file, 
                         destination_file_path=f'{path}/{name}',
                         rm_local=True,
                         permissions=permissions)
        
    def read_excel(self, path, **kwargs):
        """Wrapper around pandas.read_excel"""
        with self._hdfs_cnxn.open(path) as f:
            df = pd.read_excel(f, **kwargs)
        return df
    
    def ls(self, path):
        """Wrapper around `self._hdfs_cnxn.ls`."""
        return self._hdfs_cnxn.ls(path)
        
    def listdir(self, path, full_path=False):
        """Wrapper around `self.ls`. The parameter `full_path` (False by default)
        allows to hide the root of the paths showing only content of selected path."""
        paths = self.ls(path)
        if not full_path:
            paths = [path.split('/')[-1] for path in paths]
        return paths

    def makedir(self, destination_path):
        """Wrapper around WebHDFS.makedir."""
        self._hdfs_cnxn.makedir(destination_path) 
    
    def read_parquet(self, path, **kwargs):
        """Wrapper aroud pandas.read_parquet.
        Args:
          path (str):     Path to DISC location
          kwargs: Keyword arguments to be passed to pandas.read_parquet
        """
        df = pd.read_parquet(path, 
                             filesystem=self._hdfs_cnxn, 
                             **kwargs)
        return df

    def to_parquet(self, df, path, permissions=_default_permissions, **kwargs):
        """Wrapper aroud pandas.DataFrame.to_parquet.
        Args:
          df (pandas.DataFrame)
          path (str):     Path to DISC location
          permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
          kwargs: Keyword arguments to be passed to pandas.to_parquet
        """
        df.columns = df.columns.astype(str)
        df.to_parquet(path, filesystem=self._hdfs_cnxn, **kwargs) 
        if permissions: self._hdfs_cnxn.chmod(path, permissions)
        

    def to_feather(self, df, path, permissions=_default_permissions, **kwargs):
        """Wrapper around pandas.to_feather.
          Args:
            df (pandas.DataFrame)
            path (str):     Path to DISC location
            permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
            kwargs: Keyword arguments to be passed to pandas.to_feather
        """
        name, path = ntpath.basename(path), ntpath.dirname(path)
        token = secrets.token_hex(nbytes=8)
        local_file = f'{self.__TEMP_LOCAL_DIR}/{token}{name}'
        df.to_feather(local_file, **kwargs)
        self.upload_file(local_file, f'{path}/{name}', 
                         rm_local=True,
                         permissions=permissions)

    
    def read_feather(self, path, **kwargs):
        """Wrapper around pandas.read_feather.
        Args:
          path (str):     Path to DISC location
          kwargs: Keyword arguments to be passed to pandas.read_feather
        """
        with self._hdfs_cnxn.open(path) as f:
            df = pd.read_feather(f, **kwargs)
        return df    
    
    def to_stata(self, df, path, permissions=_default_permissions, **kwargs):
        """Wrapper around pandas.to_stata.
          Args:
            df (pandas.DataFrame)
            path (str):     Path to DISC location
            permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
            kwargs: Keyword arguments to be passed to pandas.to_stata
        """
        name, path = ntpath.basename(path), ntpath.dirname(path)
        token = secrets.token_hex(nbytes=8)
        local_file = f'{self.__TEMP_LOCAL_DIR}/{token}{name}'
        df.to_stata(local_file, **kwargs)
        self.upload_file(local_file, f'{path}/{name}', 
                         rm_local=True,
                         permissions=permissions)

    
    def read_stata(self, path, **kwargs):
        """Wrapper around pandas.read_stata.
        Args:
          path (str):     Path to DISC location
          kwargs: Keyword arguments to be passed to pandas.read_stata
        """
        with self._hdfs_cnxn.open(path) as f:
            df = pd.read_stata(f, **kwargs)
        return df
    
    
    def read_encrypted(self, path, password, **kwargs):
        """Wrapper aroud cryptpandas.read_encrypted.
        Args:
          path (str):     Path to DISC location
          password (str): Password for decryption 
          kwargs: Keyword arguments to be passed to cryptpandas.read_encrypted
        """

        token = secrets.token_hex(nbytes=8)
        local_file = f'{self.__TEMP_LOCAL_DIR}/encrypted_{token}'
        self._hdfs_cnxn.download(path, local_file)
        df = crpd.read_encrypted(local_file, password, **kwargs)
        
        return df
    
    
    def to_encrypted(self, df, path, password, permissions=_default_permissions, **kwargs):
        """Write a DataFrame as encrypted binary at specified DISC location.
        Args:
          df:       A pandas DataFrame
          path:     Path to DISC location
          password: Password for encryption
          permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
          kwargs:   Keyword arguments to be passed to cryptpandas.to_encrypted
        """
        # Write the encrypted file
        token = secrets.token_hex(nbytes=8)
        name, root = ntpath.basename(path), ntpath.dirname(path)
        local_file = f'{self.__TEMP_LOCAL_DIR}/{token}_{name}'
        encrypted = crpd.to_encrypted(df, password=password, path=local_file, **kwargs)
        destination_file_path = f'{root}/{name}'
        self.upload_file(local_file, destination_file_path, 
                         rm_local=False, overwrite=True, permissions=permissions)

    
    def to_pickle(self, obj, path, protocol='HIGHEST_PROTOCOL', permissions=_default_permissions, **kwargs):
        """Saves to remote HDFS as pickle file.
        Args:
            path (str): the path of the file to be saved
            protocol (str or int): Either a strig ('HIGHEST_PROTOCOL' or 'DEFAULT_PROTOCOL') or an integer        
            permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
        """        
        if isinstance(protocol, str):
            protocol = getattr(pickle, protocol)    
            
        with self._hdfs_cnxn.open(path, "wb") as f:
            pickle.dump(obj, f,  protocol=protocol, **kwargs)
        
        if permissions: self._hdfs_cnxn.chmod(path, permissions)          
            
    def read_pickle(self, path, **kwargs):
        with self._hdfs_cnxn.open(path, "rb") as f:
            obj = pickle.load(f, **kwargs)
        return obj
        
    def upload_file(self, local_file, destination_file_path, 
                    rm_local=False, overwrite=True, permissions=_default_permissions):
        """Uploads file to DISC.
        Args:
            local_file (str):  Path to local file to be uploaded.
            destination_file_path (str): Destination path.
            rm_local (bool): If True deletes local file after upload (default is False).
            overwrite (bool): If True overwrites file at destination (default is True).
            permissions (str or None): posix representation or permission, give as oct string, e.g, '777'(default) or 0o777
        """
        if overwrite:
            if self._hdfs_cnxn.exists(destination_file_path):
                self._hdfs_cnxn.rm(destination_file_path)
        self._hdfs_cnxn.upload(local_file, destination_file_path)
        if permissions: self._hdfs_cnxn.chmod(destination_file_path, permissions)          
        if rm_local:
            os.remove(local_file)
            
    def upload(self, local_path, destination_path):  
        """Uploads files and/or folders from `local_path` 
           onto the DISC `destination_path`"""
        if not self._hdfs_cnxn.exists(destination_path):
            self._hdfs_cnxn.mkdir(destination_path)
        if os.path.isdir(local_path):
            for root, dirs, files in [*os.walk(local_path)]:
                if root==local_path:
                    dest_path = destination_path
                else:
                    relpath = os.path.relpath(root, local_path)
                    dest_path = f'{destination_path}/{relpath}'
                for dir in dirs:
                    if not self._hdfs_cnxn.exists(f'{dest_path}/{dir}'):
                        self._hdfs_cnxn.mkdir(f'{dest_path}/{dir}')
                for file in files: 
                    self.upload_file(local_file=f'{root}/{file}', 
                  		          			 destination_file_path=f'{dest_path}/{file}', 
                  					           rm_local=False)
                 
        else:  # upload_file 
            self.upload_file(local_file=local_path, 
                             destination_file_path=destination_path, 
                             rm_local=False)
    
        
            
    def savefig(self, path, ax=None, **kwargs):
        """Saves matplotlib figure to DISC `path` destination."""
        import matplotlib.pyplot as plt
        if ax is not None:
            plt = ax
        token = secrets.token_hex(nbytes=8)
        local_file = f'{self.__TEMP_LOCAL_DIR}/{token}_fig.png'
        plt.savefig(local_file, **kwargs)
        self.upload_file(local_file=local_file, 
                         destination_file_path=path, 
                         rm_local=True)
        
    def make_vintage(self, origin, freq='month', overwrite=False, deep=False, exclude=[]):
        """
        Args:
            origin (str): The folder of which you want to store vintages. A new 'origin/VINTAGES' folder will be created.
            freq (str): Frequency with which to store vintages. Choose between year, month, day, max. Default is 'month'.
            overwrite (bool): Whether to overwrite already existing vintages. Default is `False`.
            deep (bool): If True stores as year/month/etc. Else in a single folder year_month_day_etc.
            exclude (list): List of items (files or directories) in origin, of which no vintaging should occur.
        """
        from datetime import datetime
        dtmt = datetime.today()
        _DAY, _MONTH, _YEAR = dtmt.day, dtmt.strftime('%b'), dtmt.year
        sep = '/' if deep else '_'
        vintage = f'{_YEAR}{sep}{_MONTH}'
        if freq=='year':
            vintage = f'{_YEAR}'
        elif freq=='day':
            vintage = f'{_YEAR}{sep}{_MONTH}{sep}{_DAY}'
        elif freq=='max':
            vintage = f'{_YEAR}{sep}{_MONTH}{sep}{_DAY}{sep}{dtmt.hour}h{dtmt.minute}m{dtmt.second}s'
    
        if self._hdfs_cnxn.exists(f'{origin}/VINTAGES/{vintage}') and not overwrite:
            raise PermissionError(f"Vintage 'VINTAGES/{vintage}' already exists at {origin}.\nTo override set `overwrite=True`.")
        else:
            self._hdfs_cnxn.mkdir(f'{origin}/VINTAGES/{vintage}')
    
        current = set(self.listdir(origin, full_path=False)) - ({'.', '..', 'VINTAGES'}|set(exclude))
    
        for item in current:
            self.hdfs_mv(f'{origin}/{item}', f'{origin}/VINTAGES/{vintage}/')
            
    
    def hdfs_mv(self, origin_path, destination_path):
        """Moves files/directories from one DISC location to another."""
#        if IS_WINDOWS:
        self._hdfs_cnxn.mv(origin_path, destination_path)
#        else:
#            (subprocess.Popen(f'hdfs dfs -mv {origin_path} {destination_path}',
#                         stdout=subprocess.PIPE, shell=True)
#                   .communicate())
    
    def read_sql(self, query, **kwargs):
        """Performs a sql query on disc.
        Args: 
            query (str): a SQL query.
            kwargs: Keyword arguments to be passed to pandas.read_sql.
        Returns: 
            pd.DataFrame
        """
        return pd.read_sql(query, con=self._cnxn, **kwargs)
    
    
    def __repr__(self):
        states = ('Not active', 'Active')
        sb, eb = ("\033[1m","\033[0;0m") if self._is_ipython else ('','')  # Bold
        sr, er = ("\x1b[31m", "\x1b[0m") if self._is_ipython else ('','')  # Red
        sg, eg = ("\033[92m", "\033[0m") if self._is_ipython else ('','')  # Green
        clr_str = lambda isc:(sg,eg) if isc else (sr,er)
        state_str = lambda isc:  f'{clr_str(isc)[0]}{states[isc]}{clr_str(isc)[1]}'
        _repr = f"\n{sb}DISC connection{eb}: {state_str(self._is_disc_connected)}"\
                f"\nEngine: {self._engine}"\
                f"\nSelected database: {self.db}"\
                f"\n{sb}Spark connection{eb}: {state_str(self._is_spark_connected)}"
        return _repr
        
        
    def _repr_html_(self):
        with open(f'{os.path.dirname(__file__)}/res/connector_mini.svg', 'r') as f: 
            _svg_cnn = f.read()
        states = ('Not active', 'Active')
        colors = ('#C82806', '#138F0B')
        bcolors = ('#FCD9D9', '#DBFCD9')
        html_repr = _svg_cnn + f"""</br>
        <span style="white-space: nowrap;">
        <b>DISC connection</b>:
        <span style="color:{colors[self._is_disc_connected]}; 
                     background-color:{bcolors[self._is_disc_connected]}"; 
        white-space: nowrap;>{states[self._is_disc_connected]}</span>
        </span></br>
        <span style="white-space: nowrap;">
        <span style="color: gray">Engine:</span>
        <span white-space: nowrap;>{self._engine}</span>
        </span></br>
        <span style="white-space: nowrap;">
        <span style="color: gray">Selected database:</span>
        <span white-space: nowrap;>{self.db}</span>
        </span></br>


        </br>
        <span style="white-space: nowrap;">
        <b>Spark Connection</b>:
        <span style="color:{colors[self._is_spark_connected]};
                            background-color:{bcolors[self._is_spark_connected]}"; 
        white-space: nowrap;>{states[self._is_spark_connected]}</span>
        </span>"""

        if self._is_spark_connected:
            html_repr +=f"""</br>
            <b><i>SparkContext</i></b></br>

            <a href="{self._spark_uri}">Spark UI</a></br>

            <span style="white-space: nowrap;">
            <span style="color: gray">Master:</span>
            <span white-space: nowrap;>{self.spark.sparkContext.master}</span>
            </span></br>
            <span style="white-space: nowrap;">
            <span style="color: gray">AppName:</span>
            <span white-space: nowrap;>{self.spark.sparkContext.appName}</span>
            </span></br>

            """
        html_repr += f"""</br></br>
                         <a href="{self.USER_GUIDE_URL}">
                         Need help? Check the documentation!</a>"""

        return html_repr
        
            
    def show_spark_conf(self):
        """Displays spark configurations."""
        if not self._is_spark_connected:
            print('Spark is not connected. To connect, try `disc.connect_spark()`')
        else:
            from IPython.core.display import HTML, display
            html_repr = f"""</br><b><i>Spark Configurations</i></b></br>"""
            confs = self.spark.sparkContext.getConf().getAll()
            for (cnf_k,cnf_v) in confs:
                html_repr += f"""
                   <span style="white-space: nowrap;">
                   <span style="color: gray">{cnf_k[6:]}:</span>
                   <span white-space: nowrap;>{cnf_v}</span>
                   </span></br>"""
            display(HTML(html_repr))

    def __log(self):
        try:
            _path = '/data/lab/dlb_ecb_public/share/_CONNECTORS_LOG'
            date = str(datetime.datetime.today().date())
            if date not in self.listdir(_path):
                self.to_pickle(0, f'{_path}/{date}/logs.p')
            else:
                L = self.read_pickle(f'{_path}/{date}/logs.p')
                self.to_pickle(L+1, f'{_path}/{date}/logs.p')
        except:
            pass

        
    def close(self, rm_local_temp=False):
        """Closes connection to DISC.
        Args:
           rm_local_temp (bool): Delete local temp folder. Default is False.
        """
        if rm_local_temp: shutil.rmtree(self.__TEMP_LOCAL_DIR, ignore_errors=True)
        self._cursor.close()
        self._cnxn.close()
        self._is_disc_connected = False
        self.stop_spark()
        if IS_WINDOWS:
            del self._hdfs_cnxn
        else:
            self._hdfs_cnxn.close()
        print('Closed connection to DISC.')
    
    def __del__(self):
        try:
            self.close()
        except:
            pass