Esempio n. 1
0
def setup_logging(write_log_to_file=False,
                  log_file_base="log",
                  log_level_file=logging.INFO,
                  log_level=None,
                  progress_bar=False,
                  ):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Initialise the logging system
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    if write_log_to_file or progress_bar:
        # http://stackoverflow.com/questions/29087297/
        # is-there-a-way-to-change-the-filemode-for-a-logger-object-that-is-not-configured
        # sys.stderr = open(log_file_base + ".err", 'w')
        pass
    else:
        log_file_base = None

    formatter_long = logging.Formatter('[%(asctime)s] %(name)-5s %(levelname)-8s --- %(message)s ' +
                                       '(%(filename)s:%(lineno)s)', datefmt='%Y-%m-%d %H:%M:%S')
    _logger = create_logger(name=LOGGER_BASE_NAME,
                            file_log_level=log_level_file,
                            console_log_level=log_level,
                            log_file=log_file_base,
                            formatter_file=formatter_long,
                            console_log_format_long=True,
                            )

    if progress_bar:
        # switch off all logging because we are showing the progress bar via the print statement
        # logger.disabled = True
        # logger.disabled = True
        # logger.setLevel(logging.CRITICAL)
        for handle in _logger.handlers:
            try:
                getattr(handle, "baseFilename")
            except AttributeError:
                # this is the stream handle because we get an AtrributeError. Set it to critical
                handle.setLevel(logging.CRITICAL)

    # with this call we merge the settings of our logger with the logger in the cbs_utils logger
    # so we can control the output
    cbs_utils_logger = logging.getLogger("cbs_utils")
    cbs_utils_logger.setLevel(log_level)

    handler = logging.StreamHandler()
    handler.setLevel(log_level)
    # _logger.addHandler(handler)
    # cbs_utils_logger.addHandler(handler)
    merge_loggers(_logger, "cbs_utils", logger_level_to_merge=log_level)

    return _logger
Esempio n. 2
0
import logging

import matplotlib.pyplot as plt
import seaborn as sns

from cbs_utils.misc import (create_logger, merge_loggers)
from cbs_utils.plotting import (CBSPlotSettings, add_axis_label_background)

logger = create_logger(console_log_level=logging.DEBUG)
logger = merge_loggers(logger,
                       "cbs_utils.plotting",
                       logger_level_to_merge=logging.DEBUG)
figure_properties = CBSPlotSettings()


def make_bar_plot(data_df, orientation="horizontal"):
    """
    Make the bar plot
    
    Parameters
    ----------
    data_df: Dataframe
        pandas dataframe with the data
    orientation: {"horizontal", "vertical"}
        Direction of the bars

    """

    if orientation not in ("horizontal", "vertical"):
        raise ValueError(
            f"oriental must be 'horizontal' or 'vertical'. Found {orientation}"
Esempio n. 3
0
    def __init__(self,
                 database_name=None,
                 database_type=None,
                 store_html_to_cache=False,
                 internet_scraping=True,
                 search_urls=False,
                 max_cache_dir_size=None,
                 user=None,
                 password=None,
                 hostname=None,
                 address_keys=None,
                 kvk_url_keys=None,
                 maximum_entries=None,
                 start_url_index=None,
                 stop_url_index=None,
                 start_url=None,
                 stop_url=None,
                 progressbar=False,
                 singlebar=False,
                 force_process=False,
                 url_range_process=None,
                 save=True,
                 number_of_processes=1,
                 exclude_extensions=None,
                 i_proc=None,
                 log_file_base="log",
                 log_level_file=logging.DEBUG,
                 older_time: datetime.timedelta = None,
                 timezone: pytz.timezone = 'Europe/Amsterdam',
                 filter_urls: list = None):

        # launch the process
        console_log_level = logging.getLogger(
            LOGGER_BASE_NAME).getEffectiveLevel()
        if i_proc is not None and number_of_processes > 1:
            mp.Process.__init__(self)
            formatter = logging.Formatter("{:2d} ".format(i_proc) +
                                          "%(levelname)-5s : "
                                          "%(message)s "
                                          "(%(filename)s:%(lineno)s)",
                                          datefmt="%Y-%m-%d %H:%M:%S")
            log_file = "{}_{:02d}".format(log_file_base, i_proc)
            logger_name = f"{LOGGER_BASE_NAME}_{i_proc}"
            self.logger = create_logger(name=logger_name,
                                        console_log_level=console_log_level,
                                        file_log_level=log_level_file,
                                        log_file=log_file,
                                        formatter=formatter)
            self.logger.info("Set up class logger for proc {}".format(i_proc))
        else:
            self.logger = logging.getLogger(LOGGER_BASE_NAME)
            self.logger.setLevel(console_log_level)
            self.logger.info(
                "Set up class logger for main {}".format(__name__))

        self.logger.debug("With debug on?")

        self.progressbar = progressbar
        self.showbar = progressbar
        if singlebar and i_proc > 0 or i_proc is None:
            # in case the single bar option is given, we only show the bar of the first process
            self.showbar = False

        # a list of all country url extension which we want to exclude
        self.exclude_extensions = pd.DataFrame(
            COUNTRY_EXTENSIONS, columns=["include", "country", "suffix"])
        self.exclude_extensions = self.exclude_extensions[
            ~self.exclude_extensions["include"]]
        self.exclude_extensions = self.exclude_extensions.set_index(
            "suffix", drop=True).drop(["include"], axis=1)

        self.i_proc = i_proc
        self.store_html_to_cache = store_html_to_cache
        self.max_cache_dir_size = max_cache_dir_size
        self.internet_scraping = internet_scraping
        self.search_urls = search_urls

        self.maximum_entries = maximum_entries
        self.start_url = start_url
        self.stop_url = stop_url
        self.force_process = force_process
        self.start_url_index = start_url_index
        self.stop_url_index = stop_url_index

        self.address_keys = address_keys
        self.kvk_url_keys = kvk_url_keys

        self.save = save
        self.older_time = older_time
        self.timezone = timezone
        self.filter_urls = filter_urls

        if progressbar:
            # switch off all logging because we are showing the progress bar via the print statement
            # logger.disabled = True
            # logger.disabled = True
            # logger.setLevel(logging.CRITICAL)
            for handle in self.logger.handlers:
                try:
                    getattr(handle, "baseFilename")
                except AttributeError:
                    # this is the stream handle because we get an AtrributeError. Set it to critical
                    handle.setLevel(logging.CRITICAL)

        self.url_df: pd.DataFrame = None
        self.addresses_df: pd.DataFrame = None
        self.kvk_df: pd.DataFrame = None

        self.company_vs_kvk = None
        self.n_company = None

        self.number_of_processes = number_of_processes

        self.url_range_process = Range(url_range_process)
        self.url_ranges = None

        self.database = init_database(database_name,
                                      database_type=database_type,
                                      user=user,
                                      password=password,
                                      host=hostname)
        self.database.execute_sql("SET TIME ZONE '{}'".format(self.timezone))
        tables = init_models(self.database)
        self.UrlNL = tables[0]
        self.company = tables[1]
        self.address = tables[2]
        self.website = tables[3]
Esempio n. 4
0
import logging
from pathlib import Path

from bs4 import BeautifulSoup

from cbs_utils.misc import (create_logger, merge_loggers, Timer)
from cbs_utils.regular_expressions import (KVK_REGEXP, ZIP_REGEXP, BTW_REGEXP)
from cbs_utils.web_scraping import (get_page_from_url, UrlSearchStrings)

# set up logging
log_level = logging.DEBUG  # change to DEBUG for more info
log_format = logging.Formatter(
    '%(levelname)8s --- %(message)s (%(filename)s:%(lineno)s)')
logger = create_logger(console_log_level=log_level, formatter=log_format)
merge_loggers(logger,
              "cbs_utils.web_scraping",
              logger_level_to_merge=logging.INFO)

# create url name and clean previous cache file
cache_directory = Path("tmp")
clean_cache = True
if clean_cache:
    for item in cache_directory.iterdir():
        item.unlink()
    cache_directory.rmdir()
url = "https://www.example.com"

# first read: read from the url and report time
with Timer(units="s") as timer:
    page = get_page_from_url(url, cache_directory=cache_directory)
logger.info(f"Scraping from url took: {timer.duration} {timer.units}")
Esempio n. 5
0
import matplotlib.pyplot as plt
import sys

from cbs_utils.misc import (create_logger, merge_loggers)
from cbs_utils.plotting import CBSPlotSettings
from cbs_utils.readers import StatLineTable

fig_properties = CBSPlotSettings()

logger = create_logger()
merge_loggers(logger, logger_name_to_merge="cbs_utils.readers")

# de tabel id kan je vinden door naar de data set te gaan op statline en in de url op te zoeken.
# in dit geval is de url: https://opendata.cbs.nl/#/CBS/nl/dataset/84410NED/table?ts=1568706226304
# dus we gaan een plaatje maken uit de tabel 84410NED
table_id = "84410NED"

statline = StatLineTable(table_id=table_id, plot_all_questions=True, make_the_plots=True, save_plot=True)

sys.exit(0)

statline.show_module_table(max_width=30)
statline.show_question_table(max_width=30)

# hiermee worden all vragen van module 13 geplot, dus ook de individuele opties die bij vraag 16
# horen
statline.modules_to_plot = 46

statline.plot()

# only save the first figure for inspection
Esempio n. 6
0
def test_create_logger():
    create_logger()