Esempio n. 1
0
def init_database(init_sql_script_path=None):
    if not init_sql_script_path:
        init_sql_script_path = os.path.join(ROOT_DIR, PROJECT_NAME,
                                            'db/sql/initialize_wiki_db.sql')

    if not os.path.exists(init_sql_script_path):
        raise FileNotFoundError(
            f'Could not find the init sql script @ {init_sql_script_path}')

    con = MariaDBConnector()
    con.source_sql_script(sql_script_path=init_sql_script_path)

    r1 = con.execute(f'SELECT * FROM {PROD_TABLE_NAMES}')
    r2 = con.execute(f'SELECT * FROM {ETL_JOB_LOG}')

    assert len(r1) > 0
    assert len(r2) == 0
    logging.info(
        f'Tables that are present in production and are part of ETL job:\n{r1}\n'
    )
    logging.info('Database has been initialized successfully')
class DBPreProcessor:
    def __init__(self, sql_script_path_mo=None):
        if sql_script_path_mo is None:
            self.sql_script_path_mo = os.path.join(
                ROOT_DIR, PROJECT_NAME,
                'etl/sql/pre_processing_most_outdated.sql')
        else:
            self.sql_script_path_mo = sql_script_path_mo
        if not os.path.exists(self.sql_script_path_mo):
            raise FileNotFoundError(
                f'Could not find sql script @ {self.sql_script_path_mo}')

        self.most_outdated_page_table_name = MOST_OUTDATED_PAGE
        self.category_links_table_name = 'wikipedia.categorylinks'
        self.con = MariaDBConnector()

    def run_most_outdated_pre_processing(self):
        start_time = time.time()
        self.con.source_sql_script(sql_script_path=self.sql_script_path_mo)
        end_time = time.time()

        rc1 = self.con.execute(
            f'SELECT COUNT(*) FROM {self.most_outdated_page_table_name}',
            as_df=False)
        assert rc1[0][0] > 0

        rc2 = self.con.execute(
            f'SELECT COUNT(DISTINCT cl_to) FROM {self.category_links_table_name}',
            as_df=False)
        assert rc2[0][0] > 0
        assert rc1[0][0] > rc2[0][0] // 2

        logging.info(
            f'Most outdated pre-processing complete in {round(end_time - start_time, 1)} s'
        )

    def run_pre_processing(self):
        self.run_most_outdated_pre_processing()
        logging.info('All pre-processing complete')
Esempio n. 3
0
def main(wiki_dump_date=None, fail_upon_errors=True):
    etl_job_id = get_etl_job_id()
    logging.info(
        f'Commencing ETL job for Wikipedia Assistant with job id {etl_job_id}')
    start_time = time.time()

    if wiki_dump_date is None:
        dump_date = get_simple_wiki_latest_dump_date()
        latest_loaded_date = get_latest_loaded_date()

        if dump_date <= latest_loaded_date:
            logging.info(
                f'Latest dump date from wiki is {dump_date} and latest loaded date is {latest_loaded_date}'
            )
            logging.info(
                'So not running ETL routine since there is no new dump available'
            )
            return
    else:
        dump_date = wiki_dump_date

    logging.info(
        f'Downloading all latest data dumps from Simple Wiki for dump date {dump_date}\n'
    )
    downloader = DumpDownloader(dump_date=dump_date)
    download_paths = downloader.download_dump()

    con = MariaDBConnector()
    error_files = []
    logging.info(
        f'Loading data from dumps to database for following tables: {download_paths.keys()}'
    )

    for table, file_path in download_paths.items():
        t1 = dt.datetime.now()
        logging.info(f'Loading to {table} table')

        if file_path.endswith('.sql'):
            con.source_sql_script(sql_script_path=file_path)

        elif file_path.endswith('.xml') and 'pages-meta-current' in file_path:
            xml_parser = WikiXMLParser(xml_path=file_path)
            xml_parser.parse_and_load_all_pages_xml(batch_size=10000)

        else:
            logging.warning(f'File {file_path} is not supported for ETL to DB')
            error_files.append((table, file_path, 'unsupported'))

        t2 = dt.datetime.now()
        t1_str = dt.datetime.strftime(t1, '%Y-%m-%dT%H:%M:%S')
        t2_str = dt.datetime.strftime(t2, '%Y-%m-%dT%H:%M:%S')

        con.execute(
            query=f"INSERT INTO {ETL_JOB_LOG} VALUES "
            f"({etl_job_id}, '{dump_date}', '{table}', '{file_path}', '{t1_str}', '{t2_str}')"
        )
        logging.info(f'Loaded to {table} table in {(t2 - t1).seconds} s\n')
        time.sleep(2)

    logging.info(
        'Starting pre-processing of database to aid most outdated page queries'
    )
    db_pre_processor = DBPreProcessor()
    db_pre_processor.run_pre_processing()

    end_time = time.time()
    mins_taken = (end_time - start_time) // 60
    secs_taken = round((end_time - start_time) % 60, 1)
    logging.info(
        f'Finished end-to-end ETL for {len(download_paths)} tables in {mins_taken} min {secs_taken} s'
    )

    if error_files:
        if fail_upon_errors:
            raise RuntimeError(
                f'There were following errors when loading to these tables: {error_files}'
            )
        else:
            logging.warning(
                f'There were following errors when loading to these tables: {error_files}'
            )