Exemple #1
0
def get_etl_job_id():
    con = MariaDBConnector()
    df = con.execute(
        query=f'SELECT MAX(job_id) AS max_job_id FROM {ETL_JOB_LOG}')
    if len(df) == 0 or df.max_job_id[0] is None:
        return 1
    else:
        return df.max_job_id[0] + 1
Exemple #2
0
def get_latest_loaded_date():
    con = MariaDBConnector()
    df = con.execute(
        f'SELECT MAX(wiki_dump_date) AS max_date FROM {ETL_JOB_LOG}')

    if len(df) > 0:
        return df.max_date[0]
    else:
        return None
Exemple #3
0
    def __init__(self, dump_date):
        self.dump_date = dump_date
        self.download_dir = os.path.join(DUMP_DOWNLOAD_DIR, dt.datetime.strftime(self.dump_date, '%Y%m%d'))
        self.all_tables = {}

        con = MariaDBConnector()
        res = con.execute(f'SELECT * FROM {PROD_TABLE_NAMES}', as_df=False)

        for table_name, filename_format in res:
            self.all_tables[table_name] = filename_format
    def __init__(self, sql_script_path_mo=None):
        if sql_script_path_mo is None:
            self.sql_script_path_mo = os.path.join(
                ROOT_DIR, PROJECT_NAME,
                'etl/sql/pre_processing_most_outdated.sql')
        else:
            self.sql_script_path_mo = sql_script_path_mo
        if not os.path.exists(self.sql_script_path_mo):
            raise FileNotFoundError(
                f'Could not find sql script @ {self.sql_script_path_mo}')

        self.most_outdated_page_table_name = MOST_OUTDATED_PAGE
        self.category_links_table_name = 'wikipedia.categorylinks'
        self.con = MariaDBConnector()
Exemple #5
0
    def __init__(self, xml_path=None, con=None):
        if xml_path and not os.path.exists(xml_path):
            raise FileNotFoundError(xml_path)
        self.xml_path = xml_path

        if con:
            self.con = con
        else:
            self.con = MariaDBConnector()

        self.last_modified_table_name = PAGE_TIMESTAMPS
        self.page_links_table_name = PAGE_LINK_POSITIONS
        self.num_last_modified_records = 0
        self.num_page_links_records = 0
class DBPreProcessor:
    def __init__(self, sql_script_path_mo=None):
        if sql_script_path_mo is None:
            self.sql_script_path_mo = os.path.join(
                ROOT_DIR, PROJECT_NAME,
                'etl/sql/pre_processing_most_outdated.sql')
        else:
            self.sql_script_path_mo = sql_script_path_mo
        if not os.path.exists(self.sql_script_path_mo):
            raise FileNotFoundError(
                f'Could not find sql script @ {self.sql_script_path_mo}')

        self.most_outdated_page_table_name = MOST_OUTDATED_PAGE
        self.category_links_table_name = 'wikipedia.categorylinks'
        self.con = MariaDBConnector()

    def run_most_outdated_pre_processing(self):
        start_time = time.time()
        self.con.source_sql_script(sql_script_path=self.sql_script_path_mo)
        end_time = time.time()

        rc1 = self.con.execute(
            f'SELECT COUNT(*) FROM {self.most_outdated_page_table_name}',
            as_df=False)
        assert rc1[0][0] > 0

        rc2 = self.con.execute(
            f'SELECT COUNT(DISTINCT cl_to) FROM {self.category_links_table_name}',
            as_df=False)
        assert rc2[0][0] > 0
        assert rc1[0][0] > rc2[0][0] // 2

        logging.info(
            f'Most outdated pre-processing complete in {round(end_time - start_time, 1)} s'
        )

    def run_pre_processing(self):
        self.run_most_outdated_pre_processing()
        logging.info('All pre-processing complete')
def init_database(init_sql_script_path=None):
    if not init_sql_script_path:
        init_sql_script_path = os.path.join(ROOT_DIR, PROJECT_NAME,
                                            'db/sql/initialize_wiki_db.sql')

    if not os.path.exists(init_sql_script_path):
        raise FileNotFoundError(
            f'Could not find the init sql script @ {init_sql_script_path}')

    con = MariaDBConnector()
    con.source_sql_script(sql_script_path=init_sql_script_path)

    r1 = con.execute(f'SELECT * FROM {PROD_TABLE_NAMES}')
    r2 = con.execute(f'SELECT * FROM {ETL_JOB_LOG}')

    assert len(r1) > 0
    assert len(r2) == 0
    logging.info(
        f'Tables that are present in production and are part of ETL job:\n{r1}\n'
    )
    logging.info('Database has been initialized successfully')
Exemple #8
0
class WikiXMLParser:

    def __init__(self, xml_path=None, con=None):
        if xml_path and not os.path.exists(xml_path):
            raise FileNotFoundError(xml_path)
        self.xml_path = xml_path

        if con:
            self.con = con
        else:
            self.con = MariaDBConnector()

        self.last_modified_table_name = PAGE_TIMESTAMPS
        self.page_links_table_name = PAGE_LINK_POSITIONS
        self.num_last_modified_records = 0
        self.num_page_links_records = 0

    def parse_and_load_all_pages_xml(self, file_path=None, batch_size=10000, break_after=None):
        if file_path is None:
            if not self.xml_path:
                raise ValueError('No file path is given')
            else:
                file_path = self.xml_path
        elif not os.path.exists(file_path):
            raise FileNotFoundError(file_path)

        if not (1000 <= batch_size <= 100000):
            raise ValueError('Batch size must be between 1,000 & 100,000 to prevent too many or too little batches')
        if break_after and break_after <= 0:
            raise ValueError('break_after needs to be >= 1')

        # Delete existing tables first before starting load and check counts are 0 to be safe
        self._drop_and_create()
        rc1 = self.con.execute(f'SELECT COUNT(*) FROM {self.last_modified_table_name}', as_df=False)
        rc2 = self.con.execute(f'SELECT COUNT(*) FROM {self.page_links_table_name}', as_df=False)
        assert rc1[0][0] == 0 and rc2[0][0] == 0
        logging.info('Beginning parsing of XML file containing all wiki pages')

        cnt = 0; num_pages_parsed = 0; num_batches = 0
        found_first_page = False
        found_page_title = False; found_page_namespace = False
        found_page_id = False; found_timestamp = False
        last_modified = []; page_links = []
        record = {}

        context = iterparse(file_path, events=("start", "end"))
        context = iter(context)
        event, root = context.__next__()

        for event, elem in context:
            if event == 'start' and elem.tag.endswith('page'):
                if not found_first_page:
                    found_first_page = True

                if cnt > 0 and record['page_namespace'] in ('0', '1'):
                    # logging.info(f'Assembling record for page {num_pages_parsed + 1}')

                    if ':' in record['page_title']:
                        page_title = ':'.join(record['page_title'].split(':')[1:])
                    else:
                        page_title = record['page_title']
                    page_title = page_title.replace(' ', '_')

                    last_modified.append((
                        record['page_id'],
                        record['page_namespace'],
                        page_title,
                        record['timestamp']
                    ))

                    if 'text' in record:
                        links = ' '.join(record['text']).split('[[')[1:]
                        links = [l.split(']]')[0].split('|')[0] for l in links]
                        links = [l.replace(' ', '_') for l in links if ':' not in l]

                        page_links.append((
                            record['page_id'],
                            record['page_namespace'],
                            links
                        ))

                    record = {}
                    num_pages_parsed += 1

                cnt += 1
                found_page_title = False; found_page_namespace = False
                found_page_id = False; found_timestamp = False

                # Clear XML Tree root of all seen elements so far to prevent memory overload when parsing huge file
                root.clear()

                if num_pages_parsed > (num_batches * batch_size) and (num_pages_parsed % batch_size) == 0:
                    num_batches += 1
                    logging.info(f'Finished parsing batch #{num_batches}. Loading this batch to db')
                    self._write_parsed_data_to_db(last_modified=last_modified, page_links=page_links)
                    last_modified = []; page_links = []
                    gc.collect()

                if break_after and num_pages_parsed == break_after:
                    break

            elif found_first_page:
                if elem.tag.endswith('}title') and not found_page_title and elem.text:
                    record['page_title'] = elem.text
                    found_page_title = True

                elif elem.tag.endswith('}ns') and not found_page_namespace:
                    record['page_namespace'] = elem.text
                    found_page_namespace = True

                elif elem.tag.endswith('}id') and not found_page_id:
                    record['page_id'] = elem.text
                    found_page_id = True

                elif elem.tag.endswith('}timestamp') and not found_timestamp:
                    record['timestamp'] = elem.text
                    found_timestamp = True

                elif elem.tag.endswith('}text') and elem.text:
                    if 'text' not in record:
                        record['text'] = [elem.text]
                    else:
                        record['text'].append(elem.text)

            else:
                continue

        if num_pages_parsed > (num_batches * batch_size):
            num_batches += 1
            logging.info(f'Finished parsing batch no. {num_batches}. Loading this batch to db')
            self._write_parsed_data_to_db(last_modified=last_modified, page_links=page_links)

        logging.info('Finished parsing XML file. Summary:')
        logging.info(f'{num_pages_parsed} pages parsed for last_modfied & page_links data in {num_batches} batches')
        logging.info(f'Total {self.num_last_modified_records} records appended to {self.last_modified_table_name}')
        logging.info(f'Total {self.num_page_links_records} records appended to {self.page_links_table_name}')
        gc.collect()

    def _write_parsed_data_to_db(self, last_modified, page_links):
        df_last_modified = pd.DataFrame(
            last_modified,
            columns=['page_id', 'page_namespace', 'page_title', 'timestamp']
        )

        page_links_flat = []
        for page_id, page_ns, links in page_links:
            link_position = 1
            for link in links:
                page_links_flat.append((page_id, page_ns, link, '0', str(link_position)))
                link_position += 1

        df_page_links = pd.DataFrame(
            page_links_flat,
            columns=['pl_from', 'pl_from_namespace', 'pl_title', 'pl_namespace', 'pl_position']
        )

        logging.info('Parsed following last_modified and page_links data for this batch:')
        time.sleep(0.2)
        print('\n')
        print(df_last_modified)
        print('\n')
        print(df_page_links)

        self.con.upload_df(
            df=df_last_modified,
            table_name=self.last_modified_table_name,
            delimiter='|',
            name='last_modified'
        )
        self.num_last_modified_records += len(df_last_modified)

        self.con.upload_df(
            df=df_page_links,
            table_name=self.page_links_table_name,
            delimiter='|',
            name='page_links'
        )
        self.num_page_links_records += len(df_page_links)

    def _drop_and_create(self):
        for tbl in [self.last_modified_table_name, self.page_links_table_name]:
            drop_q = f'DROP TABLE IF EXISTS {tbl}'
            self.con.execute(query=drop_q)

            create_q = DDLS[tbl]
            self.con.execute(query=create_q)
Exemple #9
0
def main(wiki_dump_date=None, fail_upon_errors=True):
    etl_job_id = get_etl_job_id()
    logging.info(
        f'Commencing ETL job for Wikipedia Assistant with job id {etl_job_id}')
    start_time = time.time()

    if wiki_dump_date is None:
        dump_date = get_simple_wiki_latest_dump_date()
        latest_loaded_date = get_latest_loaded_date()

        if dump_date <= latest_loaded_date:
            logging.info(
                f'Latest dump date from wiki is {dump_date} and latest loaded date is {latest_loaded_date}'
            )
            logging.info(
                'So not running ETL routine since there is no new dump available'
            )
            return
    else:
        dump_date = wiki_dump_date

    logging.info(
        f'Downloading all latest data dumps from Simple Wiki for dump date {dump_date}\n'
    )
    downloader = DumpDownloader(dump_date=dump_date)
    download_paths = downloader.download_dump()

    con = MariaDBConnector()
    error_files = []
    logging.info(
        f'Loading data from dumps to database for following tables: {download_paths.keys()}'
    )

    for table, file_path in download_paths.items():
        t1 = dt.datetime.now()
        logging.info(f'Loading to {table} table')

        if file_path.endswith('.sql'):
            con.source_sql_script(sql_script_path=file_path)

        elif file_path.endswith('.xml') and 'pages-meta-current' in file_path:
            xml_parser = WikiXMLParser(xml_path=file_path)
            xml_parser.parse_and_load_all_pages_xml(batch_size=10000)

        else:
            logging.warning(f'File {file_path} is not supported for ETL to DB')
            error_files.append((table, file_path, 'unsupported'))

        t2 = dt.datetime.now()
        t1_str = dt.datetime.strftime(t1, '%Y-%m-%dT%H:%M:%S')
        t2_str = dt.datetime.strftime(t2, '%Y-%m-%dT%H:%M:%S')

        con.execute(
            query=f"INSERT INTO {ETL_JOB_LOG} VALUES "
            f"({etl_job_id}, '{dump_date}', '{table}', '{file_path}', '{t1_str}', '{t2_str}')"
        )
        logging.info(f'Loaded to {table} table in {(t2 - t1).seconds} s\n')
        time.sleep(2)

    logging.info(
        'Starting pre-processing of database to aid most outdated page queries'
    )
    db_pre_processor = DBPreProcessor()
    db_pre_processor.run_pre_processing()

    end_time = time.time()
    mins_taken = (end_time - start_time) // 60
    secs_taken = round((end_time - start_time) % 60, 1)
    logging.info(
        f'Finished end-to-end ETL for {len(download_paths)} tables in {mins_taken} min {secs_taken} s'
    )

    if error_files:
        if fail_upon_errors:
            raise RuntimeError(
                f'There were following errors when loading to these tables: {error_files}'
            )
        else:
            logging.warning(
                f'There were following errors when loading to these tables: {error_files}'
            )