def createDB(self): """ Creates new DB to load SQL dump files if required """ db_create = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw) db_create.connect() db_create.create_database(self.db_name) db_create.close()
def DB_exists(self): db_check = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw) db_check.connect() db_exists = db_check.db_exists(self.db_name) db_check.close() return db_exists
def execute( self, page_fan, rev_fan, page_cache_size, rev_cache_size, host, port, db_name, db_user, db_passw, db_engine, mirror, download_files, base_ports, control_ports, dumps_dir=None, ): """ Run data retrieval and loading actions. Arguments: - page_fan = Number of workers to fan out page elements parsing - rev_fan = Number of workers to fan out rev elements parsing - db_user = User name to connect to local database - db_passw = Password for database user - mirror = Base URL of site hosting XML dumps """ if download_files: # TODO: Use proper logging module to track execution progress # Choose corresponding file downloader and etl wrapper print "Downloading new dump files from %s, for language %s" % (mirror, self.lang) self.down = RevHistDownloader(mirror, self.lang) # Donwload latest set of dump files self.paths, self.date = self.down.download(self.date) print "Got files for lang %s, date: %s" % (self.lang, self.date) # db_name = self.lang + '_' + self.date.strip('/') else: # Case of dumps folder provided explicity if dumps_dir: # Allow specifying relative paths, as well dumps_path = os.path.expanduser(dumps_dir) # Retrieve path to all available files to feed ETL lines if not os.path.exists(dumps_path): print "No dump files will be downloaded and local folder " print "with dump files not found. Please, specify a " print "valid path to local folder containing dump files." print "Program will exit now." sys.exit() else: # Attempt to find list of .7z or .xml files to be processed self.paths = glob.glob(os.path.join(dumps_path, "*.7z")) if not self.paths: self.paths = glob.glob(os.path.join(dumps_path, "*.xml")) if not self.paths: print "Directory %s" % dumps_dir print "does not contain any valid dump file." print "Program will exit now." sys.exit() # If not provided explicitly, look for default location of # dumps directory else: dumps_dir = os.path.join(self.lang + "_dumps", self.date) # Look up dump files in default directory name if not os.path.exists(dumps_dir): print "Default directory %s" % dumps_dir print " containing dump files not found." print "Program will exit now." sys.exit() else: self.paths = glob.glob(dumps_dir + "/*.7z") print "paths: " + unicode(self.paths) # DB SCHEMA PREPARATION db_create = MySQLDB(host=host, port=port, user=db_user, passwd=db_passw) db_create.connect() db_create.create_database(db_name) db_create.close() db_schema = MySQLDB(host=host, port=port, user=db_user, passwd=db_passw, db=db_name) db_schema.connect() db_schema.create_schema(engine=db_engine) db_schema.close() # Complete the queue of paths to be processed and STOP flags for # each ETL subprocess paths_queue = mp.JoinableQueue() for path in self.paths: paths_queue.put(path) for x in range(self.etl_lines): paths_queue.put("STOP") for x in range(self.etl_lines): new_etl = PageRevisionETL( name="ETL-process-%s" % x, paths_queue=paths_queue, lang=self.lang, page_fan=page_fan, rev_fan=rev_fan, page_cache_size=page_cache_size, rev_cache_size=rev_cache_size, db_name=db_name, db_user=db_user, db_passw=db_passw, base_port=base_ports[x] + (20 * x), control_port=control_ports[x] + (20 * x), ) self.etl_list.append(new_etl) print "ETL process for page and revision history defined OK." print "Proceeding with ETL workflows. This may take time..." # Extract, process and load information in local DB for etl in self.etl_list: etl.start() # Wait for ETL lines to finish for etl in self.etl_list: etl.join() # TODO: logger; ETL step completed, proceeding with data # analysis and visualization print "ETL process finished for language %s and date %s" % (self.lang, self.date) # Create primary keys for all tables # TODO: This must also be tracked by official logging module print "Now creating primary key indexes in database tables." print "This may take a while..." db_pks = MySQLDB(host="localhost", port=3306, user=db_user, passwd=db_passw, db=db_name) db_pks.connect() db_pks.create_pks() db_pks.close()
def execute(self, page_fan, rev_fan, page_cache_size, rev_cache_size, mirror, download_files, base_ports, control_ports, dumps_dir=None, debug=False): """ Run data retrieval and loading actions. Arguments: - page_fan = Number of workers to fan out page elements parsing - rev_fan = Number of workers to fan out rev elements parsing - db_user = User name to connect to local database - db_passw = Password for database user - mirror = Base URL of site hosting XML dumps """ print "----------------------------------------------------------" print ("""Executing ETL:RevHistory on lang: {0} date: {1}""" .format(self.lang, self.date)) print ("ETL lines = {0} page_fan = {1} rev_fan = {2}" .format(self.etl_lines, page_fan, rev_fan)) print "Download files =", download_files print "Start time is {0}".format(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) print "----------------------------------------------------------" print if download_files: # TODO: Use proper logging module to track execution progress # Choose corresponding file downloader and etl wrapper print "Downloading new dump files from %s, for language %s" % ( mirror, self.lang) self.down = download.RevHistDownloader(mirror, self.lang, dumps_dir) # Donwload latest set of dump files self.paths, self.date = self.down.download(self.date) if not self.paths: print "Error: dump files with pages-logging info not found." print "Program will exit now." sys.exit() print "Retrieved dump files for lang %s, date: %s" % (self.lang, self.date) print else: print "Looking for revision-history dump file(s) in data dir" # Case of dumps folder provided explicity if dumps_dir: # Allow specifying relative paths, as well abs_dumps_path = os.path.expanduser(dumps_dir) dumps_path = os.path.join(abs_dumps_path, self.lang + '_dumps', self.date) # Retrieve path to all available files to feed ETL lines if not os.path.exists(dumps_path): print "No dump files will be downloaded and local folder with dump files not found." print "Please, specify a valid path to local folder containing dump files." print "Program will exit now." sys.exit() else: # Attempt to find list of .7z or .xml files to be processed self.paths = glob.glob(os.path.join(dumps_path, '*pages-meta-hsitory*.7z')) if not self.paths: self.paths = glob.glob(os.path.join(dumps_path, '*pages-meta-hsitory*.xml')) if not self.paths: print "Directory %s does not contain any valid dump file." % dumps_path print "Program will exit now." sys.exit() # If not provided explicitly, look for default location of # dumps directory else: dumps_dir = os.path.join("data", self.lang + '_dumps', self.date) # Look up dump files in default directory name if not os.path.exists(dumps_dir): print "Default directory %s containing dump files not found." % dumps_dir print "Program will exit now." sys.exit() else: self.paths = glob.glob(os.path.join(dumps_dir, '*pages-meta-history*.7z')) if not self.paths: self.paths = glob.glob(os.path.join(dumps_dir, '*pages-meta-hsitory*.xml')) if not self.paths: print "Directory %s does not contain any valid dump file." % dumps_dir print "Program will exit now." sys.exit() print "Found revision-history dump file(s) to process." print # Print list of file paths in debug mode if debug: print "paths: " + unicode(self.paths) print # Create database # TODO: Empty correspoding tables if DB already exists # or let the user select behaviour with config argument if self.DB_exists(): self.create_DB(complete=False) else: self.create_DB(complete=True) # First insert namespace info in DB dump = DumpFile(self.paths[0]) db_schema = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_schema.connect() db_schema.insert_namespaces(nsdict=dump.get_namespaces()) db_schema.close() # Complete the queue of paths to be processed and STOP flags for # each ETL subprocess paths_queue = mp.JoinableQueue() for path in self.paths: paths_queue.put(path) for x in range(self.etl_lines): paths_queue.put('STOP') for x in range(self.etl_lines): new_etl = RevisionHistoryETL( name="[ETL:RevHistory-%s]" % x, paths_queue=paths_queue, lang=self.lang, page_fan=page_fan, rev_fan=rev_fan, page_cache_size=page_cache_size, rev_cache_size=rev_cache_size, db_name=self.db_name, db_user=self.db_user, db_passw=self.db_passw, base_port=base_ports[x]+(20*x), control_port=control_ports[x]+(20*x) ) self.etl_list.append(new_etl) print "ETL:RevHistory task defined OK." print "Proceeding with ETL workflows. This may take time..." print # Extract, process and load information in local DB for etl in self.etl_list: etl.start() # Wait a second for new ETL process to start all subprocesses time.sleep(1) # Wait for ETL lines to finish for etl in self.etl_list: etl.join() # Insert user info after all ETL lines have finished # to ensure that all metadata are stored in Redis cache # disregarding of the execution order data_dir = os.path.join(os.getcwd(), os.path.split(self.paths[0])[0]) db_users = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_users.connect() users_file_to_db(con=db_users, lang=self.lang, log_file=os.path.join(data_dir, 'logs', 'users.log'), tmp_dir=os.path.join(data_dir, 'tmp') ) db_users.close() # TODO: logger; ETL step completed, proceeding with data # analysis and visualization print "ETL:RevHistory task finished for language %s and date %s" % ( self.lang, self.date) print # Create primary keys for all tables # TODO: This must also be tracked by main logging module print "Now creating primary key indexes in database tables." print "This may take a while..." print db_pks = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_pks.connect() db_pks.create_pks_revhist() db_pks.close()
def create_DB(self, complete=False): if complete: db_create = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw) db_create.connect() db_create.create_database(self.db_name) db_create.close() db_schema = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_schema.connect() db_schema.create_schema_revhist(engine=self.db_engine) db_schema.close()
def execute(self, log_fan, log_cache_size, mirror, download_files, base_ports, control_ports, dumps_dir=None, debug=False): """ Run data retrieval and loading actions. Arguments: - log_fan = Number of workers to fan out logitem elements parsing - db_user = User name to connect to local database - db_passw = Password for database user - mirror = Base URL of site hosting XML dumps """ print "----------------------------------------------------------" print("Executing ETL:PagesLogging on lang: {0} date: {1}" .format(self.lang, self.date)) print "log_fan =", log_fan print "Download files =", download_files print "Start time is {0}".format(time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) print "----------------------------------------------------------" print if download_files: # TODO: Use proper logging module to track execution progress # Choose corresponding file downloader and etl wrapper print """Downloading new logging dump files from %s, for language %s""" % (mirror, self.lang) self.down = download.LoggingDownloader(mirror, self.lang, dumps_dir) # Donwload latest set of dump files self.paths, self.date = self.down.download(self.date) if not self.paths: print "Error: dump files with pages-logging info not found." print "Program will exit now." sys.exit() print "Got files for lang %s, date: %s" % (self.lang, self.date) else: print "Looking for pages-logging dump file in data dir" # Case of dumps folder provided explicity if dumps_dir: # Allow specifying relative paths, as well abs_dumps_path = os.path.expanduser(dumps_dir) dumps_path = os.path.join(abs_dumps_path, self.lang + '_dumps', self.date) # Retrieve path to all available files to feed ETL lines if not os.path.exists(dumps_path): print "No dump files will be downloaded and local folder with dump files not found." print "Please, specify a valid path to local folder containing dump files." print "Program will exit now." sys.exit() else: # Attempt to find list of *page-logging*.gz or # *page-logging*.xml files to be processed self.paths = glob.glob(os.path.join(dumps_path, '*pages-logging*.gz')) if not self.paths: self.paths = glob.glob(os.path.join(dumps_path, '*pages-logging*.xml')) if not self.paths: print "Directory %s does not contain any valid dump file." % dumps_path print "Program will exit now." sys.exit() # If not provided explicitly, look for default location of # dumps directory else: dumps_dir = os.path.join("data", self.lang + '_dumps', self.date) # Look up dump files in default directory name if not os.path.exists(dumps_dir): print "Default directory %s containing dump files not found." % dumps_dir print "Program will exit now." sys.exit() else: self.paths = glob.glob(os.path.join(dumps_dir, '*pages-logging*.gz')) if not self.paths: self.paths = glob.glob(os.path.join(dumps_dir, '*pages-logging*.xml')) if not self.paths: print "Directory %s does not contain any valid dump file." % dumps_dir print "Program will exit now." sys.exit() print "Found pages-logging dump file to process." print if debug: print "paths: " + unicode(self.paths) print # Create database if it does not exist # empty logging table otherwise if self.DB_exists(): self.create_DB(complete=False) else: self.create_DB(complete=True) new_etl = LoggingETL(name="[ETL:PagesLogging-0]", path=self.paths, lang=self.lang, log_fan=log_fan, log_cache_size=log_cache_size, db_name=self.db_name, db_user=self.db_user, db_passw=self.db_passw, base_port=base_ports[0]+(30), control_port=control_ports[0]+(30) ) print "ETL:Logging task for administrative records defined OK." print "Proceeding with ETL workflow. This may take time..." print # Extract, process and load information in local DB new_etl.start() # Wait for ETL line to finish new_etl.join() # TODO: logger; ETL step completed, proceeding with data # analysis and visualization print "ETL:Logging task finished for lang %s and date %s" % ( self.lang, self.date) print # Create primary keys for all tables # TODO: This must also be tracked by official logging module print "Now creating primary key indexes in database tables." print "This may take a while..." print db_pks = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_pks.connect() db_pks.create_pks_logitem() db_pks.close()
def run(self): """ Execute workflow to import revision history data from dump files The data loading workflow is composed of a number of processor elements, which can be: - Producer (P): raw input data --> input element queue - ConsumerProducer (CP): input element queue --> insert db queue - Consumer (C): insert db queue --> database (MySQL/MariaDB) In this case, the logical combination is usually N:N:1 (P, CP, C) """ start = time.time() print self.name, "Starting PageRevisionETL workflow at %s" % ( time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) db_ns = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_ns.connect() db_pages = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_pages.connect() db_revs = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_revs.connect() # DATA EXTRACTION # Use consistent naming for all child processes xml_reader_name = '-'.join([self.name, 'xml_reader']) page_proc_name = '-'.join([self.name, 'process_page']) rev_proc_name = '-'.join([self.name, 'process_revision']) page_insert_name = '-'.join([self.name, 'insert_page']) rev_insert_name = '-'.join([self.name, 'insert_revision']) for path in iter(self.paths_queue.get, 'STOP'): # Start subprocess to extract elements from revision dump file dump_file = DumpFile(path) xml_reader = Producer(name=xml_reader_name, target=process_xml, kwargs=dict(dump_file=dump_file), consumers=self.page_fan + self.rev_fan, push_pages_port=self.base_port, push_revs_port=self.base_port + 1, control_port=self.control_port) xml_reader.start() print xml_reader_name, "started" print self.name, "Extracting data from XML revision history file:" print path # List to keep tracking of page and revision workers workers = [] db_workers_revs = [] # Create and start page processes for worker in range(self.page_fan): page_worker_name = '-'.join([page_proc_name, unicode(worker)]) process_page = Processor(name=page_worker_name, target=pages_to_file, producers=1, consumers=1, pull_port=self.base_port, push_port=self.base_port + 2, control_port=self.control_port) process_page.start() workers.append(process_page) print page_worker_name, "started" # Create and start revision processes for worker in range(self.rev_fan): rev_worker_name = '-'.join([rev_proc_name, unicode(worker)]) db_wrev = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_wrev.connect() process_revision = Processor(name=rev_worker_name, target=revs_to_file, kwargs=dict(lang=self.lang), producers=1, consumers=1, pull_port=self.base_port + 1, push_port=self.base_port + 3, control_port=self.control_port) process_revision.start() workers.append(process_revision) db_workers_revs.append(db_wrev) print rev_worker_name, "started" # Create directory for logging files if it does not exist log_dir = os.path.join(os.path.split(path)[0], 'logs') tmp_dir = os.path.join(os.getcwd(), os.path.split(path)[0], 'tmp') file_name = os.path.split(path)[1] if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) log_file = os.path.join(log_dir, file_name + '.log') page_insert_db = Consumer(name=page_insert_name, target=pages_file_to_db, kwargs=dict( con=db_pages, log_file=log_file, tmp_dir=tmp_dir, file_rows=self.page_cache_size, etl_prefix=self.name), producers=self.page_fan, pull_port=self.base_port + 2) rev_insert_db = Consumer(name=rev_insert_name, target=revs_file_to_db, kwargs=dict(con=db_revs, log_file=log_file, tmp_dir=tmp_dir, file_rows=self.rev_cache_size, etl_prefix=self.name), producers=self.rev_fan, pull_port=self.base_port + 3) page_insert_db.start() print page_insert_name, "started" rev_insert_db.start() print rev_insert_name, "started" print self.name, "Waiting for all processes to finish..." print xml_reader.join() for w in workers: w.join() page_insert_db.join() rev_insert_db.join() # Mark this path as done self.paths_queue.task_done() # Mark STOP message as processed and finish self.paths_queue.task_done() end = time.time() print self.name, ": All tasks done in %.4f sec." % ((end - start) / 1.) print db_ns.close() db_pages.close() db_revs.close() for dbcon in db_workers_revs: dbcon.close()
def run(self): """ Execute workflow to import logging records of actions on pages and users from dump file The data loading workflow is composed of a number of processor elements, which can be: - Producer (P): raw input data --> input element queue - ConsumerProducer (CP): input element queue --> insert db queue - Consumer (C): insert db queue --> database (MySQL/MariaDB) In this case, the usual combination is 1:N:1 (P, CP, C) """ start = time.time() print "Starting LoggingETL workflow at %s" % (time.strftime( "%Y-%m-%d %H:%M:%S %Z", time.localtime())) # DATA EXTRACTION xml_reader_name = '-'.join([self.name, 'xml_reader']) logitem_proc_name = '-'.join([self.name, 'process_logitem']) logitem_insert_name = '-'.join([self.name, 'insert_logitem']) # Start subprocess to extract elements from logging dump file file_path = self.path[0] dump_file = DumpFile(file_path) xml_reader = Producer(name=xml_reader_name, target=process_xml, kwargs=dict(dump_file=dump_file), consumers=self.log_fan, push_logs_port=self.base_port, control_port=self.control_port) xml_reader.start() print xml_reader_name, "started" print self.name, "Extracting data from XML revision history file:" print unicode(self.path[0]) # List to keep tracking of logitem workers workers = [] # Create and start page processes for worker in range(self.log_fan): worker_name = '-'.join([logitem_proc_name, unicode(worker)]) process_logitems = Processor(name=worker_name, target=logitem_to_file, producers=1, consumers=1, pull_port=self.base_port, push_port=self.base_port + 2, control_port=self.control_port) process_logitems.start() workers.append(process_logitems) print worker_name, "started" # Create directory for logging files if it does not exist log_dir = os.path.join(os.path.split(file_path)[0], 'logs') tmp_dir = os.path.join(os.getcwd(), os.path.split(file_path)[0], 'tmp') file_name = os.path.split(file_path)[1] if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) log_file = os.path.join(log_dir, file_name + '.log') db_log = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_log.connect() logitem_insert_db = Consumer(name=logitem_insert_name, target=logitem_file_to_db, kwargs=dict(con=db_log, log_file=log_file, tmp_dir=tmp_dir, file_rows=self.log_cache_size, etl_prefix=self.name), producers=self.log_fan, pull_port=self.base_port + 2) print logitem_insert_name, "started" logitem_insert_db.start() print "Waiting for all processes to finish..." print xml_reader.join() for w in workers: w.join() logitem_insert_db.join() # All operations finished end = time.time() print "All tasks done in %.4f sec." % ((end - start) / 1.) print db_log.close()
def run(self): """ Execute workflow to import revision history data from dump files The data loading workflow is composed of a number of processor elements, which can be: - Producer (P): raw input data --> input element queue - ConsumerProducer (CP): input element queue --> insert db queue - Consumer (C): insert db queue --> database (MySQL/MariaDB) In this case, the logical combination is usually N:N:1 (P, CP, C) """ start = time.time() print self.name, "Starting PageRevisionETL workflow at %s" % ( time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) db_ns = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_ns.connect() db_pages = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_pages.connect() db_revs = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_revs.connect() # DATA EXTRACTION # Use consistent naming for all child processes xml_reader_name = '-'.join([self.name, 'xml_reader']) page_proc_name = '-'.join([self.name, 'process_page']) rev_proc_name = '-'.join([self.name, 'process_revision']) page_insert_name = '-'.join([self.name, 'insert_page']) rev_insert_name = '-'.join([self.name, 'insert_revision']) for path in iter(self.paths_queue.get, 'STOP'): # Start subprocess to extract elements from revision dump file dump_file = DumpFile(path) xml_reader = Producer(name=xml_reader_name, target=process_xml, kwargs=dict( dump_file=dump_file), consumers=self.page_fan + self.rev_fan, push_pages_port=self.base_port, push_revs_port=self.base_port+1, control_port=self.control_port) xml_reader.start() print xml_reader_name, "started" print self.name, "Extracting data from XML revision history file:" print path # List to keep tracking of page and revision workers workers = [] db_workers_revs = [] # Create and start page processes for worker in range(self.page_fan): page_worker_name = '-'.join([page_proc_name, unicode(worker)]) process_page = Processor(name=page_worker_name, target=pages_to_file, producers=1, consumers=1, pull_port=self.base_port, push_port=self.base_port+2, control_port=self.control_port) process_page.start() workers.append(process_page) print page_worker_name, "started" # Create and start revision processes for worker in range(self.rev_fan): rev_worker_name = '-'.join([rev_proc_name, unicode(worker)]) db_wrev = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_wrev.connect() process_revision = Processor(name=rev_worker_name, target=revs_to_file, kwargs=dict( lang=self.lang), producers=1, consumers=1, pull_port=self.base_port+1, push_port=self.base_port+3, control_port=self.control_port) process_revision.start() workers.append(process_revision) db_workers_revs.append(db_wrev) print rev_worker_name, "started" # Create directory for logging files if it does not exist log_dir = os.path.join(os.path.split(path)[0], 'logs') tmp_dir = os.path.join(os.getcwd(), os.path.split(path)[0], 'tmp') file_name = os.path.split(path)[1] if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) log_file = os.path.join(log_dir, file_name + '.log') page_insert_db = Consumer(name=page_insert_name, target=pages_file_to_db, kwargs=dict(con=db_pages, log_file=log_file, tmp_dir=tmp_dir, file_rows=self.page_cache_size, etl_prefix=self.name), producers=self.page_fan, pull_port=self.base_port+2) rev_insert_db = Consumer(name=rev_insert_name, target=revs_file_to_db, kwargs=dict(con=db_revs, log_file=log_file, tmp_dir=tmp_dir, file_rows=self.rev_cache_size, etl_prefix=self.name), producers=self.rev_fan, pull_port=self.base_port+3) page_insert_db.start() print page_insert_name, "started" rev_insert_db.start() print rev_insert_name, "started" print self.name, "Waiting for all processes to finish..." print xml_reader.join() for w in workers: w.join() page_insert_db.join() rev_insert_db.join() # Mark this path as done self.paths_queue.task_done() # Mark STOP message as processed and finish self.paths_queue.task_done() end = time.time() print self.name, ": All tasks done in %.4f sec." % ((end-start)/1.) print db_ns.close() db_pages.close() db_revs.close() for dbcon in db_workers_revs: dbcon.close()
def run(self): """ Execute workflow to import logging records of actions on pages and users from dump file The data loading workflow is composed of a number of processor elements, which can be: - Producer (P): raw input data --> input element queue - ConsumerProducer (CP): input element queue --> insert db queue - Consumer (C): insert db queue --> database (MySQL/MariaDB) In this case, the usual combination is 1:N:1 (P, CP, C) """ start = time.time() print "Starting LoggingETL workflow at %s" % ( time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime())) # DATA EXTRACTION xml_reader_name = '-'.join([self.name, 'xml_reader']) logitem_proc_name = '-'.join([self.name, 'process_logitem']) logitem_insert_name = '-'.join([self.name, 'insert_logitem']) # Start subprocess to extract elements from logging dump file file_path = self.path[0] dump_file = DumpFile(file_path) xml_reader = Producer(name=xml_reader_name, target=process_xml, kwargs=dict( dump_file=dump_file), consumers=self.log_fan, push_logs_port=self.base_port, control_port=self.control_port) xml_reader.start() print xml_reader_name, "started" print self.name, "Extracting data from XML revision history file:" print unicode(self.path[0]) # List to keep tracking of logitem workers workers = [] # Create and start page processes for worker in range(self.log_fan): worker_name = '-'.join([logitem_proc_name, unicode(worker)]) process_logitems = Processor(name=worker_name, target=logitem_to_file, producers=1, consumers=1, pull_port=self.base_port, push_port=self.base_port+2, control_port=self.control_port) process_logitems.start() workers.append(process_logitems) print worker_name, "started" # Create directory for logging files if it does not exist log_dir = os.path.join(os.path.split(file_path)[0], 'logs') tmp_dir = os.path.join(os.getcwd(), os.path.split(file_path)[0], 'tmp') file_name = os.path.split(file_path)[1] if not os.path.exists(log_dir): os.makedirs(log_dir) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) log_file = os.path.join(log_dir, file_name + '.log') db_log = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_log.connect() logitem_insert_db = Consumer(name=logitem_insert_name, target=logitem_file_to_db, kwargs=dict(con=db_log, log_file=log_file, tmp_dir=tmp_dir, file_rows=self.log_cache_size, etl_prefix=self.name), producers=self.log_fan, pull_port=self.base_port+2) print logitem_insert_name, "started" logitem_insert_db.start() print "Waiting for all processes to finish..." print xml_reader.join() for w in workers: w.join() logitem_insert_db.join() # All operations finished end = time.time() print "All tasks done in %.4f sec." % ((end-start)/1.) print db_log.close()
def execute(self, log_fan, log_cache_size, mirror, download_files, base_ports, control_ports, dumps_dir=None, debug=False): """ Run data retrieval and loading actions. Arguments: - log_fan = Number of workers to fan out logitem elements parsing - db_user = User name to connect to local database - db_passw = Password for database user - mirror = Base URL of site hosting XML dumps """ print("----------------------------------------------------------") print("Executing ETL:PagesLogging on lang: {0} date: {1}".format( self.lang, self.date)) print("log_fan =", log_fan) print("Download files =", download_files) print("Start time is {0}".format( time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))) print("----------------------------------------------------------") print() if download_files: # TODO: Use proper logging module to track execution progress # Choose corresponding file downloader and etl wrapper print("""Downloading new logging dump files from %s, for language %s""" % (mirror, self.lang)) self.down = LoggingDownloader(mirror, self.lang, dumps_dir) # Donwload latest set of dump files self.paths, self.date = self.down.download(self.date) if not self.paths: print("Error: dump files with pages-logging info not found.") print("Program will exit now.") sys.exit() print("Got files for lang %s, date: %s" % (self.lang, self.date)) else: print("Looking for pages-logging dump file in data dir") # Case of dumps folder provided explicity if dumps_dir: # Allow specifying relative paths, as well abs_dumps_path = os.path.expanduser(dumps_dir) dumps_path = os.path.join(abs_dumps_path, self.lang + '_dumps', self.date) # Retrieve path to all available files to feed ETL lines if not os.path.exists(dumps_path): print( "No dump files will be downloaded and local folder with dump files not found." ) print( "Please, specify a valid path to local folder containing dump files." ) print("Program will exit now.") sys.exit() else: # Attempt to find list of *page-logging*.gz or # *page-logging*.xml files to be processed self.paths = glob.glob( os.path.join(dumps_path, '*pages-logging*.gz')) if not self.paths: self.paths = glob.glob( os.path.join(dumps_path, '*pages-logging*.xml')) if not self.paths: print( "Directory %s does not contain any valid dump file." % dumps_path) print("Program will exit now.") sys.exit() # If not provided explicitly, look for default location of # dumps directory else: dumps_dir = os.path.join("data", self.lang + '_dumps', self.date) # Look up dump files in default directory name if not os.path.exists(dumps_dir): print( "Default directory %s containing dump files not found." % dumps_dir) print("Program will exit now.") sys.exit() else: self.paths = glob.glob( os.path.join(dumps_dir, '*pages-logging*.gz')) if not self.paths: self.paths = glob.glob( os.path.join(dumps_dir, '*pages-logging*.xml')) if not self.paths: print( "Directory %s does not contain any valid dump file." % dumps_dir) print("Program will exit now.") sys.exit() print("Found pages-logging dump file to process.") print() if debug: print("paths: ", str(self.paths)) print() # Create database if it does not exist # empty logging table otherwise if self.DB_exists(): self.create_DB(complete=False) else: self.create_DB(complete=True) new_etl = LoggingETL(name="[ETL:PagesLogging-0]", path=self.paths, lang=self.lang, log_fan=log_fan, log_cache_size=log_cache_size, db_name=self.db_name, db_user=self.db_user, db_passw=self.db_passw, base_port=base_ports[0] + (30), control_port=control_ports[0] + (30)) print("ETL:Logging task for administrative records defined OK.") print("Proceeding with ETL workflow. This may take time...") print() # Extract, process and load information in local DB new_etl.start() # Wait for ETL line to finish new_etl.join() # TODO: logger; ETL step completed, proceeding with data # analysis and visualization print("ETL:Logging task finished for lang %s and date %s" % (self.lang, self.date)) print() # Create primary keys for all tables # TODO: This must also be tracked by official logging module print("Now creating primary key indexes in database tables.") print("This may take a while...") print() db_pks = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_pks.connect() db_pks.create_pks_logitem() db_pks.close()
def create_DB(self, complete=False): if complete: db_create = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw) db_create.connect() db_create.create_database(self.db_name) db_create.close() db_schema = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_schema.connect() db_schema.create_schema_logitem(engine=self.db_engine) db_schema.close()
def execute(self, page_fan, rev_fan, page_cache_size, rev_cache_size, mirror, download_files, base_ports, control_ports, dumps_dir=None, debug=False): """ Run data retrieval and loading actions. Arguments: - page_fan = Number of workers to fan out page elements parsing - rev_fan = Number of workers to fan out rev elements parsing - db_user = User name to connect to local database - db_passw = Password for database user - mirror = Base URL of site hosting XML dumps """ print("----------------------------------------------------------") print(("""Executing ETL:RevHistory on lang: {0} date: {1}""".format( self.lang, self.date))) print(("ETL lines = {0} page_fan = {1} rev_fan = {2}".format( self.etl_lines, page_fan, rev_fan))) print("Download files =", download_files) print("Start time is {0}".format( time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime()))) print("----------------------------------------------------------") print() if download_files: # TODO: Use proper logging module to track execution progress # Choose corresponding file downloader and etl wrapper print("Downloading new dump files from %s, for language %s" % (mirror, self.lang)) self.down = RevHistDownloader(mirror, self.lang, dumps_dir) # Donwload latest set of dump files self.paths, self.date = self.down.download(self.date) if not self.paths: print("Error: dump files with pages-logging info not found.") print("Program will exit now.") sys.exit() print("Retrieved dump files for lang %s, date: %s" % (self.lang, self.date)) print() else: print("Looking for revision-history dump file(s) in data dir") # Case of dumps folder provided explicity if dumps_dir: # Allow specifying relative paths, as well abs_dumps_path = os.path.expanduser(dumps_dir) dumps_path = os.path.join(abs_dumps_path, self.lang + '_dumps', self.date) # Retrieve path to all available files to feed ETL lines if not os.path.exists(dumps_path): print( "No dump files will be downloaded and local folder with dump files not found." ) print( "Please, specify a valid path to local folder containing dump files." ) print("Program will exit now.") sys.exit() else: # Attempt to find list of .7z or .xml files to be processed self.paths = glob.glob( os.path.join(dumps_path, '*pages-meta-history*.7z')) if not self.paths: self.paths = glob.glob( os.path.join(dumps_path, '*pages-meta-history*.xml')) if not self.paths: print( "Directory %s does not contain any valid dump file." % dumps_path) print("Program will exit now.") sys.exit() # If not provided explicitly, look for default location of # dumps directory else: dumps_dir = os.path.join("data", self.lang + '_dumps', self.date) # Look up dump files in default directory name if not os.path.exists(dumps_dir): print( "Default directory %s containing dump files not found." % dumps_dir) print("Program will exit now.") sys.exit() else: self.paths = glob.glob( os.path.join(dumps_dir, '*pages-meta-history*.7z')) if not self.paths: self.paths = glob.glob( os.path.join(dumps_dir, '*pages-meta-history*.xml')) if not self.paths: print( "Directory %s does not contain any valid dump file." % dumps_dir) print("Program will exit now.") sys.exit() print("Found revision-history dump file(s) to process.") print() # Print list of file paths in debug mode if debug: print("paths: ", str(self.paths)) print() # Create database # TODO: Empty correspoding tables if DB already exists # or let the user select behaviour with config argument if self.DB_exists(): self.create_DB(complete=False) else: self.create_DB(complete=True) # First insert namespace info in DB dump = DumpFile(self.paths[0]) db_schema = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_schema.connect() db_schema.insert_namespaces(nsdict=dump.get_namespaces()) db_schema.close() # Complete the queue of paths to be processed and STOP flags for # each ETL subprocess paths_queue = mp.JoinableQueue() for path in self.paths: paths_queue.put(path) for x in range(self.etl_lines): paths_queue.put('STOP') for x in range(self.etl_lines): new_etl = RevisionHistoryETL(name="[ETL:RevHistory-%s]" % x, paths_queue=paths_queue, lang=self.lang, page_fan=page_fan, rev_fan=rev_fan, page_cache_size=page_cache_size, rev_cache_size=rev_cache_size, db_name=self.db_name, db_user=self.db_user, db_passw=self.db_passw, base_port=base_ports[x] + (20 * x), control_port=control_ports[x] + (20 * x)) self.etl_list.append(new_etl) print("ETL:RevHistory task defined OK.") print("Proceeding with ETL workflows. This may take time...") print() # Extract, process and load information in local DB for etl in self.etl_list: etl.start() # Wait a second for new ETL process to start all subprocesses time.sleep(1) # Wait for ETL lines to finish for etl in self.etl_list: etl.join() # Insert user info after all ETL lines have finished # to ensure that all metadata are stored in Redis cache # disregarding of the execution order data_dir = os.path.join(os.getcwd(), os.path.split(self.paths[0])[0]) db_users = MySQLDB(host=self.host, port=self.port, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_users.connect() users_file_to_db(con=db_users, lang=self.lang, log_file=os.path.join(data_dir, 'logs', 'users.log'), tmp_dir=os.path.join(data_dir, 'tmp')) db_users.close() # TODO: logger; ETL step completed, proceeding with data # analysis and visualization print("ETL:RevHistory task finished for language %s and date %s" % (self.lang, self.date)) print() # Create primary keys for all tables # TODO: This must also be tracked by main logging module print("Now creating primary key indexes in database tables.") print("This may take a while...") print() db_pks = MySQLDB(host='localhost', port=3306, user=self.db_user, passwd=self.db_passw, db=self.db_name) db_pks.connect() db_pks.create_pks_revhist() db_pks.close()