Beispiel #1
0
 def createDB(self):
     """
     Creates new DB to load SQL dump files if required
     """
     db_create = MySQLDB(host=self.host, port=self.port,
                         user=self.db_user, passwd=self.db_passw)
     db_create.connect()
     db_create.create_database(self.db_name)
     db_create.close()
Beispiel #2
0
 def createDB(self):
     """
     Creates new DB to load SQL dump files if required
     """
     db_create = MySQLDB(host=self.host,
                         port=self.port,
                         user=self.db_user,
                         passwd=self.db_passw)
     db_create.connect()
     db_create.create_database(self.db_name)
     db_create.close()
Beispiel #3
0
 def create_DB(self, complete=False):
     if complete:
         db_create = MySQLDB(host=self.host, port=self.port,
                             user=self.db_user, passwd=self.db_passw)
         db_create.connect()
         db_create.create_database(self.db_name)
         db_create.close()
     db_schema = MySQLDB(host=self.host, port=self.port, user=self.db_user,
                         passwd=self.db_passw, db=self.db_name)
     db_schema.connect()
     db_schema.create_schema_revhist(engine=self.db_engine)
     db_schema.close()
Beispiel #4
0
 def create_DB(self, complete=False):
     if complete:
         db_create = MySQLDB(host=self.host,
                             port=self.port,
                             user=self.db_user,
                             passwd=self.db_passw)
         db_create.connect()
         db_create.create_database(self.db_name)
         db_create.close()
     db_schema = MySQLDB(host=self.host,
                         port=self.port,
                         user=self.db_user,
                         passwd=self.db_passw,
                         db=self.db_name)
     db_schema.connect()
     db_schema.create_schema_logitem(engine=self.db_engine)
     db_schema.close()
Beispiel #5
0
    def execute(
        self,
        page_fan,
        rev_fan,
        page_cache_size,
        rev_cache_size,
        host,
        port,
        db_name,
        db_user,
        db_passw,
        db_engine,
        mirror,
        download_files,
        base_ports,
        control_ports,
        dumps_dir=None,
    ):
        """
        Run data retrieval and loading actions.
        Arguments:
            - page_fan = Number of workers to fan out page elements parsing
            - rev_fan = Number of workers to fan out rev elements parsing
            - db_user = User name to connect to local database
            - db_passw = Password for database user
            - mirror = Base URL of site hosting XML dumps
        """
        if download_files:
            # TODO: Use proper logging module to track execution progress
            # Choose corresponding file downloader and etl wrapper
            print "Downloading new dump files from %s, for language %s" % (mirror, self.lang)
            self.down = RevHistDownloader(mirror, self.lang)
            # Donwload latest set of dump files
            self.paths, self.date = self.down.download(self.date)
            print "Got files for lang %s, date: %s" % (self.lang, self.date)

            # db_name = self.lang + '_' + self.date.strip('/')

        else:
            # Case of dumps folder provided explicity
            if dumps_dir:
                # Allow specifying relative paths, as well
                dumps_path = os.path.expanduser(dumps_dir)
                # Retrieve path to all available files to feed ETL lines
                if not os.path.exists(dumps_path):
                    print "No dump files will be downloaded and local folder "
                    print "with dump files not found. Please, specify a "
                    print "valid path to local folder containing dump files."
                    print "Program will exit now."
                    sys.exit()

                else:
                    # Attempt to find list of .7z or .xml files to be processed
                    self.paths = glob.glob(os.path.join(dumps_path, "*.7z"))
                    if not self.paths:
                        self.paths = glob.glob(os.path.join(dumps_path, "*.xml"))
                        if not self.paths:
                            print "Directory %s" % dumps_dir
                            print "does not contain any valid dump file."
                            print "Program will exit now."
                            sys.exit()
            # If not provided explicitly, look for default location of
            # dumps directory
            else:
                dumps_dir = os.path.join(self.lang + "_dumps", self.date)
                # Look up dump files in default directory name
                if not os.path.exists(dumps_dir):
                    print "Default directory %s" % dumps_dir
                    print " containing dump files not found."
                    print "Program will exit now."
                    sys.exit()

                else:
                    self.paths = glob.glob(dumps_dir + "/*.7z")

        print "paths: " + unicode(self.paths)

        # DB SCHEMA PREPARATION
        db_create = MySQLDB(host=host, port=port, user=db_user, passwd=db_passw)
        db_create.connect()
        db_create.create_database(db_name)
        db_create.close()
        db_schema = MySQLDB(host=host, port=port, user=db_user, passwd=db_passw, db=db_name)
        db_schema.connect()
        db_schema.create_schema(engine=db_engine)
        db_schema.close()

        # Complete the queue of paths to be processed and STOP flags for
        # each ETL subprocess
        paths_queue = mp.JoinableQueue()
        for path in self.paths:
            paths_queue.put(path)

        for x in range(self.etl_lines):
            paths_queue.put("STOP")

        for x in range(self.etl_lines):
            new_etl = PageRevisionETL(
                name="ETL-process-%s" % x,
                paths_queue=paths_queue,
                lang=self.lang,
                page_fan=page_fan,
                rev_fan=rev_fan,
                page_cache_size=page_cache_size,
                rev_cache_size=rev_cache_size,
                db_name=db_name,
                db_user=db_user,
                db_passw=db_passw,
                base_port=base_ports[x] + (20 * x),
                control_port=control_ports[x] + (20 * x),
            )
            self.etl_list.append(new_etl)

        print "ETL process for page and revision history defined OK."
        print "Proceeding with ETL workflows. This may take time..."
        # Extract, process and load information in local DB
        for etl in self.etl_list:
            etl.start()

        # Wait for ETL lines to finish
        for etl in self.etl_list:
            etl.join()

        # TODO: logger; ETL step completed, proceeding with data
        # analysis and visualization
        print "ETL process finished for language %s and date %s" % (self.lang, self.date)

        # Create primary keys for all tables
        # TODO: This must also be tracked by official logging module
        print "Now creating primary key indexes in database tables."
        print "This may take a while..."
        db_pks = MySQLDB(host="localhost", port=3306, user=db_user, passwd=db_passw, db=db_name)
        db_pks.connect()
        db_pks.create_pks()
        db_pks.close()