def __init__(self):
        # Load the configuration for the filters
        try:
            filter_list_from_config = Config.filters()
        except Exception as e:
            log("ERROR", "Filter configuration error: " + str(e))
            sys.exit()  # Configuration errors should prevent execution

        # Create (and empty if needed) base filter path
        self.filter_base_dir = os.path.join(Config.get('Saved_Log_Dir'),
                                            'temp_filter_processing')
        if os.path.isdir(self.filter_base_dir):
            shutil.rmtree(self.filter_base_dir)

        # Create staging (all original files go here) and final (all filtered files end up here) directories
        os.makedirs(os.path.join(self.filter_base_dir, 'staging'),
                    exist_ok=True)
        os.makedirs(os.path.join(self.filter_base_dir, 'final'), exist_ok=True)

        # Build the filter instance list
        for filter_from_config in filter_list_from_config:
            self.instance_list.append(
                FilterInstance(filter_from_config.get("name"),
                               filter_from_config.get("exe"),
                               filter_from_config.get("type"),
                               filter_from_config.get("timeout"),
                               self.filter_base_dir))

        log(
            "INFO",
            str(len(filter_list_from_config)) +
            " valid filters have been found")
    def cleanup(self):
        # Executes when there are no more jobs
        if self.status is True:
            log("ERROR", "Filter cleanup when filters are still running")

        # Delete all temp filter input/output folders
        # (which should be empty unless there was an error in a filter's termination)
        try:
            shutil.rmtree(self.filter_base_dir)
        except OSError as e:
            log(
                "ERROR",
                "When deleting the temporary filter processing folder, encountered an error: "
                + str(e))
Beispiel #3
0
    def needs_update(self, job):
        # Checks if a job exists in the database and if it needs updating
        cur = self.conn.cursor()

        # Do directory/file names need to be sanitized?
        cur.execute(
            "SELECT FrontendUsername, InstanceName, FileSize FROM file_index WHERE JobID='{}' and EntryName='{}'".format(
                job["job_id"], job["entry_name"]))

        response = cur.fetchone()
        if response is None:
            # Does not exist in db
            return True
        else:
            # Does exist in db
            if response[0] != job["frontend_user"] or response[1] != job["instance_name"]:
                log("ERROR", "Duplicate job found for either another Frontend Username or an Instance Name")
                return TabError
            elif int(job["err_file_size"] + job["out_file_size"]) > int(response[2]):
                # File size is greater, therefore update the job
                return True
            else:
                # Already added without a file size difference
                return False
Beispiel #4
0
def main():
    # Parse command line arguments (if any)
    parser = argparse.ArgumentParser(
        description=
        "GlideinMonitor's indexing script for GlideIn .out & .err files")
    parser.add_argument('-c', help="Path to Config File")
    parser.add_argument('-f',
                        help="Ignore the lock file and force an index anyway",
                        action='store_true')
    args = parser.parse_args()

    # Process config file
    Config.init(args.c)

    # Check for index job lock
    lock_location = os.path.join(Config.get('Saved_Log_Dir'), "index_lock")
    if not pathlib.Path(lock_location).exists():
        try:
            os.mkdir(Config.get('Saved_Log_Dir'))
            log("INFO", "Creating new directory for index_lock")
        except FileExistsError:
            pass
        pathlib.Path(lock_location).touch()
    else:
        # Job index already running/did not complete
        if not args.f:
            log("ERROR", "Lock file present in saved log directory")
            return

    # Connect to the database
    db = Database()

    # Get list of job data that should be indexed
    job_index_list = determine_indexing(db)

    # Archive the original files
    archive_files(db, job_index_list)

    # Indexing & filtering complete
    db.commit()
    log("INFO", "Indexing Complete")

    # Delete the lock file
    os.remove(pathlib.Path(lock_location))
Beispiel #5
0
def begin_indexing(args):
    # Check for index job lock
    lock_location = os.path.join(Config.get('Saved_Log_Dir'), "index_lock")
    if not pathlib.Path(lock_location).exists():
        pathlib.Path(lock_location).touch()
    else:
        # Job index already running/did not complete
        if not args.f:
            log("ERROR", "Lock file present in saved log directory")
            return

    # Entry point for indexing
    db = Database()
    jobs_updated = 0
    saved_dir_name = Config.get('Saved_Log_Dir')
    datetime_name = datetime.datetime.now().strftime("%Y-%m-%d")

    log("INFO", "Begin Indexing")

    # Get a dictionary of jobs from the GWMS_Log_Dir directory
    tree = directory_jobs(Config.get('GWMS_Log_Dir'))

    log("INFO", "Directory Listing Completion")

    # Iterate through each job checking the database if it needs to be updated
    for job_name, job_data in tree.items():
        # Skip entries that are missing an err/out file
        if "err_file_path" not in job_data or "out_file_path" not in job_data:
            log(
                "INFO", "Missing ERR/OUT file for entry - jobID: " +
                job_data["entry_name"] + " - " + str(job_data["job_id"]))
            continue

        # Check if the current instance is in the database, if not then add it
        final_dir_name = os.path.join(saved_dir_name,
                                      job_data["instance_name"],
                                      job_data["frontend_user"], datetime_name)

        if db.needs_update(job_data):
            # Create the directory if it does not exist
            if not os.path.exists(final_dir_name):
                os.makedirs(final_dir_name)

            # Check if the file has certain logs within it
            found_logs = {
                "MasterLog": False,
                "StartdLog": False,
                "StarterLog": False,
                "StartdHistoryLog": False,
                "glidein_activity": False
            }
            if job_data['err_file_size'] != 0:
                with open(job_data["err_file_path"], 'rb',
                          0) as file, mmap.mmap(file.fileno(),
                                                0,
                                                access=mmap.ACCESS_READ) as s:
                    if s.find(b'MasterLog\n========') != -1:
                        found_logs["MasterLog"] = True
                    if s.find(b'StartdLog\n========') != -1:
                        found_logs["StartdLog"] = True
                    if s.find(b'StarterLog\n========') != -1:
                        found_logs["StarterLog"] = True
                    if s.find(b'StartdHistoryLog\n========') != -1:
                        found_logs["StartdHistoryLog"] = True
                    if s.find(
                            b'=== Encoded XML description of glidein activity ==='
                    ) != -1:
                        found_logs["glidein_activity"] = True

            # Tar the output and error file
            curr_job_path = os.path.join(
                final_dir_name, job_name[0] + "_" + job_name[1] + "_" +
                job_name[2] + ".tar.gz")
            with tarfile.open(curr_job_path, "w:gz") as tar:
                tar.add(job_data["out_file_path"],
                        arcname=os.path.basename(job_data["out_file_path"]))
                tar.add(job_data["err_file_path"],
                        arcname=os.path.basename(job_data["err_file_path"]))
                tar.close()

            # Add/Update it in the database
            db.add_job(job_data, curr_job_path, found_logs)

            # Job added/updated
            jobs_updated += 1

    # Indexing complete
    db.commit()

    # Delete the lock file
    os.remove(pathlib.Path(lock_location))

    log("INFO", "Jobs added/updated " + str(jobs_updated))
    log("INFO", "Indexing Complete")
    def filter_runner(base_directory, instance_list, job_queue, messageQueue):
        # Variables within runner
        master_jobs_list = {}
        unpacked_expectations = {}

        staging_dir = os.path.join(base_directory, 'staging')
        final_dir = os.path.join(base_directory, 'final')

        timeout_stepper = 0
        timeout_start = None
        completion_check = False

        # The thread runner method, runs until cleanup method
        while len(instance_list) != 0:
            # Check for messages
            for i in range(0, messageQueue.qsize()):
                # Get the message
                the_message = messageQueue.get()

                # Check for timeout
                if the_message == 'TIMEOUT_START':
                    # Timeout received
                    timeout_start = time.time()

            # Should this pass check for completion
            if timeout_start:
                completion_check = True

            # Process any new jobs
            for i in range(0, job_queue.qsize()):
                # Get the next job passed from the indexer
                completion_check = False
                next_job = job_queue.get()

                # Generate a unique ID for the job
                job_uuid = str(uuid.uuid1())
                while job_uuid in master_jobs_list:
                    job_uuid = uuid.uuid1()

                # Add it to the list (used at the end to build the archive
                # (Archive path, .out file name, .err file name)
                master_jobs_list[job_uuid] = (
                    next_job[0],
                    os.path.basename(next_job[1]["out_file_path"]),
                    os.path.basename(next_job[1]["err_file_path"]))

                # New job, move all files to the staging directory
                shutil.copyfile(next_job[1]['err_file_path'],
                                os.path.join(staging_dir, job_uuid + '.err'))
                shutil.copyfile(next_job[1]['out_file_path'],
                                os.path.join(staging_dir, job_uuid + '.out'))

            # Go through each filter from the top**, moving files as necessary
            for i in range(len(instance_list)):
                # Check for new files and move/convert if needed depending on the type
                if i == 0:
                    # Check the staging directory if it's on the first filter
                    if instance_list[i].type == 'whole':
                        new_files = Filter.move_files(
                            staging_dir, instance_list[i].input_directory)
                    else:
                        new_files, updating_expectations = Filter.type_moving(
                            staging_dir, instance_list[i].input_directory,
                            instance_list[i].type, unpacked_expectations)
                        unpacked_expectations.update(updating_expectations)
                else:
                    if instance_list[i].type == instance_list[i - 1].type:
                        new_files = Filter.move_files(
                            instance_list[i - 1].output_directory,
                            instance_list[i].input_directory)
                    else:
                        new_files, updating_expectations = Filter.type_moving(
                            instance_list[i - 1].output_directory,
                            instance_list[i].input_directory,
                            instance_list[i].type, unpacked_expectations)
                        unpacked_expectations.update(updating_expectations)

                # If it didn't move any files, check if there are any files still in the input directory
                if not new_files:
                    new_files = Filter.dir_empty_check(
                        instance_list[i].input_directory)

                # Start the filter executable if not already running only if there are new files
                if new_files:
                    completion_check = False

                    # Timeout check
                    if timeout_start is not None and timeout_stepper == i:
                        # The timeout has already started and current waiting on this filter
                        # Check if it needs to stop this filter
                        curr_filter_timeout = instance_list[i].exe_timeout
                        if time.time(
                        ) - timeout_start > curr_filter_timeout != 0:
                            # Stopping this process if needed and deleting files from it's input directory
                            print("Timeout reached")
                            instance_list[i].exe_subprocess.terminate()

                            for root, dirs, files in os.walk(
                                    instance_list[i].input_directory):
                                for f in files:
                                    os.remove(os.path.join(root, f))

                            log(
                                "ERROR", "Filter \"" + instance_list[i].name +
                                "\" timeout reached")

                    if instance_list[i].exe_subprocess is None or instance_list[
                            i].exe_subprocess.poll() is not None:
                        # If the filter did run, check the return code
                        if instance_list[i].exe_subprocess is not None:
                            # Get the return code
                            return_code_from_filter_exe = instance_list[
                                i].exe_subprocess.poll()

                            # Check if the return code signals a failure
                            if return_code_from_filter_exe != 0:
                                # The filter exe errored out, submit the error to the log and terminate filters running
                                log(
                                    "ERROR",
                                    "The filter \"" + instance_list[i].name +
                                    "\" encountered a return code of: " +
                                    str(return_code_from_filter_exe) +
                                    "\nTherefore the filters have been terminated, no filter archive created."
                                )
                                return

                        # Start the exe of the filter instance with the appropriate arguments
                        instance_list[i].exe_subprocess = subprocess.Popen([
                            instance_list[i].exe_full_path, "-i",
                            instance_list[i].input_directory, "-o",
                            instance_list[i].output_directory
                        ])
                else:
                    # Check if the timeout was waiting on this filter
                    if timeout_start is not None and timeout_stepper == i:
                        # It was waiting on this filter, and it finished processing files - move to the next filter
                        timeout_stepper += 1
                        timeout_start = time.time()

            # Check the last filter's output folder and move any files to the final directory
            if instance_list[len(instance_list) - 1].type == 'whole':
                new_files = Filter.move_files(
                    instance_list[len(instance_list) - 1].output_directory,
                    final_dir)
            else:
                new_files, updating_expectations = new_files = Filter.type_moving(
                    instance_list[len(instance_list) - 1].output_directory,
                    final_dir, 'whole', unpacked_expectations)
                unpacked_expectations.update(updating_expectations)

            if new_files:
                # New files, package them if both the .err and .out available
                # Currently, jobs getting to the beginning of the filtering stage will have both .err and .out

                # List all files in the final directory
                final_dir_files = [
                    file for file in os.listdir(final_dir)
                    if os.path.isfile(os.path.join(final_dir, file))
                ]

                final_dir_jobs = []
                # Create a list containing complete jobs
                for file in final_dir_files:
                    #  Get the current file's uuid and extension
                    job_uuid, curr_ext = os.path.splitext(file)

                    # Check if the file's job has already been added
                    if job_uuid in final_dir_jobs:
                        continue

                    # Get the other job extension (if the current file is a .out, the other is .err and vice versa)
                    other_ext = '.out'
                    if curr_ext == '.out':
                        other_ext = '.err'

                    # Check if the other file is in the folder
                    if job_uuid + other_ext in final_dir_files:
                        # By far, the most efficient way to program this - Thomas Hein, programmer of the year here
                        final_dir_jobs.append(job_uuid)

                # For all complete jobs, archive them
                for completed_job_uuid in final_dir_jobs:
                    # Save the original immediately
                    with tarfile.open(master_jobs_list[completed_job_uuid][0],
                                      "w:gz") as tar:
                        tar.add(os.path.join(final_dir,
                                             completed_job_uuid + ".out"),
                                arcname=os.path.basename(
                                    master_jobs_list[completed_job_uuid][1]))
                        tar.add(os.path.join(final_dir,
                                             completed_job_uuid + ".err"),
                                arcname=os.path.basename(
                                    master_jobs_list[completed_job_uuid][2]))
                        tar.close()

                    # Then delete the .out and .err files
                    os.remove(
                        os.path.join(final_dir, completed_job_uuid + ".out"))
                    os.remove(
                        os.path.join(final_dir, completed_job_uuid + ".err"))

            # Check if filtering is complete
            if completion_check:
                log("INFO", "Filtering Complete")
                return

            # Thread sleep before checking again
            time.sleep(0.5)
Beispiel #7
0
def determine_indexing(db):
    # Entry point for indexing
    jobs_updated = 0

    log("INFO", "Begin Indexing")

    # Get a dictionary of jobs from the GWMS_Log_Dir directory
    tree = directory_jobs(Config.get('GWMS_Log_Dir'))

    log("INFO", "Directory Listing Completion")

    # List to be exported
    job_index_list = []

    # Iterate through each job checking the database if it needs to be updated
    for job_name, job_data in tree.items():
        # Skip entries that are missing an '.err'/'.out'. file
        if "err_file_path" not in job_data or "out_file_path" not in job_data:
            log(
                "INFO", "Missing ERR/OUT file for entry - jobID: " +
                job_data["entry_name"] + " - " + str(job_data["job_id"]))
            continue

        if db.needs_update(job_data):
            # Check if the file has certain logs within it
            found_logs = {
                "MasterLog": False,
                "StartdLog": False,
                "StarterLog": False,
                "StartdHistoryLog": False,
                "glidein_activity": False
            }
            if job_data['err_file_size'] != 0:
                with open(job_data["err_file_path"], 'rb',
                          0) as file, mmap.mmap(file.fileno(),
                                                0,
                                                access=mmap.ACCESS_READ) as s:
                    if s.find(b'MasterLog\n========') != -1:
                        found_logs["MasterLog"] = True
                    if s.find(b'StartdLog\n========') != -1:
                        found_logs["StartdLog"] = True
                    if s.find(b'StarterLog\n========') != -1:
                        found_logs["StarterLog"] = True
                    if s.find(b'StartdHistoryLog\n========') != -1:
                        found_logs["StartdHistoryLog"] = True
                    if s.find(
                            b'=== Encoded XML description of glidein activity ==='
                    ) != -1:
                        found_logs["glidein_activity"] = True

            # Add found logs into the job data
            job_data.update(found_logs)

            # Add the job to list to be indexed
            job_index_list.append(job_data)

            # Job added/updated
            jobs_updated += 1

    log("INFO", "Jobs to be added/updated " + str(jobs_updated))

    return job_index_list
Beispiel #8
0
    def __init__(self):
        # Connect to SQLite unless specified otherwise in the config file
        if Config.db("type") == "sqlite":
            # SQLite Database
            try:
                os.mkdir(Config.db("dir"))
                log("INFO", "Creating new directory for SQLite DB")
            except FileExistsError:
                pass
            self.conn = sqlite3.connect(os.path.join(Config.db("dir"), "%s.sqlite" % Config.db("db_name")))

            # Check if index table exists
            db_cursor = self.conn.cursor()
            db_cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='file_index';")

            if db_cursor.fetchone() is None:
                # It doesn't, create it
                log("INFO", "Creating new SQLite database")

                script_file = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "sqliteTableCreation.sql"),
                                   'r')
                script = script_file.read()
                script_file.close()
                db_cursor.executescript(script)
        elif Config.db("type") == "mysql":
            # MySQL Database
            if not MYSQL_AVAILABLE:
                log("ERROR", "MySQL database selected but there is no MySQL connector")
                raise ImportError("Module not found: mysql.connector")
            try:
                self.conn = mysql.connector.connect(
                    host=Config.db("host"),
                    user=Config.db("user"),
                    passwd=Config.db("pass"),
                    database=Config.db("db_name")
                )

                mycursor = self.conn.cursor()
            except mysql.connector.errors.ProgrammingError:
                # Create the database
                log("INFO", "Creating new MySQL Database")
                mydb = mysql.connector.connect(
                    host=Config.db("host"),
                    user=Config.db("user"),
                    passwd=Config.db("pass")
                )

                mycursor = mydb.cursor()
                mycursor.execute("CREATE DATABASE " + Config.db("db_name"))

                self.conn = mysql.connector.connect(
                    host=Config.db("host"),
                    user=Config.db("user"),
                    passwd=Config.db("pass"),
                    database=Config.db("db_name")
                )

                mycursor = self.conn.cursor()

            # Check if the table exists
            mycursor.execute("SHOW TABLES")

            if ('file_index',) not in mycursor:
                # Create table
                log("INFO", "Creating MySQL File Index table")
                script_file = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "mysqlTableCreation.sql"),
                                   'r')
                script = script_file.read()
                script_file.close()
                mycursor.execute(script)
        else:
            log("ERROR", "No valid database selected (%s)" % Config.db("type"))
            raise ImportError("Invalid ")