def __init__(self): self.raw_data = { "tegner-login-1": "/cfs/klemming/projects/xenon/xenon1t/raw/", "midway-login1": "/project2/lgrandi/xenon1t/raw/" } self.proc_data = { "tegner-login-1": "/cfs/klemming/projects/xenon/xenon1t/processed/", "midway-login1": "/project/lgrandi/xenon1t/processed/" } self.chown_user = { "tegner-login-1": "bobau", "midway-login1": "pdeperio" } self.chown_group = { "tegner-login-1": "xenon-users", "midway-login1": "xenon1t-admins" } self.chmod = {"tegner-login-1": '750', "midway-login1": '755'} Task.__init__(self) self.hostname_config = config.get_config(config.get_hostname()) self.hostname = config.get_hostname()
def each_location(self, data_doc): # Only data waiting to be verified if data_doc['status'] != 'verifying': # and data_doc['status'] != 'transferred': self.log.debug('Location '+data_doc['host']+' does not need to add new checksum') return if data_doc['status'] == 'transferred' and \ (config.get_hostname() == 'xe1t-datamanager' or config.get_hostname() == 'login'): return # Data must be hosted somewhere if 'host' not in data_doc: return # Data must be here locally if data_doc['host'] != config.get_hostname(): # Special case of midway-srm accessible via POSIX on midway-login1 if not (data_doc['host'] == "midway-srm" and config.get_hostname() == "midway-login1"): self.log.debug('Location not here') return # This status is given after checksumming status = 'transferred' # Find file and perform checksum if os.path.isdir(data_doc['location']): value = checksumdir.dirhash(data_doc['location'], 'sha512') elif os.path.isfile(data_doc['location']): value = checksumdir._filehash(data_doc['location'], hashlib.sha512) else: # Data not actually found self.log.error("Location %s not found." % data_doc['location']) value = None status = 'error' if config.DATABASE_LOG: if data_doc['status'] == 'verifying': self.log.info("Adding a checksum to run " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update({'_id' : self.run_doc['_id'], 'data': {'$elemMatch': data_doc}}, {'$set': {'data.$.status' : status, 'data.$.checksum': value}}) elif data_doc['checksum'] != value or status == 'error': self.log.info("Checksum fail " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update({'_id' : self.run_doc['_id'], 'data': {'$elemMatch': data_doc}}, {'$set': {'data.$.checksumproblem': True}})
def each_run(self): """Run over the requested data types according to the json config file""" if 'data_type' not in config.get_config( config.get_hostname() ): logging.info("Error: Define a data_type in your configuration file") logging.info(" (e.g. 'data_type': ['raw'])") exit() for data_type in config.get_config( config.get_hostname() )['data_type']: self.log.debug("%s" % data_type) self.do_possible_transfers(option_type=self.option_type, data_type=data_type)
def each_run(self): """Run over the requested data types according to the json config file""" if 'data_type' not in config.get_config(config.get_hostname()): logging.info( "Error: Define a data_type in your configuration file") logging.info(" (e.g. 'data_type': ['raw'])") exit() for data_type in config.get_config(config.get_hostname())['data_type']: self.log.debug("%s" % data_type) self.do_possible_transfers(option_type=self.option_type, data_type=data_type)
def each_location(self, data_doc): """ Check every location with data whether it should be purged. """ self.log.debug("Checking purge logic") # Skip places where we can't locally access data if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): return # See if purge settings specified, otherwise don't purge if not config.purge_version() or (config.purge_version() == None) : self.log.debug("No processed version specified for purge, skipping") return # Do not purge processed data if data_doc['type'] == 'raw': self.log.debug("Do not purge raw data") return # Check pax version of processed run if (data_doc['pax_version'] != config.purge_version()) : self.log.debug("Don't purge this version: %s" % (data_doc['pax_version']) ) return # The purge data self.log.info("Purging %s" % data_doc['location']) self.purge(data_doc) return
def each_location(self, data_doc): """Check every location with data whether it should be purged. """ # Skip places where we can't locally access data if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): return # Do not purge processed data (use PurgeProcessed below) if data_doc['type'] == 'processed': self.log.debug("Do not purge processed data") return self.log.debug("Checking purge logic") # Only purge transfered data if data_doc["status"] != "transferred": self.log.debug("Not transfered") return # Require at least three copies of the data since we are deleting third. num_copies = self.check(data_doc['type'], warn=False) if num_copies < 3: self.log.debug("Not enough copies (%d)" % num_copies) return if self.check_purge_requirements(): self.log.info("Purging %s" % data_doc['location']) self.purge(data_doc) else: self.log.debug("Not enough time elapsed")
def each_run(self): # For each data location, see if this filename in it for data_doc in self.run_doc['data']: # Is not local, skip if 'host' not in data_doc or \ data_doc['host'] != config.get_hostname(): continue if data_doc['location'] != self.location: continue # Notify run database if config.DATABASE_LOG is True: self.collection.update({'_id': self.run_doc['_id']}, {'$pull': { 'data': data_doc }}) # Perform operation self.log.info("Removing %s" % (self.location)) if os.path.isdir(data_doc['location']): shutil.rmtree(data_doc['location']) else: os.remove(self.location) break
def get_queue(host=config.get_hostname(), partition=''): """Get list of jobs in queue""" if host == "midway-login1": args = {'partition': 'sandyb', 'user' : config.get_user()} elif host == 'tegner-login-1': args = {'partition': 'main', 'user' : 'bobau'} else: return [] if partition == '': command = 'squeue --user={user} -o "%.30j"'.format(**args) else: args['partition'] = partition command = 'squeue --partition={partition} --user={user} -o "%.30j"'.format(**args) try: queue = subprocess.check_output(command, shell=True, timeout=120) except subprocess.TimeoutExpired as e: logging.error("Process timeout") return [] except Exception as e: logging.exception(e) return [] queue_list = queue.rstrip().decode('ascii').split() if len(queue_list) > 1: return queue_list[1:] return []
def each_location(self, data_doc): """ Check every location with data whether it should be purged. """ self.log.debug("Checking purge logic") # Skip places where we can't locally access data if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): return # See if purge settings specified, otherwise don't purge if not config.purge_version() or (config.purge_version() == None): self.log.debug( "No processed version specified for purge, skipping") return # Do not purge processed data if data_doc['type'] == 'raw': self.log.debug("Do not purge raw data") return # Check pax version of processed run if (data_doc['pax_version'] != config.purge_version()): self.log.debug("Don't purge this version: %s" % (data_doc['pax_version'])) return # The purge data self.log.info("Purging %s" % data_doc['location']) self.purge(data_doc) return
def each_run(self): thishost = config.get_hostname() if thishost != 'midway-login1': return version = 'v%s' % pax.__version__ have_processed, have_raw = self.local_data_finder(thishost, version) # Skip if no raw data if not have_processed: self.log.debug("Skipping %s with no processed data", self.run_doc['name']) return in_location = config.get_processing_dir(thishost, version) out_location = config.get_minitrees_dir(thishost, version) queue_list = qsub.get_queue(thishost) # Should check version here too if self.run_doc['name'] in queue_list: self.log.debug("Skipping %s currently in queue", self.run_doc['name']) return self.log.info("Processing %s with hax_%s, output to %s", self.run_doc['name'], version, out_location) _process_hax(self.run_doc['name'], in_location, thishost, version, out_location, self.run_doc['detector'])
def each_run(self): # For each data location, see if this filename in it for data_doc in self.run_doc['data']: # Is not local, skip if 'host' not in data_doc or \ data_doc['host'] != config.get_hostname(): continue if data_doc['location'] != self.input: continue self.log.info("Moving %s to %s" % (self.input, self.output)) # Perform renaming try: shutil.move(self.input, self.output) except Exception as e: print(e) ## Notify run database if config.DATABASE_LOG is True: self.collection.update( { '_id': self.run_doc['_id'], 'data': { '$elemMatch': data_doc } }, {'$set': { 'data.$.location': self.output }}) break
def each_run(self): thishost = config.get_hostname() hax_version = 'v%s' % hax.__version__ pax_version = 'v%s' % pax.__version__ have_processed, have_raw = self.local_data_finder(thishost, pax_version) # Skip if no processed data if not have_processed: self.log.debug("Skipping %s with no processed data", self.run_doc['name']) return in_location = os.path.dirname(have_processed['location']) out_location = config.get_minitrees_dir(thishost, pax_version) queue_list = qsub.get_queue(thishost) # Should check version here too if self.run_doc['name'] in queue_list: self.log.debug("Skipping %s currently in queue", self.run_doc['name']) return self.log.info("Processing %s (%s) with hax_%s, output to %s", self.run_doc['name'], pax_version, hax_version, out_location) _process_hax(self.run_doc['name'], in_location, thishost, pax_version, out_location, self.run_doc['detector'])
def get_queue(host=config.get_hostname(), partition=''): """Get list of jobs in queue""" if host == "midway-login1": args = {'partition': 'sandyb', 'user': config.get_user()} elif host == 'tegner-login-1': args = {'partition': 'main', 'user': '******'} else: return [] if partition == '': command = 'squeue --user={user} -o "%.30j"'.format(**args) else: args['partition'] = partition command = 'squeue --partition={partition} --user={user} -o "%.30j"'.format( **args) try: queue = subprocess.check_output(command, shell=True, timeout=120) except subprocess.TimeoutExpired as e: logging.error("Process timeout") return [] except Exception as e: logging.exception(e) return [] queue_list = queue.rstrip().decode('ascii').split() if len(queue_list) > 1: return queue_list[1:] return []
def each_run(self): # For each data location, see if this filename in it for data_doc in self.run_doc['data']: # Is not local, skip if 'host' not in data_doc or \ data_doc['host'] != config.get_hostname(): continue if data_doc['location'] != self.input: continue self.log.info("Moving %s to %s" % (self.input, self.output)) # Perform renaming try: shutil.move(self.input, self.output) except Exception as e: print( e ) ## Notify run database if config.DATABASE_LOG is True: self.collection.update({'_id' : self.run_doc['_id'], 'data': {'$elemMatch': data_doc}}, {'$set': { 'data.$.location': self.output}}) break
def purge(self, data_doc, delete_data=True): if delete_data is True: self.log.info("Deleting %s" % data_doc['location']) # Temporary hardcoded check for gfal-rm removal if config.get_hostname( ) == 'login' and 'raw' in data_doc['location']: config_original = config.get_config('login') server = config_original['hostname'] if config.get_cert() == None: grid_cert = '' else: grid_cert = config.get_cert() full_command = "gfal-rm -v -r --cert %s " % grid_cert + \ server+data_doc['location'] self.log.info(full_command) try: gfal_out = subprocess.check_output( full_command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as gfal_exec: self.log.error(gfal_exec.output.rstrip().decode('ascii')) self.log.error("Error: gfal-rm status = %d\n" % gfal_exec.returncode) raise gfal_out_ascii = gfal_out.rstrip().decode('ascii') if "error" in gfal_out_ascii.lower( ): # Some errors don't get caught above self.log.error(gfal_out_ascii) raise else: self.log.info(gfal_out_ascii) # To print timing # Default POSIX removal else: if os.path.isdir(data_doc['location']): shutil.rmtree(data_doc['location']) self.log.info('Deleted, notify run database.') elif os.path.isfile(data_doc['location']): os.remove(data_doc['location']) else: self.log.error('did not exist, notify run database.') if config.DATABASE_LOG == True: resp = self.collection.update({'_id': self.run_doc['_id']}, {'$pull': { 'data': data_doc }}) self.log.info('Removed from run database: %s' % data_doc['location']) self.log.debug(resp)
def get_daq_buffer(self): for data_doc in self.run_doc['data']: if data_doc['type'] == 'untriggered': if data_doc['host'] == 'reader': if config.get_hostname() == 'eb0': return data_doc # Not found return None
def __init__(self): self.raw_data = {"tegner-login-1": "/cfs/klemming/projects/xenon/xenon1t/raw/", "midway-login1": "/project2/lgrandi/xenon1t/raw/"} self.proc_data = {"tegner-login-1": "/cfs/klemming/projects/xenon/xenon1t/processed/", "midway-login1": "/project/lgrandi/xenon1t/processed/"} self.chown_user = {"tegner-login-1": "bobau", "midway-login1": "pdeperio"} self.chown_group = {"tegner-login-1": "xenon-users", "midway-login1": "xenon1t-admins"} self.chmod = {"tegner-login-1": '750', "midway-login1": '755'} Task.__init__(self) self.hostname_config = config.get_config(config.get_hostname()) self.hostname = config.get_hostname()
def purge(self, data_doc, delete_data=True): if delete_data is True: self.log.info("Deleting %s" % data_doc['location']) # Temporary hardcoded check for gfal-rm removal if config.get_hostname() == 'login' and 'raw' in data_doc['location']: config_original = config.get_config('login') server = config_original['hostname'] if config.get_cert() == None: grid_cert = '' else: grid_cert = config.get_cert() full_command = "gfal-rm -v -r --cert %s " % grid_cert + \ server+data_doc['location'] self.log.info(full_command) try: gfal_out = subprocess.check_output(full_command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as gfal_exec: self.log.error(gfal_exec.output.rstrip().decode('ascii')) self.log.error("Error: gfal-rm status = %d\n" % gfal_exec.returncode) raise gfal_out_ascii = gfal_out.rstrip().decode('ascii') if "error" in gfal_out_ascii.lower(): # Some errors don't get caught above self.log.error(gfal_out_ascii) raise else: self.log.info(gfal_out_ascii) # To print timing # Default POSIX removal else: if os.path.isdir(data_doc['location']): shutil.rmtree(data_doc['location']) self.log.info('Deleted, notify run database.') elif os.path.isfile(data_doc['location']): os.remove(data_doc['location']) else: self.log.error('did not exist, notify run database.') if config.DATABASE_LOG == True: resp = self.collection.update({'_id': self.run_doc['_id']}, {'$pull': {'data': data_doc}}) self.log.info('Removed from run database: %s' % data_doc['location']) self.log.debug(resp)
def each_location(self, data_doc): if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): return # Skip places where we can't locally access data if 'creation_time' not in data_doc: self.log.warning("No creation time for %s" % str(data_doc)) return # How long has transfer been ongoing try: time_modified = os.stat(data_doc['location']).st_mtime except FileNotFoundError: time_modified = 0 time_modified = datetime.datetime.fromtimestamp(time_modified) time_made = data_doc['creation_time'] # Some RunsDB entries are different format for some reason (#40) if isinstance(time_made, list): # Assume only one list entry that contains the time time_made = time_made[0] difference = datetime.datetime.utcnow() - max(time_modified, time_made) if data_doc["status"] == "transferred" or data_doc[ "status"] == "verifying": return # Transfer went fine self.log.debug(difference) if difference > datetime.timedelta(hours=2): # If stale transfer self.give_error("Transfer %s from run %d (%s) lasting more than " "2 hours" % (data_doc['type'], self.run_doc['number'], self.run_doc['name'])) # Do not delete stalled or failed raw data transfers to recover with rsync # (Warning: do not use scp, which may create nested directories) delete_data = (data_doc['type'] == 'processed' and 'v%s' % pax.__version__ == data_doc['pax_version']) if difference > datetime.timedelta(hours=24): self.give_error("Transfer lasting more than 24 hours, retry.") self.purge(data_doc, delete_data) elif data_doc["status"] == 'error' and data_doc[ 'host'] != 'xe1t-datamanager': self.give_error("Transfer or process errored, retry.") self.purge(data_doc, delete_data)
def each_location(self, data_doc): if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): return # Skip places where we can't locally access data if 'creation_time' not in data_doc: self.log.warning("No creation time for %s" % str(data_doc)) return # How long has transfer been ongoing try: time_modified = os.stat(data_doc['location']).st_mtime except FileNotFoundError: time_modified = 0 time_modified = datetime.datetime.fromtimestamp(time_modified) time_made = data_doc['creation_time'] # Some RunsDB entries are different format for some reason (#40) if isinstance(time_made, list): # Assume only one list entry that contains the time time_made = time_made[0] difference = datetime.datetime.utcnow() - max(time_modified, time_made) if data_doc["status"] == "transferred" or data_doc["status"] == "verifying": return # Transfer went fine self.log.debug(difference) if difference > datetime.timedelta(hours=2): # If stale transfer self.give_error("Transfer %s from run %d (%s) lasting more than " "2 hours" % (data_doc['type'], self.run_doc['number'], self.run_doc['name'])) # Do not delete stalled or failed raw data transfers to recover with rsync # (Warning: do not use scp, which may create nested directories) delete_data = (data_doc['type'] == 'processed' and 'v%s' % pax.__version__ == data_doc['pax_version']) if difference > datetime.timedelta(hours=24): self.give_error("Transfer lasting more than 24 hours, retry.") self.purge(data_doc, delete_data) elif data_doc["status"] == 'error' and data_doc['host'] != 'xe1t-datamanager': self.give_error("Transfer or process errored, retry.") self.purge(data_doc, delete_data)
def each_location(self, data_doc): if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): return # Skip places where we can't locally access data if data_doc["status"] != "transferred": return comparison = self.get_main_checksum(**data_doc) if comparison is None: return if data_doc['checksum'] != comparison: self.give_error("Bad checksum %d, %s, %s, %s" % (self.run_doc['number'], data_doc['host'], \ data_doc['type'], data_doc['pax_version'])) if self.check(data_doc['type'], warn=False) > 1: self.purge(data_doc)
def each_run(self): """Set ownership and permissons for files/folders""" for data_doc in self.run_doc['data']: # Is not local, skip if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): continue #extract path: f_path = data_doc['location'] f_type = data_doc['type'] #apply changes according to processed/raw and analysis facility if f_type == 'processed': logging.info('Change ownership and permission for %s', f_path) logging.info('Change to username %s and group %s', self.chown_user[self.hostname], self.chown_group[self.hostname]) logging.info('Set permission: %s', self.chmod[self.hostname] ) logging.info('Set ownership and permissions at %s', config.get_hostname() ) if config.get_hostname() == "midway-login1": subprocess.call(['chmod', self.chmod[self.hostname], f_path]) subprocess.call(['chown', str(self.chown_user[self.hostname]+":"+self.chown_group[self.hostname]), f_path]) elif config.get_hostname() == "tegner-login-1": subprocess.call(['chmod', self.chmod[self.hostname], f_path]) subprocess.call(['chown', str(self.chown_user[self.hostname]+":"+self.chown_group[self.hostname]), f_path]) subprocess.call(['setfacl', '-R', '-M', '/cfs/klemming/projects/xenon/misc/basic_file', f_path]) else: logging.info('Analysis facility does not match') elif f_type == 'raw': logging.info('Change ownership and permission for %s', f_path) logging.info('Change to username %s and group %s', self.chown_user[self.hostname], self.chown_group[self.hostname]) logging.info('Set permission: %s', self.chmod[self.hostname] ) logging.info('Set ownership and permissions at %s', config.get_hostname() ) if config.get_hostname() == "midway-login1": subprocess.call(['chmod', '-R', self.chmod[self.hostname], f_path]) subprocess.call(['chown', '-R', str(self.chown_user[self.hostname]+":"+self.chown_group[self.hostname]), f_path]) elif config.get_hostname() == "tegner-login-1": subprocess.call(['chmod', self.chmod[self.hostname], f_path]) subprocess.call(['chown', str(self.chown_user[self.hostname]+":"+self.chown_group[self.hostname]), f_path]) subprocess.call(['setfacl', '-R', '-M', '/cfs/klemming/projects/xenon/misc/basic', f_path]) else: logging.info('Analysis facility does not match') else: logging.info("Nothing to change: Ownership/Permission")
def tsm_commands(self, method=None): host_xe1t_datamanager = """#!/bin/bash echo "Basic Config@xe1tdatamanager" source /home/xe1ttransfer/tsm_config/init_tsm.sh """ host_teger = """#!/bin/bash echo "Basic Config@Tegner" export PATH="/cfs/klemming/projects/xenon/.adm/xenon-tsm/:$PATH" """ general = {"xe1t-datamanager": host_xe1t_datamanager, "tegner-login-1": host_teger} check_for_raw_data = """ dsmc query ba {path} """ check_method = """ echo "No method is selected: Do nothing" """ incr_upload = """ dsmc incr {path}/ """ restore_path = """ dsmc rest {path_tsm}/ {path_restore}/ -followsymbolic=yes """ check_install = """ dsmc """ if method == "check-for-raw-data": return general[config.get_hostname()] + check_for_raw_data elif method == None: return general[config.get_hostname()] elif method == "incr-upload-path": return general[config.get_hostname()] + incr_upload elif method == "restore-path": return general[config.get_hostname()] + restore_path elif method == "check-installation": return general[config.get_hostname()] + check_install else: return general[config.get_hostname()] + check_method
def check(self, type='raw', warn=True): """Returns number of verified data locations Return the number of sites that have the same checksum as the master site. """ n = 0 for data_doc in self.run_doc['data']: # Only look at transfered data that is not untriggered if 'host' not in data_doc or \ data_doc['status'] != 'transferred' or \ data_doc['type'] == 'untriggered' or \ data_doc['type'] != type or \ 'checksum' not in data_doc: continue # Rucio stores its own checksum, assume "transferred" is 1 good copy if data_doc['host'] == 'rucio-catalogue': n += 1 # Grab main checksum and compare elif data_doc['checksum'] != self.get_main_checksum(**data_doc): if data_doc['host'] == config.get_hostname(): error = "Local checksum error " \ "run %d, %s %s" % (self.run_doc['number'], data_doc['type'], \ data_doc['pax_version']) if warn: self.give_error(error) # Comparison did not fail so add 1 good copy else: n += 1 return n
def local_data_finder(self, data_type, option_type, remote_host): datum_here = None # Information about data here datum_there = None # Information about data there version = 'v%s' % pax.__version__ # Iterate over data locations to know status for datum in self.run_doc['data']: # Is host known? if 'host' not in datum or datum['type'] != data_type: continue transferred = (datum['status'] == 'transferred') # If the location refers to here if datum['host'] == config.get_hostname(): # If uploading, we should have data if option_type == 'upload' and not transferred: continue if datum['type'] == 'processed' and not version == datum['pax_version']: continue datum_here = datum.copy() elif datum['host'] == remote_host: # This the remote host? # If downloading, they should have data if option_type == 'download' and not transferred: continue if datum['type'] == 'processed' and not version == datum['pax_version']: continue datum_there = datum.copy() return datum_here, datum_there
def each_run(self): # For each data location, see if this filename in it for data_doc in self.run_doc['data']: # Is not local, skip if 'host' not in data_doc or \ data_doc['host'] != config.get_hostname(): continue if data_doc['location'] != self.location: continue # Notify run database if config.DATABASE_LOG is True: self.collection.update({'_id': self.run_doc['_id']}, {'$pull': {'data': data_doc}}) # Perform operation self.log.info("Removing %s" % (self.location)) if os.path.isdir(data_doc['location']): shutil.rmtree(data_doc['location']) else: os.remove(self.location) break
def each_location(self, data_doc): if 'host' not in data_doc or data_doc['host'] != config.get_hostname(): return # Skip places where we can't locally access data if data_doc["status"] != "transferred": return comparison = self.get_main_checksum(**data_doc) if comparison is None: return if data_doc['checksum'] != comparison: self.give_error("Bad checksum %d, %s, %s" % (self.run_doc['number'], data_doc['host'], \ data_doc['type'])) if data_doc['type'] == 'processed': self.give_error("Bad checksum %s" % data_doc['pax_version']) self.purge(data_doc) return # Check for 2 or more copies with raw data if self.check(data_doc['type'], warn=False) > 1: self.purge(data_doc)
def get_number_in_queue(host=config.get_hostname(), partition=''): # print (len(get_queue(host, partition)), host, partition) return len(get_queue(host, partition))
def each_location(self, data_doc): #print("each location") hostname = config.get_hostname() destination = config.get_config("tsm-server") self.variables() if data_doc['host'] == "xe1t-datamanager": self.checksum_xe1t = data_doc['checksum'] logging.info("Found checksum for xe1t-datamanger: %s", self.checksum_xe1t) return if destination['name'] == data_doc['host'] and data_doc['checksum'] == None and data_doc['status'] == 'transferred': """A dedicated function to add checksums to the database in case there are no checksums for tsm-server entries but the status says transferred """ logging.info( "There is a database entry for %s (transferred) but no checksum", data_doc['location']) # Init the TSMclient class: self.tsm = TSMclient() raw_data_location = data_doc['location'] raw_data_filename = data_doc['location'].split('/')[-1] raw_data_path = config.get_config(config.get_hostname())['dir_raw'] raw_data_tsm = config.get_config(config.get_hostname())['dir_tsm'] tmp_data_path = raw_data_tsm + "tmp_checksum_test/" logging.info("Raw data location @xe1t-datamanager: %s", raw_data_location) logging.info("Path to raw data: %s", raw_data_path) logging.info("Path to tsm data: %s", raw_data_tsm) logging.info("Path to temp. data: %s", tmp_data_path) logging.info("File/Folder for backup: %s", raw_data_filename) # Sanity Check if self.tsm.check_client_installation() == False: logging.info("There is a problem with your dsmc client") return # Make sure that temp. download directory exists: if not os.path.exists(tmp_data_path): os.makedirs(tmp_data_path) # Download it to a temp directory dfolder = tmp_data_path + "/" + raw_data_filename if os.path.exists(dfolder): logging.info( "Temp. directory %s already exists -> Delete it now", dfolder) shutil.rmtree(dfolder) tsm_download_result = self.tsm.download( raw_data_tsm + raw_data_filename, tmp_data_path, raw_data_filename) if os.path.exists(tmp_data_path + raw_data_filename) == False: logging.info("Download to %s failed.", raw_data_path) # Do the checksum checksum_after = self.tsm.get_checksum_folder( tmp_data_path + "/" + raw_data_filename) logging.info("Summary of the download for checksum comparison:") logging.info("Number of downloaded files: %s", tsm_download_result["tno_restored_objects"]) logging.info("Transferred amount of data: %s", tsm_download_result["tno_restored_bytes"]) logging.info("Network transfer rate: %s", tsm_download_result["tno_network_transfer_rate"]) logging.info("Download time: %s", tsm_download_result["tno_data_transfer_time"]) logging.info("Number of failed downloads: %s", tsm_download_result["tno_failed_objects"]) logging.info("MD5 Hash (database entry|TSM-SERVER): %s", data_doc['checksum']) logging.info( "MD5 Hash (database entry|xe1t-datamanager): %s", self.checksum_xe1t) logging.info("MD5 Hash (downloaded data): %s", checksum_after) # Add to runDB and compare # if data_doc['checksum'] == None and self.checksum_xe1t == checksum_after: if data_doc['checksum'] == None and self.checksum_xe1t == "no_checksum_xe1tdatam": logging.info("No checksum for database entry TSM-server") logging.info("Checksums for xe1t-datamanager is verfied") if config.DATABASE_LOG: logging.info("Notify the runDB to add checksum") self.collection.update({'_id': self.run_doc['_id'], 'data': {'$elemMatch': data_doc}}, {'$set': {'data.$.checksum': checksum_after}}) # Delete from temp directory # if data_doc['checksum'] == None and self.checksum_xe1t == checksum_after: logging.info( "Delete temp. directory for checksum verification: %s", dfolder) shutil.rmtree(dfolder)
def copy_tsm(self, datum, destination, method, option_type): # hard coded sha512 checksum which stands for an empty directory #(Used for verifying the goodness of the uploaded data)" checksum_empty_dir = "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e" # Init the TSM client for tape backup from an extern class self.tsm = TSMclient() logging.info('Tape Backup to PDC STOCKHOLM') print(datum, destination, method, option_type) logging.debug("Notifying run database") datum_new = {'type': datum['type'], 'host': destination, 'status': 'transferring', 'location': "n/a", 'checksum': None, 'creation_time': datetime.datetime.utcnow(), } logging.info("new entry for rundb: %s", datum_new) if config.DATABASE_LOG == True: result = self.collection.update_one({'_id': self.run_doc['_id'], }, {'$push': {'data': datum_new}}) if result.matched_count == 0: self.log.error("Race condition! Could not copy because another " "process seemed to already start.") return raw_data_location = datum['location'] raw_data_filename = datum['location'].split('/')[-1] raw_data_path = raw_data_location.replace(raw_data_filename, "") raw_data_tsm = config.get_config(config.get_hostname())['dir_tsm'] logging.info("Raw data location @xe1t-datamanager: %s", raw_data_location) logging.info("Path to raw data: %s", raw_data_path) logging.info("Path to tsm data: %s", raw_data_tsm) logging.info("File/Folder for backup: %s", raw_data_filename) # Do a simple pretest to analyse the directory what is going to be backuped up # continue only if there are files in the directory and no more folders list_files = [] list_folders = [] for root, dirs, files in os.walk(raw_data_path + raw_data_filename): for name in files: list_files.append(name) for name in dirs: list_folders.append(name) # Sanity check if raw data folder contains a subfolder (mostly important for old raw data sets) if len(list_files) == 0 or len(list_folders) > 0: logging.info("ERROR: There are %s files in %s", len( list_files), raw_data_path + raw_data_filename) if len(list_folders) > 0: logging.info("ERROR: These folders are found in %s:", raw_data_path + raw_data_filename) for i_folders in list_folders: logging.info(" <> %s", i_folders) logging.info("Check the error(s) and start again") if config.DATABASE_LOG: self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': "error", 'data.$.location': "n/a", 'data.$.checksum': "n/a", } }) return else: logging.info( "Pre-test of %s counts %s files for tape upload [succcessful]", raw_data_path + raw_data_filename, len(list_files)) # Do a checksum pre-test for double counts: checksum_pretest_list = [] for i_file in files: f_path = os.path.join(raw_data_path, raw_data_filename, i_file) pre_test_checksum = ChecksumMethods.get_crc32(self, f_path) checksum_pretest_list.append(pre_test_checksum) double_counts = set( [x for x in checksum_pretest_list if checksum_pretest_list.count(x) > 1]) if len(double_counts) > 0: logging.info("Pre checksum test: [failed]") logging.info("There are two or more identical checksums observed in %s", os.path.join( raw_data_path, raw_data_filename)) if config.DATABASE_LOG: self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': "error", 'data.$.location': "n/a", 'data.$.checksum': "n/a", } }) return else: logging.info("Pre checksum test: [succcessful]") # Check first if everything is fine with the dsmc client if self.tsm.check_client_installation() == False: logging.info("There is a problem with your dsmc client") if config.DATABASE_LOG: self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': "error", 'data.$.location': "n/a", 'data.$.checksum': "n/a", } }) return #logging.debug("Notifying run database") # datum_new = {'type' : datum['type'], #'host' : destination, #'status' : 'transferring', #'location' : "n/a", #'checksum' : None, #'creation_time': datetime.datetime.utcnow(), #} #logging.info("new entry for rundb: %s", datum_new ) # if config.DATABASE_LOG == True: # result = self.collection.update_one({'_id': self.run_doc['_id'], #}, #{'$push': {'data': datum_new}}) # if result.matched_count == 0: # self.log.error("Race condition! Could not copy because another " #"process seemed to already start.") # return logging.info("Start tape upload") # Prepare a copy from raw data location to tsm location ( including renaming) checksum_before_raw = self.tsm.get_checksum_folder( raw_data_path + raw_data_filename) file_list = [] for (dirpath, dirnames, filenames) in os.walk(raw_data_path + raw_data_filename): file_list.extend(filenames) break if not os.path.exists(raw_data_tsm + raw_data_filename): os.makedirs(raw_data_tsm + raw_data_filename) for i_file in file_list: path_old = raw_data_path + raw_data_filename + "/" + i_file path_new = raw_data_tsm + raw_data_filename + \ "/" + raw_data_filename + "_" + i_file if not os.path.exists(path_new): shutil.copy2(path_old, path_new) checksum_before_tsm = self.tsm.get_checksum_folder( raw_data_tsm + raw_data_filename) if checksum_before_raw != checksum_before_tsm: logging.info("Something went wrong during copy & rename") if config.DATABASE_LOG: self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': "error", 'data.$.location': "n/a", 'data.$.checksum': "n/a", } }) return elif checksum_before_raw == checksum_before_tsm: logging.info("Copy & rename: [succcessful] -> Checksums agree") tsm_upload_result = self.tsm.upload(raw_data_tsm + raw_data_filename) logging.info("Number of uploaded files: %s", tsm_upload_result["tno_backedup"]) logging.info("Number of inspected files: %s", tsm_upload_result["tno_inspected"]) logging.info("Number of failed files: %s", tsm_upload_result["tno_failed"]) logging.info("Transferred amount of data: %s", tsm_upload_result["tno_bytes_transferred"]) logging.info("Inspected amount of data: %s", tsm_upload_result["tno_bytes_inspected"]) logging.info("Upload time: %s", tsm_upload_result["tno_data_transfer_time"]) logging.info("Network transfer rate: %s", tsm_upload_result["tno_network_transfer_rate"]) logging.info("MD5 Hash (raw data): %s", checksum_before_tsm) test_download = os.path.join(raw_data_tsm, "tsm_verify_download") # Make sure that temp. download directory exists: if not os.path.exists(test_download): os.makedirs(test_download) logging.info("Start the re-download to %s", test_download) tsm_download_result = self.tsm.download( raw_data_tsm + raw_data_filename, test_download, raw_data_filename) logging.info("Finished the re-download") if os.path.exists(test_download + "/" + raw_data_filename) == False: logging.info("Download to %s failed. Checksum will not match", test_download + "/" + raw_data_filename) else: logging.info("Download to %s succcessful. Folder exists", test_download + "/" + raw_data_filename) checksum_after = self.tsm.get_checksum_folder( test_download + "/" + raw_data_filename) logging.info("Summary of the download for checksum comparison:") logging.info("Number of downloaded files: %s", tsm_download_result["tno_restored_objects"]) logging.info("Transferred amount of data: %s", tsm_download_result["tno_restored_bytes"]) logging.info("Network transfer rate: %s", tsm_download_result["tno_network_transfer_rate"]) logging.info("Download time: %s", tsm_download_result["tno_data_transfer_time"]) logging.info("Number of failed downloads: %s", tsm_download_result["tno_failed_objects"]) logging.info("MD5 Hash (raw data): %s", checksum_after) status = "" if checksum_before_tsm == checksum_after and checksum_empty_dir != checksum_before_tsm and checksum_empty_dir != checksum_after: logging.info("Upload to tape: [succcessful]") status = "transferred" else: logging.info("Upload to tape: [failed]") status = "error" # Print a warning if the checksum crosscheck fails! if checksum_empty_dir == checksum_before_tsm or checksum_empty_dir == checksum_after: logging.info( "Checksum test indicates an empty folder before or after the tape upload") logging.info("Check your raw data directory %s for files", raw_data_tsm + raw_data_filename) # Delete check folder shutil.rmtree(raw_data_tsm + raw_data_filename) shutil.rmtree(test_download + "/" + raw_data_filename) logging.info("Finished to delete temp. directories: %s and %s", raw_data_tsm + raw_data_filename, test_download + "/" + raw_data_filename) if config.DATABASE_LOG: self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': status, 'data.$.location': raw_data_tsm + raw_data_filename, 'data.$.checksum': checksum_after, } }) logging.info("Update database") return 0
def copy_handshake(self, datum, destination, method, option_type, data_type): """ Perform all the handshaking required with the run DB. :param datum: The dictionary data location describing data to be transferred :type str :param destination: The host name where data should go to. :type str :return: """ # Get information about this destination destination_config = config.get_config(destination) self.log.info(option_type + "ing run %d to: %s" % (self.run_doc['number'], destination)) # Determine where data should be copied to if destination_config['dir_%s' % datum['type']] != None: base_dir = destination_config['dir_%s' % datum['type']] if base_dir is None: self.log.info("no directory specified for %s" % datum['type']) return if datum['type'] == 'processed': self.log.info(datum) base_dir = os.path.join(base_dir, 'pax_%s' % datum['pax_version']) # Check directory existence on local host for download only if option_type == 'download' and not os.path.exists(base_dir): if destination != config.get_hostname(): raise NotImplementedError("Cannot create directory on another " "machine.") # Recursively make directories os.makedirs(base_dir) else: base_dir = "none" # Directory or filename to be copied filename = datum['location'].split('/')[-1] self.log.debug("Notifying run database") datum_new = {'type': datum['type'], 'host': destination, 'status': 'transferring', 'location': os.path.join(base_dir, filename), 'checksum': None, 'creation_time': datetime.datetime.utcnow(), } if datum['type'] == 'processed': for variable in ('pax_version', 'pax_hash', 'creation_place'): datum_new[variable] = datum.get(variable) if method == "rucio" and option_type == "upload": # Init the rucio module when method==rucio is requested self.log.info( "Init rucio_mover module for Rucio transfers (upload)") self.rucio = RucioBase(self.run_doc) self.rucio.set_host(config.get_hostname()) self.rucio.set_remote_host(destination) # Sanity check for rucio client if self.rucio.sanity_checks() == False: logging.info("!!! <<The sanity checks fail>> !!!") return 0 # Add two further database entries for rucio related uploads datum_new['rse'] = [] datum_new['location'] = "n/a" datum_new['rule_info'] = "no_rule" if method == "rucio" and option_type == "download": rucio_catalogue_config = config.get_config("rucio-catalogue") self.log.info( "Init rucio_mover module for Rucio transfers (download)") # Load and config the download module of rucio/ruciax self.ruciodw = RucioDownload() self.ruciodw.SetDatabaseEntry(self.run_doc) self.ruciodw.ExternalDatabaseEntry() self.ruciodw.SetDownloadConfig( rucio_catalogue_config, destination_config) # specify a not available path for the download destination datum_new['location'] = "NA" if config.DATABASE_LOG == True: result = self.collection.update_one({'_id': self.run_doc['_id'], }, {'$push': {'data': datum_new}}) if result.matched_count == 0: self.log.error("Race condition! Could not copy because another " "process seemed to already start.") return self.log.info('Starting ' + method) try: # try to copy self.copy(datum, datum_new, method, option_type, data_type) # Checksumming to follow on local site if method == 'scp' or method == 'rsync': status = 'verifying' # Cannot do cax-checksum on GRID sites, # so assume gfal-copy/lcg-cp checksum is sufficient else: status = 'verifying' # TO DO: Manually copy checksum to DB entry here except scp.SCPException as e: self.log.exception(e) status = 'error' # WARNING: This needs to be extended to catch gfal-copy errors except: self.log.exception("Unexpected copy error") status = 'error' self.log.debug(method + " done, telling run database") if config.DATABASE_LOG: if method == "rucio" and option_type == "upload": logging.info("Following entries are added to the runDB:") logging.info(" * Status: %s", self.rucio.get_rucio_info()['status']) logging.info(" * Location: %s", self.rucio.get_rucio_info()['location']) logging.info(" * Checksum: %s", self.rucio.get_rucio_info()['checksum']) logging.info(" * RSE: %s", self.rucio.get_rucio_info()['rse']) logging.info(" * Preliminary rule information: %s", self.rucio.get_rucio_info()['rule_info']) self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': { 'data.$.status': self.rucio.get_rucio_info()['status'], 'data.$.location': self.rucio.get_rucio_info()['location'], 'data.$.checksum': self.rucio.get_rucio_info()['checksum'], 'data.$.rse': self.rucio.get_rucio_info()['rse'], 'data.$.rule_info': self.rucio.get_rucio_info()['rule_info'] } }) elif method == "rucio" and option_type == "download": logging.info("Following entries are added to the runDB:") logging.info(" * Status: %s", self.ruciodw.get_rucio_info()['status']) logging.info(" * Location: %s", self.ruciodw.get_rucio_info()['location']) self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': { 'data.$.status': self.ruciodw.get_rucio_info()['status'], 'data.$.location': self.ruciodw.get_rucio_info()['location'] } }) else: # Fill the data if method is not rucio if config.DATABASE_LOG: self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': { 'data.$.status': status } }) if method == "rucio" and option_type == "upload": # Rucio 'side load' to set the transfer rules directly after the file upload if self.rucio.get_rucio_info()['status'] == "transferred": logging.info( "Initiate the RucioRule for a first set of transfer rules") # Add: Outcome of the rucio transfers to the new database entry # without read the runDB again. datum_new['status'] = self.rucio.get_rucio_info()['status'] datum_new['location'] = self.rucio.get_rucio_info()['location'] datum_new['checksum'] = self.rucio.get_rucio_info()['checksum'] datum_new['rse'] = self.rucio.get_rucio_info()['rse'] datum_new['rule_info'] = self.rucio.get_rucio_info()[ 'rule_info'] # Init the RucioRule module and set its runDB entry manually self.rucio_rule = RucioRule() self.rucio_rule.set_db_entry_manually(self.run_doc) # Perform the initial rule setting: self.rucio_rule.set_possible_rules( data_type=datum['type'], dbinfo=datum_new) logging.info("Status: transferred -> Transfer rules are set for %s", self.rucio.get_rucio_info()['rse']) # Commented out due to upload section (rucio_mover) option 3! # No need to delete single file rules manually after upload # let it sleep for 5 seconds: # logging.info("Sleep") # time.sleep(15) #logging.info("Sleep time finished ") #self.log.info("Delete individual rules of the uploaded files:") # for i_file in self.rucio.get_rucio_info()['file_list']: # i_location = "{iscope}:{ifile}".format( iscope=self.rucio.get_rucio_info()['scope_upload'], # ifile=i_file.split("/")[-1] ) #self.log.info("Time out for %s", i_location) #self.rucio_rule.update_rule( i_location, self.rucio.get_rucio_info()['rse'][0], "10" ) else: logging.info( "Something went wrong during the upload (error). No rules are set") elif method == "rucio" and option_type == "download": logging.info("<-- Finished download %s to location %s with status %s -->", datum['location'], self.ruciodw.get_rucio_info()['location'], self.ruciodw.get_rucio_info()['status']) logging.debug(method + " done, telling run database") logging.info("End of " + option_type + "\n")
def each_location(self, data_doc): if data_doc['host'] == config.get_hostname(): self.locations.append(data_doc['location'])
def copy_tsm_download(self, datum, destination, method, option_type): """A dedicated download function for downloads from tape storage""" self.tsm = TSMclient() logging.info('Tape Backup to PDC STOCKHOLM (Download)') raw_data_location = datum['location'] raw_data_filename = datum['location'].split('/')[-1] raw_data_path = config.get_config(config.get_hostname())['dir_raw'] raw_data_tsm = config.get_config(config.get_hostname())['dir_tsm'] logging.info("Raw data location @xe1t-datamanager: %s", raw_data_location) logging.info("Path to raw data: %s", raw_data_path) logging.info("Path to tsm data: %s", raw_data_tsm) logging.info("File/Folder for backup: %s", raw_data_filename) self.log.debug("Notifying run database") datum_new = {'type': datum['type'], 'host': destination, 'status': 'transferring', 'location': "n/a", 'checksum': None, 'creation_time': datetime.datetime.utcnow(), } logging.info("new entry for rundb: %s", datum_new) if config.DATABASE_LOG == True: result = self.collection.update_one({'_id': self.run_doc['_id'], }, {'$push': {'data': datum_new}}) if result.matched_count == 0: self.log.error("Race condition! Could not copy because another " "process seemed to already start.") return logging.info("Start tape download") # Sanity Check if self.tsm.check_client_installation() == False: logging.info("There is a problem with your dsmc client") return # Do download: tsm_download_result = self.tsm.download( raw_data_location, raw_data_path, raw_data_filename) if os.path.exists(raw_data_path + raw_data_filename) == False: logging.info("Download to %s failed.", raw_data_path) if config.DATABASE_LOG: # Notify the database if something went wrong during the download: logging.info("Notifiy the runDB: error") self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': "error", 'data.$.location': "n/a", 'data.$.checksum': "n/a", } }) # Rename file_list = [] for (dirpath, dirnames, filenames) in os.walk(raw_data_path + raw_data_filename): file_list.extend(filenames) break for i_file in file_list: path_old = raw_data_path + raw_data_filename + "/" + i_file path_new = raw_data_path + raw_data_filename + "/" + i_file[12:] if not os.path.exists(path_new): os.rename(path_old, path_new) # Do checksum and summarize it: checksum_after = self.tsm.get_checksum_folder( raw_data_path + "/" + raw_data_filename) logging.info("Summary of the download for checksum comparison:") logging.info("Number of downloaded files: %s", tsm_download_result["tno_restored_objects"]) logging.info("Transferred amount of data: %s", tsm_download_result["tno_restored_bytes"]) logging.info("Network transfer rate: %s", tsm_download_result["tno_network_transfer_rate"]) logging.info("Download time: %s", tsm_download_result["tno_data_transfer_time"]) logging.info("Number of failed downloads: %s", tsm_download_result["tno_failed_objects"]) logging.info("MD5 Hash (database entry): %s", datum['checksum']) logging.info("MD5 Hash (downloaded data): %s", checksum_after) if checksum_after == datum['checksum']: logging.info( "The download/restore of the raw data set %s was [SUCCESSFUL]", raw_data_filename) logging.info("Raw data set located at: %s", raw_data_path + raw_data_filename) elif checksum_after != datum['checksum']: logging.info( "The download/restore of the raw data set %s [FAILED]", raw_data_filename) logging.info("Checksums do not agree!") # Notifiy the database for final registration if checksum_after == datum['checksum']: if config.DATABASE_LOG: # Notify the database if everything was fine: logging.info("Notifiy the runDB: transferred") self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': "transferred", 'data.$.location': raw_data_path + raw_data_filename, 'data.$.checksum': checksum_after, } }) else: logging.info("Database is not notified") elif checksum_after != datum['checksum']: if config.DATABASE_LOG: # Notify the database if something went wrong during the download: logging.info("Notifiy the runDB: error") self.collection.update({'_id': self.run_doc['_id'], 'data': { '$elemMatch': datum_new}}, {'$set': {'data.$.status': "error", 'data.$.location': "n/a", 'data.$.checksum': "n/a", } }) else: logging.info("Database is not notified") return 0
def each_location(self, data_doc): #print("each location") hostname = config.get_hostname() destination = config.get_config("tsm-server")
def do_possible_transfers(self, option_type='upload', data_type='raw'): """Determine candidate transfers. :param option_type: 'upload' or 'download' :type str :param data_type: 'raw' or 'processed' :type str :return: """ # Get the 'upload' or 'download' options. options = config.get_transfer_options(option_type) # If no options, can't do anything if options is None: return None, None # If should be purged, don't pull PurgeObj = BufferPurger() PurgeObj.run_doc = self.run_doc if option_type == 'download' and data_type == 'raw' and PurgeObj.check_purge_requirements(): self.log.info("Skip raw download that would be purged") return None, None start = time.time() # For this run, where do we have transfer access? datum_there = None datum_here = None for remote_host in options: self.log.debug(remote_host) # Get transfer protocol method = config.get_config(remote_host)['method'] if not method: print("Must specify transfer protocol (method) for " + remote_host) raise datum_here, datum_there = self.local_data_finder(data_type, option_type, remote_host) # Delete the old data base entry if rucio transfers are requested # and an old upload failed by a bad connection error. if method == "rucio" and datum_there != None and datum_there['status'] == 'RSEreupload' and config.DATABASE_LOG == True: self.log.info( "Former upload of %s failed with error", datum_here['location']) self.log.info( "[('Connection aborted.', BadStatusLine('',))] -> Delete runDB status and start again") self.collection.update({'_id': self.run_doc['_id']}, {'$pull': {'data': datum_there}}) # Upload logic for everything exepct tape if option_type == 'upload' and method != "tsm" and datum_here and (datum_there is None or datum_there['status'] == 'RSEreupload'): self.copy_handshake(datum_here, remote_host, method, option_type, data_type) break # Download logic for everything exepct tape if option_type == 'download' and datum_there and datum_here is None and method != "tsm": self.copy_handshake( datum_there, config.get_hostname(), method, option_type, data_type) break # Upload tsm: if option_type == 'upload' and datum_here and datum_there is None and method == "tsm": self.copy_tsm(datum_here, config.get_config( remote_host)['name'], method, option_type) break # Download tsm: if option_type == 'download' and datum_there and datum_here is None and method == "tsm": self.copy_tsm_download( datum_there, config.get_hostname(), method, option_type) break dataset = None if datum_there is not None: dataset = datum_there['location'].split('/').pop() elif datum_here is not None: dataset = datum_here['location'].split('/').pop() if dataset is not None: # Not sure why it does this sometimes end = time.time() elapsed = end - start self.log.info(method + " " + option_type + " dataset " + dataset + " took %d seconds" % elapsed)
def copyGFAL(self, datum_original, datum_destination, server, option_type, nstreams, grid_cert): """Copy data via GFAL function WARNING: Only SRM<->Local implemented (not yet SRM<->SRM) """ dataset = datum_original['location'].split('/').pop() # gfal-copy arguments: # -f: overwrite # -r: recursive # -n: number of streams (4 for now, but doesn't work on xe1t-datamanager so use lcg-cp instead) # -t: timeout in seconds # -K: specify checksum algorithm # --cert: path to initialized GRID certificate (voms-proxy-init -voms xenon.biggrid.nl -valid 168:00 -out user_cert) command = "time gfal-copy -v -f -r -p -t 32400 -K adler32 --cert %s -n %d " % ( grid_cert, nstreams) status = -1 if option_type == 'upload': logging.info(option_type + ": %s to %s" % (datum_original['location'], server + datum_destination['location'])) # Simultaneous LFC registration #lfc_config = config.get_config("lfc") # Warning: Processed data dir not implemented for LFC here #lfc_address = lfc_config['hostname']+lfc_config['dir_'+datum_original['type']] # Use GSIFTP address instead of POSIX from Stash (to avoid login node) if config.get_hostname() == 'login': config_original = config.get_config(datum_original['host']) server_original = config_original['hostname'] full_command = command + \ server_original + datum_original['location'] + " " + \ server + datum_destination['location'] # +" "+ \ # lfc_address+"/"+dataset # Use SRM address instead of POSIX from Midway (to avoid worker nodes) # elif config.get_hostname() == 'midway-login1': # server_original = 'srm://srm1.rcc.uchicago.edu:8443/srm/v2/server?SFN=' # full_command = command+ \ # server_original+datum_original['location']+" "+ \ # server+datum_destination['location'] #+" "+ \ # lfc_address+"/"+dataset else: full_command = command + \ "file://" + datum_original['location'] + " " + \ server + datum_destination['location'] # +" "+ \ # lfc_address+"/"+dataset else: # download logging.info(option_type + ": %s to %s" % (server + datum_original['location'], datum_destination['location'])) full_command = command + \ server + datum_original['location'] + " " + \ "file://" + datum_destination['location'] self.log.info(full_command) try: gfal_out = subprocess.check_output( full_command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as gfal_exec: self.log.error(gfal_exec.output.rstrip().decode('ascii')) self.log.error("Error: gfal-copy status = %d\n" % gfal_exec.returncode) raise gfal_out_ascii = gfal_out.rstrip().decode('ascii') if "error" in gfal_out_ascii.lower(): # Some errors don't get caught above self.log.error(gfal_out_ascii) raise else: self.log.info(gfal_out_ascii) # To print timing
def each_run(self): if self.has_tag('donotprocess'): self.log.debug("Do not process tag found, skip processing") return if 'processor' not in self.run_doc or \ 'DEFAULT' not in self.run_doc['processor']: self.log.debug("processor or DEFAUT tag not in run_doc, skip processing") return processing_parameters = self.run_doc['processor']['DEFAULT'] if 'gains' not in processing_parameters or \ 'drift_velocity_liquid' not in processing_parameters or \ 'electron_lifetime_liquid' not in processing_parameters: self.log.info("gains or e-lifetime not in run_doc, skip processing") return thishost = config.get_hostname() versions = ['v%s' % pax.__version__] have_processed, have_raw = self.local_data_finder(thishost, versions) # Skip if no raw data if not have_raw: self.log.debug("Skipping %s with no raw data", self.run_doc['name']) return if self.run_doc['reader']['ini']['write_mode'] != 2: self.log.debug("write_mode != 2, skip processing") return # Get number of events in data set (not set for early runs <1000) events = self.run_doc.get('trigger', {}).get('events_built', 0) # Skip if 0 events in dataset if events == 0: self.log.debug("Skipping %s with 0 events", self.run_doc['name']) return # Specify number of cores for pax multiprocess if events < 1000: # Reduce to 1 CPU for small number of events (sometimes pax stalls # with too many CPU) ncpus = 1 else: ncpus = config.NCPU - 1 # 4 based on Figure 2 here https://xecluster.lngs.infn.it/dokuwiki/doku.php?id=xenon:xenon1t:shockley:performance#automatic_processing # -1 for pax I/O worker # Process all specified versions for version in versions: pax_hash = "n/a" out_location = config.get_processing_dir(thishost, version) if have_processed[version]: self.log.debug("Skipping %s already processed with %s", self.run_doc['name'], version) continue queue_list = qsub.get_queue(thishost) # Should check version here too if self.run_doc['name'] in queue_list: self.log.debug("Skipping %s currently in queue", self.run_doc['name']) continue self.log.info("Processing %s with pax_%s (%s) and %d cores, output to %s", self.run_doc['name'], version, pax_hash, ncpus, out_location) _process(self.run_doc['name'], have_raw['location'], thishost, version, pax_hash, out_location, self.run_doc['detector'], ncpus)
def each_run(self): if self.has_tag('donotprocess'): self.log.debug("Do not process tag found, skip processing") return if 'processor' not in self.run_doc or \ 'DEFAULT' not in self.run_doc['processor']: self.log.debug( "processor or DEFAUT tag not in run_doc, skip processing") return processing_parameters = self.run_doc['processor']['DEFAULT'] if 'gains' not in processing_parameters or \ 'drift_velocity_liquid' not in processing_parameters or \ 'electron_lifetime_liquid' not in processing_parameters: self.log.info( "gains or e-lifetime not in run_doc, skip processing") return thishost = config.get_hostname() if thishost != 'midway-login1': return versions = ['v%s' % pax.__version__] have_processed, have_raw = self.local_data_finder(thishost, versions) # Skip if no raw data if not have_raw: self.log.debug("Skipping %s with no raw data", self.run_doc['name']) return if self.run_doc['reader']['ini']['write_mode'] != 2: self.log.debug("write_mode != 2, skip processing") return # Get number of events in data set (not set for early runs <1000) events = self.run_doc.get('trigger', {}).get('events_built', 0) # Skip if 0 events in dataset if events == 0: self.log.debug("Skipping %s with 0 events", self.run_doc['name']) return # Specify number of cores for pax multiprocess if events < 1000: # Reduce to 1 CPU for small number of events (sometimes pax stalls # with too many CPU) ncpus = 1 else: ncpus = config.NCPU - 1 # 4 based on Figure 2 here https://xecluster.lngs.infn.it/dokuwiki/doku.php?id=xenon:xenon1t:shockley:performance#automatic_processing # -1 for pax I/O worker # Process all specified versions for version in versions: pax_hash = "n/a" out_location = config.get_processing_dir(thishost, version) if have_processed[version]: self.log.debug("Skipping %s already processed with %s", self.run_doc['name'], version) continue queue_list = qsub.get_queue(thishost) # Should check version here too if self.run_doc['name'] in queue_list: self.log.debug("Skipping %s currently in queue", self.run_doc['name']) continue self.log.info( "Processing %s with pax_%s (%s) and %d cores, output to %s", self.run_doc['name'], version, pax_hash, ncpus, out_location) _process(self.run_doc['name'], have_raw['location'], thishost, version, pax_hash, out_location, self.run_doc['detector'], ncpus)
def each_run(self): """Set ownership and permissons for files/folders""" for data_doc in self.run_doc['data']: # Is not local, skip if 'host' not in data_doc or data_doc[ 'host'] != config.get_hostname(): continue #extract path: f_path = data_doc['location'] f_type = data_doc['type'] #apply changes according to processed/raw and analysis facility if f_type == 'processed': logging.info('Change ownership and permission for %s', f_path) logging.info('Change to username %s and group %s', self.chown_user[self.hostname], self.chown_group[self.hostname]) logging.info('Set permission: %s', self.chmod[self.hostname]) logging.info('Set ownership and permissions at %s', config.get_hostname()) if config.get_hostname() == "midway-login1": subprocess.call( ['chmod', self.chmod[self.hostname], f_path]) subprocess.call([ 'chown', str(self.chown_user[self.hostname] + ":" + self.chown_group[self.hostname]), f_path ]) elif config.get_hostname() == "tegner-login-1": subprocess.call( ['chmod', self.chmod[self.hostname], f_path]) subprocess.call([ 'chown', str(self.chown_user[self.hostname] + ":" + self.chown_group[self.hostname]), f_path ]) subprocess.call([ 'setfacl', '-R', '-M', '/cfs/klemming/projects/xenon/misc/basic_file', f_path ]) else: logging.info('Analysis facility does not match') elif f_type == 'raw': logging.info('Change ownership and permission for %s', f_path) logging.info('Change to username %s and group %s', self.chown_user[self.hostname], self.chown_group[self.hostname]) logging.info('Set permission: %s', self.chmod[self.hostname]) logging.info('Set ownership and permissions at %s', config.get_hostname()) if config.get_hostname() == "midway-login1": subprocess.call( ['chmod', '-R', self.chmod[self.hostname], f_path]) subprocess.call([ 'chown', '-R', str(self.chown_user[self.hostname] + ":" + self.chown_group[self.hostname]), f_path ]) elif config.get_hostname() == "tegner-login-1": subprocess.call( ['chmod', self.chmod[self.hostname], f_path]) subprocess.call([ 'chown', str(self.chown_user[self.hostname] + ":" + self.chown_group[self.hostname]), f_path ]) subprocess.call([ 'setfacl', '-R', '-M', '/cfs/klemming/projects/xenon/misc/basic', f_path ]) else: logging.info('Analysis facility does not match') else: logging.info("Nothing to change: Ownership/Permission")
def each_location(self, data_doc): # Only data waiting to be verified if data_doc[ 'status'] != 'verifying': # and data_doc['status'] != 'transferred': self.log.debug('Location ' + data_doc['host'] + ' does not need to add new checksum') return if data_doc['status'] == 'transferred' and \ (config.get_hostname() == 'xe1t-datamanager' or config.get_hostname() == 'login'): return # Data must be hosted somewhere if 'host' not in data_doc: return # Data must be here locally if data_doc['host'] != config.get_hostname(): # Special case of midway-srm accessible via POSIX on midway-login1 if not (data_doc['host'] == "midway-srm" and config.get_hostname() == "midway-login1"): self.log.debug('Location not here') return # This status is given after checksumming status = 'transferred' # Find file and perform checksum if os.path.isdir(data_doc['location']): value = checksumdir.dirhash(data_doc['location'], 'sha512') elif os.path.isfile(data_doc['location']): value = checksumdir._filehash(data_doc['location'], hashlib.sha512) else: # Data not actually found self.log.error("Location %s not found." % data_doc['location']) value = None status = 'error' if config.DATABASE_LOG: if data_doc['status'] == 'verifying': self.log.info("Adding a checksum to run " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update( { '_id': self.run_doc['_id'], 'data': { '$elemMatch': data_doc } }, { '$set': { 'data.$.status': status, 'data.$.checksum': value } }) elif data_doc['checksum'] != value or status == 'error': self.log.info("Checksum fail " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update( { '_id': self.run_doc['_id'], 'data': { '$elemMatch': data_doc } }, {'$set': { 'data.$.checksumproblem': True }})