def checksum(path, hashfunc="md5"): """Return checksum of files given by path. Wildcards can be used in check sum. Function is strongly dependent on checksumdir package by 'cakepietoast'. :param path: path of files to get hash from :param hashfunc: function used to get hash, default 'md5' :return: (str) hash of the file/files given by path """ import checksumdir hash_func = checksumdir.HASH_FUNCS.get(hashfunc) if not hash_func: raise NotImplementedError("{} not implemented.".format(hashfunc)) if os.path.isdir(path): return checksumdir.dirhash(path, hashfunc=hashfunc) hashvalues = [] path_list = list(sorted(glob.glob(path))) logger.debug("path_list: len: %i", len(path_list)) if len(path_list) > 0: logger.debug("first ... last: %s ... %s", str(path_list[0]), str(path_list[-1])) for path in path_list: if os.path.isfile(path): hashvalues.append(checksumdir._filehash(path, hashfunc=hash_func)) logger.debug("one hash per file: len: %i", len(hashvalues)) if len(path_list) > 0: logger.debug("first ... last: %s ... %s", str(hashvalues[0]), str(hashvalues[-1])) checksum_hash = checksumdir._reduce_hash(hashvalues, hashfunc=hash_func) logger.debug("total hash: {}".format(str(checksum_hash))) return checksum_hash
def checksum(path, hashfunc='md5'): """ Return checksum given by path. Wildcards can be used in check sum. Function is strongly dependent on checksumdir package by 'cakepietoast'. :param path: :param hashfunc: :return: """ import checksumdir hash_func = checksumdir.HASH_FUNCS.get(hashfunc) if not hash_func: raise NotImplementedError('{} not implemented.'.format(hashfunc)) if os.path.isdir(path): return checksumdir.dirhash(path, hashfunc=hashfunc) hashvalues = [] path_list = glob.glob(path) logger.debug("path_list " + str(path_list)) for path in path_list: if os.path.isfile(path): hashvalues.append(checksumdir._filehash(path, hashfunc=hash_func)) logger.debug(str(hashvalues)) hash = checksumdir._reduce_hash(hashvalues, hashfunc=hash_func) return hash
def each_location(self, data_doc): # Only data waiting to be verified if data_doc['status'] != 'verifying': # and data_doc['status'] != 'transferred': self.log.debug('Location '+data_doc['host']+' does not need to add new checksum') return if data_doc['status'] == 'transferred' and \ (config.get_hostname() == 'xe1t-datamanager' or config.get_hostname() == 'login'): return # Data must be hosted somewhere if 'host' not in data_doc: return # Data must be here locally if data_doc['host'] != config.get_hostname(): # Special case of midway-srm accessible via POSIX on midway-login1 if not (data_doc['host'] == "midway-srm" and config.get_hostname() == "midway-login1"): self.log.debug('Location not here') return # This status is given after checksumming status = 'transferred' # Find file and perform checksum if os.path.isdir(data_doc['location']): value = checksumdir.dirhash(data_doc['location'], 'sha512') elif os.path.isfile(data_doc['location']): value = checksumdir._filehash(data_doc['location'], hashlib.sha512) else: # Data not actually found self.log.error("Location %s not found." % data_doc['location']) value = None status = 'error' if config.DATABASE_LOG: if data_doc['status'] == 'verifying': self.log.info("Adding a checksum to run " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update({'_id' : self.run_doc['_id'], 'data': {'$elemMatch': data_doc}}, {'$set': {'data.$.status' : status, 'data.$.checksum': value}}) elif data_doc['checksum'] != value or status == 'error': self.log.info("Checksum fail " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update({'_id' : self.run_doc['_id'], 'data': {'$elemMatch': data_doc}}, {'$set': {'data.$.checksumproblem': True}})
def _process(name, in_location, host, pax_version, pax_hash, out_location, detector='tpc', ncpus=1): """Called by another command. """ print('Welcome to cax-process') # Import pax so can process the data from pax import core, parallel # Grab the Run DB so we can query it collection = config.mongo_collection() if detector == 'muon_veto': output_fullname = out_location + '/' + name + '_MV' elif detector == 'tpc': output_fullname = out_location + '/' + name os.makedirs(out_location, exist_ok=True) # New data location datum = {'host' : host, 'type' : 'processed', 'pax_hash' : pax_hash, 'pax_version' : pax_version, 'status' : 'transferring', 'location' : output_fullname + '.root', 'checksum' : None, 'creation_time' : datetime.datetime.utcnow(), 'creation_place': host} # This query is used to find if this run has already processed this data # in the same way. If so, quit. query = {'name' : name, 'detector' : detector, # This 'data' gets deleted later and only used for checking 'data' : {'$elemMatch': {'host' : host, 'type' : 'processed', 'pax_version': pax_version}}} doc = collection.find_one(query) # Query DB if doc is not None: print("Already processed %s. Clear first. %s" % (name, pax_version)) return 1 # Not processed this way already, so notify run DB we will doc = collection.find_one_and_update({'detector': detector, 'name': name}, {'$push': {'data': datum}}, return_document=ReturnDocument.AFTER) # Determine based on run DB what settings to use for processing. if doc['detector'] == 'muon_veto': pax_config = 'XENON1T_MV' decoder = 'BSON.DecodeZBSON' elif doc['detector'] == 'tpc': decoder = 'Pickle.DecodeZPickle' if doc['reader']['self_trigger']: pax_config = 'XENON1T' else: pax_config = 'XENON1T_LED' # Try to process data. try: print('processing', name, in_location, pax_config, ncpus) pax_kwargs = dict(config_names=pax_config, config_dict={'pax': {'input_name' : in_location, 'output_name': output_fullname, 'decoder_plugin': decoder}, 'DEFAULT': {'lock_breaking_timeout': 600}, 'Queues': {'event_block_size': 1, 'max_blocks_on_heap': 1000, 'timeout_after_sec': 600}}) if ncpus > 1: parallel.multiprocess_locally(n_cpus=ncpus, **pax_kwargs) else: core.Processor(**pax_kwargs).run() except Exception as exception: # Data processing failed. datum['status'] = 'error' if config.DATABASE_LOG == True: collection.update(query, {'$set': {'data.$': datum}}) raise datum['status'] = 'verifying' if config.DATABASE_LOG == True: collection.update(query, {'$set': {'data.$': datum}}) datum['checksum'] = checksumdir._filehash(datum['location'], hashlib.sha512) if verify(): datum['status'] = 'transferred' else: datum['status'] = 'failed' if config.DATABASE_LOG == True: collection.update(query, {'$set': {'data.$': datum}})
def _process(name, in_location, host, pax_version, pax_hash, out_location, detector='tpc', ncpus=1): """Called by another command. """ print('Welcome to cax-process') # Import pax so can process the data from pax import core, parallel # Grab the Run DB so we can query it collection = config.mongo_collection() if detector == 'muon_veto': output_fullname = out_location + '/' + name + '_MV' elif detector == 'tpc': output_fullname = out_location + '/' + name os.makedirs(out_location, exist_ok=True) # New data location datum = { 'host': host, 'type': 'processed', 'pax_hash': pax_hash, 'pax_version': pax_version, 'status': 'transferring', 'location': output_fullname + '.root', 'checksum': None, 'creation_time': datetime.datetime.utcnow(), 'creation_place': host } # This query is used to find if this run has already processed this data # in the same way. If so, quit. query = { 'name': name, 'detector': detector, # This 'data' gets deleted later and only used for checking 'data': { '$elemMatch': { 'host': host, 'type': 'processed', 'pax_version': pax_version } } } doc = collection.find_one(query) # Query DB if doc is not None: print("Already processed %s. Clear first. %s" % (name, pax_version)) return 1 # Not processed this way already, so notify run DB we will doc = collection.find_one_and_update({ 'detector': detector, 'name': name }, {'$push': { 'data': datum }}, return_document=ReturnDocument.AFTER) # Determine based on run DB what settings to use for processing. if doc['detector'] == 'muon_veto': pax_config = 'XENON1T_MV' decoder = 'BSON.DecodeZBSON' elif doc['detector'] == 'tpc': decoder = 'Pickle.DecodeZPickle' if doc['reader']['self_trigger']: pax_config = 'XENON1T' else: pax_config = 'XENON1T_LED' # Try to process data. try: print('processing', name, in_location, pax_config, ncpus) pax_kwargs = dict(config_names=pax_config, config_dict={ 'pax': { 'input_name': in_location, 'output_name': output_fullname, 'decoder_plugin': decoder }, 'DEFAULT': { 'lock_breaking_timeout': 600 }, 'Queues': { 'event_block_size': 1, 'max_blocks_on_heap': 1000, 'timeout_after_sec': 600 } }) if ncpus > 1: parallel.multiprocess_locally(n_cpus=ncpus, **pax_kwargs) else: core.Processor(**pax_kwargs).run() except Exception as exception: # Data processing failed. datum['status'] = 'error' if config.DATABASE_LOG == True: collection.update(query, {'$set': {'data.$': datum}}) raise datum['status'] = 'verifying' if config.DATABASE_LOG == True: collection.update(query, {'$set': {'data.$': datum}}) datum['checksum'] = checksumdir._filehash(datum['location'], hashlib.sha512) if verify(): datum['status'] = 'transferred' else: datum['status'] = 'failed' if config.DATABASE_LOG == True: collection.update(query, {'$set': {'data.$': datum}})
def each_location(self, data_doc): # Only data waiting to be verified if data_doc[ 'status'] != 'verifying': # and data_doc['status'] != 'transferred': self.log.debug('Location ' + data_doc['host'] + ' does not need to add new checksum') return if data_doc['status'] == 'transferred' and \ (config.get_hostname() == 'xe1t-datamanager' or config.get_hostname() == 'login'): return # Data must be hosted somewhere if 'host' not in data_doc: return # Data must be here locally if data_doc['host'] != config.get_hostname(): # Special case of midway-srm accessible via POSIX on midway-login1 if not (data_doc['host'] == "midway-srm" and config.get_hostname() == "midway-login1"): self.log.debug('Location not here') return # This status is given after checksumming status = 'transferred' # Find file and perform checksum if os.path.isdir(data_doc['location']): value = checksumdir.dirhash(data_doc['location'], 'sha512') elif os.path.isfile(data_doc['location']): value = checksumdir._filehash(data_doc['location'], hashlib.sha512) else: # Data not actually found self.log.error("Location %s not found." % data_doc['location']) value = None status = 'error' if config.DATABASE_LOG: if data_doc['status'] == 'verifying': self.log.info("Adding a checksum to run " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update( { '_id': self.run_doc['_id'], 'data': { '$elemMatch': data_doc } }, { '$set': { 'data.$.status': status, 'data.$.checksum': value } }) elif data_doc['checksum'] != value or status == 'error': self.log.info("Checksum fail " "%d %s" % (self.run_doc['number'], data_doc['type'])) self.collection.update( { '_id': self.run_doc['_id'], 'data': { '$elemMatch': data_doc } }, {'$set': { 'data.$.checksumproblem': True }})