Example #1
0
def checksum(path, hashfunc="md5"):
    """Return checksum of files given by path.

    Wildcards can be used in check sum. Function is strongly dependent on checksumdir package by 'cakepietoast'.

    :param path: path of files to get hash from
    :param hashfunc: function used to get hash, default 'md5'
    :return: (str) hash of the file/files given by path
    """
    import checksumdir

    hash_func = checksumdir.HASH_FUNCS.get(hashfunc)
    if not hash_func:
        raise NotImplementedError("{} not implemented.".format(hashfunc))

    if os.path.isdir(path):
        return checksumdir.dirhash(path, hashfunc=hashfunc)

    hashvalues = []
    path_list = list(sorted(glob.glob(path)))
    logger.debug("path_list: len: %i", len(path_list))
    if len(path_list) > 0:
        logger.debug("first ... last: %s ... %s", str(path_list[0]), str(path_list[-1]))

    for path in path_list:
        if os.path.isfile(path):
            hashvalues.append(checksumdir._filehash(path, hashfunc=hash_func))
    logger.debug("one hash per file: len: %i", len(hashvalues))
    if len(path_list) > 0:
        logger.debug("first ... last: %s ... %s", str(hashvalues[0]), str(hashvalues[-1]))
    checksum_hash = checksumdir._reduce_hash(hashvalues, hashfunc=hash_func)
    logger.debug("total hash: {}".format(str(checksum_hash)))
    return checksum_hash
Example #2
0
def checksum(path, hashfunc='md5'):
    """
    Return checksum given by path. Wildcards can be used in check sum. Function is strongly
    dependent on checksumdir package by 'cakepietoast'.

    :param path:
    :param hashfunc:
    :return:
    """
    import checksumdir
    hash_func = checksumdir.HASH_FUNCS.get(hashfunc)
    if not hash_func:
        raise NotImplementedError('{} not implemented.'.format(hashfunc))

    if os.path.isdir(path):
        return checksumdir.dirhash(path, hashfunc=hashfunc)

    hashvalues = []
    path_list = glob.glob(path)
    logger.debug("path_list " + str(path_list))
    for path in path_list:
        if os.path.isfile(path):
            hashvalues.append(checksumdir._filehash(path, hashfunc=hash_func))
    logger.debug(str(hashvalues))
    hash = checksumdir._reduce_hash(hashvalues, hashfunc=hash_func)
    return hash
Example #3
0
    def each_location(self, data_doc):
        # Only data waiting to be verified
        if data_doc['status'] != 'verifying':  # and data_doc['status'] != 'transferred':
            self.log.debug('Location '+data_doc['host']+' does not need to add new checksum')
            return
        
        if data_doc['status'] == 'transferred' and \
           (config.get_hostname() == 'xe1t-datamanager' or config.get_hostname() == 'login'):
            return
        

        # Data must be hosted somewhere
        if 'host' not in data_doc:
            return

        # Data must be here locally
        if data_doc['host'] != config.get_hostname():

            # Special case of midway-srm accessible via POSIX on midway-login1
            if not (data_doc['host']  == "midway-srm" and config.get_hostname() == "midway-login1"):
                self.log.debug('Location not here')
                return

        # This status is given after checksumming
        status = 'transferred'

        # Find file and perform checksum
        if os.path.isdir(data_doc['location']):
            value = checksumdir.dirhash(data_doc['location'],
                                        'sha512')
        elif os.path.isfile(data_doc['location']):
            value = checksumdir._filehash(data_doc['location'],
                                          hashlib.sha512)
        else:
            # Data not actually found
            self.log.error("Location %s not found." % data_doc['location'])
            value = None
            status = 'error'

        if config.DATABASE_LOG:
            if data_doc['status'] == 'verifying':
                self.log.info("Adding a checksum to run "
                              "%d %s" % (self.run_doc['number'],
                                         data_doc['type']))
                self.collection.update({'_id' : self.run_doc['_id'],
                                        'data': {'$elemMatch': data_doc}},
                                       {'$set': {'data.$.status'  : status,
                                                 'data.$.checksum': value}})
            elif data_doc['checksum'] != value or status == 'error':
                self.log.info("Checksum fail "
                              "%d %s" % (self.run_doc['number'],
                                         data_doc['type']))
                self.collection.update({'_id' : self.run_doc['_id'],
                                        'data': {'$elemMatch': data_doc}},
                                       {'$set': {'data.$.checksumproblem': True}})
Example #4
0
def _process(name, in_location, host, pax_version, pax_hash,
             out_location, detector='tpc',  ncpus=1):
    """Called by another command.
    """
    print('Welcome to cax-process')

    # Import pax so can process the data
    from pax import core, parallel

    # Grab the Run DB so we can query it
    collection = config.mongo_collection()

    if detector == 'muon_veto':
        output_fullname = out_location + '/' + name + '_MV'
    elif detector == 'tpc':
        output_fullname = out_location + '/' + name

    os.makedirs(out_location, exist_ok=True)

    # New data location
    datum = {'host'          : host,
             'type'          : 'processed',
             'pax_hash'      : pax_hash,
             'pax_version'   : pax_version,
             'status'        : 'transferring',
             'location'      : output_fullname + '.root',
             'checksum'      : None,
             'creation_time' : datetime.datetime.utcnow(),
             'creation_place': host}

    # This query is used to find if this run has already processed this data
    # in the same way.  If so, quit.
    query = {'name'    : name,
             'detector' : detector,
             # This 'data' gets deleted later and only used for checking
             'data'    : {'$elemMatch': {'host'       : host,
                                         'type'       : 'processed',
                                         'pax_version': pax_version}}}
    doc = collection.find_one(query)  # Query DB
    if doc is not None:
        print("Already processed %s.  Clear first.  %s" % (name,
                                                           pax_version))
        return 1

    # Not processed this way already, so notify run DB we will
    doc = collection.find_one_and_update({'detector': detector, 'name': name},
                                         {'$push': {'data': datum}},
                                         return_document=ReturnDocument.AFTER)

    # Determine based on run DB what settings to use for processing.
    if doc['detector'] == 'muon_veto':
        pax_config = 'XENON1T_MV'
        decoder = 'BSON.DecodeZBSON'
    elif doc['detector'] == 'tpc':
        decoder = 'Pickle.DecodeZPickle'
        if doc['reader']['self_trigger']:
            pax_config = 'XENON1T'
        else:
            pax_config = 'XENON1T_LED'

    # Try to process data.
    try:
        print('processing', name, in_location, pax_config, ncpus)
        pax_kwargs = dict(config_names=pax_config,
                          config_dict={'pax': {'input_name' : in_location,
                                               'output_name': output_fullname,
                                               'decoder_plugin': decoder},
                                       'DEFAULT': {'lock_breaking_timeout': 600},
                                       'Queues': {'event_block_size': 1,
                                                  'max_blocks_on_heap': 1000,
                                                  'timeout_after_sec': 600}})
        if ncpus > 1:
            parallel.multiprocess_locally(n_cpus=ncpus, **pax_kwargs)
        else:
            core.Processor(**pax_kwargs).run()

    except Exception as exception:
        # Data processing failed.
        datum['status'] = 'error'
        if config.DATABASE_LOG == True:
            collection.update(query, {'$set': {'data.$': datum}})
        raise

    datum['status'] = 'verifying'
    if config.DATABASE_LOG == True:
        collection.update(query, {'$set': {'data.$': datum}})

    datum['checksum'] = checksumdir._filehash(datum['location'],
                                              hashlib.sha512)
    if verify():
        datum['status'] = 'transferred'
    else:
        datum['status'] = 'failed'

    if config.DATABASE_LOG == True:
        collection.update(query, {'$set': {'data.$': datum}})
Example #5
0
def _process(name,
             in_location,
             host,
             pax_version,
             pax_hash,
             out_location,
             detector='tpc',
             ncpus=1):
    """Called by another command.
    """
    print('Welcome to cax-process')

    # Import pax so can process the data
    from pax import core, parallel

    # Grab the Run DB so we can query it
    collection = config.mongo_collection()

    if detector == 'muon_veto':
        output_fullname = out_location + '/' + name + '_MV'
    elif detector == 'tpc':
        output_fullname = out_location + '/' + name

    os.makedirs(out_location, exist_ok=True)

    # New data location
    datum = {
        'host': host,
        'type': 'processed',
        'pax_hash': pax_hash,
        'pax_version': pax_version,
        'status': 'transferring',
        'location': output_fullname + '.root',
        'checksum': None,
        'creation_time': datetime.datetime.utcnow(),
        'creation_place': host
    }

    # This query is used to find if this run has already processed this data
    # in the same way.  If so, quit.
    query = {
        'name': name,
        'detector': detector,
        # This 'data' gets deleted later and only used for checking
        'data': {
            '$elemMatch': {
                'host': host,
                'type': 'processed',
                'pax_version': pax_version
            }
        }
    }
    doc = collection.find_one(query)  # Query DB
    if doc is not None:
        print("Already processed %s.  Clear first.  %s" % (name, pax_version))
        return 1

    # Not processed this way already, so notify run DB we will
    doc = collection.find_one_and_update({
        'detector': detector,
        'name': name
    }, {'$push': {
        'data': datum
    }},
                                         return_document=ReturnDocument.AFTER)

    # Determine based on run DB what settings to use for processing.
    if doc['detector'] == 'muon_veto':
        pax_config = 'XENON1T_MV'
        decoder = 'BSON.DecodeZBSON'
    elif doc['detector'] == 'tpc':
        decoder = 'Pickle.DecodeZPickle'
        if doc['reader']['self_trigger']:
            pax_config = 'XENON1T'
        else:
            pax_config = 'XENON1T_LED'

    # Try to process data.
    try:
        print('processing', name, in_location, pax_config, ncpus)
        pax_kwargs = dict(config_names=pax_config,
                          config_dict={
                              'pax': {
                                  'input_name': in_location,
                                  'output_name': output_fullname,
                                  'decoder_plugin': decoder
                              },
                              'DEFAULT': {
                                  'lock_breaking_timeout': 600
                              },
                              'Queues': {
                                  'event_block_size': 1,
                                  'max_blocks_on_heap': 1000,
                                  'timeout_after_sec': 600
                              }
                          })
        if ncpus > 1:
            parallel.multiprocess_locally(n_cpus=ncpus, **pax_kwargs)
        else:
            core.Processor(**pax_kwargs).run()

    except Exception as exception:
        # Data processing failed.
        datum['status'] = 'error'
        if config.DATABASE_LOG == True:
            collection.update(query, {'$set': {'data.$': datum}})
        raise

    datum['status'] = 'verifying'
    if config.DATABASE_LOG == True:
        collection.update(query, {'$set': {'data.$': datum}})

    datum['checksum'] = checksumdir._filehash(datum['location'],
                                              hashlib.sha512)
    if verify():
        datum['status'] = 'transferred'
    else:
        datum['status'] = 'failed'

    if config.DATABASE_LOG == True:
        collection.update(query, {'$set': {'data.$': datum}})
Example #6
0
    def each_location(self, data_doc):
        # Only data waiting to be verified
        if data_doc[
                'status'] != 'verifying':  # and data_doc['status'] != 'transferred':
            self.log.debug('Location ' + data_doc['host'] +
                           ' does not need to add new checksum')
            return

        if data_doc['status'] == 'transferred' and \
           (config.get_hostname() == 'xe1t-datamanager' or config.get_hostname() == 'login'):
            return

        # Data must be hosted somewhere
        if 'host' not in data_doc:
            return

        # Data must be here locally
        if data_doc['host'] != config.get_hostname():

            # Special case of midway-srm accessible via POSIX on midway-login1
            if not (data_doc['host'] == "midway-srm"
                    and config.get_hostname() == "midway-login1"):
                self.log.debug('Location not here')
                return

        # This status is given after checksumming
        status = 'transferred'

        # Find file and perform checksum
        if os.path.isdir(data_doc['location']):
            value = checksumdir.dirhash(data_doc['location'], 'sha512')
        elif os.path.isfile(data_doc['location']):
            value = checksumdir._filehash(data_doc['location'], hashlib.sha512)
        else:
            # Data not actually found
            self.log.error("Location %s not found." % data_doc['location'])
            value = None
            status = 'error'

        if config.DATABASE_LOG:
            if data_doc['status'] == 'verifying':
                self.log.info("Adding a checksum to run "
                              "%d %s" %
                              (self.run_doc['number'], data_doc['type']))
                self.collection.update(
                    {
                        '_id': self.run_doc['_id'],
                        'data': {
                            '$elemMatch': data_doc
                        }
                    }, {
                        '$set': {
                            'data.$.status': status,
                            'data.$.checksum': value
                        }
                    })
            elif data_doc['checksum'] != value or status == 'error':
                self.log.info("Checksum fail "
                              "%d %s" %
                              (self.run_doc['number'], data_doc['type']))
                self.collection.update(
                    {
                        '_id': self.run_doc['_id'],
                        'data': {
                            '$elemMatch': data_doc
                        }
                    }, {'$set': {
                        'data.$.checksumproblem': True
                    }})