Esempio n. 1
0
class RATaskSpecifiedOTDBEventMessageHandler(OTDBEventMessageHandler):
    def __init__(self, exchange=DEFAULT_BUSNAME, broker=DEFAULT_BROKER):
        """
        :param exchange: name of the exchange to listen on.
        :param broker: name of the broker to connect to.
        """
        super().__init__()
        self.otdbrpc = OTDBRPC.create(exchange=exchange, broker=broker)
        self.radbrpc = RADBRPC.create(exchange=exchange, broker=broker)
        self.momrpc = MoMQueryRPC.create(exchange=exchange, broker=broker)
        self.send_bus = ToBus(exchange=exchange, broker=broker)

    def start_handling(self):
        self.otdbrpc.open()
        self.momrpc.open()
        self.radbrpc.open()
        self.send_bus.open()

    def stop_handling(self):
        self.otdbrpc.close()
        self.momrpc.close()
        self.radbrpc.close()
        self.send_bus.close()

    # This is mainly to trigger the propagation of misc field values through read_from_mom
    # and then sending them to the RA to OTDB Service in the resource assigner.
    # Might need to be a separate service if we take on more mom-otdb-adapter function.
    def onObservationApproved(self, main_id, modification_time):
        self.createAndSendSpecifiedTask(main_id, 'approved')

    def onObservationPrescheduled(self, main_id, modification_time):
        self.createAndSendSpecifiedTask(main_id, 'prescheduled')

    def createAndSendSpecifiedTask(self, main_id, status):
        spec = Specification(self.otdbrpc, self.momrpc, self.radbrpc)
        spec.status = status
        spec.read_from_OTDB_with_predecessors(main_id, "otdb", {})
        spec.read_from_mom()
        spec.update_start_end_times()
        # spec.insert_into_radb() is still done in resource_assigner for now.
        result_tree = spec.as_dict()
        if spec.status == status:
            logger.info("Sending result: %s" % result_tree)
            # Put result on bus
            msg = EventMessage(subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT,
                               content=result_tree)
            with self.send_bus:
                self.send_bus.send(msg)
            logger.info("Result sent")
        else:
            logger.warning("Problem retrieving task %i" % main_id)
Esempio n. 2
0
class RATaskSpecified(OTDBBusListener):
    def __init__(self,
                   otdb_notification_busname=DEFAULT_OTDB_NOTIFICATION_BUSNAME,
                   otdb_notification_subject=DEFAULT_OTDB_NOTIFICATION_SUBJECT,
                   otdb_service_busname=DEFAULT_OTDB_SERVICE_BUSNAME,
                   otdb_service_subject=DEFAULT_OTDB_SERVICENAME,
                   notification_busname=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_BUSNAME,
                   notification_subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT,
                   broker=None, **kwargs):
        super(RATaskSpecified, self).__init__(busname=otdb_notification_busname, subject=otdb_notification_subject, **kwargs)
        self.otdbrpc  = OTDBRPC(busname=otdb_service_busname, servicename=otdb_service_subject, broker=broker) ## , ForwardExceptions=True hardcoded in RPCWrapper right now
        self.send_bus = ToBus("%s/%s" % (notification_busname, notification_subject))

    def start_listening(self, **kwargs):
        self.otdbrpc.open()
        self.send_bus.open()
        super(RATaskSpecified, self).start_listening(**kwargs)

    def stop_listening(self, **kwargs):
        super(RATaskSpecified, self).stop_listening(**kwargs)
        self.send_bus.close()
        self.otdbrpc.close()

    def get_predecessors(self, parset):
        """ Extract the list of predecessor obs IDs from the given parset. """

        key = PARSET_PREFIX + "Observation.Scheduler.predecessors"
        stringlist = PyParameterValue(str(parset[key]), True).getStringVector()

        # Key contains values starting with 'S' = Scheduler, 'L'/'T' = OTDB, 'M' = MoM
        # 'S' we can probably ignore? Might be only internal in the Scheduler?
        result = []
        for s in stringlist:
            try: # Made the source a string for readability, but it's not efficient
                if s.startswith('M'):
                    result.append({'source': 'mom', 'id': int(s[1:])})
                elif s.startswith('L') or s.startswith('T'):
                    result.append({'source': 'otdb', 'id': int(s[1:])})
                else: # 'S'
                    logger.info("found a predecessor ID I can't handle: %s" % s)
                    result.append({'source': 'other', 'id': int(s[1:])})
            except ValueError:
                logger.warning("found a predecessor ID that I can't parse %s" % s)
        return result

    def get_specification_with_predecessors(self, id, id_source, state, found_parsets):
        logger.info("Processing ID %s from %s" % (id, id_source))
        if id_source == "other":
            return None
        elif id_source == "mom":
            otdb_id = self.otdbrpc.taskGetIDs( mom_id=id )['otdb_id']
        elif id_source == "otdb":
            otdb_id = id
        else:
            logger.warning("Error in understanding id %s", id)
      
        logger.info("Processing OTDB ID %s", otdb_id)
        result = {"otdb_id": otdb_id, "predecessors": []} 
        if state:
            result["state"] = state # TODO should be status not state
        else:
            pass #otdbrpc.taskGetStatus not implemented and maybe not needed?
          
        if otdb_id in found_parsets:
            parset = found_parsets[otdb_id]
        else:
            parset = self.otdbrpc.taskGetSpecification( otdb_id=otdb_id )['specification']
            found_parsets[otdb_id] = parset
          
        logger.info("parset [%s]: %s" % (otdb_id, parset))
        result['specification'] = resourceIndicatorsFromParset(parset)
      
        key = PARSET_PREFIX + "Observation.processSubtype"
        result['task_type'], result['task_subtype'] = convertSchedulerProcessSubtype(parset.get(key, ""))

        logger.info("Processing predecessors")
        predecessor_ids = self.get_predecessors(parset)
        for id in predecessor_ids:
            predecessor_result = self.get_specification_with_predecessors(id['id'], id['source'], "", found_parsets)
            if predecessor_result:
                result["predecessors"].append(predecessor_result)
        return result

    def onObservationApproved(self, main_id, modificationTime):
        self.createAndSendSpecifiedTask(main_id, 'approved')

    def onObservationPrescheduled(self, main_id, modificationTime):
        self.createAndSendSpecifiedTask(main_id, 'prescheduled')

    def createAndSendSpecifiedTask(self, main_id, status):
        # Construct root node of tree
        resultTree = self.get_specification_with_predecessors(main_id, "otdb", status, {})
        logger.info("Sending result: %s" % resultTree)

        # Put result on bus
        msg = EventMessage(content=resultTree)
        self.send_bus.send(msg)
        logger.info("Result sent")
Esempio n. 3
0
class RADBPGListener(PostgresListener):
    def __init__(self,
                 busname=DEFAULT_NOTIFICATION_BUSNAME,
                 notification_prefix=DEFAULT_NOTIFICATION_PREFIX,
                 dbcreds=None,
                 broker=None):
        super(RADBPGListener, self).__init__(dbcreds.host, dbcreds.database,
                                             dbcreds.user, dbcreds.password)

        self.notification_prefix = notification_prefix
        self.event_bus = ToBus(busname, broker=broker)

        self.subscribe('task_update_with_task_view', self.onTaskUpdated)
        self.subscribe('task_insert_with_task_view', self.onTaskInserted)
        self.subscribe('task_delete', self.onTaskDeleted)

        # when the specification starttime and endtime are updated, then that effects the task as well
        # so subscribe to specification_update, and use task_view as view_for_row
        self.subscribe('specification_update_with_task_view',
                       self.onSpecificationUpdated)

        self.subscribe('resource_claim_update_with_resource_claim_view',
                       self.onResourceClaimUpdated)
        self.subscribe('resource_claim_insert_with_resource_claim_view',
                       self.onResourceClaimInserted)
        self.subscribe('resource_claim_delete', self.onResourceClaimDeleted)

        self.subscribe('resource_availability_update',
                       self.onResourceAvailabilityUpdated)
        self.subscribe('resource_capacity_update',
                       self.onResourceCapacityUpdated)

    def onTaskUpdated(self, payload=None):
        self._sendNotification('TaskUpdated', payload,
                               ['starttime', 'endtime'])

    def onTaskInserted(self, payload=None):
        self._sendNotification('TaskInserted', payload,
                               ['starttime', 'endtime'])

    def onTaskDeleted(self, payload=None):
        self._sendNotification('TaskDeleted', payload)

    def onSpecificationUpdated(self, payload=None):
        # when the specification starttime and endtime are updated, then that effects the task as well
        # so send a TaskUpdated notification
        self._sendNotification('TaskUpdated', payload,
                               ['starttime', 'endtime'])

    def onResourceClaimUpdated(self, payload=None):
        self._sendNotification('ResourceClaimUpdated', payload,
                               ['starttime', 'endtime'])

    def onResourceClaimInserted(self, payload=None):
        self._sendNotification('ResourceClaimInserted', payload,
                               ['starttime', 'endtime'])

    def onResourceClaimDeleted(self, payload=None):
        self._sendNotification('ResourceClaimDeleted', payload)

    def onResourceAvailabilityUpdated(self, payload=None):
        self._sendNotification('ResourceAvailabilityUpdated', payload)

    def onResourceCapacityUpdated(self, payload=None):
        self._sendNotification('ResourceCapacityUpdated', payload)

    def __enter__(self):
        super(RADBPGListener, self).__enter__()
        self.event_bus.open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        super(RADBPGListener, self).__exit__(exc_type, exc_val, exc_tb)
        self.event_bus.close()

    def _formatTimestampsAsIso(self, fields, contentDict):
        '''convert all requested fields in the contentDict to proper isoformat datetime strings.
        In postgres we use timestamps without timezone.
        By convention we only enter utc values.
        But, if they are json encoded by postgress, they are not properly formatted with the in isoformat with a 'Z' at the end.
        So, parse the requested fields, and return them as datetime.
        '''
        try:
            for state in ('old', 'new'):
                if state in contentDict:
                    for field in fields:
                        try:
                            if contentDict[state] and field in contentDict[
                                    state]:
                                timestampStr = contentDict[state][field]
                                formatStr = '%Y-%m-%dT%H:%M:%S' if 'T' in timestampStr else '%Y-%m-%d %H:%M:%S'
                                if timestampStr.rfind('.') > -1:
                                    formatStr += '.%f'

                                timestamp = datetime.strptime(
                                    timestampStr, formatStr)

                                contentDict[state][field] = timestamp
                        except Exception as e:
                            logger.error(
                                'Could not convert field \'%s\' to datetime: %s'
                                % (field, e))

            return contentDict
        except Exception as e:
            logger.error('Could not parse payload: %s\n%s' % (contentDict, e))

    def _sendNotification(self, subject, payload, timestampFields=None):
        try:
            content = json.loads(payload)

            if 'new' in content and content[
                    'new'] and 'old' in content and content['old']:
                # check if new and old are equal.
                # however, new and old can be based on different views,
                # so, only check the values for the keys they have in common
                new_keys = set(content['new'].keys())
                old_keys = set(content['old'].keys())
                common_keys = new_keys & old_keys
                equal_valued_keys = [
                    k for k in common_keys
                    if content['new'][k] == content['old'][k]
                ]
                if len(equal_valued_keys) == len(common_keys):
                    logger.info(
                        'new and old values are equal, not sending notification. %s'
                        % (content['new']))
                    return

            if timestampFields:
                content = self._formatTimestampsAsIso(timestampFields, content)
        except Exception as e:
            logger.error('Could not parse payload: %s\n%s' % (payload, e))
            content = None

        try:
            msg = EventMessage(context=self.notification_prefix + subject,
                               content=content)
            logger.info('Sending notification %s: %s' %
                        (subject, str(content).replace('\n', ' ')))
            self.event_bus.send(msg)
        except Exception as e:
            logger.error(str(e))
Esempio n. 4
0
class RATaskSpecified(OTDBBusListener):
    def __init__(
            self,
            otdb_notification_busname=DEFAULT_OTDB_NOTIFICATION_BUSNAME,
            otdb_notification_subject=DEFAULT_OTDB_NOTIFICATION_SUBJECT,
            otdb_service_busname=DEFAULT_OTDB_SERVICE_BUSNAME,
            otdb_service_subject=DEFAULT_OTDB_SERVICENAME,
            notification_busname=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_BUSNAME,
            notification_subject=DEFAULT_RA_TASK_SPECIFIED_NOTIFICATION_SUBJECT,
            broker=None,
            **kwargs):
        super(RATaskSpecified,
              self).__init__(busname=otdb_notification_busname,
                             subject=otdb_notification_subject,
                             **kwargs)
        self.otdbrpc = OTDBRPC(
            busname=otdb_service_busname,
            servicename=otdb_service_subject,
            broker=broker
        )  ## , ForwardExceptions=True hardcoded in RPCWrapper right now
        self.send_bus = ToBus("%s/%s" %
                              (notification_busname, notification_subject))

    def start_listening(self, **kwargs):
        self.otdbrpc.open()
        self.send_bus.open()
        super(RATaskSpecified, self).start_listening(**kwargs)

    def stop_listening(self, **kwargs):
        super(RATaskSpecified, self).stop_listening(**kwargs)
        self.send_bus.close()
        self.otdbrpc.close()

    def get_predecessors(self, parset):
        """ Extract the list of predecessor obs IDs from the given parset. """

        key = PARSET_PREFIX + "Observation.Scheduler.predecessors"
        stringlist = PyParameterValue(str(parset[key]), True).getStringVector()

        # Key contains values starting with 'S' = Scheduler, 'L'/'T' = OTDB, 'M' = MoM
        # 'S' we can probably ignore? Might be only internal in the Scheduler?
        result = []
        for s in stringlist:
            try:  # Made the source a string for readability, but it's not efficient
                if s.startswith('M'):
                    result.append({'source': 'mom', 'id': int(s[1:])})
                elif s.startswith('L') or s.startswith('T'):
                    result.append({'source': 'otdb', 'id': int(s[1:])})
                else:  # 'S'
                    logger.info("found a predecessor ID I can't handle: %s" %
                                s)
                    result.append({'source': 'other', 'id': int(s[1:])})
            except ValueError:
                logger.warning("found a predecessor ID that I can't parse %s" %
                               s)
        return result

    def get_specification_with_predecessors(self, id, id_source, state,
                                            found_parsets):
        logger.info("Processing ID %s from %s" % (id, id_source))
        if id_source == "other":
            return None
        elif id_source == "mom":
            otdb_id = self.otdbrpc.taskGetIDs(mom_id=id)['otdb_id']
        elif id_source == "otdb":
            otdb_id = id
        else:
            logger.warning("Error in understanding id %s", id)

        logger.info("Processing OTDB ID %s", otdb_id)
        result = {"otdb_id": otdb_id, "predecessors": []}
        if state:
            result["state"] = state  # TODO should be status not state
        else:
            pass  #otdbrpc.taskGetStatus not implemented and maybe not needed?

        if otdb_id in found_parsets:
            parset = found_parsets[otdb_id]
        else:
            parset = self.otdbrpc.taskGetSpecification(
                otdb_id=otdb_id)['specification']
            found_parsets[otdb_id] = parset

        logger.info("parset [%s]: %s" % (otdb_id, parset))
        result['specification'] = resourceIndicatorsFromParset(parset)

        key = PARSET_PREFIX + "Observation.processSubtype"
        result['task_type'], result[
            'task_subtype'] = convertSchedulerProcessSubtype(
                parset.get(key, ""))

        logger.info("Processing predecessors")
        predecessor_ids = self.get_predecessors(parset)
        for id in predecessor_ids:
            predecessor_result = self.get_specification_with_predecessors(
                id['id'], id['source'], "", found_parsets)
            if predecessor_result:
                result["predecessors"].append(predecessor_result)
        return result

    def onObservationApproved(self, main_id, modificationTime):
        self.createAndSendSpecifiedTask(main_id, 'approved')

    def onObservationPrescheduled(self, main_id, modificationTime):
        self.createAndSendSpecifiedTask(main_id, 'prescheduled')

    def createAndSendSpecifiedTask(self, main_id, status):
        # Construct root node of tree
        resultTree = self.get_specification_with_predecessors(
            main_id, "otdb", status, {})
        logger.info("Sending result: %s" % resultTree)

        # Put result on bus
        msg = EventMessage(content=resultTree)
        self.send_bus.send(msg)
        logger.info("Result sent")
Esempio n. 5
0
class RATaskSpecified(OTDBBusListener):
  def __init__(self, servicename, otdb_busname=None, my_busname=None, **kwargs):
    super(RATaskSpecified, self).__init__(busname=otdb_busname, subject="TaskStatus", **kwargs)

    self.parset_rpc = RPC(service="TaskSpecification", busname=otdb_busname)
    self.send_bus   = ToBus("%s/%s" % (my_busname, servicename))

  def start_listening(self, **kwargs):
    self.parset_rpc.open()
    self.send_bus.open()

    super(RATaskSpecified, self).start_listening(**kwargs)

  def stop_listening(self, **kwargs):
    super(RATaskSpecified, self).stop_listening(**kwargs)

    self.send_bus.close()
    self.parset_rpc.close()

  def onObservationPrescheduled(self, treeId, modificationTime):
    logger.info("Processing obs ID %s", treeId)

    # Request the parset
    main_obsID  = treeId
    main_parset,_ = self.parset_rpc( OtdbID=main_obsID )

    # Construct a dict of all the parsets we retrieved
    parsets = {}
    parsets[main_obsID] = main_parset

    logger.info("Processing predecessors")

    # Collect the initial set of predecessors
    request_obsIDs = set(predecessors(main_parset))

    logger.info("Processing %s", request_obsIDs)

    # Iterate recursively over all known predecessor obsIDs, and request their parsets
    while request_obsIDs:
        obsID = request_obsIDs.pop()

        if obsID in parsets:
            # Predecessor lists can overlap -- we already have this one
            continue

        logger.info("Fetching predecessor %s", obsID)

        # Request predecessor parset
        parsets[obsID],_ = self.parset_rpc( OtdbID=obsID )

        # Add the list of predecessors
        request_obsIDs = request_obsIDs.union(predecessors(parsets[obsID]))

    # Convert parsets to resource indicators
    logger.info("Extracting resource indicators")
    resourceIndicators = dict([(str(obsID), resourceIndicatorsFromParset(parset)) for (obsID,parset) in parsets.iteritems()])

    # Construct and send result message
    logger.info("Sending result")
    result = {
      "sasID": main_obsID,
      "state": "prescheduled",
      "time_of_change": modificationTime,
      "resource_indicators": resourceIndicators,
    }

    # Put result on bus
    msg = EventMessage(content=result)
    self.send_bus.send(msg)

    logger.info("Result sent")
Esempio n. 6
0
class RATaskSpecified(OTDBBusListener):
    def __init__(self,
                 servicename,
                 otdb_busname=None,
                 my_busname=None,
                 **kwargs):
        super(RATaskSpecified, self).__init__(busname=otdb_busname,
                                              subject="TaskStatus",
                                              **kwargs)

        self.parset_rpc = RPC(service="TaskSpecification",
                              busname=otdb_busname)
        self.send_bus = ToBus("%s/%s" % (my_busname, servicename))

    def start_listening(self, **kwargs):
        self.parset_rpc.open()
        self.send_bus.open()

        super(RATaskSpecified, self).start_listening(**kwargs)

    def stop_listening(self, **kwargs):
        super(RATaskSpecified, self).stop_listening(**kwargs)

        self.send_bus.close()
        self.parset_rpc.close()

    def onObservationPrescheduled(self, treeId, modificationTime):
        logger.info("Processing obs ID %s", treeId)

        # Request the parset
        main_obsID = treeId
        main_parset, _ = self.parset_rpc(OtdbID=main_obsID)

        # Construct a dict of all the parsets we retrieved
        parsets = {}
        parsets[main_obsID] = main_parset

        logger.info("Processing predecessors")

        # Collect the initial set of predecessors
        request_obsIDs = set(predecessors(main_parset))

        logger.info("Processing %s", request_obsIDs)

        # Iterate recursively over all known predecessor obsIDs, and request their parsets
        while request_obsIDs:
            obsID = request_obsIDs.pop()

            if obsID in parsets:
                # Predecessor lists can overlap -- we already have this one
                continue

            logger.info("Fetching predecessor %s", obsID)

            # Request predecessor parset
            parsets[obsID], _ = self.parset_rpc(OtdbID=obsID)

            # Add the list of predecessors
            request_obsIDs = request_obsIDs.union(predecessors(parsets[obsID]))

        # Convert parsets to resource indicators
        logger.info("Extracting resource indicators")
        resourceIndicators = dict([(str(obsID),
                                    resourceIndicatorsFromParset(parset))
                                   for (obsID, parset) in parsets.iteritems()])

        # Construct and send result message
        logger.info("Sending result")
        result = {
            "sasID": main_obsID,
            "state": "prescheduled",
            "time_of_change": modificationTime,
            "resource_indicators": resourceIndicators,
        }

        # Put result on bus
        msg = EventMessage(content=result)
        self.send_bus.send(msg)

        logger.info("Result sent")
Esempio n. 7
0
class IngestTransferServer:
    def __init__(self,
                 exchange=DEFAULT_BUSNAME,
                 mom_credentials=None,
                 lta_credentials=None,
                 user=None,
                 broker=None,
                 max_nr_of_parallel_jobs=MAX_NR_OF_JOBS):
        self.user = user
        if not self.user:
            self.user = getpass.getuser()

        self.mom_credentials = mom_credentials
        self.lta_credentials = lta_credentials
        self.event_bus = ToBus(exchange=exchange, broker=broker)
        self.max_nr_of_parallel_jobs = max_nr_of_parallel_jobs

        self.__running_jobs = {}
        self.__lock = Lock()
        self.__prev_bytes_sent = _getBytesSent()
        self.__prev_bytes_sent_timestamp = datetime.utcnow()
        self.__prev_used_bandwidth = 0.0
        self.__running_jobs_log_timestamp = datetime.utcnow()

    def start_job(self, job_dict):
        if not self.enoughResourcesAvailable():
            raise ResourceWarning(
                "Not enough resources available to start new job: %s" %
                job_dict)

        job_id = job_dict['JobId']

        if job_id in self.__running_jobs:
            raise ValueError(
                'job %s is already running. Discarding this new job copy, and keeping the current one running...'
                % job_id)

        def threaded_pipeline_func(job):
            logger.info('starting job %s in the background', job_id)
            with LTAClient(self.lta_credentials.user, self.lta_credentials.password) as ltaClient, \
                 MoMClient(self.mom_credentials.user, self.mom_credentials.password) as momClient:
                jobPipeline = IngestPipeline(job,
                                             momClient,
                                             ltaClient,
                                             exchange=self.event_bus.exchange,
                                             broker=self.event_bus.broker,
                                             user=self.user)
                with self.__lock:
                    self.__running_jobs[job_id]['pipeline'] = jobPipeline

                jobPipeline.run()

        with self.__lock:
            thread = Thread(target=threaded_pipeline_func,
                            args=[job_dict],
                            name="transfer_thread_%s" % (job_id, ))
            thread.daemon = True
            self.__running_jobs[job_id] = {'thread': thread}
            thread.start()

    def __clearFinishedJobs(self):
        try:
            with self.__lock:
                finished_job_ids = [
                    job_id for job_id, job_thread_dict in list(
                        self.__running_jobs.items())
                    if not job_thread_dict['thread'].is_alive()
                ]

                for job_id in finished_job_ids:
                    logger.info('removing finished job %s', job_id)
                    del self.__running_jobs[job_id]
        except Exception as e:
            logger.error('__clearFinishedJobs: %s', e)

    def enoughResourcesAvailable(self):
        try:
            now = datetime.utcnow()
            bytes_sent = _getBytesSent()

            if bytes_sent >= self.__prev_bytes_sent:  # bytes_sent might wrap around zero
                # compute current speed in Gbps
                speed = 8 * (bytes_sent - self.__prev_bytes_sent
                             ) / totalSeconds(now -
                                              self.__prev_bytes_sent_timestamp)

                # running average for used_bandwidth
                used_bandwidth = 0.5 * speed + 0.5 * self.__prev_used_bandwidth

                logger.debug("resources: current used_bandwidth = %s",
                             humanreadablesize(used_bandwidth, 'bps'))

                # store for next iteration
                self.__prev_bytes_sent = bytes_sent
                self.__prev_bytes_sent_timestamp = now
                self.__prev_used_bandwidth = used_bandwidth

                # only start new jobs if we have some bandwith available
                # note that this is a 'soft' limit.
                # we cannot control the actual bandwith used by the running transfers
                # we can only not start new jobs if we already exceed the MAX_USED_BANDWITH_TO_START_NEW_JOBS
                if used_bandwidth > MAX_USED_BANDWITH_TO_START_NEW_JOBS:
                    logger.warning(
                        'resources: not enough bandwith available to start new jobs, using %s, max %s'
                        % (humanreadablesize(used_bandwidth, 'bps'),
                           humanreadablesize(
                               MAX_USED_BANDWITH_TO_START_NEW_JOBS, 'bps')))
                    return False
            else:
                # wrapped around 0, just store for next iteration, do not compute anything
                self.__prev_bytes_sent = bytes_sent
                self.__prev_bytes_sent_timestamp = now

            # only start new jobs if we have some cpu time available
            idle_cpu_percentage = psutil.cpu_times_percent().idle
            logger.debug("resources: current idle_cpu_percentage = %s%%",
                         idle_cpu_percentage)
            if idle_cpu_percentage < 5:
                logger.warning(
                    'resources: not enough cpu power available to start new jobs, cpu_idle %s%%'
                    % idle_cpu_percentage)
                return False

            # only start new jobs if system load is not too high
            short_load_avg = os.getloadavg()[0]
            cpu_count = psutil.cpu_count()
            allowed_load = 1.5 * cpu_count
            logger.debug(
                "resources: current short term load = %s #cpu's = %s allowed_load = %s",
                short_load_avg, cpu_count, allowed_load)
            if short_load_avg > allowed_load:
                logger.warning(
                    'resources: system load too high (%s > %s), cannot start new jobs'
                    % (short_load_avg, allowed_load))
                return False

            # only allow 1 job at the time if swapping
            swap_memory_percentage = psutil.swap_memory().percent
            logger.debug("resources: current swap_memory_percentage = %s%%",
                         swap_memory_percentage)
            if swap_memory_percentage > 5 and len(self.__running_jobs) > 0:
                logger.warning(
                    'resources: system swapping. not enough memory available to start new jobs'
                )
                return False

            # only start new jobs if number of processes is not too high
            try:
                current_user = getpass.getuser()
                current_user_procs = [
                    p for p in psutil.process_iter()
                    if p.username() == current_user
                ]
                current_num_user_procs = len(current_user_procs)
                allowed_num_user_procs = 64 * cpu_count

                logger.debug(
                    "resources: current num_user_procs = %s allowed_num_user_procs = %s",
                    current_num_user_procs, allowed_num_user_procs)

                if current_num_user_procs > allowed_num_user_procs:
                    logger.warning(
                        'resources: number of processes by %s too high (%s > %s), cannot start new jobs'
                        % (current_user, current_num_user_procs,
                           allowed_num_user_procs))
                    return False
            except Exception as e:
                logger.exception(e)
                pass

            # limit total number of parallel transferring jobs to self.max_nr_of_parallel_jobs
            with self.__lock:
                starting_threads = [
                    job_thread_dict['thread']
                    for job_thread_dict in list(self.__running_jobs.values())
                    if 'pipeline' not in job_thread_dict
                ]
                pipelines = [
                    job_thread_dict['pipeline']
                    for job_thread_dict in list(self.__running_jobs.values())
                    if 'pipeline' in job_thread_dict
                ]
                initializing_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_INITIALIZING
                ]
                transferring_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_TRANSFERRING
                ]
                finalizing_pipelines = [
                    pipeline for pipeline in pipelines
                    if pipeline.status == IngestPipeline.STATUS_FINALIZING
                ]

                num_busy_transfers = len(starting_threads) + len(
                    initializing_pipelines) + len(transferring_pipelines)
                num_finalizing_transfers = len(finalizing_pipelines)

                logger.debug(
                    "resources: current num_busy_transfers = %s num_finalizing_transfers = %s max_nr_of_parallel_jobs = %s",
                    num_busy_transfers, num_finalizing_transfers,
                    self.max_nr_of_parallel_jobs)

                if num_busy_transfers >= self.max_nr_of_parallel_jobs:
                    logger.warning(
                        'resources: already running %d parallel jobs (#starting=%d, #transferring=%d) limiting the total number of transferring jobs to %d'
                        % (len(self.__running_jobs),
                           len(initializing_pipelines) + len(starting_threads),
                           len(transferring_pipelines),
                           self.max_nr_of_parallel_jobs))
                    return False

                if num_finalizing_transfers >= 2 * self.max_nr_of_parallel_jobs:
                    logger.warning(
                        'resources: already waiting for %d jobs to finish (updating status/SIP to MoM and LTA). not starting new jobs until some jobs finished...'
                        % (len(finalizing_pipelines), ))
                    return False

        except Exception as e:
            logger.exception(
                "error while checking for available resources: %s", e)

            num_running_jobs = len(self.__running_jobs)
            if num_running_jobs <= 4:
                logger.info("running %d jobs, assuming we can run 1 more: ",
                            num_running_jobs)
                return True
            else:
                logger.warning(
                    "already running %d jobs, assuming for safety we cannot run more jobs...",
                    num_running_jobs)
                return False

        return True

    def run(self):
        with self.event_bus:
            while True:
                try:
                    self.__clearFinishedJobs()

                    with self.__lock:
                        starting_threads = [
                            job_thread_dict['thread'] for job_thread_dict in
                            list(self.__running_jobs.values())
                            if 'pipeline' not in job_thread_dict
                        ]
                        pipelines = [
                            job_thread_dict['pipeline'] for job_thread_dict in
                            list(self.__running_jobs.values())
                            if 'pipeline' in job_thread_dict
                        ]
                        initializing_pipelines = [
                            pipeline for pipeline in pipelines
                            if pipeline.status ==
                            IngestPipeline.STATUS_INITIALIZING
                        ]
                        transferring_pipelines = [
                            pipeline for pipeline in pipelines
                            if pipeline.status ==
                            IngestPipeline.STATUS_TRANSFERRING
                        ]
                        finalizing_pipelines = [
                            pipeline for pipeline in pipelines if
                            pipeline.status == IngestPipeline.STATUS_FINALIZING
                        ]
                        finished_pipelines = [
                            pipeline for pipeline in pipelines if
                            pipeline.status == IngestPipeline.STATUS_FINISHED
                        ]
                        log_interval = 5 if self.__running_jobs else 60

                    if datetime.utcnow(
                    ) - self.__running_jobs_log_timestamp > timedelta(
                            seconds=log_interval):
                        status_log_line = "status: running %s jobs: #starting=%d, #transferring=%d, #finalizing=%d, #finished=%d, bandwith used on network interface(s) %s %s (%s), load=%.1f" % (
                            len(self.__running_jobs),
                            len(initializing_pipelines) +
                            len(starting_threads), len(transferring_pipelines),
                            len(finalizing_pipelines), len(finished_pipelines),
                            NET_IF_TO_MONITOR,
                            humanreadablesize(self.__prev_used_bandwidth,
                                              'bps'),
                            humanreadablesize(self.__prev_used_bandwidth / 8,
                                              'Bps'), os.getloadavg()[0])

                        logger.info(status_log_line)
                        self.__running_jobs_log_timestamp = datetime.utcnow()

                        msg = EventMessage(subject="%s.%s" %
                                           (INGEST_NOTIFICATION_PREFIX,
                                            'TransferServiceStatus'),
                                           content={
                                               'ingest_server':
                                               socket.gethostname(),
                                               'message':
                                               status_log_line
                                           })
                        msg.ttl = 3600  # remove message from queue's when not picked up within 1 hours
                        self.event_bus.send(msg)

                    time.sleep(5)
                except KeyboardInterrupt:
                    break
                except Exception as e:
                    logger.error(e)
Esempio n. 8
0
class CacheManager:
    def __init__(self,
                 cache_path='.du_cache.py',
                 mountpoint=CEP4_DATA_MOUNTPOINT,
                 exchange=DEFAULT_BUSNAME,
                 broker=DEFAULT_BROKER):

        self._cache_path = cache_path

        self.otdb_listener = OTDBBusListener(
            _CacheManagerOTDBEventMessageHandler,
            handler_kwargs={'cache_manager': self},
            exchange=exchange,
            broker=broker,
            num_threads=1)

        self.dm_listener = DataManagementBusListener(
            _CacheManagerDataManagementEventMessageHandler,
            handler_kwargs={'cache_manager': self},
            exchange=exchange,
            broker=broker,
            num_threads=1)

        self.event_bus = ToBus(exchange=exchange, broker=broker)

        self._updateCacheThread = None
        self._running = False

        self._cacheLock = RLock()

        self._cache = {'path_du_results': {}, 'otdb_id2path': {}}
        self._last_cache_write_timestamp = datetime.datetime(1970, 1, 1)
        self._readCacheFromDisk()

        # dict to hold threading Events per path to prevent expensive multiple du calls for each path
        self._du_threading_events = {}
        self._du_threading_events_lock = RLock()

        self.disk_usage = DiskUsage(mountpoint=mountpoint,
                                    exchange=exchange,
                                    broker=broker)

    def _sendDiskUsageChangedNotification(self,
                                          path,
                                          disk_usage,
                                          otdb_id=None):
        try:
            msg = EventMessage(subject='%s.DiskUsageChanged' %
                               DEFAULT_DM_NOTIFICATION_PREFIX,
                               content={
                                   'path':
                                   path,
                                   'disk_usage':
                                   disk_usage,
                                   'disk_usage_readable':
                                   humanreadablesize(disk_usage),
                                   'otdb_id':
                                   otdb_id
                               })
            logger.info('Sending notification with subject %s to %s: %s',
                        msg.subject, self.event_bus.exchange, msg.content)
            self.event_bus.send(msg)
        except Exception as e:
            logger.error(str(e))

    def _readCacheFromDisk(self):
        # maybe this cache on disk is slow, if so, revert to proper db solution
        try:
            if os.path.exists(self._cache_path):
                with open(self._cache_path, 'r') as file:
                    cache_from_disk = eval(file.read().strip())  #slow!
                    with self._cacheLock:
                        self._cache = cache_from_disk
                        if not isinstance(self._cache, dict):
                            self._cache = {
                                'path_du_results': {},
                                'otdb_id2path': {}
                            }
                        if 'path_du_results' not in self._cache:
                            self._cache['path_du_results'] = {}
                        if 'otdb_id2path' not in self._cache:
                            self._cache['otdb_id2path'] = {}
        except Exception as e:
            logger.error("Error while reading in du cache: %s", e)
            with self._cacheLock:
                self._cache = {'path_du_results': {}, 'otdb_id2path': {}}

    def _writeCacheToDisk(self):
        try:
            # only persist (a subset of) the cache to disk every once in a while.
            if datetime.datetime.utcnow(
            ) - self._last_cache_write_timestamp > datetime.timedelta(
                    minutes=5):
                cache_str = ''
                with self._cacheLock:
                    # Take a subset of the entire cache
                    # only the path_du_results for paths at project level (like /data/projects, /data/projects/LC9_001)
                    # Do not store path_du_results for deeper levels on disk, because that makes the disk read/write too slow,
                    # and the deeper levels can be obtained via rhb-du calls quite fast anyway.
                    # Furthermore, once a deeper level du results is stored in the memory cache, then it is also available for fast lookup.
                    # We just don't store these deep levels on disk.
                    sub_cache = {
                        path: du_result
                        for path, du_result in list(
                            self._cache['path_du_results'].items())
                        if self.getDepthToProjectsDir(path) <= 1
                        and du_result.get('found')
                    }
                    cache_str = str(sub_cache)

                tmp_path = '/tmp/tmp_storagequery_cache.py'
                with open(tmp_path, 'w') as file:
                    file.write(cache_str)
                dir_path = os.path.dirname(self._cache_path)
                if dir_path:
                    os.makedirs(dir_path, exist_ok=True)
                shutil.move(tmp_path, self._cache_path)
                self._last_cache_write_timestamp = datetime.datetime.utcnow()
        except Exception as e:
            logger.error("Error while writing du cache: %s", e)

    def _updateCache(self, du_result, send_notification=True):
        if not 'path' in du_result:
            return

        path = du_result['path']
        otdb_id = du_result.get('otdb_id')

        with self._cacheLock:
            path_cache = self._cache['path_du_results']
            otdb_id2path_cache = self._cache['otdb_id2path']

            if otdb_id is None:
                # try to look up the otdb in the path cache
                if path in path_cache:
                    otdb_id = du_result.get('otdb_id')

                # if still None, try to get the id from the path
                if otdb_id is None:
                    otdb_id = getOTDBIdFromPath(path)

            if not path in path_cache or path_cache[path][
                    'disk_usage'] != du_result['disk_usage']:
                # update the cache entry, even when no du result found,
                # cause that will save disk queries next time.
                logger.info('updating cache entry: %s', du_result)
                path_cache[path] = du_result

            if otdb_id != None:
                otdb_id2path_cache[otdb_id] = path

            if not du_result['found']:
                # even when the du for the path is not found,
                # keep a copy in the cache for fast lookup by clients
                # Make sure the size is 0
                du_result['disk_usage'] = 0
                du_result['disk_usage_readable'] = humanreadablesize(0)

            path_cache[path]['cache_timestamp'] = datetime.datetime.utcnow()
            path_cache[path]['needs_update'] = False

        self._writeCacheToDisk()

        if send_notification:
            self._sendDiskUsageChangedNotification(path,
                                                   du_result['disk_usage'],
                                                   otdb_id)

    def _invalidateCacheEntryForPath(self, path):
        with self._cacheLock:
            path_cache = self._cache['path_du_results']
            if path in path_cache:
                path_cache[path]['needs_update'] = True

    def getOtdbIdsFoundOnDisk(self):
        with self._cacheLock:
            otdb_id2path_cache = self._cache['otdb_id2path']
            return sorted(list(otdb_id2path_cache.keys()))

    def getDiskUsagesForAllOtdbIds(self, force_update=False):
        otdb_ids = self.getOtdbIdsFoundOnDisk()

        result = {}
        for otdb_id in otdb_ids:
            result[otdb_id] = self.getDiskUsageForOTDBId(
                otdb_id, force_update=force_update)

        return result

    def getDepthToProjectsDir(self, path):
        return len(
            path.replace(self.disk_usage.path_resolver.projects_path,
                         '').strip('/').split('/'))

    def _scanProjectsTree(self):
        try:

            def addSubDirectoriesToCache(directory):
                if not self._running:
                    return

                depth = self.getDepthToProjectsDir(directory)
                MAX_SCAN_DEPTH = 2
                #depth=0 : projects
                #depth=1 : projects/<project>
                #depth=2 : projects/<project>/<obs>
                #depth=3 : projects/<project>/<obs>/<sub_dir>
                if depth > MAX_SCAN_DEPTH:
                    return

                if depth < MAX_SCAN_DEPTH:
                    logger.info('tree scan: scanning \'%s\'', directory)
                    sd_result = self.disk_usage.path_resolver.getSubDirectories(
                        directory)

                    if sd_result['found']:
                        subdir_paths = [
                            os.path.join(directory, sd)
                            for sd in sd_result['sub_directories']
                        ]

                        for subdir_path in subdir_paths:
                            # recurse
                            addSubDirectoriesToCache(subdir_path)

                with self._cacheLock:
                    path_cache = self._cache['path_du_results']
                    add_empty_du_result_to_cache = not directory in path_cache

                if add_empty_du_result_to_cache:
                    logger.info(
                        'tree scan: adding \'%s\' with empty disk_usage to cache which will be du\'ed later',
                        directory)
                    empty_du_result = {
                        'found': True,
                        'disk_usage': None,
                        'path': directory,
                        'name': directory.split('/')[-1]
                    }
                    self._updateCache(empty_du_result, send_notification=False)

                with self._cacheLock:
                    path_cache = self._cache['path_du_results']
                    if directory in path_cache:
                        # mark cache entry for directory to be updated
                        path_cache[directory]['needs_update'] = True

            addSubDirectoriesToCache(
                self.disk_usage.path_resolver.projects_path)
            logger.info('tree scan complete')

        except Exception as e:
            logger.exception(str(e))

    def _updateOldEntriesInCache(self):
        logger.info('starting updating old cache entries')
        while self._running:
            try:
                now = datetime.datetime.utcnow()
                with self._cacheLock:
                    path_cache = self._cache['path_du_results']
                    old_entries = [
                        cache_entry
                        for cache_entry in list(path_cache.values()) if now -
                        cache_entry['cache_timestamp'] > MAX_CACHE_ENTRY_AGE
                    ]
                    needs_update_entries = [
                        cache_entry
                        for cache_entry in list(path_cache.values())
                        if cache_entry.get('needs_update', False)
                    ]

                updateable_entries = old_entries + needs_update_entries

                logger.info(
                    '%s old cache entries need to be updated, #age:%s #needs_update:%s',
                    len(updateable_entries), len(old_entries),
                    len(needs_update_entries))

                if updateable_entries:
                    # sort them oldest to newest, 'needs_update' paths first
                    def compareFunc(entry1, entry2):
                        if entry1.get('needs_update'
                                      ) and not entry2.get('needs_update'):
                            return -1
                        if not entry1.get('needs_update') and entry2.get(
                                'needs_update'):
                            return 1

                        #depth1 = self.getDepthToProjectsDir(entry1['path'])
                        #depth2 = self.getDepthToProjectsDir(entry2['path'])

                        #if depth1 != depth2:
                        ## lower level dirs are sorted in front of higher level dirs
                        #return depth1 - depth2

                        if entry1['cache_timestamp'] < entry2[
                                'cache_timestamp']:
                            return -1
                        if entry1['cache_timestamp'] > entry2[
                                'cache_timestamp']:
                            return 1
                        return 0

                    updateable_entries = sorted(updateable_entries,
                                                key=cmp_to_key(compareFunc))

                    cacheUpdateStart = datetime.datetime.utcnow()

                    for i, cache_entry in enumerate(updateable_entries):
                        try:
                            # it might be that the cache_entry was already updated via another way
                            # so only update it if still to old or needs_update
                            now = datetime.datetime.utcnow()
                            if now - cache_entry[
                                    'cache_timestamp'] > MAX_CACHE_ENTRY_AGE or cache_entry.get(
                                        'needs_update', False):
                                path = cache_entry.get('path')
                                if path:
                                    logger.info(
                                        '_updateOldEntriesInCache: examining entry %s/%s. timestamp:%s age:%s needs_update:%s path: \'%s\'',
                                        i, len(updateable_entries),
                                        cache_entry['cache_timestamp'],
                                        format_timedelta(
                                            now -
                                            cache_entry['cache_timestamp']),
                                        cache_entry.get('needs_update',
                                                        False), path)

                                    #du a full update from disk, which might be (really) slow.
                                    result = du_getDiskUsageForPath(path)
                                    logger.debug(
                                        'trying to update old entry in cache: %s',
                                        result)
                                    self._updateCache(result)
                        except Exception as e:
                            logger.error(str(e))

                        if not self._running:
                            logger.info('exiting _updateCacheThread')
                            return

                        if datetime.datetime.utcnow(
                        ) - cacheUpdateStart > datetime.timedelta(minutes=10):
                            # break out of cache update loop if full update takes more than 1min
                            # next loop we'll start with the oldest cache entries again
                            logger.info(
                                'skipping remaining %s old cache entries updates, they will be updated next time',
                                len(updateable_entries) - i)
                            break

                #update the CEP4 capacities in the RADB once in a while...
                self._updateCEP4CapacitiesInRADB()

                #sleep for a while, (or stop if requested)
                for i in range(60):
                    sleep(1)
                    if not self._running:
                        logger.info('exiting _updateCacheThread')
                        return

            except Exception as e:
                logger.exception(str(e))

    def _updateCEP4CapacitiesInRADB(self):
        try:
            df_result = self.disk_usage.getDiskFreeSpace()
            if df_result['found']:
                #get the total used space, and update the resource availability in the radb
                radbrpc = self.disk_usage.path_resolver.radbrpc
                storage_resources = radbrpc.getResources(
                    resource_types='storage', include_availability=True)
                cep4_storage_resource = next(x for x in storage_resources
                                             if 'CEP4' in x['name'])

                total_capacity = df_result.get('disk_size')
                used_capacity = df_result.get('disk_usage')
                available_capacity = df_result.get('disk_free')

                logger.info(
                    'updating capacities for resource \'%s\' (id=%s) in the RADB: total=%s, used=%s, available=%s',
                    cep4_storage_resource['name'], cep4_storage_resource['id'],
                    humanreadablesize(total_capacity),
                    humanreadablesize(used_capacity),
                    humanreadablesize(available_capacity))

                radbrpc.updateResourceAvailability(
                    cep4_storage_resource['id'],
                    available_capacity=available_capacity,
                    total_capacity=total_capacity)
        except Exception as e:
            logger.error('_updateCEP4CapacitiesInRADB: %s', e)

    def open(self):
        logger.info("opening storagequeryservice cache...")
        self._running = True

        self.disk_usage.open()
        self.event_bus.open()

        self._scanProjectsTree()

        self._updateCacheThread = Thread(target=self._updateOldEntriesInCache)
        self._updateCacheThread.daemon = True
        self._updateCacheThread.start()

        self.otdb_listener.start_listening()
        self.dm_listener.start_listening()
        logger.info("opened storagequeryservice cache")

    def close(self):
        logger.info("closing storagequeryservice cache...")
        self._running = False

        self.otdb_listener.stop_listening()
        self.dm_listener.stop_listening()
        self._updateCacheThread.join()

        self.event_bus.close()
        self.disk_usage.close()
        logger.info("closed storagequeryservice cache")

    def __enter__(self):
        self.open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def onObservationFinished(self, otdb_id, modificationTime):
        self._onDiskActivityForOTDBId(otdb_id)

    def onObservationAborted(self, otdb_id, modificationTime):
        self._onDiskActivityForOTDBId(otdb_id)

    def onTaskDeleted(self, otdb_id, deleted, paths, message=''):
        self._onDiskActivityForOTDBId(otdb_id)

        with self._cacheLock:
            if deleted and otdb_id != None and otdb_id in self._cache[
                    'otdb_id2path']:
                del self._cache['otdb_id2path'][otdb_id]

    def _onDiskActivityForOTDBId(self, otdb_id):
        result = self.disk_usage.getDiskUsageForOTDBId(otdb_id)
        self._updateCache(result)

        task_path = result.get('path')
        projects_path = self.disk_usage.path_resolver.projects_path

        # update all paths up the tree up to the projects_path
        # update the resource availability in the radb as well
        path = task_path
        while path:
            parent_path = '/'.join(path.split('/')[:-1])

            if projects_path.startswith(
                    parent_path) and len(parent_path) < len(projects_path):
                break

            logger.info(
                'invalidating cache entry for %s because disk usage for task %s in %s changed',
                parent_path, otdb_id, task_path)

            self._invalidateCacheEntryForPath(parent_path)

            path = parent_path

    def getDiskUsageForOTDBId(self,
                              otdb_id,
                              include_scratch_paths=True,
                              force_update=False):
        return self.getDiskUsageForTask(
            otdb_id=otdb_id,
            include_scratch_paths=include_scratch_paths,
            force_update=force_update)

    def getDiskUsageForMoMId(self,
                             mom_id,
                             include_scratch_paths=True,
                             force_update=False):
        return self.getDiskUsageForTask(
            mom_id=mom_id,
            include_scratch_paths=include_scratch_paths,
            force_update=force_update)

    def getDiskUsageForRADBId(self,
                              radb_id,
                              include_scratch_paths=True,
                              force_update=False):
        return self.getDiskUsageForTask(
            radb_id=radb_id,
            include_scratch_paths=include_scratch_paths,
            force_update=force_update)

    def getDiskUsageForTask(self,
                            radb_id=None,
                            mom_id=None,
                            otdb_id=None,
                            include_scratch_paths=True,
                            force_update=False):
        logger.info(
            "cache.getDiskUsageForTask(radb_id=%s, mom_id=%s, otdb_id=%s, include_scratch_paths=%s, force_update=%s)",
            radb_id, mom_id, otdb_id, include_scratch_paths, force_update)

        if otdb_id != None and not include_scratch_paths:
            with self._cacheLock:
                path = self._cache['otdb_id2path'].get(otdb_id)

            if path:
                logger.info('Using path from cache for otdb_id %s %s', otdb_id,
                            path)
                return self.getDiskUsageForPath(path,
                                                force_update=force_update)

        logger.info(
            "cache.getDiskUsageForTask could not find path in cache, determining path..."
        )

        path_result = self.disk_usage.path_resolver.getPathForTask(
            radb_id=radb_id,
            mom_id=mom_id,
            otdb_id=otdb_id,
            include_scratch_paths=include_scratch_paths)

        if path_result['found']:
            task_path = path_result['path']
            scratch_paths = path_result.get('scratch_paths', [])

            # get all du's in parallel over all paths
            paths = [task_path] + scratch_paths
            paths_du_result = self.getDiskUsageForPaths(
                paths, force_update=force_update)

            # split into project and subdir
            path_du_result = paths_du_result.pop(task_path)
            scratch_du_result = paths_du_result

            task_du_result = dict(path_du_result)

            # yield id's for if available, or None
            for id in ['radb_id', 'otdb_id', 'mom_id']:
                task_du_result[id] = path_result.get(id)

            if scratch_du_result:
                task_du_result['scratch_paths'] = scratch_du_result

            return task_du_result

        # still no path(s) found for otdb_id, now try from cache and ignore possible scratch paths
        if otdb_id != None:
            with self._cacheLock:
                path = self._cache['otdb_id2path'].get(otdb_id)

            if path:
                logger.info(
                    'Using path from cache for otdb_id %s %s (ignoring possible scratch/share paths)',
                    otdb_id, path)
                return self.getDiskUsageForPath(path,
                                                force_update=force_update)

        return {'found': False, 'path': path_result['path']}

    def getDiskUsageForTasks(self,
                             radb_ids=None,
                             mom_ids=None,
                             otdb_ids=None,
                             include_scratch_paths=True,
                             force_update=False):
        logger.info(
            "cache.getDiskUsageForTasks(radb_ids=%s, mom_ids=%s, otdb_ids=%s)"
            % (radb_ids, mom_ids, otdb_ids))
        tasks_result = {'radb_ids': {}, 'mom_ids': {}, 'otdb_ids': {}}
        if radb_ids is None:
            radb_ids = []
        if mom_ids is None:
            mom_ids = []
        if otdb_ids is None:
            otdb_ids = []

        with futures.ThreadPoolExecutor(
                max_workers=os.cpu_count()) as executor:

            # helper function to expand parallel p_kwarg dict into kwargs
            def parallel_getDiskUsageForTask(p_kwarg):
                return self.getDiskUsageForTask(**p_kwarg)

            parallel_kwargs = [{
                'radb_id': radb_id,
                'include_scratch_paths': include_scratch_paths,
                'force_update': force_update
            } for radb_id in radb_ids]
            parallel_kwargs += [{
                'mom_id': mom_id,
                'include_scratch_paths': include_scratch_paths,
                'force_update': force_update
            } for mom_id in mom_ids]
            parallel_kwargs += [{
                'otdb_id': otdb_id,
                'include_scratch_paths': include_scratch_paths,
                'force_update': force_update
            } for otdb_id in otdb_ids]
            results = list(
                executor.map(parallel_getDiskUsageForTask, parallel_kwargs))

            # collect results in a dict grouped by id_type
            for result in results:
                if result.get('radb_id') in radb_ids:
                    tasks_result['radb_ids'][result['radb_id']] = result
                if result.get('mom_id') in mom_ids:
                    tasks_result['mom_ids'][result['mom_id']] = result
                if result.get('otdb_id') in otdb_ids:
                    tasks_result['otdb_ids'][result['otdb_id']] = result

        logger.info(
            "cache.getDiskUsageForTasks(radb_ids=%s, mom_ids=%s, otdb_ids=%s) returning: %s"
            % (radb_ids, mom_ids, otdb_ids, tasks_result))

        return tasks_result

    def getDiskUsageForPaths(self, paths, force_update=False):
        with futures.ThreadPoolExecutor(
                max_workers=os.cpu_count()) as executor:

            # helper function to expand parallel p_arg tuple into list of args
            def parallel_getDiskUsageForPath(p_arg):
                return self.getDiskUsageForPath(*p_arg)

            parallel_args = [(path, force_update) for path in paths]
            results = list(
                executor.map(parallel_getDiskUsageForPath, parallel_args))

            return {result['path']: result for result in results}

    def getDiskUsageForPath(self, path, force_update=False):
        logger.info("cache.getDiskUsageForPath('%s', force_update=%s)", path,
                    force_update)
        needs_cache_update = False
        if not force_update:
            with self._cacheLock:
                needs_cache_update |= path not in self._cache[
                    'path_du_results']

        if needs_cache_update or force_update:
            logger.info("cache update needed for %s", path)

            # check if some other thread is already doing a du call for this path...
            with self._du_threading_events_lock:
                path_threading_event = self._du_threading_events.get(path)
                need_to_do_du_call = path_threading_event is None

                if need_to_do_du_call:
                    # no other thread is currently du'ing/updating this path
                    # so create a threading Event and store it in the dict,
                    # so other threads can wait for this event.
                    logger.info("updating the cache for %s current_thread=%s",
                                path,
                                current_thread().name)
                    path_threading_event = Event()
                    self._du_threading_events[path] = path_threading_event

            if need_to_do_du_call:
                # no other thread is currently du'ing/updating this path
                # so we need to do it here.
                result = du_getDiskUsageForPath(path)
                self._updateCache(result)

                # signal threads waiting for this same path du call
                # and do bookkeeping
                with self._du_threading_events_lock:
                    logger.info(
                        "signaling other threads that the cache was updated for %s current_thread=%s",
                        path,
                        current_thread().name)
                    path_threading_event.set()
                    del self._du_threading_events[path]
            else:
                logger.info(
                    "waiting for du call on other thread that will update the cache for %s current_thread=%s",
                    path,
                    current_thread().name)
                path_threading_event.wait()
                logger.info(
                    "another thread just updated the cache for %s current_thread=%s",
                    path,
                    current_thread().name)

        with self._cacheLock:
            if path in self._cache['path_du_results']:
                result = self._cache['path_du_results'][path]
            else:
                result = {
                    'found': False,
                    'path': path,
                    'message': 'unknown error'
                }
                if not self.disk_usage.path_resolver.pathExists(path):
                    result['message'] = 'No such path: %s' % path

        result['disk_usage_readable'] = humanreadablesize(
            result.get('disk_usage', 0))
        logger.info('cache.getDiskUsageForPath(\'%s\') result: %s', path,
                    result)
        return result

    def getDiskUsageForTaskAndSubDirectories(self,
                                             radb_id=None,
                                             mom_id=None,
                                             otdb_id=None,
                                             force_update=False):
        logger.info(
            "cache.getDiskUsageForTaskAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s)"
            % (radb_id, mom_id, otdb_id))
        task_du_result = self.getDiskUsageForTask(radb_id=radb_id,
                                                  mom_id=mom_id,
                                                  otdb_id=otdb_id,
                                                  force_update=force_update)
        if task_du_result['found']:
            task_sd_result = self.disk_usage.path_resolver.getSubDirectories(
                task_du_result['path'])

            if task_sd_result['found']:
                subdir_paths = [
                    os.path.join(task_du_result['path'], sd)
                    for sd in task_sd_result['sub_directories']
                ]
                subdirs_du_result = self.getDiskUsageForPaths(
                    subdir_paths, force_update=force_update)

                result = {
                    'found': True,
                    'task_directory': task_du_result,
                    'sub_directories': subdirs_du_result
                }
                logger.info(
                    "result for cache.getDiskUsageForTaskAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s): %s",
                    radb_id, mom_id, otdb_id, result)
                return result

        logger.warning(
            "result for cache.getDiskUsageForTaskAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s): %s",
            radb_id, mom_id, otdb_id, task_du_result)
        return task_du_result

    def getDiskUsageForProjectDirAndSubDirectories(self,
                                                   radb_id=None,
                                                   mom_id=None,
                                                   otdb_id=None,
                                                   project_name=None,
                                                   force_update=False):
        logger.info(
            "cache.getDiskUsageForProjectDirAndSubDirectories(radb_id=%s, mom_id=%s, otdb_id=%s)"
            % (radb_id, mom_id, otdb_id))
        path_result = self.disk_usage.path_resolver.getProjectDirAndSubDirectories(
            radb_id=radb_id,
            mom_id=mom_id,
            otdb_id=otdb_id,
            project_name=project_name)
        if path_result['found']:
            projectdir_path = path_result['path']
            subdir_paths = [
                os.path.join(path_result['path'], sd)
                for sd in path_result['sub_directories']
            ]

            # get all du's in parallel over all paths
            paths = [projectdir_path] + subdir_paths
            paths_du_result = self.getDiskUsageForPaths(
                paths, force_update=force_update)

            # split into project and subdir
            projectdir_du_result = paths_du_result.pop(projectdir_path)
            subdirs_du_result = paths_du_result

            # create total result dict
            result = {
                'found': True,
                'projectdir': projectdir_du_result,
                'sub_directories': subdirs_du_result
            }
            logger.info(
                'cache.getDiskUsageForProjectDirAndSubDirectories result: %s' %
                result)
            return result

        return path_result

    def getDiskUsageForProjectsDirAndSubDirectories(self, force_update=False):
        logger.info("cache.getDiskUsageForProjectsDirAndSubDirectories")
        projects_path = self.disk_usage.path_resolver.projects_path
        project_subdirs_result = self.disk_usage.path_resolver.getSubDirectories(
            projects_path)
        subdir_paths = [
            os.path.join(projects_path, sd)
            for sd in project_subdirs_result['sub_directories']
        ] if project_subdirs_result['found'] else []

        # get all du's in parallel over all paths
        paths = [projects_path] + subdir_paths
        paths_du_result = self.getDiskUsageForPaths(paths,
                                                    force_update=force_update)

        # split into project and subdir
        projectsdir_du_result = paths_du_result.pop(projects_path)
        subdirs_du_result = paths_du_result

        # create total result dict
        result = {
            'found': True,
            'projectdir': projectsdir_du_result,
            'sub_directories': subdirs_du_result
        }

        logger.info(
            'cache.getDiskUsageForProjectsDirAndSubDirectories result: %s' %
            result)
        return result
Esempio n. 9
0
class RADBPGListener(PostgresListener):
    def __init__(self,
                 exchange=DEFAULT_BUSNAME,
                 dbcreds=None,
                 broker=DEFAULT_BROKER):
        super(RADBPGListener, self).__init__(dbcreds=dbcreds)

        self.event_bus = ToBus(exchange=exchange, broker=broker)

        self.radb = RADatabase(dbcreds=dbcreds)

        self.subscribe('task_update', self.onTaskUpdated)
        self.subscribe('task_insert', self.onTaskInserted)
        self.subscribe('task_delete', self.onTaskDeleted)

        self.subscribe('task_predecessor_insert_column_task_id',
                       self.onTaskPredecessorChanged)
        self.subscribe('task_predecessor_update_column_task_id',
                       self.onTaskPredecessorChanged)
        self.subscribe('task_predecessor_delete_column_task_id',
                       self.onTaskPredecessorChanged)

        self.subscribe('task_predecessor_insert_column_predecessor_id',
                       self.onTaskSuccessorChanged)
        self.subscribe('task_predecessor_update_column_predecessor_id',
                       self.onTaskSuccessorChanged)
        self.subscribe('task_predecessor_delete_column_predecessor_id',
                       self.onTaskSuccessorChanged)

        # when the specification starttime and endtime are updated, then that effects the task as well
        self.subscribe('specification_update', self.onSpecificationUpdated)

        self.subscribe('resource_claim_update', self.onResourceClaimUpdated)
        self.subscribe('resource_claim_insert', self.onResourceClaimInserted)
        self.subscribe('resource_claim_delete', self.onResourceClaimDeleted)

        self.subscribe('resource_availability_update',
                       self.onResourceAvailabilityUpdated)
        self.subscribe('resource_capacity_update',
                       self.onResourceCapacityUpdated)

    def onTaskUpdated(self, payload=None):
        # Send notification for the given updated task
        task_id = payload
        task = self.radb.getTask(task_id)
        self._sendNotification('TaskUpdated', task)

        # The "blocked_by_ids" property of the given task's successors might have been updated due to the given task
        # status being updated. Therefore also send a notification for these successors - lazily ignoring that they
        # might not have changed.
        suc_sched_tasks = self.radb.getTasks(task_ids=task['successor_ids'],
                                             task_status='scheduled')
        for suc_sched_task in suc_sched_tasks:
            self._sendNotification('TaskUpdated', suc_sched_task)

    def onTaskInserted(self, payload=None):
        self._sendNotification('TaskInserted', self.radb.getTask(payload))

    def onTaskDeleted(self, payload=None):
        self._sendNotification('TaskDeleted', payload)

    def onTaskPredecessorChanged(self, task_id):
        logger.info('onTaskPredecessorChanged(task_id=%s)', task_id)
        self._sendNotification('TaskUpdated', self.radb.getTask(task_id))

    def onTaskSuccessorChanged(self, task_id):
        logger.info('onTaskSuccessorChanged(task_id=%s)', task_id)
        self._sendNotification('TaskUpdated', self.radb.getTask(task_id))

    def onSpecificationUpdated(self, payload=None):
        # when the specification starttime and endtime are updated, then that effects the task as well
        self._sendNotification('TaskUpdated',
                               self.radb.getTask(specification_id=payload))

    def onResourceClaimUpdated(self, payload=None):
        self._sendNotification('ResourceClaimUpdated',
                               self.radb.getResourceClaim(payload))

    def onResourceClaimInserted(self, payload=None):
        self._sendNotification('ResourceClaimInserted',
                               self.radb.getResourceClaim(payload))

    def onResourceClaimDeleted(self, payload=None):
        self._sendNotification('ResourceClaimDeleted', payload)

    def onResourceAvailabilityUpdated(self, payload=None):
        r = self.radb.getResources(resource_ids=[payload],
                                   include_availability=True)[0]
        r = {k: r[k] for k in ['id', 'active']}
        self._sendNotification('ResourceAvailabilityUpdated', r)

    def onResourceCapacityUpdated(self, payload=None):
        r = self.radb.getResources(resource_ids=[payload],
                                   include_availability=True)[0]
        r = {
            k: r[k]
            for k in
            ['id', 'total_capacity', 'available_capacity', 'used_capacity']
        }
        self._sendNotification('ResourceCapacityUpdated', r)

    def __enter__(self):
        super(RADBPGListener, self).__enter__()
        self.radb.connect()
        self.event_bus.open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        super(RADBPGListener, self).__exit__(exc_type, exc_val, exc_tb)
        self.radb.disconnect()
        self.event_bus.close()

    def _formatTimestampsAsIso(self, fields, contentDict):
        '''convert all requested fields in the contentDict to proper isoformat datetime strings.
        In postgres we use timestamps without timezone.
        By convention we only enter utc values.
        But, if they are json encoded by postgress, they are not properly formatted with the in isoformat with a 'Z' at the end.
        So, parse the requested fields, and return them as datetime.
        '''
        try:
            for field in fields:
                try:
                    if field in contentDict:
                        timestampStr = contentDict[field]
                        formatStr = '%Y-%m-%dT%H:%M:%S' if 'T' in timestampStr else '%Y-%m-%d %H:%M:%S'
                        if timestampStr.rfind('.') > -1:
                            formatStr += '.%f'

                        timestamp = datetime.strptime(timestampStr, formatStr)

                        contentDict[field] = timestamp
                except Exception as e:
                    logger.error(
                        'Could not convert field \'%s\' to datetime: %s' %
                        (field, e))

            return contentDict
        except Exception as e:
            logger.error(
                'Error while convering timestamp fields \'%s\'in %s\n%s' %
                (fields, contentDict, e))

    def _sendNotification(self, subject, contentDict):
        try:
            if subject and contentDict:
                msg = EventMessage(subject="%s.%s" %
                                   (DEFAULT_RADB_NOTIFICATION_PREFIX, subject),
                                   content=contentDict)
                logger.info('Sending notification %s to %s: %s', subject,
                            self.event_bus.exchange,
                            str(contentDict).replace('\n', ' '))
                self.event_bus.send(msg)
        except Exception as e:
            logger.error(str(e))
Esempio n. 10
0
class IngestPipeline():
    STATUS_INITIALIZING = 1
    STATUS_TRANSFERRING = 2
    STATUS_FINALIZING   = 3
    STATUS_FINISHED     = 4

    def __init__(self, job, momClient, ltaClient,
                 exchange=DEFAULT_BUSNAME,
                 broker=DEFAULT_BROKER,
                 user=getpass.getuser(),
                 globus_timeout=GLOBUS_TIMEOUT,
                 minimal_SIP=False):
        self.status              = IngestPipeline.STATUS_INITIALIZING

        self.hostname            = socket.gethostname()
        self.job                 = job
        self.momClient           = momClient
        self.ltaClient           = ltaClient
        self.user                = user

        if not self.user:
            self.user=getpass.getuser()

        self.globus_timeout      = globus_timeout

        self.minimal_SIP         = minimal_SIP

        self.event_bus           = ToBus(exchange, broker=broker, connection_log_level=logging.DEBUG)

        self.Project             = job['Project']
        self.DataProduct         = job['DataProduct']
        self.FileName            = job['FileName']
        self.JobId               = job['JobId']
        self.ArchiveId           = int(job['ArchiveId'])
        self.ObsId               = int(job['ObservationId'])
        self.ExportID            = job['ExportID']
        self.Type                = job["Type"]
        self.HostLocation        = job['Location'].partition(':')[0]
        self.Location            = job['Location'].partition(':')[2]
        self.ticket              = ''
        self.FileSize            = '0'
        self.MD5Checksum         = ''
        self.Adler32Checksum     = ''
        self.ChecksumResult      = False
        self.SIP                 = ''
        self.PrimaryUri          = ''
        self.SecondaryUri        = ''
        self.lta_site            = ''

    def GetStorageTicket(self):
        do_check_already_in_lta=isProductionEnvironment()
        result = self.ltaClient.GetStorageTicket(self.Project, self.FileName, self.FileSize, self.ArchiveId, self.JobId, self.ObsId, do_check_already_in_lta, self.Type)

        error = result.get('error')
        if error:
            if 'StorageTicket with mom ID "%i"' % (self.ArchiveId) in error:
                if 'existing_ticket_id' in result and 'existing_ticket_state' in result:
                    logger.warning("Got a Tier 1 GetStorageTicket error for an incomplete storage ticket %s with status %s" % (result['existing_ticket_id'],result['existing_ticket_state']))
                    if result['existing_ticket_state'] < IngestSuccessful:
                        try:
                            self.ticket                = result['existing_ticket_id']
                            logger.warning("trying to repair status of StorageTicket %s" % self.ticket)

                            self.SendStatusToLTA(IngestFailed)
                        except Exception as e:
                            logger.exception('ResettingStatus IngestFailed failed for %s' % self.ticket)
                        raise Exception ('Had to reset state for %s' % self.ticket)
                    else:
                        raise PipelineError('GetStorageTicket error: Dataproduct already in LTA for %s' % (self.JobId), PipelineAlreadyInLTAError)
                else:
                    raise Exception('GetStorageTicket error I can''t interpret: %s' % result)

            if 'no storage resources defined for project' in error or "project does not exists" in error:
                raise PipelineError('GetStorageTicket error for project not known in LTA: %s' % error, PipelineNoProjectInLTAError)

            raise Exception('GetStorageTicket error: %s' % error)
        else:
            self.ticket            = result.get('ticket')
            self.PrimaryUri        = result.get('primary_uri_rnd')
            self.SecondaryUri      = result.get('secondary_uri_rnd')

            if 'sara' in self.PrimaryUri:
                self.lta_site = 'sara'
            elif 'juelich' in self.PrimaryUri:
                self.lta_site = 'juelich'
            elif 'psnc' in self.PrimaryUri:
                self.lta_site = 'poznan'

    def TransferFile(self):
        try:
            logger.info('Starting file transfer for %s ' % self.JobId)
            start = time.time()
            self.status = IngestPipeline.STATUS_TRANSFERRING

            self.__sendNotification('JobProgress',
                                    message='transfer starting',
                                    percentage_done=0.0,
                                    total_bytes_transfered=0)

            local_ip = hostnameToIp(self.hostname)

            if 'cep4' in self.HostLocation.lower() or 'cpu' in self.HostLocation.lower():
                self.HostLocation = 'localhost'

            if 'locus' in self.HostLocation.lower() and not '.' in self.HostLocation.lower():
                self.HostLocation += '.cep2.lofar'

            def progress_callback(percentage_done, current_speed, total_bytes_transfered):
                self.__sendNotification('JobProgress',
                                        percentage_done=min(100.0, round(10.0*percentage_done)/10.0),
                                        current_speed=current_speed,
                                        total_bytes_transfered=total_bytes_transfered)

            if (os.path.splitext(self.Location)[-1] == '.h5' and
                os.path.splitext(os.path.basename(self.Location))[0].endswith('_bf')):
                logger.info('dataproduct is a beamformed h5 file. adding raw file to the transfer')
                self.Location = [self.Location, self.Location.replace('.h5', '.raw')]

            if self.DataProduct not in self.Location and 'Source' in self.job:
                # old hack, is needed to support dynspec / pulsar archiving scripts
                self.Location = os.path.join(self.Location, self.job['Source'])

            cp = LtaCp(self.HostLocation,
                       self.Location,
                       self.PrimaryUri,
                       self.user,
                       local_ip=local_ip,
                       globus_timeout=self.globus_timeout,
                       progress_callback=progress_callback)

            transfer_result = cp.transfer(force=True)

            self.status = IngestPipeline.STATUS_FINALIZING

            if not transfer_result:
                msg = 'error while transferring %s with ltacp' % (self.JobId)
                logger.error(msg)
                raise Exception(msg)

            self.MD5Checksum = transfer_result[0]
            self.Adler32Checksum = transfer_result[1]
            self.FileSize = transfer_result[2]

            if self.MD5Checksum and self.Adler32Checksum and self.FileSize:
                logger.debug('valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s', self.JobId,
                                                                                                         self.FileSize,
                                                                                                         humanreadablesize(int(self.FileSize), 'B'),
                                                                                                         self.MD5Checksum,
                                                                                                         self.Adler32Checksum)
            else:
                msg = 'no valid checksums found for %s with filesize %sB (%s). md5: %s adler32: %s' % (self.JobId,
                                                                                                       self.FileSize,
                                                                                                       humanreadablesize(int(self.FileSize), 'B'),
                                                                                                       self.MD5Checksum,
                                                                                                       self.Adler32Checksum)
                logger.error(msg)
                raise Exception(msg)

            try:
                self.__sendNotification('JobProgress',
                                        message='transfer finished',
                                        percentage_done=100.0,
                                        total_bytes_transfered=int(self.FileSize))
            except ValueError:
                pass
            elapsed = time.time() - start

            try:
                if int(self.FileSize) > 0:
                    avgSpeed = float(self.FileSize) / elapsed
                logger.info("Finished file transfer for %s in %d sec with an average speed of %s for %s including ltacp overhead" % (self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B')))
            except Exception:
                logger.info('Finished file transfer of %s in %s' % (self.JobId, elapsed))

        except Exception as exp:
            if isinstance(exp, LtacpException):
                if '550 File not found' in exp.value:
                    logger.error('Destination directory does not exist. Creating %s in LTA for %s' % (self.PrimaryUri, self.JobId))

                    if create_missing_directories(self.PrimaryUri) == 0:
                        logger.info('Created path %s in LTA for %s' % (self.PrimaryUri, self.JobId))
                elif 'source path' in exp.value and 'does not exist' in exp.value:
                    raise PipelineError(exp.value, PipelineNoSourceError)

            raise Exception('transfer failed for %s: %s' % (self.JobId, str(exp)))

    def SendChecksumsToLTA(self):
        result = self.ltaClient.SendChecksums(self.JobId, self.Project, self.ticket, self.FileSize, self.PrimaryUri, self.SecondaryUri, self.MD5Checksum, self.Adler32Checksum)
        if not result.get('error'):
            #store final uri's
            self.PrimaryUri   = result['primary_uri']
            self.SecondaryUri = result.get('secondary_uri')

    def SendStatusToLTA(self, lta_state_id):
        if self.ticket:
            self.ltaClient.UpdateUriState(self.JobId, self.Project, self.ticket, self.PrimaryUri, lta_state_id)

    def CheckForValidSIP(self):
        if self.Type == "MoM":
            try:
                with self.momClient:
                    self.momClient.getSIP(self.ArchiveId, validate=True, log_prefix=self.JobId)

            except Exception as e:
                logger.log(logging.WARNING if self.minimal_SIP else logging.ERROR,
                           'CheckForValidSIP: Getting SIP from MoM failed for %s: %s', self.JobId, e)
                if not self.minimal_SIP:
                    raise

        elif 'SIPLocation' in self.job: # job file might know where the sip is when it is not a MoM job
            try:
                sip_host = self.job['SIPLocation'].split(':')[0]
                sip_path = self.job['SIPLocation'].split(':')[1]

                cmd = ['ssh', '-tt', '-n', '-x', '-q', '%s@%s' % (self.user, sip_host), 'cat %s' % sip_path]
                logger.info("GetSIP for %s at SIPLocation %s - cmd %s" % (self.JobId, self.job['SIPLocation'], ' ' .join(cmd)))
                p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                out, err = communicate_returning_strings(p)
                if p.returncode != 0:
                    raise PipelineError('GetSIP error getting EoR SIP for %s: %s' % (self.JobId, out + err))

                tmp_SIP = out

                tmp_SIP = addIngestInfoToSIP(tmp_SIP, self.ticket, self.FileSize, self.MD5Checksum, self.Adler32Checksum)

                tmp_SIP = tmp_SIP.replace('<stationType>Europe</stationType>','<stationType>International</stationType>')

                #make sure the source in the SIP is the same as the type of the storageticket
                tmp_SIP = re.compile('<source>eor</source>', re.IGNORECASE).sub('<source>%s</source>' % (self.Type,), tmp_SIP)

                if not validateSIPAgainstSchema(tmp_SIP):
                    logger.error('CheckForValidSIP: Invalid SIP:\n%s', tmp_SIP)
                    raise Exception('SIP for %s does not validate against schema' % self.JobId)

            except:
                logger.exception('CheckForValidSIP: Getting SIP from SIPLocation %s failed', self.job['SIPLocation'])
                raise

        logger.info('SIP for %s is valid, can proceed with transfer' % (self.JobId,))

    def GetSIP(self):
        try:
            if self.Type == "MoM":
                with self.momClient:
                    self.SIP = self.momClient.uploadDataAndGetSIP(self.ArchiveId,
                                                                self.ticket,
                                                                self.FileName,
                                                                self.PrimaryUri,
                                                                self.FileSize,
                                                                self.MD5Checksum,
                                                                self.Adler32Checksum,
                                                                validate=True)
            elif 'SIPLocation' in self.job: # job file might know where the sip is when it is not a MoM job
                try:
                    sip_host = self.job['SIPLocation'].split(':')[0]
                    sip_path = self.job['SIPLocation'].split(':')[1]

                    cmd = ['ssh', '-tt', '-n', '-x', '-q', '%s@%s' % (self.user, sip_host), 'cat %s' % sip_path]
                    logger.info("GetSIP for %s at SIPLocation %s - cmd %s" % (self.JobId, self.job['SIPLocation'], ' ' .join(cmd)))
                    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    out, err = communicate_returning_strings(p)
                    if p.returncode != 0:
                        raise PipelineError('GetSIP error getting EoR SIP for %s: %s' % (self.JobId, out + err))

                    self.SIP = out

                    self.SIP = addIngestInfoToSIP(self.SIP, self.ticket, self.FileSize, self.MD5Checksum, self.Adler32Checksum)

                    self.SIP = self.SIP.replace('<stationType>Europe</stationType>','<stationType>International</stationType>')

                    #make sure the source in the SIP is the same as the type of the storageticket
                    self.SIP = re.compile('<source>eor</source>', re.IGNORECASE).sub('<source>%s</source>' % (self.Type,), self.SIP)

                    if not validateSIPAgainstSchema(self.SIP):
                        logger.error('Invalid SIP:\n%s', self.SIP)
                        raise Exception('SIP for %s does not validate against schema' % self.JobId)

                except:
                    logger.exception('Getting SIP from SIPLocation %s failed', self.job['SIPLocation'])
                    raise

                logger.info('SIP received for %s from SIPLocation %s with size %d (%s): \n%s' % (self.JobId,
                                                                                                self.job['SIPLocation'],
                                                                                                len(self.SIP),
                                                                                                humanreadablesize(len(self.SIP)),
                                                                                                self.SIP[0:1024]))
            else:
                self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type)
                self.FileType = FILE_TYPE_UNSPECIFIED
        except Exception as e:
            if self.minimal_SIP:
                logger.info('making minimal SIP for %s', self.JobId)
                self.SIP = makeSIP(self.Project, self.ObsId, self.ArchiveId, self.ticket, self.FileName, self.FileSize, self.MD5Checksum, self.Adler32Checksum, self.Type)
                logger.info('minimal SIP for %s: \n%s', self.JobId, self.SIP)
                self.FileType = FILE_TYPE_UNSPECIFIED
            else:
                raise

    def SendSIPToLTA(self):
        try:
            self.ltaClient.SendSIP(self.JobId, self.SIP, self.ticket)
        except Exception as e:
            logger.error('SendSIPToLTA exception: %s', e)
            raise PipelineError(str(e), PipelineJobFailedError)

    def RollBack(self):
        try:
            logger.info('rolling back file transfer for %s', self.JobId)
            start     = time.time()

            if self.PrimaryUri:
                srmrm(self.PrimaryUri, log_prefix=self.JobId, timeout=300)

            if self.SecondaryUri:
                srmrm(self.SecondaryUri, log_prefix=self.JobId, timeout=300)

            logger.debug("rollBack for %s took %ds", self.JobId, time.time() - start)
        except Exception as e:
            logger.exception('rollback failed for %s: %s', self.JobId, e)

    def __sendNotification(self, subject, message='', **kwargs):
        try:
            contentDict = { 'job_id': self.JobId,
                            'export_id': self.job.get('job_group_id'),
                            'archive_id': self.ArchiveId,
                            'project': self.Project,
                            'type': self.Type,
                            'ingest_server': self.hostname,
                            'dataproduct': self.DataProduct,
                            'srm_url': self.PrimaryUri }
            if 'ObservationId' in self.job:
                contentDict['otdb_id'] = self.job['ObservationId']

            if self.lta_site:
                contentDict['lta_site'] = self.lta_site

            if message:
                contentDict['message'] = message

            for k,v in list(kwargs.items()):
                contentDict[k] = v

            msg = EventMessage(subject="%s.%s" % (INGEST_NOTIFICATION_PREFIX, subject), content=contentDict)
            msg.ttl = 48*3600 #remove message from queue's when not picked up within 48 hours
            logger.info('Sending notification %s: %s' % (subject, str(contentDict).replace('\n', ' ')))
            self.event_bus.send(msg)
        except Exception as e:
            logger.error(str(e))

    def run(self):
        with self.event_bus:
            try:
                logger.info("starting ingestpipeline for %s" % self.JobId)
                start = time.time()
                self.__sendNotification('JobStarted')

                self.GetStorageTicket()
                self.CheckForValidSIP()
                self.TransferFile()
                self.SendChecksumsToLTA()
                self.GetSIP()
                self.SendSIPToLTA()
                self.SendStatusToLTA(IngestSuccessful)

                avgSpeed = 0
                elapsed = time.time() - start
                try:
                    avgSpeed = float(self.FileSize) / elapsed
                    logger.info("Ingest Pipeline finished for %s in %d sec with average speed of %s for %s including all overhead",
                                self.JobId, elapsed, humanreadablesize(avgSpeed, 'Bps'), humanreadablesize(float(self.FileSize), 'B'))
                except Exception:
                    logger.info("Ingest Pipeline finished for %s in %d sec", self.JobId, elapsed)

                self.__sendNotification('JobFinished',
                                        average_speed=avgSpeed,
                                        total_bytes_transfered=int(self.FileSize))

            except PipelineError as pe:
                logger.log(logging.WARNING if pe.type == PipelineAlreadyInLTAError else logging.ERROR,
                           'Encountered PipelineError for %s : %s', self.JobId, str(pe))
                if pe.type == PipelineNoSourceError:
                    self.__sendNotification('JobTransferFailed', 'data not transfered because it was not on disk')
                elif pe.type == PipelineAlreadyInLTAError:
                    self.__sendNotification('JobFinished', 'data was already in the LTA',
                                            average_speed=0,
                                            total_bytes_transfered=0)
                else:
                    self.RollBack()

                    # by default the error_message for the notification is the exception
                    error_message = str(pe)
                    # for known messsages in the exception, make a nice readable error_message
                    if 'MoM returned login screen instead of SIP' in error_message:
                        error_message = 'MoM returned login screen instead of SIP'

                    self.__sendNotification('JobTransferFailed', error_message)

                try:
                    if pe.type != PipelineAlreadyInLTAError:
                        self.SendStatusToLTA(IngestFailed)
                except Exception as e:
                    logger.error('SendStatusToLTA failed for %s: %s', self.JobId, e)

            except Exception as e:
                logger.error('Encountered unexpected error for %s: %s', self.JobId, e)

                # by default the error_message for the notification is the exception
                error_message = str(e)
                # for known messsages in the exception, make a nice readable error_message
                if 'ltacp' in error_message and ('file listing failed' in error_message or 'du failed' in error_message):
                    error_message = 'dataproduct %s not found at location %s:%s' % (self.DataProduct, self.HostLocation, self.Location)
                elif 'does not validate against schema' in error_message:
                    error_message = 'invalid SIP does not validate against schema'

                try:
                    self.RollBack()
                except Exception as rbe:
                    logger.error('RollBack failed for %s: %s', self.JobId, rbe)
                try:
                    self.SendStatusToLTA(IngestFailed)
                except Exception as sse:
                    logger.error('SendStatusToLTA failed for %s: %s', self.JobId, sse)
                try:
                    self.__sendNotification('JobTransferFailed', error_message)
                except Exception as sne:
                    logger.error('sendNotification failed for %s: %s', self.JobId, sne)
            finally:
                self.status = IngestPipeline.STATUS_FINISHED