Beispiel #1
0
    def checkForSubmission(self):
        """Additional checks for unit submission"""

        # call the base class
        if not super(AtlasUnit, self).checkForSubmission():
            return False

        # check that parent units are complete because otherwise, when we check for submission to do submissions first (ITransform.update)
        # datasets may not have been created yet
        if not self.checkParentUnitsAreComplete():
            return False

        # Add a check for chain units to have frozen their input DS
        if len(
                self.req_units
        ) > 0 and self.inputdata._name == "DQ2Dataset" and not self.inputdata.tag_info:

            # check datasets are frozen
            for uds in self.inputdata.dataset:
                try:
                    dq2_lock.acquire()

                    try:
                        # list datasets in container
                        ds_list = dq2.listDatasetsInContainer(uds)

                        cont_ok = True
                        for ds in ds_list:
                            # find locations and check if frozen
                            loc_dict = dq2.listDatasetReplicas(ds)
                            locations = []
                            for loc in loc_dict[loc_dict.keys()[0]]:
                                locations += loc_dict[loc_dict.keys()[0]][loc]

                            ds_ok = False
                            for loc in locations:
                                if loc == "":
                                    continue
                                datasetsiteinfo = dq2.listFileReplicas(loc, ds)
                                if datasetsiteinfo[0]['found'] != None:
                                    ds_ok = True
                                    break

                            if not ds_ok:
                                cont_ok = False
                                break
                    except:
                        logger.warning(
                            "Unable to check if datasets are frozen")
                        cont_ok = False
                finally:
                    dq2_lock.release()

            # at least one dataset wasn't frozen
            if not cont_ok:
                return False

        return True
Beispiel #2
0
   def checkForSubmission(self):
      """Additional checks for unit submission"""

      # call the base class
      if not super(AtlasUnit,self).checkForSubmission():
         return False

      # check that parent units are complete because otherwise, when we check for submission to do submissions first (ITransform.update)
      # datasets may not have been created yet
      if not self.checkParentUnitsAreComplete():
         return False

      # Add a check for chain units to have frozen their input DS
      if len(self.req_units) > 0 and self.inputdata._name == "DQ2Dataset" and not self.inputdata.tag_info:

         # check datasets are frozen
         for uds in self.inputdata.dataset:
            try:
               dq2_lock.acquire()

               try:
                  # list datasets in container
                  ds_list = dq2.listDatasetsInContainer(uds)

                  cont_ok = True
                  for ds in ds_list:
                     # find locations and check if frozen
                     loc_dict = dq2.listDatasetReplicas(ds)
                     locations = []
                     for loc in loc_dict[ loc_dict.keys()[0] ]:
                        locations += loc_dict[ loc_dict.keys()[0] ][loc]

                     ds_ok = False
                     for loc in locations:
                        if loc == "":
                           continue
                        datasetsiteinfo = dq2.listFileReplicas(loc, ds)
                        if datasetsiteinfo[0]['found'] != None:
                           ds_ok = True
                           break

                     if not ds_ok:
                        cont_ok = False
                        break
               except:
                  logger.warning("Unable to check if datasets are frozen")
                  cont_ok = False
            finally:
               dq2_lock.release()


         # at least one dataset wasn't frozen
         if not cont_ok:
            return False

      return True
Beispiel #3
0
    def check(self):
        super(AnaTransform, self).check()
        if not self.inputdata.dataset:
            return
        if not self.backend:
            logger.warning("Determining backend and cloud...")

            # Get ddm sites of atlas_dbrelease, if present
            db_sites = None
            if self.application.atlas_dbrelease == "LATEST":
                from pandatools import Client
                self.application.atlas_dbrelease = getPandaClient(
                ).getLatestDBRelease(False)
            if self.application.atlas_dbrelease:
                try:
                    db_dataset = self.application.atlas_dbrelease.split(':')[0]
                    try:
                        dq2_lock.acquire()
                        db_locations = dq2.listDatasetReplicas(
                            db_dataset).values()[0][1]
                    finally:
                        dq2_lock.release()

                except Exception as x:
                    raise ApplicationConfigurationError(
                        x,
                        'Problem in AnaTask - j.application.atlas_dbrelease is wrongly configured ! '
                    )
                db_sites = stripSites(db_locations)

            # Get complete/incomplete ddm sites for input dataset
            ds = self.inputdata.dataset[0]
            try:
                dq2_lock.acquire()
                if ds[-1] != "/":
                    try:
                        replicas = {ds: dq2.listDatasetReplicas(ds)}
                    except DQUnknownDatasetException:
                        ds += "/"
                if ds[-1] == "/":
                    replicas = dq2.listDatasetReplicasInContainer(ds)
            finally:
                dq2_lock.release()

            # check if replicas are non-empty
            somefound = False
            for tid in replicas:
                if len(replicas[tid]) == 0:
                    raise ApplicationConfigurationError(
                        None, "No replicas for dataset %s found!" % tid)
            replicas = [r.values()[0] for r in replicas.values()
                        ]  # (dict with only one entry)

            # Get allowed sites for each backend:
            backends = [
                be for be in config["backendPreference"] if be in GPI.__dict__
            ]
            allowed_sites = {}
            if "LCG" in backends:
                allowed_sites["LCG"] = GPI.LCG().requirements.list_sites(
                    True, True)
            if "Panda" in backends:
                from pandatools import Client
                allowed_sites["Panda"] = getPandaClient().PandaSites.keys()
                #allowed_sites["Panda"] = [site["ddm"] for site in Client.getSiteSpecs()[1].values()]
            if "NG" in backends:
                allowed_sites["NG"] = getConfig(
                    "Athena")["AllowedSitesNGDQ2JobSplitter"]
            #if "PBS" in backends:
            #   sites["PBS"] = [] # should be local DQ2 storage element!

            # Get list of cloud-backend pairs (cbl) for complete replicas
            common_cbl = None
            for r in replicas:
                cbl = self.findCompleteCloudBackend(db_sites, allowed_sites, r)
                if common_cbl is None:
                    common_cbl = cbl
                else:
                    common_cbl = [cb for cb in cbl if cb in common_cbl]

            #print "CLOUD/BACKEND list for COMPLETE replicas: ", common_cbl

            # ..and for incomplete replicas
            if common_cbl is None or len(common_cbl) == 0:
                if len(replicas) > 1:
                    raise ApplicationConfigurationError(
                        None,
                        'Container dataset %s has no complete replica on one site and backend. Please specify individual tid datasets or use t.initializeFromDataset("%s") '
                        % (ds, ds))
                common_cbl = self.findIncompleteCloudBackend(
                    db_sites, allowed_sites, replicas[0])
                #print "CLOUD/BACKEND list for INCOMPLETE replicas: ", common_cbl
            if common_cbl is None or len(common_cbl) == 0:
                raise ApplicationConfigurationError(
                    None,
                    'Container dataset %s has no replica on one site and backend. Please specify individual tid datasets!'
                    % (ds))

            cb = common_cbl[0]
            using_cloud = cb[0]
            using_backend = cb[1]

            assert using_cloud, using_backend

            if using_backend == "Panda":
                self.backend = stripProxy(GPI.Panda())
                self.backend.requirements.cloud = using_cloud

            elif using_backend == "NG":
                self.backend = stripProxy(GPI.NG())
            elif using_backend == "LCG":
                self.backend = stripProxy(GPI.LCG())
                self.backend.requirements = stripProxy(
                    GPI.AtlasLCGRequirements())
                self.backend.requirements.cloud = using_cloud
            assert self.backend
            logger.warning("Running on cloud %s using backend %s", using_cloud,
                           using_backend)

        logger.warning("Determining partition splitting...")
        try:
            if not self.backend.requirements.cloud:
                self.backend.requirements.cloud = "DE"
        except:
            pass
        if not self.inputdata.dataset:
            return
        splitter = DQ2JobSplitter()
        splitter.numfiles = self.files_per_job
        #splitter.update_siteindex = False # commented to use default value
        #splitter.use_lfc = True
        sjl = splitter.split(
            self
        )  # This works even for Panda, no special "Job" properties are used anywhere.
        self.partitions_data = [sj.inputdata for sj in sjl]
        try:
            self.partitions_sites = [
                sj.backend.requirements.sites for sj in sjl
            ]
        except AttributeError:
            self.partitions_sites = [sj.backend.site for sj in sjl]
            pass
        self.setPartitionsLimit(len(self.partitions_data) + 1)
        self.setPartitionsStatus([
            c for c in range(1,
                             len(self.partitions_data) + 1)
            if self.getPartitionStatus(c) != "completed"
        ], "ready")
Beispiel #4
0
    def checkCompletedApp(self, app):
        task = self._getParent()
        j = app._getParent()
        for odat in j.outputdata.outputdata:
            # Look out: if this is changed, there is anothher one like it below!
            if 0 == len([
                    f for f in j.outputdata.output
                    if ".".join(odat.split(".")[:-1]) in f
            ]):
                logger.error("Job %s has not produced %s file, only: %s" %
                             (j.id, odat, j.outputdata.output))
                return False
        # if this is the first app to complete the partition...
        if self.getPartitionStatus(self._app_partition[app.id]) != "completed":
            task_container, subtask_dsname = task.container_name, self.dataset_name

            infos = {}
            for oinfo in j.outputdata.output:
                try:
                    dq2_lock.acquire()
                    info = oinfo.split(",")
                    # get master replica from dataset - info not set to SE; but to ANALY_XYZ from panda
                    master_replica = dq2.getMasterReplicaLocation(info[0])
                    if master_replica:
                        info[5] = master_replica
                    else:
                        replicas = dq2.listDatasetReplicas(info[0]).values()
                        if len(replicas) == 0:
                            try:
                                info[5] = getPandaClient().PandaSites[
                                    info[5]]["ddm"]
                            except KeyError:
                                pass
                        else:
                            complete, incomplete = replicas[0].values()
                            info[5] = (complete + incomplete)[0]
                    if info[4][:3] == "ad:":
                        info[4] = info[4][3:]

                finally:
                    dq2_lock.release()

                datasetname = subtask_dsname + '.' + info[5]
                info[0] = datasetname
                infos.setdefault(datasetname, []).append(",".join(info))

            for ds in infos.keys():
                outputdata = DQ2OutputDataset()
                try:
                    outputdata.create_dataset(ds)
                except DQDatasetExistsException:
                    pass
                try:
                    outputdata.register_datasets_details(None, infos[ds])
                except DQFileExistsInDatasetException:
                    pass

            # Register Container
            try:
                containerinfo = {}
                dq2_lock.acquire()
                try:
                    containerinfo = dq2.listDatasets(task_container)
                except:
                    containerinfo = {}
                if containerinfo == {}:
                    try:
                        dq2.registerContainer(task_container)
                        logger.debug('Registered container for Task %i: %s' %
                                     (task.id, task_container))
                    except Exception as x:
                        logger.error(
                            'Problem registering container for Task %i, %s : %s %s'
                            % (task.id, task_container, x.__class__, x))
                for ds in infos.keys():
                    try:
                        dq2.registerDatasetsInContainer(task_container, [ds])
                    except DQContainerAlreadyHasDataset:
                        pass
                    except Exception as x:
                        logger.error(
                            'Problem registering dataset %s in container %s: %s %s'
                            % (subtask_dsname, task_container, x.__class__, x))
            finally:
                dq2_lock.release()
        return True
   def checkCompletedApp(self, app):
      task = self._getParent()
      j = app._getParent()
      for odat in j.outputdata.outputdata:
          # Look out: if this is changed, there is anothher one like it below!
          if 0==len([f for f in j.outputdata.output if ".".join(odat.split(".")[:-1]) in f]):
              logger.error("Job %s has not produced %s file, only: %s" % (j.id, odat, j.outputdata.output))
              return False
      # if this is the first app to complete the partition...
      if self.getPartitionStatus(self._app_partition[app.id]) != "completed":
          task_container, subtask_dsname = task.container_name, self.dataset_name

          infos = {}
          for oinfo in j.outputdata.output:
              try:
                  dq2_lock.acquire()
                  info = oinfo.split(",")
                  # get master replica from dataset - info not set to SE; but to ANALY_XYZ from panda
                  master_replica = dq2.getMasterReplicaLocation(info[0])
                  if master_replica:
                      info[5] = master_replica
                  else:
                      replicas = dq2.listDatasetReplicas(info[0]).values()
                      if len(replicas) == 0:
                          try:
                              info[5] = getPandaClient().PandaSites[info[5]]["ddm"]
                          except KeyError:
                              pass
                      else:
                          complete, incomplete = replicas[0].values()
                          info[5] = (complete + incomplete)[0]
                  if info[4][:3] == "ad:":
                      info[4] = info[4][3:]

              finally:
                  dq2_lock.release()
                
              datasetname = subtask_dsname + '.' + info[5]
              info[0] = datasetname
              infos.setdefault(datasetname, []).append(",".join(info))

          for ds in infos.keys():
              outputdata = DQ2OutputDataset()
              try:
                  outputdata.create_dataset(ds)
              except DQDatasetExistsException:
                  pass
              try:
                  outputdata.register_datasets_details(None, infos[ds])
              except DQFileExistsInDatasetException:
                  pass

          # Register Container
          try:
              containerinfo = {}
              dq2_lock.acquire()
              try:
                  containerinfo = dq2.listDatasets(task_container)
              except:
                  containerinfo = {}
              if containerinfo == {}:
                  try:
                      dq2.registerContainer(task_container)
                      logger.debug('Registered container for Task %i: %s' % (task.id, task_container))
                  except Exception as x:
                      logger.error('Problem registering container for Task %i, %s : %s %s' % (task.id, task_container,x.__class__, x))
              for ds in infos.keys():
                  try:
                      dq2.registerDatasetsInContainer(task_container, [ ds ] )
                  except DQContainerAlreadyHasDataset:
                      pass
                  except Exception as x:
                      logger.error('Problem registering dataset %s in container %s: %s %s' %( subtask_dsname, task_container, x.__class__, x))
          finally:
              dq2_lock.release()
      return True
   def check(self):
      super(AnaTransform,self).check()
      if not self.inputdata.dataset:
         return
      if not self.backend:
         logger.warning("Determining backend and cloud...")

         # Get ddm sites of atlas_dbrelease, if present
         db_sites = None
         if self.application.atlas_dbrelease == "LATEST":
            from pandatools import Client
            self.application.atlas_dbrelease = getPandaClient().getLatestDBRelease(False)
         if self.application.atlas_dbrelease:
            try:
               db_dataset = self.application.atlas_dbrelease.split(':')[0] 
               try:
                  dq2_lock.acquire()
                  db_locations = dq2.listDatasetReplicas(db_dataset).values()[0][1] 
               finally:
                  dq2_lock.release()

            except Exception as x:
               raise ApplicationConfigurationError(x, 'Problem in AnaTask - j.application.atlas_dbrelease is wrongly configured ! ')
            db_sites = stripSites(db_locations)
         
         # Get complete/incomplete ddm sites for input dataset
         ds = self.inputdata.dataset[0]
         try:
            dq2_lock.acquire()
            if ds[-1] != "/":
               try:
                  replicas = {ds : dq2.listDatasetReplicas(ds)}
               except DQUnknownDatasetException:
                  ds += "/"
            if ds[-1] == "/":
               replicas = dq2.listDatasetReplicasInContainer(ds)
         finally:
            dq2_lock.release()
        
         # check if replicas are non-empty
         somefound = False
         for tid in replicas: 
            if len(replicas[tid]) == 0:
               raise ApplicationConfigurationError(None, "No replicas for dataset %s found!" % tid)
         replicas = [r.values()[0] for r in replicas.values()] # (dict with only one entry)

         # Get allowed sites for each backend:
         backends = [be for be in config["backendPreference"] if be in GPI.__dict__]
         allowed_sites = {}
         if "LCG" in backends:
            allowed_sites["LCG"] = GPI.LCG().requirements.list_sites(True,True)
         if "Panda" in backends:
            from pandatools import Client
            allowed_sites["Panda"] = getPandaClient().PandaSites.keys()
            #allowed_sites["Panda"] = [site["ddm"] for site in Client.getSiteSpecs()[1].values()]
         if "NG" in backends:
            allowed_sites["NG"] = getConfig("Athena")["AllowedSitesNGDQ2JobSplitter"]
         #if "PBS" in backends:
         #   sites["PBS"] = [] # should be local DQ2 storage element!

         # Get list of cloud-backend pairs (cbl) for complete replicas
         common_cbl = None
         for r in replicas:
            cbl = self.findCompleteCloudBackend(db_sites, allowed_sites, r)
            if common_cbl is None:
               common_cbl = cbl
            else:
               common_cbl = [cb for cb in cbl if cb in common_cbl]

         #print "CLOUD/BACKEND list for COMPLETE replicas: ", common_cbl

         # ..and for incomplete replicas
         if common_cbl is None or len(common_cbl) == 0:
            if len(replicas) > 1:
               raise ApplicationConfigurationError(None, 'Container dataset %s has no complete replica on one site and backend. Please specify individual tid datasets or use t.initializeFromDataset("%s") ' % (ds, ds))
            common_cbl = self.findIncompleteCloudBackend(db_sites, allowed_sites, replicas[0])
            #print "CLOUD/BACKEND list for INCOMPLETE replicas: ", common_cbl
         if common_cbl is None or len(common_cbl) == 0:
            raise ApplicationConfigurationError(None, 'Container dataset %s has no replica on one site and backend. Please specify individual tid datasets!' % (ds))

         cb = common_cbl[0]
         using_cloud = cb[0]
         using_backend = cb[1]

         assert using_cloud, using_backend

         if using_backend == "Panda":
            self.backend = stripProxy(GPI.Panda())
            self.backend.requirements.cloud = using_cloud

         elif using_backend == "NG":
            self.backend = stripProxy(GPI.NG())
         elif using_backend == "LCG":
            self.backend = stripProxy(GPI.LCG())
            self.backend.requirements = stripProxy(GPI.AtlasLCGRequirements())
            self.backend.requirements.cloud = using_cloud
         assert self.backend
         logger.warning("Running on cloud %s using backend %s", using_cloud, using_backend)

      logger.warning("Determining partition splitting...")
      try:
         if not self.backend.requirements.cloud:
            self.backend.requirements.cloud = "DE"
      except:
         pass
      if not self.inputdata.dataset:
         return
      splitter = DQ2JobSplitter()
      splitter.numfiles = self.files_per_job
      #splitter.update_siteindex = False # commented to use default value
      #splitter.use_lfc = True
      sjl = splitter.split(self) # This works even for Panda, no special "Job" properties are used anywhere.
      self.partitions_data = [sj.inputdata for sj in sjl]
      try:
         self.partitions_sites = [sj.backend.requirements.sites for sj in sjl]
      except AttributeError:
         self.partitions_sites = [sj.backend.site for sj in sjl]
         pass
      self.setPartitionsLimit(len(self.partitions_data)+1)
      self.setPartitionsStatus([c for c in range(1,len(self.partitions_data)+1) if self.getPartitionStatus(c) != "completed"], "ready")