Esempio n. 1
0
    def testXMLJSON(self):
        """
        Test XML and JSON in the same scope
        """
        site = 'T1_US_FNAL_Buffer'
        httpDict = {
            'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/json/test"
        }
        phedexJSON = PhEDEx(responseType='json', httpDict=httpDict)
        httpDict = {
            'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/xml/test"
        }
        phedexXML = PhEDEx(responseType='xml', httpDict=httpDict)

        phedexXML.getNodeTFC(site)
        tfc_file = phedexXML.cacheFileName('tfc', inputdata={'node': site})
        tfc_map = {}
        tfc_map[site] = readTFC(tfc_file)
        pfn = tfc_map[site].matchLFN('srmv2',
                                     '/store/user/jblow/dir/test.root')

        self.assertTrue(
            pfn ==
            'srm://cmssrm.fnal.gov:8443/srm/managerv2?SFN=/11/store/user/jblow/dir/test.root'
        )

        self.assertTrue(
            phedexJSON.getNodeSE('T1_US_FNAL_Buffer') == 'cmssrm.fnal.gov')
def get_tfc_rules(site):
    """
    Get the TFC regexp for a given site.
    """
    phedex = PhEDEx(responseType='xml')
    phedex.getNodeTFC(site)
    tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site})

    return readTFC(tfc_file)
Esempio n. 3
0
def get_tfc_rules(site):
    """
    Get the TFC regexp for a given site.
    """
    phedex = PhEDEx(responseType='xml')
    phedex.getNodeTFC(site)
    tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site})

    return readTFC(tfc_file)
    def testDataServiceXML(self):
        # asks for PEM pass phrase ...
        raise nose.SkipTest
        phedex = PhEDEx(responseType='xml')

        site = 'T2_UK_SGrid_Bristol'
        lfn = '/store/users/metson/file'
        protocol = 'srmv2'
        phedex.getNodeTFC(site)

        tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site})
        tfc = readTFC(tfc_file)

        pfn_dict = phedex.getPFN(site, lfn, protocol)
        phedex_pfn = pfn_dict[(site, lfn)]
        pfn = tfc.matchLFN(protocol, lfn)
        msg = 'TFC pfn (%s) did not match PhEDEx pfn (%s)' % (pfn, phedex_pfn)
        self.assertEqual(phedex_pfn, pfn, msg)
Esempio n. 5
0
    def testDataServiceXML(self):
        # asks for PEM pass phrase ...
        raise nose.SkipTest
        phedex = PhEDEx(responseType='xml')

        site = 'T2_UK_SGrid_Bristol'
        lfn = '/store/users/metson/file'
        protocol = 'srmv2'
        phedex.getNodeTFC(site)

        tfc_file = phedex.cacheFileName('tfc', inputdata={'node': site})
        tfc = readTFC(tfc_file)

        pfn_dict = phedex.getPFN(site, lfn, protocol)
        phedex_pfn = pfn_dict[(site, lfn)]
        pfn = tfc.matchLFN(protocol, lfn)
        msg = 'TFC pfn (%s) did not match PhEDEx pfn (%s)' % (pfn, phedex_pfn)
        self.assertEqual(phedex_pfn, pfn, msg)
Esempio n. 6
0
    def testXMLJSON(self):
        """
        Test XML and JSON in the same scope
        """
        site = 'T1_US_FNAL_Buffer'
        httpDict = {'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/json/test"}
        phedexJSON = PhEDEx(responseType='json', httpDict=httpDict)
        httpDict = {'endpoint': "https://cmsweb.cern.ch/phedex/datasvc/xml/test"}
        phedexXML = PhEDEx(responseType='xml', httpDict=httpDict)

        phedexXML.getNodeTFC(site)
        tfc_file = phedexXML.cacheFileName('tfc', inputdata={'node': site})
        tfc_map = {}
        tfc_map[site] = readTFC(tfc_file)
        pfn = tfc_map[site].matchLFN('srmv2', '/store/user/jblow/dir/test.root')

        self.assertTrue(pfn == 'srm://cmssrm.fnal.gov:8443/srm/managerv2?SFN=/11/store/user/jblow/dir/test.root')

        self.assertTrue(phedexJSON.getNodeSE('T1_US_FNAL_Buffer') == 'cmssrm.fnal.gov')
Esempio n. 7
0
class TransferDaemon(BaseDaemon):
    """
    _TransferDaemon_
    Call multiprocessing library to instantiate a TransferWorker for each user.
    """
    def __init__(self, config):
        """
        Initialise class members:
            1. check and create dropbox dir
            2. define oracle and couch (config and file instance) server connection
            3. PhEDEx connection
            4. Setup wmcore factory
        """

        self.doc_acq = ''
        # Need a better way to test this without turning off this next line
        BaseDaemon.__init__(self, config, 'AsyncTransfer')

        self.dropbox_dir = '%s/dropbox/outputs' % self.config.componentDir

        if not os.path.isdir(self.dropbox_dir):
            try:
                os.makedirs(self.dropbox_dir)
            except OSError as e:
                if not e.errno == errno.EEXIST:
                    self.logger.exception('Unknown error in mkdir' % e.errno)
                    raise

        if not os.path.isdir("/tmp/DashboardReport"):
            try:
                os.makedirs("/tmp/DashboardReport")
            except OSError as e:
                if not e.errno == errno.EEXIST:
                    self.logger.exception('Unknown error in mkdir' % e.errno)
                    raise
        try:
            config_server = CouchServer(dburl=self.config.config_couch_instance)
            self.config_db = config_server.connectDatabase(self.config.config_database)
        except:
            self.logger.exception('Failed when contacting local couch')
            raise

        try:    
            self.oracleDB = HTTPRequests(self.config.oracleDB,
                                         self.config.opsProxy,
                                         self.config.opsProxy)
        except:
            self.logger.exception('Failed when contacting Oracle')
            raise
        self.pool = Pool(processes=self.config.pool_size)
        self.factory = WMFactory(self.config.schedAlgoDir,
                                 namespace=self.config.schedAlgoDir)

        self.site_tfc_map = {}
        try:
            self.phedex = PhEDEx(responseType='xml',
                                 dict={'key':self.config.opsProxy,
                                       'cert':self.config.opsProxy})
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)
            raise
        # TODO: decode xml
        try:
            self.phedex2 = PhEDEx(responseType='json',
                                 dict={'key':self.config.opsProxy,
                                       'cert':self.config.opsProxy})
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)
            raise

        self.logger.debug(type((self.phedex2.getNodeMap())['phedex']['node']))
        for site in [x['name'] for x in self.phedex2.getNodeMap()['phedex']['node']]:
            if site and str(site) != 'None' and str(site) != 'unknown':
                self.site_tfc_map[site] = self.get_tfc_rules(site)
                self.logger.debug('tfc site: %s %s' % (site, self.get_tfc_rules(site)))


    # Over riding setup() is optional, and not needed here
    def algorithm(self, parameters=None):
        """
        1  Get transfer config from couchdb config instance
        2. Get a list of users with files to transfer from the db instance
                                                    (oracle or couch, by config flag)
        3. For each user get a suitably sized input for submission (call to a list)
        4. Submit to a subprocess
        """

        if self.config.isOracle:
            users = self.oracleSiteUser(self.oracleDB)
        else:
            users = self.active_users(self.db)

            sites = self.active_sites()
            self.logger.info('%s active sites' % len(sites))
            self.logger.debug('Active sites are: %s' % sites)

        self.logger.debug('kicking off pool')
        for u in users:
            for i in range(len(u)):
                if not u[i]:
                   u[i] = '' 
                    
            self.logger.debug('current_running %s' % current_running)
            self.logger.debug('Testing current running: %s %s %s' % (u, current_running, (u not in current_running)))
            if u not in current_running:
                self.logger.debug('processing %s' % u)
                current_running.append(u)
                self.logger.debug('processing %s' % current_running)
                self.pool.apply_async(ftscp, (u, self.site_tfc_map, self.config),
                                      callback=log_result)

    def oracleSiteUser(self, db):
        """
        1. Acquire transfers from DB
        2. Get acquired users and destination sites
        """

        self.logger.info('Retrieving users...')
        fileDoc = dict()
        fileDoc['subresource'] = 'activeUsers'
        fileDoc['grouping'] = 0
        fileDoc['asoworker'] = self.config.asoworker

        result = dict()
        try:
            result = db.get(self.config.oracleFileTrans,
                             data=encodeRequest(fileDoc))
        except Exception as ex:
            self.logger.error("Failed to acquire transfers \
                              from oracleDB: %s" % ex)
            return []
        
        self.logger.debug(oracleOutputMapping(result))
        # TODO: translate result into list((user,group,role),...)
        if len(oracleOutputMapping(result)) != 0:
            self.logger.debug(type( [[x['username'].encode('ascii','ignore'), x['user_group'], x['user_role']] for x in oracleOutputMapping(result)]))
            try:
                docs =  oracleOutputMapping(result)
                users = [[x['username'], x['user_group'], x['user_role']] for x in docs]
                self.logger.info('Users to process: %s' % str(users))
            except:
                self.logger.exception('User data malformed. ')
        else:
            self.logger.info('No new user to acquire')
            return []

        actives = list()
        for user in users:
            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'acquireTransfers'
            fileDoc['username'] = user[0]

            self.logger.debug("Retrieving transfers from oracleDB for user: %s " % user[0])

            try:
                result = db.post(self.config.oracleFileTrans,
                                 data=encodeRequest(fileDoc))
            except Exception as ex:
                self.logger.error("Failed to acquire transfers \
                                  from oracleDB: %s" %ex)
                continue

            self.doc_acq = str(result)
            for i in range(len(user)):
                if not user[i] or user[i] in ['None', 'NULL']:
                    user[i] = ''
                user[i] = str(user[i])
            actives.append(user)


            self.logger.debug("Transfers retrieved from oracleDB. %s " % users)

        return users

    def active_users(self, db):
        """
        Query a view for users with files to transfer.
        get this from the following view:
              ftscp?group=true&group_level=1
        """
        query = {'group': True, 'group_level': 3}
        try:
            users = db.loadView(self.config.ftscp_design, 'ftscp_all', query)
        except Exception as e:
            self.logger.exception('A problem occured when\
                                  contacting couchDB: %s' % e)
            return []

        if len(users['rows']) <= self.config.pool_size:
            active_users = [x['key'] for x in users['rows']]
        else:
            sorted_users = self.factory.loadObject(self.config.algoName,
                                                   args=[self.config,
                                                         self.logger,
                                                         users['rows'],
                                                         self.config.pool_size],
                                                   getFromCache=False,
                                                   listFlag=True)
            active_users = sorted_users()[:self.config.pool_size]
        self.logger.info('%s active users' % len(active_users))
        self.logger.debug('Active users are: %s' % active_users)
        return active_users

    def  active_sites(self):
        """
        Get a list of all sites involved in transfers.
        """
        query = {'group': True, 'stale': 'ok'}
        try:
            sites = self.db.loadView('AsyncTransfer', 'sites', query)
        except Exception as e:
            self.logger.exception('A problem occured \
                                  when contacting couchDB: %s' % e)
            return []

        def keys_map(inputDict):
            """
            Map function.
            """
            return inputDict['key']

        return map(keys_map, sites['rows'])

    def get_tfc_rules(self, site):
        """
        Get the TFC regexp for a given site.
        """
        tfc_file = None
        try:
            self.phedex.getNodeTFC(site)
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)
        try:
            tfc_file = self.phedex.cacheFileName('tfc',
                                                 inputdata={'node': site})
        except Exception as e:
            self.logger.exception('PhEDEx cache exception: %s' % e)
        return readTFC(tfc_file)

    def terminate(self, parameters=None):
        """
        Called when thread is being terminated.
        """
        self.pool.close()
        self.pool.join()
Esempio n. 8
0
class ReporterWorker:

    def __init__(self, user, config):
        """
        store the user and tfc the worker
        """
        self.user = user
        self.config = config
        self.dropbox_dir = '%s/dropbox/inputs' % self.config.componentDir
        logging.basicConfig(level=config.log_level)
        self.site_tfc_map = {}
        self.logger = logging.getLogger('AsyncTransfer-Reporter-%s' % self.user)
        formatter = getCommonLogFormatter(self.config)
        for handler in logging.getLogger().handlers:
            handler.setFormatter(formatter)
        self.uiSetupScript = getattr(self.config, 'UISetupScript', None)
        self.cleanEnvironment = ''
        self.userDN = ''
        self.init = True
        if getattr(self.config, 'cleanEnvironment', False):
            self.cleanEnvironment = 'unset LD_LIBRARY_PATH; unset X509_USER_CERT; unset X509_USER_KEY;'
        # TODO: improve how the worker gets a log
        self.logger.debug("Trying to get DN")
        try:
            self.userDN = getDNFromUserName(self.user, self.logger)
        except Exception as ex:
            msg = "Error retrieving the user DN"
            msg += str(ex)
            msg += str(traceback.format_exc())
            self.logger.error(msg)
            self.init = False
            return
        if not self.userDN:
            self.init = False
            return
        defaultDelegation = {
                                  'logger': self.logger,
                                  'credServerPath': self.config.credentialDir,
                                  # It will be moved to be getfrom couchDB
                                  'myProxySvr': 'myproxy.cern.ch',
                                  'min_time_left' : getattr(self.config, 'minTimeLeft', 36000),
                                  'serverDN': self.config.serverDN,
                                  'uisource': self.uiSetupScript,
                                  'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)
                            }
        if hasattr(self.config, "cache_area"):
            try:
                defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(self.config.cache_area)[0]
            except IndexError:
                self.logger.error('MyproxyAccount parameter cannot be retrieved from %s' % self.config.cache_area)
                pass
        if getattr(self.config, 'serviceCert', None):
            defaultDelegation['server_cert'] = self.config.serviceCert
        if getattr(self.config, 'serviceKey', None):
            defaultDelegation['server_key'] = self.config.serviceKey

        self.valid = False
        try:
            self.valid, proxy = getProxy(self.userDN, "", "", defaultDelegation, self.logger)
        except Exception as ex:
            msg = "Error getting the user proxy"
            msg += str(ex)
            msg += str(traceback.format_exc())
            self.logger.error(msg)

        if self.valid:
            self.userProxy = proxy
        else:
            # Use the operator's proxy when the user proxy in invalid.
            # This will be moved soon
            self.logger.error('Did not get valid proxy. Setting proxy to ops proxy')
            self.userProxy = config.opsProxy

        if self.config.isOracle:
            try:
                self.oracleDB = HTTPRequests(self.config.oracleDB,
                                             config.opsProxy,
                                             config.opsProxy)
            except Exception:
                self.logger.exception()
                raise
        else:
            server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy)
            self.db = server.connectDatabase(self.config.files_database)

        # Set up a factory for loading plugins
        self.factory = WMFactory(self.config.pluginDir, namespace = self.config.pluginDir)
        self.commandTimeout = 1200
        self.max_retry = config.max_retry
        # Proxy management in Couch
        os.environ['X509_USER_PROXY'] = self.userProxy
        try:
            self.phedex = PhEDEx(responseType='xml',
                                 dict={'key':self.config.opsProxy,
                                       'cert':self.config.opsProxy})
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)

    def __call__(self):
        """
        a. makes the ftscp copyjob
        b. submits ftscp
        c. deletes successfully transferred files from the DB
        """
        self.logger.info("Retrieving files for %s" % self.user)
        files_to_update = self.files_for_update()
        self.logger.info("%s files to process" % len(files_to_update))
        self.logger.debug("%s files to process" % files_to_update)
        for input_file in files_to_update:
            remove_good = True
            remove_failed = True
            failed_lfns = []
            failure_reason = []
            good_lfns = []
            self.logger.info("Updating %s" % input_file)
            if os.path.basename(input_file).startswith('Reporter'):
                try:
                    json_data = json.loads(open(input_file).read())
                except ValueError as e:
                    self.logger.error("Error loading %s" % e)
                    self.logger.debug('Removing %s' % input_file)
                    os.unlink(input_file)
                    continue
                except Exception as e:
                    self.logger.error("Error loading %s" % e)
                    self.logger.debug('Removing %s' % input_file)
                    os.unlink(input_file)
                    continue
                if json_data:
                    self.logger.debug('Inputs: %s %s %s' % (json_data['LFNs'], json_data['transferStatus'], json_data['failure_reason']))

                    if 'FAILED' or 'abandoned' or 'CANCELED' or 'lost' in json_data['transferStatus']:
                        # Sort failed files
                        failed_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'FAILED' or x == 'CANCELED']
                        abandoned_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'abandoned']
                        failed_indexes.extend(abandoned_indexes)
                        self.logger.info('failed indexes %s' % len(failed_indexes))
                        self.logger.debug('failed indexes %s' % failed_indexes)
                        for i in failed_indexes:
                            failed_lfns.append(json_data['LFNs'][i])
                            failure_reason.append(json_data['failure_reason'][i])
                        self.logger.debug('Marking failed %s %s' %(failed_lfns, failure_reason))
                        updated_failed_lfns = self.mark_failed(failed_lfns, failure_reason)


                    if 'Done' or 'FINISHED' in json_data['transferStatus']:
                        # Sort good files
                        good_indexes = [i for i, x in enumerate(json_data['transferStatus']) if (x == 'Done' or x == 'FINISHED' or x == 'Finishing') ]
                        self.logger.info('good indexes %s' % len(good_indexes))
                        self.logger.debug('good indexes %s' % good_indexes)
                        for i in good_indexes:
                            good_lfns.append(json_data['LFNs'][i])
                        self.logger.info('Marking good %s' %(good_lfns))
                        try:
                            updated_good_lfns = self.mark_good(good_lfns)
                        except:
                            self.logger.exception('Either no files to mark or failed to update state')

                    # Remove the json file
                    self.logger.debug('Removing %s' % input_file)
                    os.unlink( input_file )

                else:
                    self.logger.info('Empty file %s' % input_file)
                    continue
            else:
                self.logger.info('File not for the Reporter %s' % input_file)
                continue
        self.logger.info('Update completed')
        return

    def files_for_update(self):
        """
        Retrieve the list of files to update.
        """
        files_to_update = []
        user_dir = os.path.join(self.dropbox_dir, self.user)
        self.logger.info('Looking into %s' % user_dir)
        for user_file in os.listdir(user_dir):
            files_to_update.append(os.path.join(self.dropbox_dir, self.user, user_file))
        return files_to_update

    def mark_good(self, files):
        """
        Mark the list of files as tranferred
        """
        updated_lfn = []
        good_ids = []
        if len(files) == 0:
            return updated_lfn
        for it, lfn in enumerate(files):
            hash_lfn = getHashLfn(lfn)
            self.logger.info("Marking good %s" % hash_lfn)
            self.logger.debug("Marking good %s" % lfn)
            if not self.config.isOracle:
                try:
                    document = self.db.document(hash_lfn)
                except Exception as ex:
                    msg = "Error loading document from couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
            self.logger.info("Doc %s Loaded" % hash_lfn)
            try:
                now = str(datetime.datetime.now())
                last_update = time.time()
                if self.config.isOracle:
                    docId = getHashLfn(lfn)
                    good_ids.append(docId)
                    updated_lfn.append(lfn)
                else:
                    if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed':
                        outputLfn = document['lfn'].replace('store/temp', 'store', 1)
                        data = dict()
                        data['end_time'] = now
                        data['state'] = 'done'
                        data['lfn'] = outputLfn
                        data['last_update'] = last_update
                        updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + getHashLfn(lfn)
                        updateUri += "?" + urllib.urlencode(data)
                        self.db.makeRequest(uri = updateUri, type = "PUT", decode = False)
                        updated_lfn.append(lfn)
                        self.logger.debug("Marked good %s" % lfn)
                    else: 
                        updated_lfn.append(lfn)
                    try:
                        self.db.commit()
                    except Exception as ex:
                        msg = "Error commiting documents in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue   
            except Exception as ex:
                msg = "Error updating document"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                continue
        if self.config.isOracle:
            try:
                data = dict()
                data['asoworker'] = self.config.asoworker
                data['subresource'] = 'updateTransfers'
                data['list_of_ids'] = good_ids
                data['list_of_transfer_state'] = ["DONE" for x in good_ids]
                result = self.oracleDB.post(self.config.oracleFileTrans,
                                            data=encodeRequest(data))
                self.logger.debug("Marked good %s" % good_ids)
            except Exception:
                self.logger.exception('Error updating document')
                return {}
        
        self.logger.info("Transferred file %s updated, removing now source file" %docId)
        try:
            docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'),
                                        data=encodeRequest({'subresource': 'getById', 'id': docId}))
            document = oracleOutputMapping(docbyId, None)[0]
        except Exception:
            msg = "Error getting file from source"
            self.logger.exception(msg)
            return {}

        if document["source"] not in self.site_tfc_map:
            self.logger.debug("site not found... gathering info from phedex")
            self.site_tfc_map[document["source"]] = self.get_tfc_rules(document["source"])
        pfn = self.apply_tfc_to_lfn( '%s:%s' %(document["source"], lfn))
        self.logger.debug("File has to be removed now from source site: %s" %pfn)
        self.remove_files(self.userProxy, pfn)
        self.logger.debug("Transferred file removed from source")
        return updated_lfn

    def remove_files(self, userProxy, pfn):

        command = 'env -i X509_USER_PROXY=%s gfal-rm -v -t 180 %s'  % \
                  (userProxy, pfn)
        logging.debug("Running remove command %s" % command)
        try:
            rc, stdout, stderr = execute_command(command, self.logger, 3600)
        except Exception as ex:
            self.logger.error(ex)
        if rc:
            logging.info("Deletion command failed with output %s and error %s" %(stdout, stderr))
        else:
            logging.info("File Deleted.")
        return 

    def get_tfc_rules(self, site):
        """
        Get the TFC regexp for a given site.
        """
        self.phedex.getNodeTFC(site)
        try:
            tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site})
        except Exception:
            self.logger.exception('A problem occured when getting the TFC regexp: %s')
            return None
        return readTFC(tfc_file)

    def apply_tfc_to_lfn(self, file):
        """
        Take a CMS_NAME:lfn string and make a pfn.
        Update pfn_to_lfn_mapping dictionary.
        """
        try:
            site, lfn = tuple(file.split(':'))
        except Exception:
            self.logger.exception('It does not seem to be an lfn %s' %file.split(':'))
            return None
        if site in self.site_tfc_map:
            pfn = self.site_tfc_map[site].matchLFN('srmv2', lfn)
            # TODO: improve fix for wrong tfc on sites
            try:
                if pfn.find("\\") != -1: pfn = pfn.replace("\\","")
                if pfn.split(':')[0] != 'srm' and pfn.split(':')[0] != 'gsiftp' :
                    self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                    return None
            except IndexError:
                self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                return None
            except AttributeError:
                self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                return None
            return pfn
        else:
            self.logger.error('Wrong site %s!' % site)
            return None

    def mark_failed(self, files=[], failures_reasons=[], force_fail=False):
        """
        Something failed for these files so increment the retry count
        """
        updated_lfn = []
        for lfn in files:
            data = {}
            self.logger.debug("Document: %s" % lfn)
            if not isinstance(lfn, dict):
                if 'temp' not in lfn:
                    temp_lfn = lfn.replace('store', 'store/temp', 1)
                else:
                    temp_lfn = lfn
            else:
                if 'temp' not in lfn['value']:
                    temp_lfn = lfn['value'].replace('store', 'store/temp', 1)
                else:
                    temp_lfn = lfn['value']
            docId = getHashLfn(temp_lfn)
            # Load document to get the retry_count
            if self.config.isOracle:
                try:
                    self.logger.debug("Document: %s" %docId)
                    docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers',
                                                                                    'fileusertransfers'),
                                                data=encodeRequest({'subresource': 'getById', 'id': docId}))
                    document = oracleOutputMapping(docbyId)[0]
                    data = dict()
                    data['asoworker'] = self.config.asoworker
                    data['subresource'] = 'updateTransfers'
                    data['list_of_ids'] = docId

                    if force_fail or document['transfer_retry_count'] + 1 > self.max_retry:
                        data['list_of_transfer_state'] = 'FAILED'
                        data['list_of_retry_value'] = 0
                    else:
                        data['list_of_transfer_state'] = 'RETRY'
                        fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)])
                        if fatal_error:
                            data['list_of_transfer_state'] = 'FAILED'
                        
                    data['list_of_failure_reason'] = failures_reasons[files.index(lfn)]
                    data['list_of_retry_value'] = 0

                    self.logger.debug("update: %s" % data)
                    result = self.oracleDB.post(self.config.oracleFileTrans,
                                                data=encodeRequest(data))
                    if not data['list_of_transfer_state'] == 'RETRY':  
                        updated_lfn.append(lfn)
                    self.logger.debug("Marked failed %s" % lfn)
                except Exception as ex:
                    self.logger.error("Error updating document status: %s" %ex)
                    continue
            else:
                try:
                    document = self.db.document( docId )
                except Exception as ex:
                    msg = "Error loading document from couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
                if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed':
                    now = str(datetime.datetime.now())
                    last_update = time.time()
                    # Prepare data to update the document in couch
                    if force_fail or len(document['retry_count']) + 1 > self.max_retry:
                        data['state'] = 'failed'
                        data['end_time'] = now
                    else:
                        data['state'] = 'retry'
                        fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)])
                        if fatal_error:
                            data['state'] = 'failed'
                            data['end_time'] = now

                    self.logger.debug("Failure list: %s" % failures_reasons)
                    self.logger.debug("Files: %s" % files)
                    self.logger.debug("LFN %s" % lfn)

                    data['failure_reason'] = failures_reasons[files.index(lfn)]
                    data['last_update'] = last_update
                    data['retry'] = now
                    # Update the document in couch
                    self.logger.debug("Marking failed %s" % docId)
                    try:
                        updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId
                        updateUri += "?" + urllib.urlencode(data)
                        self.db.makeRequest(uri = updateUri, type = "PUT", decode = False)
                        updated_lfn.append(docId)
                        self.logger.debug("Marked failed %s" % docId)
                    except Exception as ex:
                        msg = "Error in updating document in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue
                    try:
                        self.db.commit()
                    except Exception as ex:
                        msg = "Error commiting documents in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue
                else: updated_lfn.append(docId)
        self.logger.debug("failed file updated")
        return updated_lfn

    def determine_fatal_error(self, failure=""):
        """
        Determine if transfer error is fatal or not.
        """
        permanent_failure_reasons = [
                             ".*canceled because it stayed in the queue for too long.*",
                             ".*permission denied.*",
                             ".*disk quota exceeded.*",
                             ".*operation not permitted*",
                             ".*mkdir\(\) fail.*",
                             ".*open/create error.*",
                             ".*mkdir\: cannot create directory.*",
                             ".*does not have enough space.*"
                                    ]
        failure = str(failure).lower()
        for permanent_failure_reason in permanent_failure_reasons:
            if re.match(permanent_failure_reason, failure):
                return True
        return False

    def mark_incomplete(self, files=[]):
        """
        Mark the list of files as acquired
        """
        self.logger('Something called mark_incomplete which should never be called')
Esempio n. 9
0
class TransferDaemon(BaseDaemon):
    """
    _TransferDaemon_
    Call multiprocessing library to instantiate a TransferWorker for each user.
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        #Need a better way to test this without turning off this next line
        BaseDaemon.__init__(self, config, 'AsyncTransfer')

        self.dropbox_dir = '%s/dropbox/outputs' % self.config.componentDir
        if not os.path.isdir(self.dropbox_dir):
            try:
                os.makedirs(self.dropbox_dir)
            except OSError as e:
                if e.errno == errno.EEXIST:
                    pass
                else:
                    self.logger.error('Unknown error in mkdir' % e.errno)
                    raise
        server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy)
        self.db = server.connectDatabase(self.config.files_database)
        config_server = CouchServer(dburl=self.config.config_couch_instance)
        self.config_db = config_server.connectDatabase(self.config.config_database)
        self.logger.debug('Connected to CouchDB')
        self.pool = Pool(processes=self.config.pool_size)
        try:
            self.phedex = PhEDEx(responseType='xml', dict = {'key': self.config.opsProxy, 'cert': self.config.opsProxy})
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)
        # Set up a factory for loading plugins
        self.factory = WMFactory(self.config.schedAlgoDir, namespace = self.config.schedAlgoDir)

        result_list = []
        current_running = []

    # Over riding setup() is optional, and not needed here
    def algorithm(self, parameters = None):
        """
        1. Get a list of users with files to transfer from the couchdb instance
        2. For each user get a suitably sized input for ftscp (call to a list)
        3. Submit the ftscp to a subprocess
        """
        query = {'stale':'ok'}
        try:
            params = self.config_db.loadView('asynctransfer_config', 'GetTransferConfig', query)
            self.config.max_files_per_transfer = params['rows'][0]['key'][1]
            self.config.algoName = params['rows'][0]['key'][2]
        except IndexError:
            self.logger.exception('Config data could not be retrieved from the config database. Fallback to the config file')
        except Exception as e:
            self.logger.exception('A problem occured when contacting couchDB: %s' % e)

        users = self.active_users(self.db)

        sites = self.active_sites()
        self.logger.info('%s active sites' % len(sites))
        self.logger.debug('Active sites are: %s' % sites)

        site_tfc_map = {}
        for site in sites:
            # TODO: Remove this check once the ASO request will be validated before the upload.
            if site and str(site) != 'None' and str(site) != 'unknown':
                site_tfc_map[site] = self.get_tfc_rules(site)
        self.logger.debug('kicking off pool')
        for u in users:
            self.logger.debug('current_running %s' %current_running)
            if u not in current_running:
                self.logger.debug('processing %s' %u)
                current_running.append(u)
                self.logger.debug('processing %s' %current_running)
                self.pool.apply_async(ftscp,(u, site_tfc_map, self.config), callback = log_result)

    def active_users(self, db):
        """
        Query a view for users with files to transfer. Get this from the
        following view:
            ftscp?group=true&group_level=1
        """
        #TODO: Remove stale=ok for now until tested
        #query = {'group': True, 'group_level': 3, 'stale': 'ok'}
        query = {'group': True, 'group_level': 3}
        try:
            users = db.loadView(self.config.ftscp_design, 'ftscp_all', query)
        except Exception as e:
            self.logger.exception('A problem occured when contacting couchDB: %s' % e)
            return []

        active_users = []
        if len(users['rows']) <= self.config.pool_size:
            active_users = users['rows']
            def keys_map(inputDict):
                """
                Map function.
                """
                return inputDict['key']
            active_users = map(keys_map, active_users)
        else:
            sorted_users = self.factory.loadObject(self.config.algoName, args = [self.config, self.logger, users['rows'], self.config.pool_size], getFromCache = False, listFlag = True)
            #active_users = random.sample(users['rows'], self.config.pool_size)
            active_users = sorted_users()[:self.config.pool_size]
        self.logger.info('%s active users' % len(active_users))
        self.logger.debug('Active users are: %s' % active_users)
        return active_users

    def active_sites(self):
        """
        Get a list of all sites involved in transfers.
        """
        query = {'group': True, 'stale': 'ok'}
        try:
            sites = self.db.loadView('AsyncTransfer', 'sites', query)
        except Exception as e:
            self.logger.exception('A problem occured when contacting couchDB: %s' % e)
            return []

        def keys_map(inputDict):
            """
            Map function.
            """
            return inputDict['key']

        return map(keys_map, sites['rows'])

    def get_tfc_rules(self, site):
        """
        Get the TFC regexp for a given site.
        """
        tfc_file = None
        try:
            self.phedex.getNodeTFC(site)
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)
        try:
            tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site})
        except Exception as e:
            self.logger.exception('PhEDEx cache exception: %s' % e)
        return readTFC(tfc_file)

    def terminate(self, parameters = None):
        """
        Called when thread is being terminated.
        """
        self.pool.close()
        self.pool.join()
Esempio n. 10
0
class Getter(object):
    """
    Get transfers to be submitted
    """
    def __init__(self, config, quiet, debug, test=False):
        """
        initialize log, connections etc
        """
        self.config = config.Getter

        self.TEST = False
        createLogdir('Monitor')

        def setRootLogger(quiet, debug):
            """
            Taken from CRABServer TaskWorker
            Sets the root logger with the desired verbosity level
               The root logger logs to logs/asolog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')

            if self.TEST:
                # if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/submitter.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        try:
            self.phedex = PhEDEx(responseType='xml',
                                 dict={
                                     'key': self.config.opsProxy,
                                     'cert': self.config.opsProxy
                                 })
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)

        self.documents = dict()
        self.doc_acq = ''
        self.STOP = False
        self.logger = setRootLogger(quiet, debug)
        self.q = Queue()
        self.active_lfns = list()
        self.Update = update(self.logger, self.config)
        self.site_tfc_map = {}
        for site in [
                x['name']
                for x in json.loads(self.phedex.getNodeMap())['phedex']['node']
        ]:
            if site and str(site) != 'None' and str(site) != 'unknown':
                self.site_tfc_map[site] = self.get_tfc_rules(site)
                self.logger.debug('tfc site: %s %s' %
                                  (site, self.get_tfc_rules(site)))

    def algorithm(self):
        """
        - Get Users
        - Get Source dest
        - create queue for each (user, link)
        - feed threads
        """

        workers = list()
        for i in range(self.config.max_threads_num):
            worker = Thread(target=self.worker, args=(i, self.q))
            worker.setDaemon(True)
            worker.start()
            workers.append(worker)

        site_tfc_map = dict()
        while not self.STOP:
            sites, users = self.oracleSiteUser(self.Update)

            self.Update.retry()

            for _user in users:
                for source in sites:
                    for dest in sites:
                        lfns = [[x['source_lfn'], x['destination_lfn']]
                                for x in self.documents
                                if x['source'] == source and x['destination']
                                == dest and x['username'] == _user[0]
                                and x not in self.active_lfns]
                        self.active_lfns = self.active_lfns + lfns
                        # IMPORTANT: remove only on final states

                        for files in chunks(lfns, self.config.files_per_job):
                            self.q.put((files, _user, source, dest,
                                        self.site_tfc_map))

            self.logger.debug('Queue lenght: %s' % self.q.qsize())
            time.sleep(4)

        for w in workers:
            w.join()

        self.logger.info('Submitter stopped.')

    def oracleSiteUser(self, Update):
        """
        1. Acquire transfers from DB
        2. Get acquired users and destination sites
        """

        # TODO: flexible with other DBs and get users list

        users = Update.acquire()

        if users != 1:
            self.documents = Update.getAcquired(users)

        for doc in self.documents:
            if doc['user_role'] is None:
                doc['user_role'] = ""
            if doc['user_group'] is None:
                doc['user_group'] = ""

        unique_users = list()
        try:
            unique_users = [
                list(i) for i in set(
                    tuple([x['username'], x['user_group'], x['user_role']])
                    for x in self.documents)
            ]
        except Exception as ex:
            self.logger.error("Failed to map active users: %s" % ex)

        if len(unique_users) <= self.config.pool_size:
            active_users = unique_users
        else:
            active_users = unique_users[:self.config.pool_size]

        self.logger.info('%s active users' % len(active_users))
        self.logger.debug('Active users are: %s' % active_users)

        active_sites_dest = [x['destination'] for x in self.documents]
        active_sites = active_sites_dest + [
            x['source'] for x in self.documents
        ]

        self.logger.debug('Active sites are: %s' % list(set(active_sites)))
        return list(set(active_sites)), active_users

    def get_tfc_rules(self, site):
        """
        Get the TFC regexp for a given site.
        """
        tfc_file = None
        try:
            self.phedex.getNodeTFC(site)
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)
        try:
            tfc_file = self.phedex.cacheFileName('tfc',
                                                 inputdata={'node': site})
        except Exception as e:
            self.logger.exception('PhEDEx cache exception: %s' % e)
        return readTFC(tfc_file)

    def critical_failure(self, lfns, lock, inputs):
        """
        if an exception occurs before the end, remove lfns from active
        to let it be reprocessed later.

        :param lfns:
        :param lock:
        :param inputs:
        :return:
        """
        lock.acquire()
        for lfn in lfns:
            self.active_lfns.remove(lfn)
        lock.release()
        inputs.task_done()

    def worker(self, i, inputs):
        """
        - Retrieve userDN
        - Retrieve user proxy
        - Delegate proxy to fts is needed
        - submit fts job
        - update doc states

        :param i: thread number
        :param inputs: tuple (lfns, _user, source, dest, tfc_map)
        :return:
        """
        # TODO: differentiate log messages per USER!
        logger = self.logger
        logger.info("Process %s is starting. PID %s", i, os.getpid())
        lock = Lock()
        Update = update(logger, self.config)

        while not self.STOP:
            if inputs.empty():
                time.sleep(10)
                continue
            try:
                lfns, _user, source, dest, tfc_map = inputs.get()
                [user, group, role] = _user
            except (EOFError, IOError):
                crashMessage = "Hit EOF/IO in getting new work\n"
                crashMessage += "Assuming this is a graceful break attempt.\n"
                logger.error(crashMessage)
                continue

            start = time.time()

            if not self.config.TEST:
                try:
                    userDN = getDNFromUserName(user,
                                               logger,
                                               ckey=self.config.opsProxy,
                                               cert=self.config.opsProxy)
                except Exception as ex:
                    logger.exception('Cannot retrieve user DN')
                    self.critical_failure(lfns, lock, inputs)
                    continue

                defaultDelegation = {
                    'logger':
                    logger,
                    'credServerPath':
                    self.config.credentialDir,
                    'myProxySvr':
                    'myproxy.cern.ch',
                    'min_time_left':
                    getattr(self.config, 'minTimeLeft', 36000),
                    'serverDN':
                    self.config.serverDN,
                    'uisource':
                    '',
                    'cleanEnvironment':
                    getattr(self.config, 'cleanEnvironment', False)
                }

                cache_area = self.config.cache_area

                try:
                    defaultDelegation['myproxyAccount'] = re.compile(
                        'https?://([^/]*)/.*').findall(cache_area)[0]
                except IndexError:
                    logger.error(
                        'MyproxyAccount parameter cannot be retrieved from %s . '
                        % self.config.cache_area)
                    self.critical_failure(lfns, lock, inputs)
                    continue

                if getattr(self.config, 'serviceCert', None):
                    defaultDelegation['server_cert'] = self.config.serviceCert
                if getattr(self.config, 'serviceKey', None):
                    defaultDelegation['server_key'] = self.config.serviceKey

                try:
                    defaultDelegation['userDN'] = userDN
                    defaultDelegation['group'] = group
                    defaultDelegation['role'] = role
                    logger.debug('delegation: %s' % defaultDelegation)
                    valid_proxy, user_proxy = getProxy(defaultDelegation,
                                                       logger)
                    if not valid_proxy:
                        logger.error(
                            'Failed to retrieve user proxy... putting docs on retry'
                        )
                        logger.error(
                            'docs on retry: %s' %
                            Update.failed(lfns, submission_error=True))
                        continue
                except Exception:
                    logger.exception('Error retrieving proxy')
                    self.critical_failure(lfns, lock, inputs)
                    continue
            else:
                user_proxy = self.config.opsProxy
                self.logger.debug("Using opsProxy for testmode")

            context = dict()
            try:
                if self.config.TEST:
                    logger.debug("Running in test mode, submitting fake jobs")
                else:
                    context = fts3.Context(self.config.serverFTS,
                                           user_proxy,
                                           user_proxy,
                                           verify=True)
                    logger.debug(
                        fts3.delegate(context,
                                      lifetime=timedelta(hours=48),
                                      force=False))
            except Exception:
                logger.exception("Error submitting to FTS")
                self.critical_failure(lfns, lock, inputs)
                continue

            failed_lfn = list()
            try:
                if self.config.TEST:
                    submitted_lfn = lfns
                    jobid = getHashLfn(lfns[0][0])
                    self.logger.debug('Fake job id: ' + jobid)
                else:
                    failed_lfn, submitted_lfn, jobid = Submission(
                        lfns, source, dest, i, self.logger, fts3, context,
                        tfc_map)
                    if jobid == -1:
                        self.critical_failure(lfns, lock, inputs)
                        continue
                    logger.info('Submitted %s files' % len(submitted_lfn))
            except Exception:
                logger.exception("Unexpected error during FTS job submission!")
                self.critical_failure(lfns, lock, inputs)
                continue

            # TODO: add file FTS id and job id columns for kill command
            try:
                Update.submitted(lfns)
            except Exception:
                logger.exception("Error updating document status")
                self.critical_failure(lfns, lock, inputs)
                continue

            try:
                Update.failed(failed_lfn)
            except Exception:
                logger.exception(
                    "Error updating document status, job submission will be retried later..."
                )
                self.critical_failure(lfns, lock, inputs)
                continue

            try:
                createLogdir('Monitor/' + user)
                with open('Monitor/' + user + '/' + str(jobid) + '.txt',
                          'w') as outfile:
                    json.dump(lfns, outfile)
                logger.info('Monitor files created')
            except Exception:
                logger.exception("Error creating file for monitor")
                self.critical_failure(lfns, lock, inputs)
                continue

            end = time.time()
            self.logger.info('Input processed in %s', str(end - start))
            time.sleep(0.5)

        logger.debug("Worker %s exiting.", i)
        return 0

    def quit_(self):
        """
        set STOP to True
        :return:
        """
        self.logger.info(
            "Received kill request. Setting STOP flag in the master and threads..."
        )
        self.STOP = True
Esempio n. 11
0
class TransferDaemon(BaseWorkerThread):
    """
    _TransferDaemon_
    Call multiprocessing library to instantiate a TransferWorker for each user.
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        #Need a better way to test this without turning off this next line
        BaseWorkerThread.__init__(self)
        #logging.basicConfig(format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s',datefmt = '%m-%d %H:%M')
        #self.logger = logging.getLogger()
        # self.logger is set up by the BaseWorkerThread, we just set it's level

        self.config = config.AsyncTransfer
        try:
            self.logger.setLevel(self.config.log_level)
        except:
            import logging
            self.logger = logging.getLogger()
            self.logger.setLevel(self.config.log_level)

        self.logger.debug('Configuration loaded')

        server = CouchServer(self.config.couch_instance)
        self.db = server.connectDatabase(self.config.files_database)
        self.logger.debug('Connected to CouchDB')
        self.pool = Pool(processes=self.config.pool_size)

        self.phedex = PhEDEx(responseType='xml')

    # Over riding setup() is optional, and not needed here

    def algorithm(self, parameters = None):
        """

        1. Get a list of users with files to transfer from the couchdb instance
        2. For each user get a suitably sized input for ftscp (call to a list)
        3. Submit the ftscp to a subprocess

        """
        users = self.active_users(self.db)

        sites = self.active_sites()
        self.logger.info('%s active sites' % len(sites))
        self.logger.debug('Active sites are: %s' % sites)

        site_tfc_map = {}
        for site in sites:
            site_tfc_map[site] = self.get_tfc_rules(site)

        self.logger.debug('kicking off pool')

        r = [self.pool.apply_async(ftscp, (u, site_tfc_map, self.config)) for u in users]
        for result in r:
            self.logger.info(result.get())

    def active_users(self, db):
        """
        Query a view for users with files to transfer. Get this from the
        following view:
            ftscp?group=true&group_level=1
        """
        query = {'group': True, 'group_level':3}
        users = db.loadView('AsyncTransfer', 'ftscp', query)

        active_users = []
        if len(users['rows']) <= self.config.pool_size:
            active_users = users['rows']
        else:
            #TODO: have a plugin algorithm here...
            active_users = random.sample(users['rows'], self.config.pool_size)

        def keys_map(inputDict):
            """
            Map function.
            """
            return inputDict['key']

        return map(keys_map, active_users)

    def active_sites(self):
        """
        Get a list of all sites involved in transfers.
        """
        query = {'group': True}
        sites = self.db.loadView('AsyncTransfer', 'sites', query)

        def keys_map(inputDict):
            """
            Map function.
            """
            return inputDict['key']

        return map(keys_map, sites['rows'])

    def get_tfc_rules(self, site):
        """
        Get the TFC regexp for a given site.
        """
        self.phedex.getNodeTFC(site)
        tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site})

        return readTFC(tfc_file)

    def terminate(self, parameters = None):
        """
        Called when thread is being terminated.
        """
        self.pool.close()
        self.pool.join()