Beispiel #1
0
 def rescheduleVisit():
     for i in range(5):
         try:
             with store.LTAStorageDb(self.dbcreds) as db:
                 logger.info('Rescheduling %s for new visit.' % (location.path(),))
                 db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL + datetime.timedelta(mins=1))
                 break
         except:
             time.sleep(1)
Beispiel #2
0
 def _mark_directory_for_a_visit(self, dir_id):
     """
     update the directory's last visit time to unix-epoch (which is the lowest possible visit timestamp), so that it
     appears in the visitStats which are used by the scraper to determine the next directory to be visited.
     :param int dir_id: the id of the directory
     :return: None
     """
     with store.LTAStorageDb(self._dbcreds) as db:
         return db.updateDirectoryLastVisitTime(dir_id,
                                                datetime.fromtimestamp(0))
Beispiel #3
0
    def _insert_missing_directory_tree_if_needed(self, srm_url):
        # example url: srm://lofar-srm.fz-juelich.de:8443/pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884/L652884_SAP000_B000_P001_bf_e619e5da.tar
        # or for a dir: srm://lofar-srm.fz-juelich.de:8443/pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884
        # site_url then becomes: srm://lofar-srm.fz-juelich.de:8443
        # dir_path then becomes: /pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884
        site = self._get_site_from_db(srm_url)
        dir_path = get_dir_path_in_site(srm_url)

        with store.LTAStorageDb(self._dbcreds) as db:
            return db.insert_missing_directory_tree_if_needed(
                dir_path, site['id'])
Beispiel #4
0
    def _get_site_from_db(self, srm_url):
        """
        find the site entry in the database for the given srm_url.
        raises a lookup error if not found.
        :param string srm_url: a valid srm url
        :return: a site entry dict from the database
        """
        site_url = get_site_surl(srm_url)

        # find site in db
        with store.LTAStorageDb(self._dbcreds) as db:
            site = next((s for s in db.sites() if s['url'] == site_url), None)
            if site is None:
                raise LookupError('Could not find site %s in database %s' %
                                  (site_url, self._dbcreds.database))
            return site
Beispiel #5
0
    def _schedule_srmurl_for_visit(self, srm_url):
        """process the given srm_url, insert it in the db if needed, and mark it as not visited,
        so that the scraper will visit it soon.
        :param srm_url: a valid srm url like: srm://lofar-srm.fz-juelich.de:8443/pnfs/fz-juelich.de/data/lofar/ops/projects/lc8_029/652884/L652884_SAP000_B000_P001_bf_e619e5da.tar
        :return: None
        """
        if srm_url:
            with store.LTAStorageDb(self._dbcreds) as db:
                site = self._get_site_from_db(srm_url)
                dir_path = get_dir_path_in_site(srm_url)
                directory = db.directoryByName(dir_path, site['id'])

                if directory is None:
                    dir_id = self._insert_missing_directory_tree_if_needed(
                        srm_url).get(dir_path)
                else:
                    dir_id = directory.get('dir_id')

                if dir_id is not None:
                    self._mark_directory_for_a_visit(dir_id)
Beispiel #6
0
def main():
    from optparse import OptionParser
    from lofar.common import dbcredentials

    # Check the invocation arguments
    parser = OptionParser(
        "%prog [options]",
        description=
        'runs the lta scraper and stores results in the speficied database.')
    parser.add_option_group(dbcredentials.options_group(parser))
    parser.set_defaults(dbcredentials="LTASO")
    (options, args) = parser.parse_args()

    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        level=logging.INFO)

    dbcreds = dbcredentials.parse_options(options)

    logger.info("Using dbcreds: %s" % dbcreds.stringWithHiddenPassword())

    global db
    db = store.LTAStorageDb(dbcreds)

    app.run(debug=False, host='0.0.0.0', port=9632)
Beispiel #7
0
def main():
    '''the main function scanning all locations and gathering the results'''

    from optparse import OptionParser
    from lofar.common import dbcredentials
    from lofar.messaging import DEFAULT_BROKER, DEFAULT_BUSNAME
    from lofar.lta.ltastorageoverview.ingesteventhandler import LTASOIngestEventHandler, IngestEventMesssageBusListener

    # Check the invocation arguments
    parser = OptionParser("%prog [options]", description='runs the lta scraper and stores results in the speficied database.')
    parser.add_option('-j', '--parallel', dest='parallel', type='int', default=8, help='number of parallel srmls jobs to run, default: %default')

    parser.add_option('-b', '--broker', dest='broker', type='string', default=DEFAULT_BROKER,
                      help='Address of the messaging broker, default: %default')
    parser.add_option('-e', '--exchange', dest='exchange', type='string',
                      default=DEFAULT_BUSNAME,
                      help='Name of the bus exchange on the broker on which the ingest notifications are published, default: %default')

    parser.add_option('-V', '--verbose', dest='verbose', action='store_true', help='verbose logging')
    parser.add_option_group(dbcredentials.options_group(parser))
    parser.set_defaults(dbcredentials="LTASO")
    (options, args) = parser.parse_args()

    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        level=logging.DEBUG if options.verbose else logging.INFO)
    options.parallel = max(1, min(8*multiprocessing.cpu_count(), options.parallel))
    logger.info("Using maximum number of parallel srmls jobs: %d" % options.parallel)

    dbcreds = dbcredentials.parse_options(options)
    logger.info("Using dbcreds: %s" % dbcreds.stringWithHiddenPassword())

    db = store.LTAStorageDb(dbcreds)
    populateDbWithLTASitesAndRootDirs(db)

    # for each site we want one or more ResultGetterThreads
    # so make a dict with a list per site based on the locations
    getters = dict([(site['name'],[]) for site in db.sites()])

    # some helper functions
    def numLocationsInQueues():
        '''returns the total number of locations in the queues'''
        return db.numDirectoriesNotVisitedSince(datetime.datetime.utcnow() - VISIT_INTERVAL)

    def totalNumGetters():
        '''returns the total number of parallel running ResultGetterThreads'''
        return sum([len(v) for v in list(getters.values())])

    def cleanupFinishedGetters():
        # get rid of old finished ResultGetterThreads
        finishedGetters = dict([(site_name, [getter for getter in getterList if not getter.isAlive()]) for site_name, getterList in list(getters.items())])
        for site_name,finishedGetterList in list(finishedGetters.items()):
            for finishedGetter in finishedGetterList:
                getters[site_name].remove(finishedGetter)


    # the main loop
    # loop over the locations and spawn ResultGetterThreads to get the results parallel
    # use load balancing over the different sites and with respect to queue lengths
    # do not overload this host system
    with IngestEventMesssageBusListener(handler_type=LTASOIngestEventHandler,
                                        handler_kwargs={'dbcreds': dbcreds},
                                        exchange=options.exchange, broker=options.broker):
        while True:

            cleanupFinishedGetters()

            # spawn new ResultGetterThreads
            # do not overload this host system
            num_waiting = numLocationsInQueues()
            while (num_waiting > 0 and
                   totalNumGetters() < options.parallel and
                   os.getloadavg()[0] < 4*multiprocessing.cpu_count()):
                sitesStats = db.visitStats(datetime.datetime.utcnow() - VISIT_INTERVAL)

                for site_name, site_stats in list(sitesStats.items()):
                    numGetters = len(getters[site_name])
                    queue_length = site_stats['queue_length']
                    weight = float(queue_length) / float(20 * (numGetters + 1))
                    if numGetters == 0 and queue_length > 0:
                        weight = 1e6 # make getterless sites extra important, so each site keeps flowing
                    site_stats['# get'] = numGetters
                    site_stats['weight'] = weight

                totalWeight = max(1.0, sum([site_stats['weight'] for site_stats in list(sitesStats.values())]))

                logger.debug("siteStats:\n%s" % str('\n'.join([str((k, v)) for k, v in list(sitesStats.items())])))

                # now pick a random site using the weights
                chosen_site_name = None
                cumul = 0.0
                r = random()
                for site_name,site_stats in list(sitesStats.items()):
                    ratio = site_stats['weight']/totalWeight
                    cumul += ratio

                    if r <= cumul and site_stats['queue_length'] > 0:
                        chosen_site_name = site_name
                        break

                if not chosen_site_name:
                    break

                chosen_dir_id = sitesStats[chosen_site_name]['least_recent_visited_dir_id']
                db.updateDirectoryLastVisitTime(chosen_dir_id, datetime.datetime.utcnow())

                logger.debug("chosen_site_name: %s chosen_dir_id: %s", chosen_site_name, chosen_dir_id)

                # make and start a new ResultGetterThread the location deque of the chosen site
                newGetter = ResultGetterThread(dbcreds, chosen_dir_id)
                newGetter.start()
                getters[chosen_site_name].append(newGetter)

                cleanupFinishedGetters()

                # refresh num_waiting
                num_waiting = numLocationsInQueues()
                logger.info('numLocationsInQueues=%d totalNumGetters=%d siteQueueLengths: %s load_5min: %.1f' % (num_waiting,
                                                                                                                 totalNumGetters(),
                                                                                                                 ' '.join(['%s:%d' % (name, stats['queue_length']) for name, stats in list(sitesStats.items())]),
                                                                                                                 os.getloadavg()[0]))

            # sleep before main loop next iteration
            # to wait for some results
            # and some getters to finish
            time.sleep(30 if num_waiting <= options.parallel else 0.25)
Beispiel #8
0
    def run(self):
        '''A single location is pop\'ed from the locations deque and the results are queried.
        Resulting subdirectories are appended to the locations deque'''
        try:
            with store.LTAStorageDb(self.dbcreds) as db:
                dir = db.directory(self.dir_id)

                if not dir:
                    return

                dir_id = dir['dir_id']
                dir_name = dir['dir_name']

                site_id = dir['site_id']
                site = db.site(site_id)
                srm_url = site['url']

            location = Location(srm_url, dir_name)

            try:
                def rescheduleVisit():
                    for i in range(5):
                        try:
                            with store.LTAStorageDb(self.dbcreds) as db:
                                logger.info('Rescheduling %s for new visit.' % (location.path(),))
                                db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL + datetime.timedelta(mins=1))
                                break
                        except:
                            time.sleep(1)


                # get results... long blocking
                result = location.getResult()
                logger.info(result)

                with store.LTAStorageDb(self.dbcreds) as db:
                    # convert the result.files list into a dict
                    #with (filename, dir_id) as key and a tuple with all file info as value
                    result_file_tuple_dict = {}
                    for file in result.files:
                        filename = file.filename.split('/')[-1]
                        key = (filename, dir_id)
                        file_tuple = (filename, int(file.size), file.created_at, dir_id)
                        result_file_tuple_dict[key] = file_tuple

                    # create a dict of all already known files from the db
                    known_file_dict = {}
                    for file in db.filesInDirectory(dir_id):
                        key = (str(file['name']), dir_id)
                        known_file_dict[key] = file

                    # now compare the result and known (filename, dir_id) sets
                    # and find out which a new, and which are known.
                    # compare only by (filename, dir_id) because for a given file the size and/or date might have changed,
                    # but that does not make it a new/unique file.
                    result_file_key_set = set(result_file_tuple_dict.keys())
                    known_file_key_set = set(known_file_dict.keys())
                    new_file_key_set = result_file_key_set - known_file_key_set
                    removed_file_key_set = known_file_key_set - result_file_key_set

                    logger.info("%s %s: %d out of %d files are new, and %d are already known", site['name'],
                                                                                                dir_name,
                                                                                                len(new_file_key_set),
                                                                                                len(result_file_key_set),
                                                                                                len(known_file_key_set))

                    if new_file_key_set:
                        new_file_tuple_set = [result_file_tuple_dict[key] for key in new_file_key_set]
                        file_ids = db.insertFileInfos(new_file_tuple_set)

                        if len(file_ids) != len(new_file_tuple_set):
                            rescheduleVisit()

                    if known_file_key_set:
                        for key, known_file in list(known_file_dict.items()):
                            if key in result_file_tuple_dict:
                                result_file_tuple = result_file_tuple_dict[key]

                                known_size = int(known_file['size'])

                                result_size = result_file_tuple[1]

                                if known_size != result_size:
                                    logger.info("%s %s: updating %s (id=%d) size from %d to %d",
                                                site['name'], dir_name, known_file['name'], known_file['id'],
                                                known_size, result_size)
                                    db.updateFileInfoSize(known_file['id'], result_size)

                    if removed_file_key_set:
                        for removed_file_key in removed_file_key_set:
                            db.deleteFileInfoFromDirectory(removed_file_key[0], removed_file_key[1])

                    # skip empty nikhef dirs
                    filteredSubDirectories = [loc for loc in result.subDirectories
                                              if not ('nikhef' in loc.srmurl and 'generated' in loc.directory) ]

                    # skip sksp spectroscopy project
                    filteredSubDirectories = [loc for loc in filteredSubDirectories
                                              if not ('sara' in loc.srmurl and 'sksp' in loc.directory and 'spectro' in loc.directory) ]

                    subDirectoryNames = [loc.directory for loc in filteredSubDirectories]

                    if subDirectoryNames:
                        #check for already known subdirectories in the db
                        known_subDirectoryNames_set = set(subdir['name'] for subdir in db.subDirectories(dir_id))

                        new_subdir_name_set = set(subDirectoryNames) - known_subDirectoryNames_set;

                        logger.info("%s %s: %d out of %d subdirs are new, and %d are already known", site['name'], dir_name, len(new_subdir_name_set), len(subDirectoryNames), len(known_subDirectoryNames_set))

                        if new_subdir_name_set:
                            subdir_ids = db.insertSubDirectories(new_subdir_name_set, dir_id)

                            if len(subdir_ids) != len(new_subdir_name_set):
                                rescheduleVisit()

            except (SrmlsException, ParseException) as e:
                logger.error('Error while scanning %s\n%s' % (location.path(), str(e)))

                if 'does not exist' in str(e):
                    with store.LTAStorageDb(self.dbcreds) as db:
                        db.deleteDirectory(self.dir_id)
                else:
                    rescheduleVisit()

        except Exception as e:
            logger.exception(str(e))

            with store.LTAStorageDb(self.dbcreds) as db:
                logger.info('Rescheduling dir_id %d for new visit.' % (self.dir_id,))
                db.updateDirectoryLastVisitTime(self.dir_id, datetime.datetime.utcnow() - VISIT_INTERVAL)
 def create_database_connection(self) -> store.LTAStorageDb:
     return store.LTAStorageDb(self.dbcreds)
Beispiel #10
0
def main():
    from optparse import OptionParser
    from lofar.common import dbcredentials

    # Check the invocation arguments
    parser = OptionParser(
        "%prog [options]",
        description=
        'execute a performance test by inserting many files on an empty test database.'
    )
    (options, args) = parser.parse_args()

    logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                        level=logging.INFO)

    with LTAStorageDbTestInstance() as test_db:
        base_date = datetime.utcnow()

        db = store.LTAStorageDb(test_db.dbcreds)

        db.insertSiteIfNotExists('sara', 'srm://srm.siteA.nl:8444')
        rootdir_id = db.insertRootDirectory(
            'sara', '/pnfs/grid.siteA.nl/data/lofar/ops')
        projects_dir_id = db.insertSubDirectory(
            '/pnfs/grid.siteA.nl/data/lofar/ops/projects', rootdir_id)

        total_num_files_inserted = 0

        with open('db_perf.csv', 'w') as file:
            for cycle_nr in range(1, 10):
                for project_nr in range(1, 10):
                    # project_name = 'lc%d_%03d/%d' % (cycle_nr, project_nr, os.getpid())
                    project_name = 'lc%d_%03d' % (cycle_nr, project_nr)
                    projectdir_id = db.insertSubDirectory(
                        '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s' %
                        (project_name, ), projects_dir_id)

                    obs_base_id = cycle_nr * 100000 + project_nr * 1000
                    for obs_nr, obsId in enumerate(
                            range(obs_base_id, obs_base_id + 20)):
                        obsName = 'L%s' % obsId

                        obsdir_id = db.insertSubDirectory(
                            '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s/%s'
                            % (project_name, obsName), projectdir_id)

                        fileinfos = [
                            ('%s_SB%3d' % (obsName, sbNr),
                             1000 + sbNr + project_nr * cycle_nr, base_date +
                             timedelta(days=10 * cycle_nr + project_nr,
                                       minutes=obs_nr,
                                       seconds=sbNr), obsdir_id)
                            for sbNr in range(0, 2)
                        ]
                        now = datetime.utcnow()
                        file_ids = db.insertFileInfos(fileinfos)
                        total_num_files_inserted += len(file_ids)
                        elapsed = totalSeconds(datetime.utcnow() - now)
                        line = '%s,%s' % (total_num_files_inserted, elapsed)
                        print(line)
                        file.write(line + '\n')