def lock_domain(domain, lock_timeout=300, wait=True): """ Implementation for lock/try_lock, very common and therefore in an own function """ global GLOBAL_LOCK_TABLE if not os.path.exists(paths.get_domain_path(domain)): raise ProtocolError('Domain does not exist.') if domain in GLOBAL_LOCK_TABLE: # We do not want to wait, so raise a ProtocolError immediately if GLOBAL_LOCK_TABLE[domain].is_locked and wait is False: raise ProtocolError('Already locked.') try: #if thread_name not in thread_name: # Create a new lock if domain not in GLOBAL_LOCK_TABLE: GLOBAL_LOCK_TABLE[domain] = lock.FileLock( file_name=domain, folder=paths.get_content_root(), timeout=lock_timeout) # Lock, or wait for lock. GLOBAL_LOCK_TABLE[domain].acquire() # Convert other exceptions to a ProtocolError except lock.FileLockException as err: raise ProtocolError(str(err)) except OSError as err: del GLOBAL_LOCK_TABLE[domain] raise ProtocolError(str(err))
def start_sync(self): """ Starts rsync procedure (rsync submodule) to mirror temporaray source to destination -> content archive path """ content_path = paths.get_content_root() itemlist = os.listdir(self.__path) for domain in itemlist: domain_path = paths.get_domain_path(domain) fsmutex = lock.FileLock(domain, folder=content_path, timeout=100) fsmutex.acquire() try: logging.debug('Creating directory: ' + domain_path) os.mkdir(domain_path) except OSError: # This is expected # (I swear) pass git_proc = git.Git(domain) git_proc.init() git_proc.checkout('empty') git_proc.branch(self.__metalist[0]['commitTime']) rsync(os.path.join(self.__path, domain), content_path) git_proc.commit('Site {domain_name} was crawled.' .format(domain_name=domain)) git_proc.recreate_master() fsmutex.release()
def clear_locks(): """ Searches for *.lock and tries to remove found filesystem locks """ cmd = 'find {path} -iname "*.lock"'.format(path=paths.get_content_root()) try: locklist = str(subprocess.check_output(cmd, shell=True), 'UTF-8').splitlines() if len(locklist) > 0: logging.info('Wil remove following files:') for lockfile in locklist: logging.info('rm -f ' + lockfile) os.remove(lockfile) except subprocess.CalledProcessError: logging.exception('Cannot clear locks')
def repair(): """ Walks through domain hierarchy invoking repair() and clear_locks() """ try: # Make sure all repos are on the most recent one # Additional errorchecking might take place here domain_patt = os.path.join(paths.get_content_root(), '*') for domain in glob.glob(domain_patt): logging.info('- Sanitizing: ' + domain) sanitize_domain(domain) # Remove all .lock files that are left by (very) rough shutdowns clear_locks() except OSError: logging.exception('Unable to repair archive')
def load(self): """ Invokes threaded xml recovery """ try: self.__init__() domain_patt = os.path.join(paths.get_content_root(), '*') domain_list = glob.glob(domain_patt) threadPool = ThreadPool(16) threadPool.map(self.recover_domain, domain_list) threadPool.close() threadPool.join() except KeyboardInterrupt: print('Got interrupted') finally: self.__shutdown = True threadPool.close() threadPool.join() return self.__metalist