def start(self, delay_in_sec=0): """ Starts the intervalmanager, which starts the crawlmanager procedure with a given delay :delay_in_sec: delay time in seconds """ # fetch interval from config in seconds self.__interval = float(config.get('crawler.interval')) * 60 # delay before next crawl if delay_in_sec != 0: logging.info(""" Next crawl will start in {0} seconds.""" .format(delay_in_sec)) time.sleep(delay_in_sec) if self.status != 'stop': self.__cmanager = c.CrawlerManager(utl.unique_items_from_file()) self.__start_time = times.get_localtime_sec() self.__cmanager.register_done(self.crawling_done_callback) self.__set_status('active') self.__cmanager.start() else: self.__set_status('ready') logging.info('Interval Manager finished - current state set to ready.')
def __init__(self, urls): """ :urls: set of urls that will be crawled """ self.__die = False if len(urls) > 0: self.__done_callback = None self.__urls = list(urls) self.__pool = mpool.ThreadPool(config.get('crawler.maxInst'))
def __init__(self): """ Collected arguments """ self.__filelock = lock.FileLock(LOCKFILE, folder=config.get('general.root'), timeout=0.1) self._args = docopt(__doc__, version='Archive 1.0') submodules = { 'init': self.handle_init, 'crawler': self.handle_crawler, 'javadapter': self.handle_javadapter, 'db': self.handle_db, 'config': self.handle_config, 'repair': self.handle_repair } try: loglevel = self._args['--loglevel'] if loglevel is not None: loglevel = loglevel.upper() severity = getattr(logging, loglevel) else: raise KeyError except KeyError: severity = logging.INFO except AttributeError: print('Error: \"loglevel\" is not valid severity level') print(__doc__) sys.exit(-1) try: logging.basicConfig(level=severity, filename=os.path.join(paths.get_log_dir(), 'archive.log'), format='%(asctime)s - %(levelname)s - %(message)s') pass except IOError as err: # Disable warning for initialization if self._args['init'] is False: print('Cannot open log - file structure probably does not exist yet:', err) # Set up config to another file if desired try: if self._args['init'] is False and self._args['--config']: config.load(os.path.abspath(self._args['--config'])) elif self._args['init'] is False: config.load('webarchive.conf.xml') except IOError as err: print('FATAL: Unable to locate config:', err) sys.exit(-4) # iterating through arguments for module, handler in submodules.items(): if self._args[module]: try: handler() except lock.FileLockException: print("archive is currently locked with global.lock.") sys.exit(0)
def load(self, plugin_path=None): """ Load a list of \*.py files from a directory The .py files are read in and stored in-memory :plugin_path: A path to a directory with .py files or None to read the value from the cfg """ # If no path is given, # we'll try to read it from the cfg if plugin_path is None: actual_path = os.path.join(config.get('general.root'), config.get('general.filterpath')) else: actual_path = plugin_path # Built a list of (path_to_filter, filter_source) for source in sorted(glob.glob(os.path.join(actual_path, '*.py'))): with open(source, 'r') as handle: self.__source_list.append( (source, handle.read() + '\n'))
def handle_config(self): """ Invokes Config Handler operations """ if self._args['--get']: print(config.get(self._args['--get'])) elif self._args['--set']: config.set(self._args['--set'], self._args['<value>']) elif self._args['--default']: print(config.get_default(self._args['--default'])) elif self._args['--config']: config.load(self._args['--config'])
def start(host='localhost', port=config.get('javadapter.port')): """ Start the Javadapter server, and exit once done :host: the host to start the server on (does anythinh but localhost work?) :port: the port on which the server listens on :returns: a server, on which shutdown() can be called """ # Spawn a new thread for each connection server = ThreadedTCPServer((host, port), AdapterHandler) server_thread = threading.Thread(target=server.serve_forever) server_thread.daemon = True server_thread.start() return server
def __init__(self): self.__pickle_path = os.path.join(config.get('general.root'), 'pickle_cache')
def get_urllist_path(): """ :returns: return path to url.txt """ return os.path.join(config.get('general.root'), config.get('crawler.urllistpath'))
def get_temp_root(): """ :returns: temp dir path """ return os.path.join(config.get('general.root'), config.get('crawler.tempRoot'))
def get_log_dir(): """ :returns: log dir path """ return os.path.join(config.get('general.root'), 'logs/')
def get_archive_root(): """ :returns: archive root path """ return config.get('general.root')
def get_content_root(): """ :return: '{archive_root}/content/' path """ return os.path.join(config.get('general.root'), 'content/')
def get_sqlpath(): """ :returns: sql statements path """ return os.path.join(config.get('general.root'), config.get('db.sqlSource'))
def get_dbpath(): """ :returns: db path """ return os.path.join(config.get('general.root'), config.get('db.path'))
def get_urllist_path(): """ :returns: return path to url.txt """ return os.path.join(config.get('general.root'), config.get('crawler.urllistpath')) ########################################################################### # unittest # ########################################################################### if __name__ == '__main__': # some predefined values to test with ROOT = config.get('general.root') SQL_PATH = 'sql/' DB_PATH = 'metadata.db' CRAWLER_TMP_ROOT = 'tmp/' CONTENT_DIR = 'content/' LOG_DIR = 'logs/' TEMP_DIR = 'tmp/' DOMAIN = 'www.domain_name.org' URL = 'url.txt' class TestPaths(unittest.TestCase): def test_get_db_path(self): self.assertEqual(get_dbpath(), os.path.join(ROOT, DB_PATH))