def main(): local_logger = LocalLogger('frontier') local_logger.log(LogType.Info, 'Starting') if not __debug__: os.nice(-1) parser = argparse.ArgumentParser() parser.add_argument('-d', action='store_true') parser.add_argument('job', type=str) args = parser.parse_args() config_fetcher = ConfigFetcher(args.job) config_file = config_fetcher.get_config_file() logger = SimpleLogger( get_or_create_domain(AwsConnections.sdb(), CrawlJobGlossary(args.job).logs_table_name), create_frontier_id(config_file.global_config.environment)) try: if config_file.global_config.environment == ComputeEnv.AWS: frontier = AwsFrontier(args.job, logger) else: frontier = LocalFrontier(args.job, logger) seeder = FrontierSeeder(config_file.global_config, frontier) seeder_thread = InterruptableThread(lambda t: seeder.run()) seeder_thread.start() metrics_service = MetricsService(args.job, 10) metrics_service.start() frontier_service = RemoteFrontier(frontier) frontier_service.start() logger.log(LogType.Info, 'Started') frontier_service.join() if frontier_service.threw_exception: logger.log(LogType.InternalError, 'Unexpectedly stopped', None, frontier_service.exception, frontier_service.exc_info) except SqsMessageRetentionException, ex: logger.log(LogType.InternalWarning, "Full-stopping crawl job", None, ex, sys.exc_info()) CrawlJobController(args.job).stop()
def logs_table(self): if self._logs_table is None: self._logs_table = get_or_create_domain( self._sdb, self.glossary.logs_table_name) return self._logs_table
def skipped_urls(self): if self._skipped_urls is None: self._skipped_urls = get_or_create_domain( self._sdb, self.glossary.skipped_urls_table_name) return self._skipped_urls
def redirected_urls(self): if self._redirected_urls is None: self._redirected_urls = get_or_create_domain( self._sdb, self.glossary.redirected_urls_table_name) return self._redirected_urls