def purge_out_of_scope(self, crawl_job_name): """ Go through crawled_urls and move them into skipped_urls and delete the content from s3. """ # Todo: Not tested scope = CrawlerScope(ConfigFetcher(crawl_job_name).get_scope_file()) crawl_job = CrawlJob(crawl_job_name) next_token_file_path = os.path.join( LOCAL_CRAWL_JOB_DIR, self.name, "purge_out_of_scope_next_crawled_urls_token.txt") with open(next_token_file_path, 'r') as next_token_file: prev_next_token = next_token_file.read() query = "select `url`, `referrer_id` from `{0}`".format( crawl_job.glossary.crawled_urls_table_name) try: items = crawl_job.crawled_urls.select(query, next_token=prev_next_token) except: items = crawl_job.crawled_urls.select(query) next_token = items.next_token count = 0 try: for item in items: count += 1 if prev_next_token != items.next_token: prev_next_token = next_token next_token = items.next_token url_info = UrlInfo(item['url'], canonized_url=item['url']) c = scope.get(url_info) if c == UrlClass.InScope or item.name.endswith( 'robots.txt') or item.name.endswith('sitemap.xml'): continue attributes = {REASON_ATTR_NAME: c} referrer_id = item.get(REFERRER_ID_ATTR_NAME, None) if referrer_id: attributes[REFERRER_ID_ATTR_NAME] = referrer_id crawl_job.skipped_urls.put_attributes(item.name, attributes) key = crawl_job.crawled_content_bucket.get_key(url_info.s3_key) if key: key.delete() item.delete() # Todo: do this in batches? except Exception, ex: with open(next_token_file_path, 'w') as next_token_file: next_token_file.write(prev_next_token) print "Interrupted after %s records." % count raise
def persist(self): if self.ec2_instance is not None and self.ec2_instance.state in [ InstanceState.RUNNING, InstanceState.PENDING ]: raise PreconditionNotMet('Frontier is still in the %s state' % self.ec2_instance.state) crawl_job = CrawlJob(self.crawl_job_name) # Persist all queues with names that start with the crawl_job.name persist_to_s3(AwsConnections.sqs(), crawl_job.name, crawl_job.persisted_frontier_bucket)
def enqueue_skipped(self): # Todo: Not tested crawl_job = CrawlJob(self.crawl_job_name) scope = CrawlerScope( ConfigFetcher(self.crawl_job_name).get_scope_file()) frontier = get_frontier_client(self.ec2_instance, None) for item in crawl_job.skipped_urls.select("select * from %s" % crawl_job.skipped_urls.name): url_info = UrlInfo(item.name) if scope.get(url_info) == UrlClass.InScope: url_info.referrer_id = item[REFERRER_ID_ATTR_NAME] frontier.enqueue(url_info) item.delete() # Todo: do this in batches?
def __init__(self, crawl_job_name, emit_interval): """ :param crawl_job_name: :param emit_interval: How often to emit the metrics in minutes. """ config_fetcher = ConfigFetcher(crawl_job_name) config_file = config_fetcher.get_config_file() self._global_config = config_file.global_config self.crawl_job = CrawlJob(crawl_job_name, self._global_config) self.emit_interval = emit_interval * SECONDS_PER_MINUTE self.namespace = 'atrax/' + self.crawl_job.name self.cw = AwsConnections.cloudwatch() self.sqs = AwsConnections.sqs()
def __init__(self, crawl_job_name, logger): self.logger = logger config_fetcher = ConfigFetcher(crawl_job_name) config_file = config_fetcher.get_config_file() self._global_config = config_file.global_config self.crawl_job = CrawlJob(crawl_job_name, self._global_config) self._recurring_timer_interval = self._global_config.lb_maintenance_cycle_period * SECONDS_PER_MINUTE self.metrics = FrontierMetrics(self.crawl_job.name) local_fetcher_id = create_fetcher_id(self._global_config.environment, 0) # The minimum dequeue interval that every consumer must have in order to be considered as a queue donor. min_dequeue_interval = (1.0 / self._global_config.max_fetch_rate ) * Frontier.DEQUEUE_INTERVAL_MARGIN self._consumers = ConsumerCollection(local_fetcher_id, min_dequeue_interval, self.crawl_job.instance_accessor, self._recurring_timer_interval, self.logger) self._queues_by_name = {} self._unassigned_queues = deque() for queue in AwsConnections.sqs().get_all_queues(self.crawl_job.name): frontier_queue = FrontierQueue(queue) self._queues_by_name[queue.name] = frontier_queue if frontier_queue.count > 0: self._unassigned_queues.appendleft(frontier_queue) else: self._unassigned_queues.append(frontier_queue) self._is_scaling = True # This is a hack to serialize the execution of asynchronous operations into the main thread. self._zmq_context = zmq.Context.instance() self._zmq_socket = self._zmq_context.socket(zmq.REQ) self._zmq_socket.connect('tcp://%s:%s' % (LOCALHOST_IP, str(DEFAULT_FRONTIER_PORT))) self._enqueue_count = 0 self._dequeue_count = 0 self._previous_emit_time = time.time()
def restore(self): crawl_job = CrawlJob(self.crawl_job_name) restore_from_s3(AwsConnections.sqs(), crawl_job.persisted_frontier_bucket, queue_creator=FrontierQueue.create_sqs_queue)
def __init__(self, job_name, local_id=0): InterruptableThread.__init__(self) self.id = 'unknown' try: socket.setdefaulttimeout( 30) # wait up to 30 seconds for a response on all sockets config_fetcher = ConfigFetcher(job_name) config_file = config_fetcher.get_config_file() self.global_config = config_file.global_config self.id = create_fetcher_id(self.global_config.environment, local_id) self.crawl_job = CrawlJob(job_name, self.global_config) self.logger = SimpleLogger(self.crawl_job.logs_table, self.id) self.local_logger = LocalLogger('fetcher', local_id) try: self.metrics = FetcherMetrics(self.crawl_job.name) self.notifier = CrawlJobNotifications(self.crawl_job.name) if self.global_config.reference_job: if self.global_config.reference_job_version < self.global_config.version: from atrax.prior_versions import crawl_job_versions self.reference_job = crawl_job_versions[ self.global_config.reference_job_version]( self.global_config.reference_job) else: self.reference_job = CrawlJob( self.global_config.reference_job) else: self.reference_job = None self.url_extractor = UrlExtractor(self.logger) self.user_agent = config_file.user_agents['Standard'] % self.id non_compliant_ua = config_file.user_agents.get( 'NonCompliant', None) self.non_compliant_user_agent = ( non_compliant_ua % self.id) if non_compliant_ua else None self.select_original_query = \ "select itemName() from `{0}` where `{1}`='%s' and `{2}`='{3}' limit 1".format( self.crawl_job.crawled_urls.name, FINGERPRINT_ATTR_NAME, ORIGINAL_ATTR_NAME, ORIGINAL_ATTR_VALUE_SELF) scope_file = config_fetcher.get_scope_file() self.scope = CrawlerScope(scope_file) self.url_transformer = UrlTransformer(config_file) self.robots_txt_cache = {} self.downloader = self.initialize_downloader() # Initialize frontier client frontier_instance = self.crawl_job.instance_accessor.get_frontier_instance( ) self.frontier = get_frontier_client(frontier_instance, self.new_queue_assigned) self.local_seen_urls = TrieSet( ) # a local list of URLs that this fetcher has seen since it started. # Don't crawl pages that are at least 90% similar to the previous 5 crawled pages in the URL's lineage. self.dust_info_factory = DustInfoFactory(.9, .1, 5) redis_instance = self.crawl_job.instance_accessor.get_redis_instance( ) self.redundant_url_detector = RedundantUrlDetector( self.crawl_job.glossary.redundant_urls_key, host=redis_instance.private_ip_address, port=DEFAULT_REDIS_PORT) except Exception, ex: self.logger.log(LogType.InternalError, "Unexpectedly stopped", None, ex, sys.exc_info()) raise except Exception, ex: sys.stderr.write( "Failed to initialize fetcher %s.: \n%s\nStack Trace:%s\n" % (self.id, ex, sys.exc_info())) raise
import time from atrax.common.crawl_job import CrawlJob from python_common.collections.trie_set import TrieSet crawl_job = CrawlJob('sel11122014') trie_set = TrieSet() total_items = 0 for domain in [ crawl_job.crawled_urls, crawl_job.skipped_urls, crawl_job.failed_urls, crawl_job.redirected_urls ]: next_token = None query = "select itemName() from `%s`" % domain.name while True: items = domain.select(query, next_token=next_token) num_items = 0 for item in items: trie_set.add(item.name) num_items += 1 total_items += 1 if items.next_token is None or num_items == 0: break next_token = next_token print total_items
import sys from atrax.common.crawl_job import CrawlJob from atrax.management.config_fetcher import ConfigFetcher from atrax.frontier.remote_frontier_client import get_frontier_client def new_queue_assigned(queue_name): print "Queue assigned: " + queue_name job_name = sys.argv[1] config_fetcher = ConfigFetcher(job_name) config_file = config_fetcher.get_config_file() global_config = config_file.global_config crawl_job = CrawlJob(job_name, global_config) frontier_instance = crawl_job.instance_accessor.get_frontier_instance() frontier = get_frontier_client(frontier_instance, new_queue_assigned) # msg_id, url_info = frontier.dequeue('i-5ab7b450:0') msg_id, url_info = frontier.dequeue('0000:0') if url_info: print "Dequeued: " + url_info.id else: print "Dequeue failed"