def destroy(self): """ This terminates all instances and deletes all crawl data and instance storage. Deleting the configuration can only happen manually or through Atrax Keeper """ if self.state.get() != CrawlJobState.STOPPED: self.notifications.stopping_crawl_job() s3 = AwsConnections.s3() crawl_job_glossary = CrawlJobGlossary(self.name) aws.s3.delete_non_empty_bucket(crawl_job_glossary.crawled_content_bucket_name) # Don't call self.stop() because we don't want the frontier controller to attempt to persist the frontier. self.pause() # this terminates the fetchers and stops the frontier sdb = AwsConnections.sdb() for table_name in crawl_job_glossary.table_names: if sdb.lookup(table_name): sdb.delete_domain(table_name) # Todo: implement # self.frontier_controller.destroy() self.notifications.delete_all_topics() crawl_job_state_table = AwsConnections.sdb().get_domain(CRAWL_JOB_STATE_DOMAIN_NAME) crawl_job_state_table.delete_attributes(self.name)
def setUpClass(cls): cls._bucket = get_or_create_bucket(AwsConnections.s3(), 'crawled-content-test-bucket') sdb = AwsConnections.sdb() cls._crawled_urls = sdb.lookup( CrawlJobGlossary('sel11122014').crawled_urls_table_name) cls._target = CrawledContent(cls._bucket) cls._content = "yada yada yada"
def __init__(self, crawl_job_name, emit_interval): """ :param crawl_job_name: :param emit_interval: How often to emit the metrics in minutes. """ config_fetcher = ConfigFetcher(crawl_job_name) config_file = config_fetcher.get_config_file() self._global_config = config_file.global_config self.crawl_job = CrawlJob(crawl_job_name, self._global_config) self.emit_interval = emit_interval * SECONDS_PER_MINUTE self.namespace = 'atrax/' + self.crawl_job.name self.cw = AwsConnections.cloudwatch() self.sqs = AwsConnections.sqs()
def __init__(self, crawl_job_name): self.crawl_job_name = crawl_job_name sdb = AwsConnections.sdb() self.crawl_job_state_table = sdb.lookup(CRAWL_JOB_STATE_DOMAIN_NAME) if not self.crawl_job_state_table: self.crawl_job_state_table = aws.sdb.create_domain( sdb, CRAWL_JOB_STATE_DOMAIN_NAME)
def create_sqs_queue(queue_name): queue = AwsConnections.sqs().create_queue(queue_name, 10 * 60) # 10 minutes queue.set_attribute('MessageRetentionPeriod', 1209600) # 14 days # Defaults: # queue.set_attribute('DelaySeconds', 0) # Don't delay # queue.set_attribute('MaximumMessageSize', 256144) # 256 KB # queue.set_attribute('ReceiveMessageWaitTimeSeconds', 0) # Don't wait return queue
def terminate_fetcher_instances(self): requests = self.get_all_spot_requests() request_ids = [request.id for request in requests] terminate_spot_instances_by_request(AwsConnections.ec2(), request_ids) for fetcher_instance in AwsInstanceAccessor( self.crawl_job_name).get_fetcher_instances(): fetcher_instance.terminate()
def __init__(self, name, global_config=None): self.name = name self.config = global_config self.glossary = CrawlJobGlossary(self.name) self._sdb = AwsConnections.sdb() self._s3 = AwsConnections.s3() self._logs_table = None self._crawled_urls = None self._failed_urls = None self._skipped_urls = None self._redirected_urls = None self._persisted_frontier_bucket = None self._crawled_content_bucket = None self._crawled_content = None self._seen_urls = None self._instance_accessor = None
def persist(self): if self.ec2_instance is not None and self.ec2_instance.state in [ InstanceState.RUNNING, InstanceState.PENDING ]: raise PreconditionNotMet('Frontier is still in the %s state' % self.ec2_instance.state) crawl_job = CrawlJob(self.crawl_job_name) # Persist all queues with names that start with the crawl_job.name persist_to_s3(AwsConnections.sqs(), crawl_job.name, crawl_job.persisted_frontier_bucket)
def __init__(self, address, crawl_job_name=None, on_new_queue_assigned=None): FrontierInterface.__init__(self) self._on_new_queue_assigned = on_new_queue_assigned self._zmq_context = zmq.Context.instance() self._client = self._zmq_context.socket(zmq.REQ) self._client.RCVTIMEO = 1000 * 60 # wait up to a minute for responses to come back self._client.connect('tcp://' + address) self._sqs = AwsConnections.sqs() self._queue_history = {} self._queue_names = QueueKeyDict() self._messages = {}
def start_on_demand_instances(self, count, availability_zone): modules = [ModuleNames.FETCHER] security_groups = [FETCHER_SECURITY_GROUP_NAME] fetcher_instances = [] for i in xrange(0, count): reservation = self.ec2.run_instances( image_id=get_latest_fetcher_ami(self.ec2, VirtualizationType.HVM).id, key_name=EC2_KEY_PAIR_NAME, security_groups=security_groups, instance_type=FETCHER_INSTANCE_TYPES[0], placement=availability_zone, monitoring_enabled=True, instance_initiated_shutdown_behavior='terminate', instance_profile_arn=STANDARD_INSTANCE_ARN, user_data=self.on_demand_user_data) fetcher_instance = reservation.instances[0] wait_for_state(fetcher_instance, (InstanceState.PENDING, InstanceState.RUNNING)) self.ec2.create_tags( [fetcher_instance.id], { CRAWL_JOB_TAG_NAME: self.crawl_job_name, PACKAGES_TAG_NAME: ' '.join(modules) }) cloudwatch = AwsConnections.cloudwatch() alarm = cloudwatch.MetricAlarm( name=fetcher_instance.id + '-LOW_NETWORK', description= "Stop the instance when network in drops below 300 bytes in 5 minutes.", namespace='AWS/EC2', dimensions={'InstanceId': [fetcher_instance.id]}, metric='NetworkIn', statistic='Sum', comparison='<', threshold=300, period=300, evaluation_periods=1) alarm.add_alarm_action('arn:aws:automate:' + AwsConnections.region + ':ec2:terminate') cloudwatch.put_metric_alarm(alarm) fetcher_instances.append(fetcher_instance) return fetcher_instances
def main(): local_logger = LocalLogger('frontier') local_logger.log(LogType.Info, 'Starting') if not __debug__: os.nice(-1) parser = argparse.ArgumentParser() parser.add_argument('-d', action='store_true') parser.add_argument('job', type=str) args = parser.parse_args() config_fetcher = ConfigFetcher(args.job) config_file = config_fetcher.get_config_file() logger = SimpleLogger( get_or_create_domain(AwsConnections.sdb(), CrawlJobGlossary(args.job).logs_table_name), create_frontier_id(config_file.global_config.environment)) try: if config_file.global_config.environment == ComputeEnv.AWS: frontier = AwsFrontier(args.job, logger) else: frontier = LocalFrontier(args.job, logger) seeder = FrontierSeeder(config_file.global_config, frontier) seeder_thread = InterruptableThread(lambda t: seeder.run()) seeder_thread.start() metrics_service = MetricsService(args.job, 10) metrics_service.start() frontier_service = RemoteFrontier(frontier) frontier_service.start() logger.log(LogType.Info, 'Started') frontier_service.join() if frontier_service.threw_exception: logger.log(LogType.InternalError, 'Unexpectedly stopped', None, frontier_service.exception, frontier_service.exc_info) except SqsMessageRetentionException, ex: logger.log(LogType.InternalWarning, "Full-stopping crawl job", None, ex, sys.exc_info()) CrawlJobController(args.job).stop()
def __init__(self, crawl_job_name, logger): self.logger = logger config_fetcher = ConfigFetcher(crawl_job_name) config_file = config_fetcher.get_config_file() self._global_config = config_file.global_config self.crawl_job = CrawlJob(crawl_job_name, self._global_config) self._recurring_timer_interval = self._global_config.lb_maintenance_cycle_period * SECONDS_PER_MINUTE self.metrics = FrontierMetrics(self.crawl_job.name) local_fetcher_id = create_fetcher_id(self._global_config.environment, 0) # The minimum dequeue interval that every consumer must have in order to be considered as a queue donor. min_dequeue_interval = (1.0 / self._global_config.max_fetch_rate ) * Frontier.DEQUEUE_INTERVAL_MARGIN self._consumers = ConsumerCollection(local_fetcher_id, min_dequeue_interval, self.crawl_job.instance_accessor, self._recurring_timer_interval, self.logger) self._queues_by_name = {} self._unassigned_queues = deque() for queue in AwsConnections.sqs().get_all_queues(self.crawl_job.name): frontier_queue = FrontierQueue(queue) self._queues_by_name[queue.name] = frontier_queue if frontier_queue.count > 0: self._unassigned_queues.appendleft(frontier_queue) else: self._unassigned_queues.append(frontier_queue) self._is_scaling = True # This is a hack to serialize the execution of asynchronous operations into the main thread. self._zmq_context = zmq.Context.instance() self._zmq_socket = self._zmq_context.socket(zmq.REQ) self._zmq_socket.connect('tcp://%s:%s' % (LOCALHOST_IP, str(DEFAULT_FRONTIER_PORT))) self._enqueue_count = 0 self._dequeue_count = 0 self._previous_emit_time = time.time()
def create_instance(self, availability_zone=None): modules = [ModuleNames.FRONTIER, ModuleNames.REDIS] security_groups = [ FRONTIER_SECURITY_GROUP_NAME, REDIS_SECURITY_GROUP_NAME, FETCHER_SECURITY_GROUP_NAME ] parts = [('cloud_config.yaml', 'cloud-config', generate_cloud_config()), ('stopgap_debian_setup.sh', 'x-shellscript', generate_stopgap_debian_setup())] for module in modules: script = generate_upstart_script(self.crawl_job_name, module) parts.append((module + '.conf', 'upstart-job', script)) user_data = create_multipart(parts) ec2 = AwsConnections.ec2() reservation = ec2.run_instances( image_id=get_latest_frontier_ami(ec2).id, key_name=EC2_KEY_PAIR_NAME, security_groups=security_groups, instance_type=FRONTIER_INSTANCE_TYPE, placement=availability_zone, monitoring_enabled=True, instance_initiated_shutdown_behavior='stop', instance_profile_arn=STANDARD_INSTANCE_ARN, user_data=user_data, disable_api_termination=True ) # This instance can only be terminated manually frontier_instance = reservation.instances[0] wait_for_state(frontier_instance, (InstanceState.PENDING, InstanceState.RUNNING)) ec2.create_tags( [frontier_instance.id], { CRAWL_JOB_TAG_NAME: self.crawl_job_name, PACKAGES_TAG_NAME: ' '.join(modules) }) return frontier_instance
def populate_seen_urls(job_name, environment): crawl_job = CrawlJobGlossary(job_name) seen_urls_instance = InstanceAccessorBase( job_name, environment).get_redis_instance() seen_urls = SeenUrls(crawl_job.seen_urls_key, host=seen_urls_instance.private_ip_address, port=DEFAULT_REDIS_PORT) sdb = AwsConnections.sdb() for table_name in [ crawl_job.failed_urls_table_name, crawl_job.skipped_urls_table_name, crawl_job.crawled_urls_table_name ]: table = sdb.lookup(table_name) items = table.select("select itemName() from `%s`" % table_name) for item in items: host = urlsplit(item.name)[1] last_colon = host.rfind(':') domain = host if last_colon == -1 else host[0:last_colon] seen_urls.add(item.name, domain)
def setUp(self): AwsConnections.sqs()
def __init__(self, crawl_job_name): self.crawl_job_name = crawl_job_name self.namespace = 'atrax/' + self.crawl_job_name self.cw = AwsConnections.cloudwatch()
def __init__(self, crawl_job_name): self.crawl_job_name = crawl_job_name self.ec2 = AwsConnections.ec2() self._on_demand_user_data = None self._spot_user_data = None
from aws import USWest2 as AwsConnections domain_name = 'crawled-urls.siemens17042013' domain = AwsConnections.sdb().lookup(domain_name) query = "select * from `{0}` where `redirectsTo` is null".format(domain_name) items = domain.select(query) count = 0 for item in items: # print item.name + '\n' count += 1 next_token = items.next_token if next_token is not None: print next_token break print '\n' + str(count)
from atrax.management.aws_env.ami import * from aws import USWest2 as AwsConnections image = get_latest_frontier_ami(AwsConnections.ec2())
def __init__(self, crawl_job_name): self.crawl_job_name = crawl_job_name s3 = AwsConnections.s3() self.bucket = s3.lookup(CONFIG_BUCKET_NAME)
def delete_queues(self): queues = AwsConnections.sqs().get_all_queues(self.crawl_job_name) for queue in queues: queue.delete()
def restore(self): crawl_job = CrawlJob(self.crawl_job_name) restore_from_s3(AwsConnections.sqs(), crawl_job.persisted_frontier_bucket, queue_creator=FrontierQueue.create_sqs_queue)
def __init__(self, crawl_job_name): super(AwsInstanceAccessor, self).__init__(crawl_job_name, ComputeEnv.AWS) self.ec2 = AwsConnections.ec2()
encoded = url.encode('utf-8') if not isinstance(encoded, unicode): decoded = encoded.decode('utf-8') def pack_message(url_info): return b64encode(pickle.dumps(url_info)) def unpack_message(m): return pickle.loads(b64decode(m)) from boto.sqs.message import RawMessage as SqsMessage from aws import USWest2 as AwsConnections queue = AwsConnections.sqs().lookup('test_queue') queue.set_message_class(SqsMessage) def dequeue(): while True: received_msg = queue.read() if not received_msg: break received_body = received_msg.get_body() received_url_info = unpack_message(received_body) print received_url_info.raw_url received_msg.delete()