def create_sqs_queue(queue_name): queue = AwsConnections.sqs().create_queue(queue_name, 10 * 60) # 10 minutes queue.set_attribute('MessageRetentionPeriod', 1209600) # 14 days # Defaults: # queue.set_attribute('DelaySeconds', 0) # Don't delay # queue.set_attribute('MaximumMessageSize', 256144) # 256 KB # queue.set_attribute('ReceiveMessageWaitTimeSeconds', 0) # Don't wait return queue
def persist(self): if self.ec2_instance is not None and self.ec2_instance.state in [ InstanceState.RUNNING, InstanceState.PENDING ]: raise PreconditionNotMet('Frontier is still in the %s state' % self.ec2_instance.state) crawl_job = CrawlJob(self.crawl_job_name) # Persist all queues with names that start with the crawl_job.name persist_to_s3(AwsConnections.sqs(), crawl_job.name, crawl_job.persisted_frontier_bucket)
def __init__(self, crawl_job_name, emit_interval): """ :param crawl_job_name: :param emit_interval: How often to emit the metrics in minutes. """ config_fetcher = ConfigFetcher(crawl_job_name) config_file = config_fetcher.get_config_file() self._global_config = config_file.global_config self.crawl_job = CrawlJob(crawl_job_name, self._global_config) self.emit_interval = emit_interval * SECONDS_PER_MINUTE self.namespace = 'atrax/' + self.crawl_job.name self.cw = AwsConnections.cloudwatch() self.sqs = AwsConnections.sqs()
def __init__(self, address, crawl_job_name=None, on_new_queue_assigned=None): FrontierInterface.__init__(self) self._on_new_queue_assigned = on_new_queue_assigned self._zmq_context = zmq.Context.instance() self._client = self._zmq_context.socket(zmq.REQ) self._client.RCVTIMEO = 1000 * 60 # wait up to a minute for responses to come back self._client.connect('tcp://' + address) self._sqs = AwsConnections.sqs() self._queue_history = {} self._queue_names = QueueKeyDict() self._messages = {}
def __init__(self, crawl_job_name, logger): self.logger = logger config_fetcher = ConfigFetcher(crawl_job_name) config_file = config_fetcher.get_config_file() self._global_config = config_file.global_config self.crawl_job = CrawlJob(crawl_job_name, self._global_config) self._recurring_timer_interval = self._global_config.lb_maintenance_cycle_period * SECONDS_PER_MINUTE self.metrics = FrontierMetrics(self.crawl_job.name) local_fetcher_id = create_fetcher_id(self._global_config.environment, 0) # The minimum dequeue interval that every consumer must have in order to be considered as a queue donor. min_dequeue_interval = (1.0 / self._global_config.max_fetch_rate ) * Frontier.DEQUEUE_INTERVAL_MARGIN self._consumers = ConsumerCollection(local_fetcher_id, min_dequeue_interval, self.crawl_job.instance_accessor, self._recurring_timer_interval, self.logger) self._queues_by_name = {} self._unassigned_queues = deque() for queue in AwsConnections.sqs().get_all_queues(self.crawl_job.name): frontier_queue = FrontierQueue(queue) self._queues_by_name[queue.name] = frontier_queue if frontier_queue.count > 0: self._unassigned_queues.appendleft(frontier_queue) else: self._unassigned_queues.append(frontier_queue) self._is_scaling = True # This is a hack to serialize the execution of asynchronous operations into the main thread. self._zmq_context = zmq.Context.instance() self._zmq_socket = self._zmq_context.socket(zmq.REQ) self._zmq_socket.connect('tcp://%s:%s' % (LOCALHOST_IP, str(DEFAULT_FRONTIER_PORT))) self._enqueue_count = 0 self._dequeue_count = 0 self._previous_emit_time = time.time()
def setUp(self): AwsConnections.sqs()
def delete_queues(self): queues = AwsConnections.sqs().get_all_queues(self.crawl_job_name) for queue in queues: queue.delete()
def restore(self): crawl_job = CrawlJob(self.crawl_job_name) restore_from_s3(AwsConnections.sqs(), crawl_job.persisted_frontier_bucket, queue_creator=FrontierQueue.create_sqs_queue)
encoded = url.encode('utf-8') if not isinstance(encoded, unicode): decoded = encoded.decode('utf-8') def pack_message(url_info): return b64encode(pickle.dumps(url_info)) def unpack_message(m): return pickle.loads(b64decode(m)) from boto.sqs.message import RawMessage as SqsMessage from aws import USWest2 as AwsConnections queue = AwsConnections.sqs().lookup('test_queue') queue.set_message_class(SqsMessage) def dequeue(): while True: received_msg = queue.read() if not received_msg: break received_body = received_msg.get_body() received_url_info = unpack_message(received_body) print received_url_info.raw_url received_msg.delete()