Esempio n. 1
0
 def create_sqs_queue(queue_name):
     queue = AwsConnections.sqs().create_queue(queue_name, 10 * 60)  # 10 minutes
     queue.set_attribute('MessageRetentionPeriod', 1209600)  # 14 days
     # Defaults:
     # queue.set_attribute('DelaySeconds', 0)  # Don't delay
     # queue.set_attribute('MaximumMessageSize', 256144)  # 256 KB
     # queue.set_attribute('ReceiveMessageWaitTimeSeconds', 0)  # Don't wait
     return queue
Esempio n. 2
0
    def persist(self):
        if self.ec2_instance is not None and self.ec2_instance.state in [
                InstanceState.RUNNING, InstanceState.PENDING
        ]:
            raise PreconditionNotMet('Frontier is still in the %s state' %
                                     self.ec2_instance.state)

        crawl_job = CrawlJob(self.crawl_job_name)

        # Persist all queues with names that start with the crawl_job.name
        persist_to_s3(AwsConnections.sqs(), crawl_job.name,
                      crawl_job.persisted_frontier_bucket)
Esempio n. 3
0
 def __init__(self, crawl_job_name, emit_interval):
     """
     :param crawl_job_name:
     :param emit_interval: How often to emit the metrics in minutes.
     """
     config_fetcher = ConfigFetcher(crawl_job_name)
     config_file = config_fetcher.get_config_file()
     self._global_config = config_file.global_config
     self.crawl_job = CrawlJob(crawl_job_name, self._global_config)
     self.emit_interval = emit_interval * SECONDS_PER_MINUTE
     self.namespace = 'atrax/' + self.crawl_job.name
     self.cw = AwsConnections.cloudwatch()
     self.sqs = AwsConnections.sqs()
Esempio n. 4
0
 def __init__(self,
              address,
              crawl_job_name=None,
              on_new_queue_assigned=None):
     FrontierInterface.__init__(self)
     self._on_new_queue_assigned = on_new_queue_assigned
     self._zmq_context = zmq.Context.instance()
     self._client = self._zmq_context.socket(zmq.REQ)
     self._client.RCVTIMEO = 1000 * 60  # wait up to a minute for responses to come back
     self._client.connect('tcp://' + address)
     self._sqs = AwsConnections.sqs()
     self._queue_history = {}
     self._queue_names = QueueKeyDict()
     self._messages = {}
Esempio n. 5
0
    def __init__(self, crawl_job_name, logger):
        self.logger = logger
        config_fetcher = ConfigFetcher(crawl_job_name)
        config_file = config_fetcher.get_config_file()
        self._global_config = config_file.global_config
        self.crawl_job = CrawlJob(crawl_job_name, self._global_config)

        self._recurring_timer_interval = self._global_config.lb_maintenance_cycle_period * SECONDS_PER_MINUTE
        self.metrics = FrontierMetrics(self.crawl_job.name)

        local_fetcher_id = create_fetcher_id(self._global_config.environment,
                                             0)

        # The minimum dequeue interval that every consumer must have in order to be considered as a queue donor.
        min_dequeue_interval = (1.0 / self._global_config.max_fetch_rate
                                ) * Frontier.DEQUEUE_INTERVAL_MARGIN
        self._consumers = ConsumerCollection(local_fetcher_id,
                                             min_dequeue_interval,
                                             self.crawl_job.instance_accessor,
                                             self._recurring_timer_interval,
                                             self.logger)

        self._queues_by_name = {}
        self._unassigned_queues = deque()
        for queue in AwsConnections.sqs().get_all_queues(self.crawl_job.name):
            frontier_queue = FrontierQueue(queue)
            self._queues_by_name[queue.name] = frontier_queue
            if frontier_queue.count > 0:
                self._unassigned_queues.appendleft(frontier_queue)
            else:
                self._unassigned_queues.append(frontier_queue)

        self._is_scaling = True

        # This is a hack to serialize the execution of asynchronous operations into the main thread.
        self._zmq_context = zmq.Context.instance()
        self._zmq_socket = self._zmq_context.socket(zmq.REQ)
        self._zmq_socket.connect('tcp://%s:%s' %
                                 (LOCALHOST_IP, str(DEFAULT_FRONTIER_PORT)))

        self._enqueue_count = 0
        self._dequeue_count = 0
        self._previous_emit_time = time.time()
 def setUp(self):
     AwsConnections.sqs()
Esempio n. 7
0
 def delete_queues(self):
     queues = AwsConnections.sqs().get_all_queues(self.crawl_job_name)
     for queue in queues:
         queue.delete()
Esempio n. 8
0
 def restore(self):
     crawl_job = CrawlJob(self.crawl_job_name)
     restore_from_s3(AwsConnections.sqs(),
                     crawl_job.persisted_frontier_bucket,
                     queue_creator=FrontierQueue.create_sqs_queue)
Esempio n. 9
0
    encoded = url.encode('utf-8')

    if not isinstance(encoded, unicode):
        decoded = encoded.decode('utf-8')


def pack_message(url_info):
    return b64encode(pickle.dumps(url_info))

def unpack_message(m):
    return pickle.loads(b64decode(m))

from boto.sqs.message import RawMessage as SqsMessage
from aws import USWest2 as AwsConnections

queue = AwsConnections.sqs().lookup('test_queue')
queue.set_message_class(SqsMessage)


def dequeue():

    while True:
        received_msg = queue.read()
        if not received_msg:
            break
        received_body = received_msg.get_body()
        received_url_info = unpack_message(received_body)

        print received_url_info.raw_url
        received_msg.delete()