Ejemplo n.º 1
0
    def destroy(self):
        """
        This terminates all instances and deletes all crawl data and instance storage.
        Deleting the configuration can only happen manually or through Atrax Keeper
        """
        if self.state.get() != CrawlJobState.STOPPED:
            self.notifications.stopping_crawl_job()

        s3 = AwsConnections.s3()
        crawl_job_glossary = CrawlJobGlossary(self.name)
        aws.s3.delete_non_empty_bucket(crawl_job_glossary.crawled_content_bucket_name)

        # Don't call self.stop() because we don't want the frontier controller to attempt to persist the frontier.
        self.pause()  # this terminates the fetchers and stops the frontier

        sdb = AwsConnections.sdb()
        for table_name in crawl_job_glossary.table_names:
            if sdb.lookup(table_name):
                sdb.delete_domain(table_name)

        # Todo: implement
        # self.frontier_controller.destroy()

        self.notifications.delete_all_topics()
        crawl_job_state_table = AwsConnections.sdb().get_domain(CRAWL_JOB_STATE_DOMAIN_NAME)
        crawl_job_state_table.delete_attributes(self.name)
Ejemplo n.º 2
0
 def setUpClass(cls):
     cls._bucket = get_or_create_bucket(AwsConnections.s3(),
                                        'crawled-content-test-bucket')
     sdb = AwsConnections.sdb()
     cls._crawled_urls = sdb.lookup(
         CrawlJobGlossary('sel11122014').crawled_urls_table_name)
     cls._target = CrawledContent(cls._bucket)
     cls._content = "yada yada yada"
Ejemplo n.º 3
0
 def __init__(self, crawl_job_name, emit_interval):
     """
     :param crawl_job_name:
     :param emit_interval: How often to emit the metrics in minutes.
     """
     config_fetcher = ConfigFetcher(crawl_job_name)
     config_file = config_fetcher.get_config_file()
     self._global_config = config_file.global_config
     self.crawl_job = CrawlJob(crawl_job_name, self._global_config)
     self.emit_interval = emit_interval * SECONDS_PER_MINUTE
     self.namespace = 'atrax/' + self.crawl_job.name
     self.cw = AwsConnections.cloudwatch()
     self.sqs = AwsConnections.sqs()
Ejemplo n.º 4
0
 def __init__(self, crawl_job_name):
     self.crawl_job_name = crawl_job_name
     sdb = AwsConnections.sdb()
     self.crawl_job_state_table = sdb.lookup(CRAWL_JOB_STATE_DOMAIN_NAME)
     if not self.crawl_job_state_table:
         self.crawl_job_state_table = aws.sdb.create_domain(
             sdb, CRAWL_JOB_STATE_DOMAIN_NAME)
Ejemplo n.º 5
0
 def create_sqs_queue(queue_name):
     queue = AwsConnections.sqs().create_queue(queue_name, 10 * 60)  # 10 minutes
     queue.set_attribute('MessageRetentionPeriod', 1209600)  # 14 days
     # Defaults:
     # queue.set_attribute('DelaySeconds', 0)  # Don't delay
     # queue.set_attribute('MaximumMessageSize', 256144)  # 256 KB
     # queue.set_attribute('ReceiveMessageWaitTimeSeconds', 0)  # Don't wait
     return queue
Ejemplo n.º 6
0
    def terminate_fetcher_instances(self):
        requests = self.get_all_spot_requests()
        request_ids = [request.id for request in requests]
        terminate_spot_instances_by_request(AwsConnections.ec2(), request_ids)

        for fetcher_instance in AwsInstanceAccessor(
                self.crawl_job_name).get_fetcher_instances():
            fetcher_instance.terminate()
Ejemplo n.º 7
0
    def __init__(self, name, global_config=None):
        self.name = name
        self.config = global_config
        self.glossary = CrawlJobGlossary(self.name)

        self._sdb = AwsConnections.sdb()
        self._s3 = AwsConnections.s3()

        self._logs_table = None
        self._crawled_urls = None
        self._failed_urls = None
        self._skipped_urls = None
        self._redirected_urls = None

        self._persisted_frontier_bucket = None
        self._crawled_content_bucket = None
        self._crawled_content = None

        self._seen_urls = None
        self._instance_accessor = None
Ejemplo n.º 8
0
    def persist(self):
        if self.ec2_instance is not None and self.ec2_instance.state in [
                InstanceState.RUNNING, InstanceState.PENDING
        ]:
            raise PreconditionNotMet('Frontier is still in the %s state' %
                                     self.ec2_instance.state)

        crawl_job = CrawlJob(self.crawl_job_name)

        # Persist all queues with names that start with the crawl_job.name
        persist_to_s3(AwsConnections.sqs(), crawl_job.name,
                      crawl_job.persisted_frontier_bucket)
Ejemplo n.º 9
0
 def __init__(self,
              address,
              crawl_job_name=None,
              on_new_queue_assigned=None):
     FrontierInterface.__init__(self)
     self._on_new_queue_assigned = on_new_queue_assigned
     self._zmq_context = zmq.Context.instance()
     self._client = self._zmq_context.socket(zmq.REQ)
     self._client.RCVTIMEO = 1000 * 60  # wait up to a minute for responses to come back
     self._client.connect('tcp://' + address)
     self._sqs = AwsConnections.sqs()
     self._queue_history = {}
     self._queue_names = QueueKeyDict()
     self._messages = {}
Ejemplo n.º 10
0
    def start_on_demand_instances(self, count, availability_zone):
        modules = [ModuleNames.FETCHER]
        security_groups = [FETCHER_SECURITY_GROUP_NAME]

        fetcher_instances = []

        for i in xrange(0, count):
            reservation = self.ec2.run_instances(
                image_id=get_latest_fetcher_ami(self.ec2,
                                                VirtualizationType.HVM).id,
                key_name=EC2_KEY_PAIR_NAME,
                security_groups=security_groups,
                instance_type=FETCHER_INSTANCE_TYPES[0],
                placement=availability_zone,
                monitoring_enabled=True,
                instance_initiated_shutdown_behavior='terminate',
                instance_profile_arn=STANDARD_INSTANCE_ARN,
                user_data=self.on_demand_user_data)

            fetcher_instance = reservation.instances[0]
            wait_for_state(fetcher_instance,
                           (InstanceState.PENDING, InstanceState.RUNNING))
            self.ec2.create_tags(
                [fetcher_instance.id], {
                    CRAWL_JOB_TAG_NAME: self.crawl_job_name,
                    PACKAGES_TAG_NAME: ' '.join(modules)
                })

            cloudwatch = AwsConnections.cloudwatch()
            alarm = cloudwatch.MetricAlarm(
                name=fetcher_instance.id + '-LOW_NETWORK',
                description=
                "Stop the instance when network in drops below 300 bytes in 5 minutes.",
                namespace='AWS/EC2',
                dimensions={'InstanceId': [fetcher_instance.id]},
                metric='NetworkIn',
                statistic='Sum',
                comparison='<',
                threshold=300,
                period=300,
                evaluation_periods=1)
            alarm.add_alarm_action('arn:aws:automate:' +
                                   AwsConnections.region + ':ec2:terminate')
            cloudwatch.put_metric_alarm(alarm)
            fetcher_instances.append(fetcher_instance)

        return fetcher_instances
Ejemplo n.º 11
0
def main():
    local_logger = LocalLogger('frontier')
    local_logger.log(LogType.Info, 'Starting')

    if not __debug__:
        os.nice(-1)

    parser = argparse.ArgumentParser()
    parser.add_argument('-d', action='store_true')
    parser.add_argument('job', type=str)
    args = parser.parse_args()

    config_fetcher = ConfigFetcher(args.job)
    config_file = config_fetcher.get_config_file()

    logger = SimpleLogger(
        get_or_create_domain(AwsConnections.sdb(),
                             CrawlJobGlossary(args.job).logs_table_name),
        create_frontier_id(config_file.global_config.environment))
    try:
        if config_file.global_config.environment == ComputeEnv.AWS:
            frontier = AwsFrontier(args.job, logger)
        else:
            frontier = LocalFrontier(args.job, logger)

        seeder = FrontierSeeder(config_file.global_config, frontier)
        seeder_thread = InterruptableThread(lambda t: seeder.run())
        seeder_thread.start()

        metrics_service = MetricsService(args.job, 10)
        metrics_service.start()

        frontier_service = RemoteFrontier(frontier)
        frontier_service.start()
        logger.log(LogType.Info, 'Started')
        frontier_service.join()
        if frontier_service.threw_exception:
            logger.log(LogType.InternalError, 'Unexpectedly stopped', None,
                       frontier_service.exception, frontier_service.exc_info)
    except SqsMessageRetentionException, ex:
        logger.log(LogType.InternalWarning, "Full-stopping crawl job", None,
                   ex, sys.exc_info())
        CrawlJobController(args.job).stop()
Ejemplo n.º 12
0
    def __init__(self, crawl_job_name, logger):
        self.logger = logger
        config_fetcher = ConfigFetcher(crawl_job_name)
        config_file = config_fetcher.get_config_file()
        self._global_config = config_file.global_config
        self.crawl_job = CrawlJob(crawl_job_name, self._global_config)

        self._recurring_timer_interval = self._global_config.lb_maintenance_cycle_period * SECONDS_PER_MINUTE
        self.metrics = FrontierMetrics(self.crawl_job.name)

        local_fetcher_id = create_fetcher_id(self._global_config.environment,
                                             0)

        # The minimum dequeue interval that every consumer must have in order to be considered as a queue donor.
        min_dequeue_interval = (1.0 / self._global_config.max_fetch_rate
                                ) * Frontier.DEQUEUE_INTERVAL_MARGIN
        self._consumers = ConsumerCollection(local_fetcher_id,
                                             min_dequeue_interval,
                                             self.crawl_job.instance_accessor,
                                             self._recurring_timer_interval,
                                             self.logger)

        self._queues_by_name = {}
        self._unassigned_queues = deque()
        for queue in AwsConnections.sqs().get_all_queues(self.crawl_job.name):
            frontier_queue = FrontierQueue(queue)
            self._queues_by_name[queue.name] = frontier_queue
            if frontier_queue.count > 0:
                self._unassigned_queues.appendleft(frontier_queue)
            else:
                self._unassigned_queues.append(frontier_queue)

        self._is_scaling = True

        # This is a hack to serialize the execution of asynchronous operations into the main thread.
        self._zmq_context = zmq.Context.instance()
        self._zmq_socket = self._zmq_context.socket(zmq.REQ)
        self._zmq_socket.connect('tcp://%s:%s' %
                                 (LOCALHOST_IP, str(DEFAULT_FRONTIER_PORT)))

        self._enqueue_count = 0
        self._dequeue_count = 0
        self._previous_emit_time = time.time()
Ejemplo n.º 13
0
    def create_instance(self, availability_zone=None):
        modules = [ModuleNames.FRONTIER, ModuleNames.REDIS]
        security_groups = [
            FRONTIER_SECURITY_GROUP_NAME, REDIS_SECURITY_GROUP_NAME,
            FETCHER_SECURITY_GROUP_NAME
        ]
        parts = [('cloud_config.yaml', 'cloud-config',
                  generate_cloud_config()),
                 ('stopgap_debian_setup.sh', 'x-shellscript',
                  generate_stopgap_debian_setup())]

        for module in modules:
            script = generate_upstart_script(self.crawl_job_name, module)
            parts.append((module + '.conf', 'upstart-job', script))

        user_data = create_multipart(parts)

        ec2 = AwsConnections.ec2()
        reservation = ec2.run_instances(
            image_id=get_latest_frontier_ami(ec2).id,
            key_name=EC2_KEY_PAIR_NAME,
            security_groups=security_groups,
            instance_type=FRONTIER_INSTANCE_TYPE,
            placement=availability_zone,
            monitoring_enabled=True,
            instance_initiated_shutdown_behavior='stop',
            instance_profile_arn=STANDARD_INSTANCE_ARN,
            user_data=user_data,
            disable_api_termination=True
        )  # This instance can only be terminated manually

        frontier_instance = reservation.instances[0]
        wait_for_state(frontier_instance,
                       (InstanceState.PENDING, InstanceState.RUNNING))
        ec2.create_tags(
            [frontier_instance.id], {
                CRAWL_JOB_TAG_NAME: self.crawl_job_name,
                PACKAGES_TAG_NAME: ' '.join(modules)
            })
        return frontier_instance
Ejemplo n.º 14
0
def populate_seen_urls(job_name, environment):
    crawl_job = CrawlJobGlossary(job_name)
    seen_urls_instance = InstanceAccessorBase(
        job_name, environment).get_redis_instance()
    seen_urls = SeenUrls(crawl_job.seen_urls_key,
                         host=seen_urls_instance.private_ip_address,
                         port=DEFAULT_REDIS_PORT)

    sdb = AwsConnections.sdb()

    for table_name in [
            crawl_job.failed_urls_table_name,
            crawl_job.skipped_urls_table_name,
            crawl_job.crawled_urls_table_name
    ]:
        table = sdb.lookup(table_name)
        items = table.select("select itemName() from `%s`" % table_name)

        for item in items:
            host = urlsplit(item.name)[1]
            last_colon = host.rfind(':')
            domain = host if last_colon == -1 else host[0:last_colon]

            seen_urls.add(item.name, domain)
Ejemplo n.º 15
0
 def setUp(self):
     AwsConnections.sqs()
Ejemplo n.º 16
0
 def __init__(self, crawl_job_name):
     self.crawl_job_name = crawl_job_name
     self.namespace = 'atrax/' + self.crawl_job_name
     self.cw = AwsConnections.cloudwatch()
Ejemplo n.º 17
0
 def __init__(self, crawl_job_name):
     self.crawl_job_name = crawl_job_name
     self.ec2 = AwsConnections.ec2()
     self._on_demand_user_data = None
     self._spot_user_data = None
Ejemplo n.º 18
0
from aws import USWest2 as AwsConnections

domain_name = 'crawled-urls.siemens17042013'
domain = AwsConnections.sdb().lookup(domain_name)
query = "select * from `{0}` where `redirectsTo` is null".format(domain_name)
items = domain.select(query)

count = 0

for item in items:
    # print item.name + '\n'
    count += 1
    next_token = items.next_token
    if next_token is not None:
        print next_token
        break

print '\n' + str(count)
Ejemplo n.º 19
0
from atrax.management.aws_env.ami import *
from aws import USWest2 as AwsConnections

image = get_latest_frontier_ami(AwsConnections.ec2())
Ejemplo n.º 20
0
 def __init__(self, crawl_job_name):
     self.crawl_job_name = crawl_job_name
     s3 = AwsConnections.s3()
     self.bucket = s3.lookup(CONFIG_BUCKET_NAME)
Ejemplo n.º 21
0
 def delete_queues(self):
     queues = AwsConnections.sqs().get_all_queues(self.crawl_job_name)
     for queue in queues:
         queue.delete()
Ejemplo n.º 22
0
 def restore(self):
     crawl_job = CrawlJob(self.crawl_job_name)
     restore_from_s3(AwsConnections.sqs(),
                     crawl_job.persisted_frontier_bucket,
                     queue_creator=FrontierQueue.create_sqs_queue)
Ejemplo n.º 23
0
 def __init__(self, crawl_job_name):
     super(AwsInstanceAccessor, self).__init__(crawl_job_name,
                                               ComputeEnv.AWS)
     self.ec2 = AwsConnections.ec2()
Ejemplo n.º 24
0
    encoded = url.encode('utf-8')

    if not isinstance(encoded, unicode):
        decoded = encoded.decode('utf-8')


def pack_message(url_info):
    return b64encode(pickle.dumps(url_info))

def unpack_message(m):
    return pickle.loads(b64decode(m))

from boto.sqs.message import RawMessage as SqsMessage
from aws import USWest2 as AwsConnections

queue = AwsConnections.sqs().lookup('test_queue')
queue.set_message_class(SqsMessage)


def dequeue():

    while True:
        received_msg = queue.read()
        if not received_msg:
            break
        received_body = received_msg.get_body()
        received_url_info = unpack_message(received_body)

        print received_url_info.raw_url
        received_msg.delete()