Ejemplo n.º 1
0
 def __init__(self, crawl_job_name, emit_interval):
     """
     :param crawl_job_name:
     :param emit_interval: How often to emit the metrics in minutes.
     """
     config_fetcher = ConfigFetcher(crawl_job_name)
     config_file = config_fetcher.get_config_file()
     self._global_config = config_file.global_config
     self.crawl_job = CrawlJob(crawl_job_name, self._global_config)
     self.emit_interval = emit_interval * SECONDS_PER_MINUTE
     self.namespace = 'atrax/' + self.crawl_job.name
     self.cw = AwsConnections.cloudwatch()
     self.sqs = AwsConnections.sqs()
Ejemplo n.º 2
0
 def _get_contact_endpoints(self):
     contact_endpoints = {}
     for contact_name, details in ConfigFetcher(
             self.name).get_config_file().contacts.iteritems():
         for protocol, endpoint in json.loads(details).iteritems():
             contact_endpoints[endpoint] = protocol
     return contact_endpoints
Ejemplo n.º 3
0
    def subscribe_to_notifications(self):
        self.initialize_topics()

        for contact_name, details in ConfigFetcher(
                self.name).get_config_file().contacts.iteritems():
            for protocol, endpoint in json.loads(details).iteritems():
                for topic_arn in self.topic_arns.values():
                    self.sns_conn.subscribe(topic_arn, protocol, endpoint)
Ejemplo n.º 4
0
 def __init__(self, config, frontier):
     self.config = config
     self.config_fetcher = ConfigFetcher(self.config.job_name)
     self.frontier_controller = FrontierController(self.config.job_name)
     self.frontier = frontier
     self.queue_names = QueueKeyDict()
     self.logger = SimpleLogger(
         self.frontier.logger.log_table,
         self.frontier.logger.source.replace('frontier', 'seeder'))
Ejemplo n.º 5
0
def main():
    local_logger = LocalLogger('frontier')
    local_logger.log(LogType.Info, 'Starting')

    if not __debug__:
        os.nice(-1)

    parser = argparse.ArgumentParser()
    parser.add_argument('-d', action='store_true')
    parser.add_argument('job', type=str)
    args = parser.parse_args()

    config_fetcher = ConfigFetcher(args.job)
    config_file = config_fetcher.get_config_file()

    logger = SimpleLogger(
        get_or_create_domain(AwsConnections.sdb(),
                             CrawlJobGlossary(args.job).logs_table_name),
        create_frontier_id(config_file.global_config.environment))
    try:
        if config_file.global_config.environment == ComputeEnv.AWS:
            frontier = AwsFrontier(args.job, logger)
        else:
            frontier = LocalFrontier(args.job, logger)

        seeder = FrontierSeeder(config_file.global_config, frontier)
        seeder_thread = InterruptableThread(lambda t: seeder.run())
        seeder_thread.start()

        metrics_service = MetricsService(args.job, 10)
        metrics_service.start()

        frontier_service = RemoteFrontier(frontier)
        frontier_service.start()
        logger.log(LogType.Info, 'Started')
        frontier_service.join()
        if frontier_service.threw_exception:
            logger.log(LogType.InternalError, 'Unexpectedly stopped', None,
                       frontier_service.exception, frontier_service.exc_info)
    except SqsMessageRetentionException, ex:
        logger.log(LogType.InternalWarning, "Full-stopping crawl job", None,
                   ex, sys.exc_info())
        CrawlJobController(args.job).stop()
Ejemplo n.º 6
0
    def __init__(self, crawl_job_name, logger):
        self.logger = logger
        config_fetcher = ConfigFetcher(crawl_job_name)
        config_file = config_fetcher.get_config_file()
        self._global_config = config_file.global_config
        self.crawl_job = CrawlJob(crawl_job_name, self._global_config)

        self._recurring_timer_interval = self._global_config.lb_maintenance_cycle_period * SECONDS_PER_MINUTE
        self.metrics = FrontierMetrics(self.crawl_job.name)

        local_fetcher_id = create_fetcher_id(self._global_config.environment,
                                             0)

        # The minimum dequeue interval that every consumer must have in order to be considered as a queue donor.
        min_dequeue_interval = (1.0 / self._global_config.max_fetch_rate
                                ) * Frontier.DEQUEUE_INTERVAL_MARGIN
        self._consumers = ConsumerCollection(local_fetcher_id,
                                             min_dequeue_interval,
                                             self.crawl_job.instance_accessor,
                                             self._recurring_timer_interval,
                                             self.logger)

        self._queues_by_name = {}
        self._unassigned_queues = deque()
        for queue in AwsConnections.sqs().get_all_queues(self.crawl_job.name):
            frontier_queue = FrontierQueue(queue)
            self._queues_by_name[queue.name] = frontier_queue
            if frontier_queue.count > 0:
                self._unassigned_queues.appendleft(frontier_queue)
            else:
                self._unassigned_queues.append(frontier_queue)

        self._is_scaling = True

        # This is a hack to serialize the execution of asynchronous operations into the main thread.
        self._zmq_context = zmq.Context.instance()
        self._zmq_socket = self._zmq_context.socket(zmq.REQ)
        self._zmq_socket.connect('tcp://%s:%s' %
                                 (LOCALHOST_IP, str(DEFAULT_FRONTIER_PORT)))

        self._enqueue_count = 0
        self._dequeue_count = 0
        self._previous_emit_time = time.time()
Ejemplo n.º 7
0
def purge_out_of_scope(self, crawl_job_name):
    """
    Go through crawled_urls and move them into skipped_urls and delete the content from s3.
    """
    # Todo: Not tested
    scope = CrawlerScope(ConfigFetcher(crawl_job_name).get_scope_file())
    crawl_job = CrawlJob(crawl_job_name)

    next_token_file_path = os.path.join(
        LOCAL_CRAWL_JOB_DIR, self.name,
        "purge_out_of_scope_next_crawled_urls_token.txt")

    with open(next_token_file_path, 'r') as next_token_file:
        prev_next_token = next_token_file.read()

    query = "select `url`, `referrer_id` from `{0}`".format(
        crawl_job.glossary.crawled_urls_table_name)
    try:
        items = crawl_job.crawled_urls.select(query,
                                              next_token=prev_next_token)
    except:
        items = crawl_job.crawled_urls.select(query)

    next_token = items.next_token

    count = 0
    try:
        for item in items:
            count += 1

            if prev_next_token != items.next_token:
                prev_next_token = next_token
                next_token = items.next_token

            url_info = UrlInfo(item['url'], canonized_url=item['url'])
            c = scope.get(url_info)
            if c == UrlClass.InScope or item.name.endswith(
                    'robots.txt') or item.name.endswith('sitemap.xml'):
                continue

            attributes = {REASON_ATTR_NAME: c}
            referrer_id = item.get(REFERRER_ID_ATTR_NAME, None)
            if referrer_id:
                attributes[REFERRER_ID_ATTR_NAME] = referrer_id
            crawl_job.skipped_urls.put_attributes(item.name, attributes)
            key = crawl_job.crawled_content_bucket.get_key(url_info.s3_key)

            if key:
                key.delete()
            item.delete()  # Todo: do this in batches?
    except Exception, ex:
        with open(next_token_file_path, 'w') as next_token_file:
            next_token_file.write(prev_next_token)
        print "Interrupted after %s records." % count
        raise
Ejemplo n.º 8
0
 def enqueue_skipped(self):
     # Todo: Not tested
     crawl_job = CrawlJob(self.crawl_job_name)
     scope = CrawlerScope(
         ConfigFetcher(self.crawl_job_name).get_scope_file())
     frontier = get_frontier_client(self.ec2_instance, None)
     for item in crawl_job.skipped_urls.select("select * from %s" %
                                               crawl_job.skipped_urls.name):
         url_info = UrlInfo(item.name)
         if scope.get(url_info) == UrlClass.InScope:
             url_info.referrer_id = item[REFERRER_ID_ATTR_NAME]
             frontier.enqueue(url_info)
             item.delete()  # Todo: do this in batches?
Ejemplo n.º 9
0
def export_seeds_from_crawl_job(output_path, dest_crawl_job_name,
                                src_crawl_job_name, version):
    crawl_job = crawl_job_versions[version](src_crawl_job_name)
    scope = CrawlerScope(ConfigFetcher(dest_crawl_job_name).get_scope_file())
    query = "select `url` from `{0}` where `url` is not null and `redirectsTo` is null".format(
        crawl_job.crawled_urls.name)

    with open(output_path, 'w') as output_file:
        items = crawl_job.crawled_urls.select(query)

        count = 0
        try:
            for item in items:
                url = item['url']
                count += 1
                if scope.get(UrlInfo(url)) == UrlClass.InScope:
                    output_file.write(url + '\n')
        except Exception as ex:
            print "Interrupted after %s records" % count
            raise
Ejemplo n.º 10
0
 def _global_config(self):
     config_file = ConfigFetcher(self.name).get_config_file()
     return GlobalConfig(config_file.colon_delimited_dict('Global'))
Ejemplo n.º 11
0
    def __init__(self, job_name, local_id=0):
        InterruptableThread.__init__(self)
        self.id = 'unknown'
        try:
            socket.setdefaulttimeout(
                30)  # wait up to 30 seconds for a response on all sockets

            config_fetcher = ConfigFetcher(job_name)
            config_file = config_fetcher.get_config_file()
            self.global_config = config_file.global_config
            self.id = create_fetcher_id(self.global_config.environment,
                                        local_id)
            self.crawl_job = CrawlJob(job_name, self.global_config)
            self.logger = SimpleLogger(self.crawl_job.logs_table, self.id)
            self.local_logger = LocalLogger('fetcher', local_id)

            try:
                self.metrics = FetcherMetrics(self.crawl_job.name)
                self.notifier = CrawlJobNotifications(self.crawl_job.name)

                if self.global_config.reference_job:
                    if self.global_config.reference_job_version < self.global_config.version:
                        from atrax.prior_versions import crawl_job_versions

                        self.reference_job = crawl_job_versions[
                            self.global_config.reference_job_version](
                                self.global_config.reference_job)
                    else:
                        self.reference_job = CrawlJob(
                            self.global_config.reference_job)
                else:
                    self.reference_job = None

                self.url_extractor = UrlExtractor(self.logger)

                self.user_agent = config_file.user_agents['Standard'] % self.id
                non_compliant_ua = config_file.user_agents.get(
                    'NonCompliant', None)
                self.non_compliant_user_agent = (
                    non_compliant_ua % self.id) if non_compliant_ua else None

                self.select_original_query = \
                    "select itemName() from `{0}` where `{1}`='%s' and `{2}`='{3}' limit 1".format(
                        self.crawl_job.crawled_urls.name, FINGERPRINT_ATTR_NAME,
                        ORIGINAL_ATTR_NAME, ORIGINAL_ATTR_VALUE_SELF)

                scope_file = config_fetcher.get_scope_file()
                self.scope = CrawlerScope(scope_file)
                self.url_transformer = UrlTransformer(config_file)
                self.robots_txt_cache = {}

                self.downloader = self.initialize_downloader()

                # Initialize frontier client
                frontier_instance = self.crawl_job.instance_accessor.get_frontier_instance(
                )
                self.frontier = get_frontier_client(frontier_instance,
                                                    self.new_queue_assigned)

                self.local_seen_urls = TrieSet(
                )  # a local list of URLs that this fetcher has seen since it started.

                # Don't crawl pages that are at least 90% similar to the previous 5 crawled pages in the URL's lineage.
                self.dust_info_factory = DustInfoFactory(.9, .1, 5)

                redis_instance = self.crawl_job.instance_accessor.get_redis_instance(
                )
                self.redundant_url_detector = RedundantUrlDetector(
                    self.crawl_job.glossary.redundant_urls_key,
                    host=redis_instance.private_ip_address,
                    port=DEFAULT_REDIS_PORT)
            except Exception, ex:
                self.logger.log(LogType.InternalError, "Unexpectedly stopped",
                                None, ex, sys.exc_info())
                raise
        except Exception, ex:
            sys.stderr.write(
                "Failed to initialize fetcher %s.: \n%s\nStack Trace:%s\n" %
                (self.id, ex, sys.exc_info()))
            raise
Ejemplo n.º 12
0
import sys
from atrax.common.crawl_job import CrawlJob
from atrax.management.config_fetcher import ConfigFetcher
from atrax.frontier.remote_frontier_client import get_frontier_client


def new_queue_assigned(queue_name):
    print "Queue assigned: " + queue_name


job_name = sys.argv[1]
config_fetcher = ConfigFetcher(job_name)
config_file = config_fetcher.get_config_file()
global_config = config_file.global_config

crawl_job = CrawlJob(job_name, global_config)
frontier_instance = crawl_job.instance_accessor.get_frontier_instance()
frontier = get_frontier_client(frontier_instance, new_queue_assigned)

# msg_id, url_info = frontier.dequeue('i-5ab7b450:0')
msg_id, url_info = frontier.dequeue('0000:0')

if url_info:
    print "Dequeued: " + url_info.id
else:
    print "Dequeue failed"
Ejemplo n.º 13
0
 def global_config(self):
     if self._global_config is None:
         config_file = ConfigFetcher(self.name).get_config_file()
         self._global_config = GlobalConfig(
             config_file.colon_delimited_dict('Global'))
     return self._global_config