Ejemplo n.º 1
0
def purge_out_of_scope(self, crawl_job_name):
    """
    Go through crawled_urls and move them into skipped_urls and delete the content from s3.
    """
    # Todo: Not tested
    scope = CrawlerScope(ConfigFetcher(crawl_job_name).get_scope_file())
    crawl_job = CrawlJob(crawl_job_name)

    next_token_file_path = os.path.join(
        LOCAL_CRAWL_JOB_DIR, self.name,
        "purge_out_of_scope_next_crawled_urls_token.txt")

    with open(next_token_file_path, 'r') as next_token_file:
        prev_next_token = next_token_file.read()

    query = "select `url`, `referrer_id` from `{0}`".format(
        crawl_job.glossary.crawled_urls_table_name)
    try:
        items = crawl_job.crawled_urls.select(query,
                                              next_token=prev_next_token)
    except:
        items = crawl_job.crawled_urls.select(query)

    next_token = items.next_token

    count = 0
    try:
        for item in items:
            count += 1

            if prev_next_token != items.next_token:
                prev_next_token = next_token
                next_token = items.next_token

            url_info = UrlInfo(item['url'], canonized_url=item['url'])
            c = scope.get(url_info)
            if c == UrlClass.InScope or item.name.endswith(
                    'robots.txt') or item.name.endswith('sitemap.xml'):
                continue

            attributes = {REASON_ATTR_NAME: c}
            referrer_id = item.get(REFERRER_ID_ATTR_NAME, None)
            if referrer_id:
                attributes[REFERRER_ID_ATTR_NAME] = referrer_id
            crawl_job.skipped_urls.put_attributes(item.name, attributes)
            key = crawl_job.crawled_content_bucket.get_key(url_info.s3_key)

            if key:
                key.delete()
            item.delete()  # Todo: do this in batches?
    except Exception, ex:
        with open(next_token_file_path, 'w') as next_token_file:
            next_token_file.write(prev_next_token)
        print "Interrupted after %s records." % count
        raise
Ejemplo n.º 2
0
    def persist(self):
        if self.ec2_instance is not None and self.ec2_instance.state in [
                InstanceState.RUNNING, InstanceState.PENDING
        ]:
            raise PreconditionNotMet('Frontier is still in the %s state' %
                                     self.ec2_instance.state)

        crawl_job = CrawlJob(self.crawl_job_name)

        # Persist all queues with names that start with the crawl_job.name
        persist_to_s3(AwsConnections.sqs(), crawl_job.name,
                      crawl_job.persisted_frontier_bucket)
Ejemplo n.º 3
0
 def enqueue_skipped(self):
     # Todo: Not tested
     crawl_job = CrawlJob(self.crawl_job_name)
     scope = CrawlerScope(
         ConfigFetcher(self.crawl_job_name).get_scope_file())
     frontier = get_frontier_client(self.ec2_instance, None)
     for item in crawl_job.skipped_urls.select("select * from %s" %
                                               crawl_job.skipped_urls.name):
         url_info = UrlInfo(item.name)
         if scope.get(url_info) == UrlClass.InScope:
             url_info.referrer_id = item[REFERRER_ID_ATTR_NAME]
             frontier.enqueue(url_info)
             item.delete()  # Todo: do this in batches?
Ejemplo n.º 4
0
 def __init__(self, crawl_job_name, emit_interval):
     """
     :param crawl_job_name:
     :param emit_interval: How often to emit the metrics in minutes.
     """
     config_fetcher = ConfigFetcher(crawl_job_name)
     config_file = config_fetcher.get_config_file()
     self._global_config = config_file.global_config
     self.crawl_job = CrawlJob(crawl_job_name, self._global_config)
     self.emit_interval = emit_interval * SECONDS_PER_MINUTE
     self.namespace = 'atrax/' + self.crawl_job.name
     self.cw = AwsConnections.cloudwatch()
     self.sqs = AwsConnections.sqs()
Ejemplo n.º 5
0
    def __init__(self, crawl_job_name, logger):
        self.logger = logger
        config_fetcher = ConfigFetcher(crawl_job_name)
        config_file = config_fetcher.get_config_file()
        self._global_config = config_file.global_config
        self.crawl_job = CrawlJob(crawl_job_name, self._global_config)

        self._recurring_timer_interval = self._global_config.lb_maintenance_cycle_period * SECONDS_PER_MINUTE
        self.metrics = FrontierMetrics(self.crawl_job.name)

        local_fetcher_id = create_fetcher_id(self._global_config.environment,
                                             0)

        # The minimum dequeue interval that every consumer must have in order to be considered as a queue donor.
        min_dequeue_interval = (1.0 / self._global_config.max_fetch_rate
                                ) * Frontier.DEQUEUE_INTERVAL_MARGIN
        self._consumers = ConsumerCollection(local_fetcher_id,
                                             min_dequeue_interval,
                                             self.crawl_job.instance_accessor,
                                             self._recurring_timer_interval,
                                             self.logger)

        self._queues_by_name = {}
        self._unassigned_queues = deque()
        for queue in AwsConnections.sqs().get_all_queues(self.crawl_job.name):
            frontier_queue = FrontierQueue(queue)
            self._queues_by_name[queue.name] = frontier_queue
            if frontier_queue.count > 0:
                self._unassigned_queues.appendleft(frontier_queue)
            else:
                self._unassigned_queues.append(frontier_queue)

        self._is_scaling = True

        # This is a hack to serialize the execution of asynchronous operations into the main thread.
        self._zmq_context = zmq.Context.instance()
        self._zmq_socket = self._zmq_context.socket(zmq.REQ)
        self._zmq_socket.connect('tcp://%s:%s' %
                                 (LOCALHOST_IP, str(DEFAULT_FRONTIER_PORT)))

        self._enqueue_count = 0
        self._dequeue_count = 0
        self._previous_emit_time = time.time()
Ejemplo n.º 6
0
 def restore(self):
     crawl_job = CrawlJob(self.crawl_job_name)
     restore_from_s3(AwsConnections.sqs(),
                     crawl_job.persisted_frontier_bucket,
                     queue_creator=FrontierQueue.create_sqs_queue)
Ejemplo n.º 7
0
    def __init__(self, job_name, local_id=0):
        InterruptableThread.__init__(self)
        self.id = 'unknown'
        try:
            socket.setdefaulttimeout(
                30)  # wait up to 30 seconds for a response on all sockets

            config_fetcher = ConfigFetcher(job_name)
            config_file = config_fetcher.get_config_file()
            self.global_config = config_file.global_config
            self.id = create_fetcher_id(self.global_config.environment,
                                        local_id)
            self.crawl_job = CrawlJob(job_name, self.global_config)
            self.logger = SimpleLogger(self.crawl_job.logs_table, self.id)
            self.local_logger = LocalLogger('fetcher', local_id)

            try:
                self.metrics = FetcherMetrics(self.crawl_job.name)
                self.notifier = CrawlJobNotifications(self.crawl_job.name)

                if self.global_config.reference_job:
                    if self.global_config.reference_job_version < self.global_config.version:
                        from atrax.prior_versions import crawl_job_versions

                        self.reference_job = crawl_job_versions[
                            self.global_config.reference_job_version](
                                self.global_config.reference_job)
                    else:
                        self.reference_job = CrawlJob(
                            self.global_config.reference_job)
                else:
                    self.reference_job = None

                self.url_extractor = UrlExtractor(self.logger)

                self.user_agent = config_file.user_agents['Standard'] % self.id
                non_compliant_ua = config_file.user_agents.get(
                    'NonCompliant', None)
                self.non_compliant_user_agent = (
                    non_compliant_ua % self.id) if non_compliant_ua else None

                self.select_original_query = \
                    "select itemName() from `{0}` where `{1}`='%s' and `{2}`='{3}' limit 1".format(
                        self.crawl_job.crawled_urls.name, FINGERPRINT_ATTR_NAME,
                        ORIGINAL_ATTR_NAME, ORIGINAL_ATTR_VALUE_SELF)

                scope_file = config_fetcher.get_scope_file()
                self.scope = CrawlerScope(scope_file)
                self.url_transformer = UrlTransformer(config_file)
                self.robots_txt_cache = {}

                self.downloader = self.initialize_downloader()

                # Initialize frontier client
                frontier_instance = self.crawl_job.instance_accessor.get_frontier_instance(
                )
                self.frontier = get_frontier_client(frontier_instance,
                                                    self.new_queue_assigned)

                self.local_seen_urls = TrieSet(
                )  # a local list of URLs that this fetcher has seen since it started.

                # Don't crawl pages that are at least 90% similar to the previous 5 crawled pages in the URL's lineage.
                self.dust_info_factory = DustInfoFactory(.9, .1, 5)

                redis_instance = self.crawl_job.instance_accessor.get_redis_instance(
                )
                self.redundant_url_detector = RedundantUrlDetector(
                    self.crawl_job.glossary.redundant_urls_key,
                    host=redis_instance.private_ip_address,
                    port=DEFAULT_REDIS_PORT)
            except Exception, ex:
                self.logger.log(LogType.InternalError, "Unexpectedly stopped",
                                None, ex, sys.exc_info())
                raise
        except Exception, ex:
            sys.stderr.write(
                "Failed to initialize fetcher %s.: \n%s\nStack Trace:%s\n" %
                (self.id, ex, sys.exc_info()))
            raise
Ejemplo n.º 8
0
import time

from atrax.common.crawl_job import CrawlJob
from python_common.collections.trie_set import TrieSet

crawl_job = CrawlJob('sel11122014')

trie_set = TrieSet()

total_items = 0
for domain in [
        crawl_job.crawled_urls, crawl_job.skipped_urls, crawl_job.failed_urls,
        crawl_job.redirected_urls
]:
    next_token = None
    query = "select itemName() from `%s`" % domain.name
    while True:
        items = domain.select(query, next_token=next_token)

        num_items = 0
        for item in items:
            trie_set.add(item.name)
            num_items += 1
            total_items += 1

        if items.next_token is None or num_items == 0:
            break
        next_token = next_token

print total_items
Ejemplo n.º 9
0
import sys
from atrax.common.crawl_job import CrawlJob
from atrax.management.config_fetcher import ConfigFetcher
from atrax.frontier.remote_frontier_client import get_frontier_client


def new_queue_assigned(queue_name):
    print "Queue assigned: " + queue_name


job_name = sys.argv[1]
config_fetcher = ConfigFetcher(job_name)
config_file = config_fetcher.get_config_file()
global_config = config_file.global_config

crawl_job = CrawlJob(job_name, global_config)
frontier_instance = crawl_job.instance_accessor.get_frontier_instance()
frontier = get_frontier_client(frontier_instance, new_queue_assigned)

# msg_id, url_info = frontier.dequeue('i-5ab7b450:0')
msg_id, url_info = frontier.dequeue('0000:0')

if url_info:
    print "Dequeued: " + url_info.id
else:
    print "Dequeue failed"