Ejemplo n.º 1
0
    def destroy(self):
        """
        This terminates all instances and deletes all crawl data and instance storage.
        Deleting the configuration can only happen manually or through Atrax Keeper
        """
        if self.state.get() != CrawlJobState.STOPPED:
            self.notifications.stopping_crawl_job()

        s3 = AwsConnections.s3()
        crawl_job_glossary = CrawlJobGlossary(self.name)
        aws.s3.delete_non_empty_bucket(crawl_job_glossary.crawled_content_bucket_name)

        # Don't call self.stop() because we don't want the frontier controller to attempt to persist the frontier.
        self.pause()  # this terminates the fetchers and stops the frontier

        sdb = AwsConnections.sdb()
        for table_name in crawl_job_glossary.table_names:
            if sdb.lookup(table_name):
                sdb.delete_domain(table_name)

        # Todo: implement
        # self.frontier_controller.destroy()

        self.notifications.delete_all_topics()
        crawl_job_state_table = AwsConnections.sdb().get_domain(CRAWL_JOB_STATE_DOMAIN_NAME)
        crawl_job_state_table.delete_attributes(self.name)
Ejemplo n.º 2
0
 def setUpClass(cls):
     cls._bucket = get_or_create_bucket(AwsConnections.s3(),
                                        'crawled-content-test-bucket')
     sdb = AwsConnections.sdb()
     cls._crawled_urls = sdb.lookup(
         CrawlJobGlossary('sel11122014').crawled_urls_table_name)
     cls._target = CrawledContent(cls._bucket)
     cls._content = "yada yada yada"
Ejemplo n.º 3
0
    def __init__(self, name, global_config=None):
        self.name = name
        self.config = global_config
        self.glossary = CrawlJobGlossary(self.name)

        self._sdb = AwsConnections.sdb()
        self._s3 = AwsConnections.s3()

        self._logs_table = None
        self._crawled_urls = None
        self._failed_urls = None
        self._skipped_urls = None
        self._redirected_urls = None

        self._persisted_frontier_bucket = None
        self._crawled_content_bucket = None
        self._crawled_content = None

        self._seen_urls = None
        self._instance_accessor = None
Ejemplo n.º 4
0
 def __init__(self, crawl_job_name):
     self.crawl_job_name = crawl_job_name
     s3 = AwsConnections.s3()
     self.bucket = s3.lookup(CONFIG_BUCKET_NAME)