def destroy(self): """ This terminates all instances and deletes all crawl data and instance storage. Deleting the configuration can only happen manually or through Atrax Keeper """ if self.state.get() != CrawlJobState.STOPPED: self.notifications.stopping_crawl_job() s3 = AwsConnections.s3() crawl_job_glossary = CrawlJobGlossary(self.name) aws.s3.delete_non_empty_bucket(crawl_job_glossary.crawled_content_bucket_name) # Don't call self.stop() because we don't want the frontier controller to attempt to persist the frontier. self.pause() # this terminates the fetchers and stops the frontier sdb = AwsConnections.sdb() for table_name in crawl_job_glossary.table_names: if sdb.lookup(table_name): sdb.delete_domain(table_name) # Todo: implement # self.frontier_controller.destroy() self.notifications.delete_all_topics() crawl_job_state_table = AwsConnections.sdb().get_domain(CRAWL_JOB_STATE_DOMAIN_NAME) crawl_job_state_table.delete_attributes(self.name)
def setUpClass(cls): cls._bucket = get_or_create_bucket(AwsConnections.s3(), 'crawled-content-test-bucket') sdb = AwsConnections.sdb() cls._crawled_urls = sdb.lookup( CrawlJobGlossary('sel11122014').crawled_urls_table_name) cls._target = CrawledContent(cls._bucket) cls._content = "yada yada yada"
def __init__(self, name, global_config=None): self.name = name self.config = global_config self.glossary = CrawlJobGlossary(self.name) self._sdb = AwsConnections.sdb() self._s3 = AwsConnections.s3() self._logs_table = None self._crawled_urls = None self._failed_urls = None self._skipped_urls = None self._redirected_urls = None self._persisted_frontier_bucket = None self._crawled_content_bucket = None self._crawled_content = None self._seen_urls = None self._instance_accessor = None
def __init__(self, crawl_job_name): self.crawl_job_name = crawl_job_name s3 = AwsConnections.s3() self.bucket = s3.lookup(CONFIG_BUCKET_NAME)