def start_up(self): create_project_directory(self.project_name) create_overview_files(self.project_name, self.base_url) self.queue = file_to_set( self.queue_file) # Convert file to set for faster processing self.scraped = file_to_set(self.scraped_file) print("-- Scraper initialised --")
def run(content_num): queue_urls = file_to_set(QUEUE_FILE) for url in queue_urls: scraper.scrape_page(url, content_file + str(content_num) + ".md") content_num += 1 check_queue(content_num)
def run(): queue_urls = file_to_set(QUEUE_FILE) for url in queue_urls: scraper.scrape_page(url, PROJECT_NAME + "/" + get_path_name(url) + ".md") if not is_file_empty(QUEUE_FILE): run() else: print("-- All pages from '" + get_sub_domain_name(HOMEPAGE) + "' was scraped --")
def next_job(): while True: crawled_links = file_to_set(CRAWLED_FILE) if len( crawled_links ) > NUMBER_OF_SITES: # if we have crawled more links than NUMBER_OF_SITES we exit # Crawler_Indexer_run.py. os._exit(0) url = queue.get() # the next link in Crawler.crawling( Crawler, url ) # call the crawling method from Crawler passing the current URL. queue.task_done( ) # it lets workers(threads) say when a task is done.
def crawl(): queued_links = file_to_set(QUEUE_FILE) if len(queued_links) > 0: new_jobs() # call a new_jobs() method.
def new_jobs(): for link in file_to_set(QUEUE_FILE): queue.put(link) queue.join() # will wait until enough task_done calls have been made crawl() # call the crawl method
def start_up(): create_proj_directory(Crawler.project) create_files(Crawler.project, Crawler.base_url) Crawler.queue = file_to_set(Crawler.queue_file) Crawler.crawled = file_to_set(Crawler.crawled_file)
def check_queue(content_num): if len(file_to_set(QUEUE_FILE)) > 0: run(content_num) else: print("-- All pages from '" + get_sub_domain_name(HOMEPAGE) + "' was scraped --")