def start_up(self):
     create_project_directory(self.project_name)
     create_overview_files(self.project_name, self.base_url)
     self.queue = file_to_set(
         self.queue_file)  # Convert file to set for faster processing
     self.scraped = file_to_set(self.scraped_file)
     print("-- Scraper initialised --")
Example #2
0
def run(content_num):
    queue_urls = file_to_set(QUEUE_FILE)

    for url in queue_urls:
        scraper.scrape_page(url, content_file + str(content_num) + ".md")
        content_num += 1
    check_queue(content_num)
def run():
        queue_urls = file_to_set(QUEUE_FILE)

        for url in queue_urls:
                scraper.scrape_page(url, PROJECT_NAME + "/" + get_path_name(url) + ".md")
        
        if not is_file_empty(QUEUE_FILE):
                run()
        else:
                print("-- All pages from '" + get_sub_domain_name(HOMEPAGE) + "' was scraped --")
 def next_job():
     while True:
         crawled_links = file_to_set(CRAWLED_FILE)
         if len(
                 crawled_links
         ) > NUMBER_OF_SITES:  # if we have crawled more links than NUMBER_OF_SITES we exit
             # Crawler_Indexer_run.py.
             os._exit(0)
         url = queue.get()  # the next link in
         Crawler.crawling(
             Crawler, url
         )  # call the crawling method from Crawler passing the current URL.
         queue.task_done(
         )  # it lets workers(threads) say when a task is done.
 def crawl():
     queued_links = file_to_set(QUEUE_FILE)
     if len(queued_links) > 0:
         new_jobs()  # call a new_jobs() method.
 def new_jobs():
     for link in file_to_set(QUEUE_FILE):
         queue.put(link)
     queue.join()  # will wait until enough task_done calls have been made
     crawl()  # call the crawl method
Example #7
0
 def start_up():
     create_proj_directory(Crawler.project)
     create_files(Crawler.project, Crawler.base_url)
     Crawler.queue = file_to_set(Crawler.queue_file)
     Crawler.crawled = file_to_set(Crawler.crawled_file)
Example #8
0
def check_queue(content_num):
    if len(file_to_set(QUEUE_FILE)) > 0:
        run(content_num)
    else:
        print("-- All pages from '" + get_sub_domain_name(HOMEPAGE) +
              "' was scraped --")