def link_crawler(self, crawlable, max_delay): main_url, product_url_regex = crawlable.get_home_page( ), crawlable.get_product_pages() main_page = Page(crawlable, main_url) main_crawling_pages = main_page.get_main_crawling_pages() crawl_queue = main_crawling_pages[:] current_main_page = 0 main_pages_length = len(main_crawling_pages) all_visited, product_list = [], [] products_visited = [] while crawl_queue: url = crawl_queue.pop() try: if url not in all_visited: if url in main_crawling_pages: current_main_page += 1 print('\n%d out of %d main pages\n' % (current_main_page, main_pages_length)) if max_delay and max_delay > 0: time.sleep( random.randint(0, max_delay) ) #Making a little bit more difficult to be caught page = Page(crawlable, url) all_visited.append(url) if page.is_product and page.url not in products_visited: product_list.append(page.get_product()) products_visited.append(page.url) page_links = page.get_page_links() for link in page_links: if link not in all_visited: crawl_queue.append(link) except Exception as e: traceback.print_exc(file=sys.stdout) if url not in all_visited: all_visited.append(url) print(str(len(product_list)) + ' products found.') return product_list
def findDedicateInPage(pageIndex, buff): print("START: scan page {0}".format(pageIndex)) page = Page(URL, pageIndex, PAGE_SIZE) for photo in page.fetchPhotos(): if Photo.fetchOneById(photo.id) is None: buff.append(photo) print("SCAN: {0} is qulified because cannot find record locally".format(photo.id)) else: print("SKIP: {0} has a local record.".format(photo.id))
def test_get_product(self): crawler = Crawler() base_path = os.path.abspath('.') + os.sep + 'tests' file_base_path = 'file:///' + base_path link = os.path.join(file_base_path, 'produto_1.html') epoca = EpocaCosmeticos() print epoca.get_product_pages() product = Page(EpocaCosmeticos(), link).get_product() self.assertEquals('Produto 1', product.name) self.assertEquals('Titulo do Produto 1', product.title) self.assertEquals(link, product.url)
def _get_links_and_forms_and_store(self, url): if Page.query.filter(Page.url == url).first(): return [] r = requests.get(url, cookies=self.cookies) soup = BeautifulSoup(r.text, "html.parser") try: page = Page(self.website_id, url) page.save_to_db() self._get_forms(soup, page.id, url) except Exception as e: print("Row already exists", e) return self._get_links(url, soup)
def getPaging(pageIndex, downloadPhotoFn, tracker): pageObject = Page(URL, pageIndex, PAGE_SIZE) photos = pageObject.fetchPhotos() print("PREPARE: create download task for Page: {0}".format(pageIndex)) with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_DOWNLOAD_WORKER) as downloadExecutor: executor = downloadExecutor #executor = None for photo in photos: if tracker is not None: tracker.addTask(1) if executor is None: downloadPhotoFn(photo, tracker) else: executor.submit(downloadPhotoFn, photo, tracker)
clock.tick(120) # Menu Components button_dimensions = (100, 40) font_style = "lucidaconsole" title_font = pygame.font.SysFont(font_style, 72) title = title_font.render("BLOCKY", True, (255, 0, 0)) start_button = Button("start_button", (0, 0), button_dimensions, main, "Start", pygame.font.SysFont(font_style, 12)) run_ai_button = Button("run_ai_button", (0, 0), button_dimensions, test_ai, "Run A.I.", pygame.font.SysFont(font_style, 12)) info_button = Button("info_button", (0, 0), button_dimensions, info_page, "Info", pygame.font.SysFont(font_style, 12)) button_list = [start_button, run_ai_button, info_button] menu_page = Page(display, button_list) menu_page.arrange_buttons("vertical", (display_width // 2 - button_dimensions[0] // 2, display_height // 2 - button_dimensions[1] // 2), 60) # Info Components info_image = pygame.image.load('data/control_info.jpg') info_image = pygame.transform.scale(info_image, (display_width // 2, display_height // 2)) back_button = Button( "back_button", ((display_width - button_dimensions[0]) / 2, display_height / 2 + 100), button_dimensions, menu, "Back", pygame.font.SysFont(font_style, 12)) info_page = Page(display, [back_button])