Example #1
0
    def link_crawler(self, crawlable, max_delay):
        main_url, product_url_regex = crawlable.get_home_page(
        ), crawlable.get_product_pages()

        main_page = Page(crawlable, main_url)
        main_crawling_pages = main_page.get_main_crawling_pages()
        crawl_queue = main_crawling_pages[:]
        current_main_page = 0
        main_pages_length = len(main_crawling_pages)

        all_visited, product_list = [], []
        products_visited = []

        while crawl_queue:
            url = crawl_queue.pop()

            try:
                if url not in all_visited:
                    if url in main_crawling_pages:
                        current_main_page += 1
                        print('\n%d out of %d main pages\n' %
                              (current_main_page, main_pages_length))

                    if max_delay and max_delay > 0:
                        time.sleep(
                            random.randint(0, max_delay)
                        )  #Making a little bit more difficult to be caught

                    page = Page(crawlable, url)
                    all_visited.append(url)

                    if page.is_product and page.url not in products_visited:
                        product_list.append(page.get_product())
                        products_visited.append(page.url)

                    page_links = page.get_page_links()
                    for link in page_links:
                        if link not in all_visited:
                            crawl_queue.append(link)

            except Exception as e:
                traceback.print_exc(file=sys.stdout)
                if url not in all_visited:
                    all_visited.append(url)

        print(str(len(product_list)) + ' products found.')
        return product_list
Example #2
0
def findDedicateInPage(pageIndex, buff):
	print("START: scan page {0}".format(pageIndex))	
	page = Page(URL, pageIndex, PAGE_SIZE)	
	for photo in page.fetchPhotos():
		if Photo.fetchOneById(photo.id) is None:
			buff.append(photo)
			print("SCAN: {0} is qulified because cannot find record locally".format(photo.id))
		else:
			print("SKIP: {0} has a local record.".format(photo.id))
Example #3
0
 def test_get_product(self):
     crawler = Crawler()
     base_path = os.path.abspath('.') + os.sep + 'tests'
     file_base_path = 'file:///' + base_path
     link = os.path.join(file_base_path, 'produto_1.html')
     epoca = EpocaCosmeticos()
     print epoca.get_product_pages()
     product = Page(EpocaCosmeticos(), link).get_product()
     self.assertEquals('Produto 1', product.name)
     self.assertEquals('Titulo do Produto 1', product.title)
     self.assertEquals(link, product.url)
Example #4
0
    def _get_links_and_forms_and_store(self, url):
        if Page.query.filter(Page.url == url).first():
            return []

        r = requests.get(url, cookies=self.cookies)
        soup = BeautifulSoup(r.text, "html.parser")

        try:
            page = Page(self.website_id, url)
            page.save_to_db()
            self._get_forms(soup, page.id, url)
        except Exception as e:
            print("Row already exists", e)

        return self._get_links(url, soup)
Example #5
0
def getPaging(pageIndex, downloadPhotoFn, tracker):
	pageObject = Page(URL, pageIndex, PAGE_SIZE)
	
	photos = pageObject.fetchPhotos()
	
	print("PREPARE: create download task for Page: {0}".format(pageIndex))
	with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_DOWNLOAD_WORKER) as downloadExecutor:
		
		executor = downloadExecutor
		#executor = None

		for photo in photos:			
			if tracker is not None:
				tracker.addTask(1)				
			
			if executor is None:
				downloadPhotoFn(photo, tracker)
			else:			
				executor.submit(downloadPhotoFn, photo, tracker)
Example #6
0
        clock.tick(120)


# Menu Components
button_dimensions = (100, 40)
font_style = "lucidaconsole"
title_font = pygame.font.SysFont(font_style, 72)
title = title_font.render("BLOCKY", True, (255, 0, 0))
start_button = Button("start_button", (0, 0), button_dimensions, main, "Start",
                      pygame.font.SysFont(font_style, 12))
run_ai_button = Button("run_ai_button", (0, 0), button_dimensions, test_ai,
                       "Run A.I.", pygame.font.SysFont(font_style, 12))
info_button = Button("info_button", (0, 0), button_dimensions, info_page,
                     "Info", pygame.font.SysFont(font_style, 12))
button_list = [start_button, run_ai_button, info_button]
menu_page = Page(display, button_list)
menu_page.arrange_buttons("vertical",
                          (display_width // 2 - button_dimensions[0] // 2,
                           display_height // 2 - button_dimensions[1] // 2),
                          60)

# Info Components
info_image = pygame.image.load('data/control_info.jpg')
info_image = pygame.transform.scale(info_image,
                                    (display_width // 2, display_height // 2))
back_button = Button(
    "back_button",
    ((display_width - button_dimensions[0]) / 2, display_height / 2 + 100),
    button_dimensions, menu, "Back", pygame.font.SysFont(font_style, 12))
info_page = Page(display, [back_button])