Beispiel #1
0
    async def crawl_pages(self, category):
        cat_id = self.categories.get(category)
        offset = 0
        max_results = 50
        auctions = list()

        while True:
            url = self.search_category_url_format.format(
                cat_id=cat_id, skip=offset, max_num_of_results=max_results)
            _, page_content = await self.extract_async(url)
            if page_content is not None:
                json_obj = json.loads(page_content.decode("utf-8"),
                                      encoding="utf-8")

                items = json_obj.get("Items")
                auctions.extend(items)

            offset += max_results

            if len(items) < max_results:
                break

        log.debug("Found: %d auctions of category: %s" %
                  (len(auctions), category))

        output_dir = self.output_dir_path_format.format(category=category)
        csv_file_path = os.path.join(
            output_dir, "{category}.csv".format(category=category))

        log.info("Csv output directory path: %s, csv file: %s" %
                 (output_dir, csv_file_path))

        Util.create_directory(output_dir)

        csv_manager = CsvManager(csv_file_path, self.fields, "id")
        csv_manager.open_file()

        tasks = (self.parse_item(category, item) for item in items)
        for res in AsyncCrawler.limited_as_completed(tasks, 5):
            extracted_data = await res

            if csv_manager.check_row_exist(extracted_data):
                extracted_data["flag"] = self.flags.get("updated")
            else:
                extracted_data["flag"] = self.flags.get("new")

            csv_manager.update_row(extracted_data)

            auction_output_dir = os.path.join(output_dir,
                                              extracted_data.get("id"))
            Util.create_directory(auction_output_dir)

            if extracted_data.get("images") is not None:
                images_urls = extracted_data.get("images").split('|')

                local_img = list()

                for img_url in images_urls:
                    local_img_file_path = os.path.join(
                        auction_output_dir, "{img_id}.jpg".format(
                            img_id=self.get_image_id(img_url)))

                    if not Util.check_file_exist(local_img_file_path):
                        local_img.append((img_url, local_img_file_path))

                download_tasks = (self.download_file(img_url, img_file_path)
                                  for img_url, img_file_path in local_img)

                for r in AsyncCrawler.limited_as_completed(download_tasks):
                    await r

        csv_manager.close_file()
    async def crawl_pages(self, category, max_pages):
        pages = (self.search_category_url_format.format(
            category=category, page_number=page_number)
                 for page_number in range(1, max_pages + 1))

        auctions_links = list()

        tasks = (self.extract_async(url) for url in pages)
        for page in AsyncCrawler.limited_as_completed(tasks, 5):
            url, page_content = await page
            if url is not None and page_content is not None:
                auctions_links.extend(
                    self.parse_search_result_page(page_content))

        if not auctions_links:
            log.warning("No results found for category: %s" % category)
            return

        log.debug("Found: %d auctions in %d pages of category: %s" %
                  (len(auctions_links), max_pages, category))

        output_dir = self.output_dir_path_format.format(category=category)
        csv_file_path = os.path.join(
            output_dir, "{category}.csv".format(category=category))

        Util.create_directory(output_dir)

        csv_manager = CsvManager(csv_file_path, self.fields, "id")
        csv_manager.open_file()

        for auction_url in auctions_links:
            self.driver.get(auction_url)

            extracted_data = self.parse_data(category, auction_url,
                                             self.driver.page_source)
            if csv_manager.check_row_exist(extracted_data):
                log.debug("row already existed in csv")
                extracted_data["flag"] = self.flags.get("updated")
            else:
                log.debug("row in new")
                extracted_data["flag"] = self.flags.get("new")

            csv_manager.update_row(extracted_data)

            auction_output_dir = os.path.join(output_dir,
                                              extracted_data.get("id"))
            Util.create_directory(auction_output_dir)

            if extracted_data.get("images") is not None:
                images_urls = extracted_data.get("images").split('|')

                local_img = list()

                for img_url in images_urls:
                    local_img_file_path = os.path.join(
                        auction_output_dir, "{img_id}.png".format(
                            img_id=self.get_image_id(img_url)))

                    if not Util.check_file_exist(local_img_file_path):
                        local_img.append((img_url, local_img_file_path))

                download_tasks = (self.download_file(img_url, img_file_path)
                                  for img_url, img_file_path in local_img)

                for r in AsyncCrawler.limited_as_completed(download_tasks):
                    await r

        csv_manager.close_file()
Beispiel #3
0
    async def crawl_pages(self, category, max_pages):
        pages = (self.search_link_format.format(category=category,
                                                page_number=page_number)
                 for page_number in range(1, max_pages + 1))

        auctions_links = list()

        tasks = (self.extract_async(url) for url in pages)
        for page in AsyncCrawler.limited_as_completed(tasks, 5):
            url, page_content = await page
            if url is not None and page_content is not None:
                auctions_links.extend(
                    self.parse_search_result_page(page_content))

        if not auctions_links:
            log.warning("No results found for category: %s" % category)
            return

        log.debug("Found: %d auctions in %d pages of category: %s" %
                  (len(auctions_links), max_pages, category))

        output_dir = self.output_dir_path_format.format(category=category)
        csv_file_path = os.path.join(
            output_dir, "{category}.csv".format(category=category))

        Util.create_directory(output_dir)

        csv_manager = CsvManager(csv_file_path, self.fields, "id")
        csv_manager.open_file()
        '''
        tasks = (self.extract_multi_async([url.replace("aukcja", "zdjecia"), url]) for url in auctions_links)
        for pages in AsyncCrawler.limited_as_completed(tasks):
            results = await pages
            images_url, images_page_content = results[0]
            url, page_content = results[1]
        '''
        tasks = (self.extract_async(url) for url in auctions_links)
        for page in AsyncCrawler.limited_as_completed(tasks, 5):
            url, page_content = await page
            if url is not None and page_content is not None:
                extracted_data = self.parse_data(category, url, page_content)

                images_links = list()
                images_url = url.replace("aukcja", "zdjecia")
                _, images_page_content = await self.extract_async(images_url)
                if images_url is not None and images_page_content is not None:
                    images_links = self.parse_full_images_page(
                        images_page_content)
                    extracted_data["images"] = '|'.join(images_links)

                if csv_manager.check_row_exist(extracted_data):
                    if _translate.get("finished") in extracted_data.get(
                            "stop").lower():
                        extracted_data["flag"] = self.flags.get("sold")
                    else:
                        extracted_data["flag"] = self.flags.get("updated")
                else:
                    extracted_data["flag"] = self.flags.get("new")

                csv_manager.update_row(extracted_data)

                auction_output_dir = os.path.join(output_dir,
                                                  extracted_data.get("id"))
                Util.create_directory(auction_output_dir)

                if extracted_data.get("images") is not None:
                    images_urls = extracted_data.get("images").split('|')

                    local_img = list()

                    for img_url in images_urls:
                        local_img_file_path = os.path.join(
                            auction_output_dir, "{img_id}.jpg".format(
                                img_id=self.get_image_id(img_url)))

                        if not Util.check_file_exist(local_img_file_path):
                            local_img.append((img_url, local_img_file_path))

                    download_tasks = (self.download_file(
                        img_url, img_file_path)
                                      for img_url, img_file_path in local_img)

                    for r in AsyncCrawler.limited_as_completed(download_tasks):
                        await r

            else:
                logging.error("Url or page_content none: %s" % url)

        csv_manager.close_file()