예제 #1
0
    def multi_book_extractor_by_publishers_url(self, worker=2):
        """its creating multi thread  extractor and then call _extract_books_by_publishers then aggregate the results
        Parameters
        ----------
        publishers_url : str
            url of publisher link
        worker: int
            number of thread that selenium will run
        """
        process = Process()
        _publishers = self._scrape_publishers(PUBLISHERS_URL)
        # _split_publishers_list = split_to_sublist(the_list=_publishers[-2:], number_of_sublist=worker)
        _split_publishers_list = split_to_sublist(the_list=_publishers,
                                                  number_of_sublist=worker)
        worker_files = []
        for publishers in _split_publishers_list:
            worker_files.append(process.name + ".csv")
            process = Process(target=self._extract_books_by_publishers,
                              args=(publishers, process.name))
            process.start()
        process.join()

        csv_handler = CSVHandler()
        joined_file_path = csv_handler.join_csv_files(worker_files,
                                                      out_put=self.output_file)
        csv_handler.remove_files(worker_files)
        process.terminate()

        return joined_file_path
예제 #2
0
 def _extract_books_by_publishers(self, publishers, file_name):
     """extract books by publishers , turn the_scrape_books_by_publishers into a csv file
     Parameters
     ----------
     publishers:list
         its a list of publishers url
     """
     try:
         driver = SeleniumDriver().chrome_driver(
             without_browser=self.without_browser,
             optimized_mode=self.optimized_mode)
         csv_handler = CSVHandler()
         for publisher in publishers:
             books = self._scrape_books_by_publishers(publisher, driver)
             csv_handler.export_book_to_csv(books=books,
                                            file_name=file_name)
         driver.close()
     except Exception as e:
         logging.error(e)
예제 #3
0
 def __init__(self, without_browser=True, optimized_mode=True):
     self.without_browser = without_browser
     self.optimized_mode = optimized_mode
     self.csv_handler = CSVHandler()
예제 #4
0
class BookScrapper(object):
    def __init__(self, without_browser=True, optimized_mode=True):
        self.without_browser = without_browser
        self.optimized_mode = optimized_mode
        self.csv_handler = CSVHandler()

    def __scrape_genres_urls(self):
        driver = SeleniumDriver().chrome_driver(without_browser=self.without_browser,
                                                optimized_mode=self.optimized_mode)
        driver.get(url=WEBSITE)
        menu_items = driver.find_element_by_class_name("side-nav")
        items = menu_items.find_elements_by_xpath(".//a[@href]")
        _list_genres = []
        for item in items:
            if "genre" in item.get_attribute("href"):
                _list_genres.append(item.get_attribute("href"))

        _list_genres_without_duplication = list(set(_list_genres))
        driver.close()
        return _list_genres_without_duplication

    def __extract_genres_id(self):
        genre_urls = self.__scrape_genres_urls()
        genres = []
        for url in genre_urls:
            unquote_url = furl(unquote(url))
            "segment 2 belong to category"
            genres.append(unquote_url.path.segments[2])
        return genres

    def extract_books(self):
        genres_id = self.__extract_genres_id()
        genres_id.sort()
        out_put = ""
        # for test
        # genres_id = genres_id[-1:]
        for id in genres_id:
            out_put = self.extract_books_api_by_category(id)
        return out_put

    def extract_books_api_by_category(self, category_id):
        category_url = self.__generate_category_url(category_id)
        books = self.__get_books_from_api(category_url)
        return self.csv_handler.export_book_to_csv(books, file_name="navar")

    def __get_books_from_api(self, category_url):
        logging.info("API CALLED")
        self.__remove_json_file()
        pagination = self.__get_next_offset_from_json()
        url = category_url
        logging.info(pagination.offset)
        list_books = []
        while pagination.hasMore:
            response = requests.get(url)
            response.encoding = 'UTF-8'
            logging.info(pagination.offset)
            self.__save_json_to_file(response.json(), JSON_FILE_PATH)
            pagination = self.__get_next_offset_from_json()
            url = pagination.offset
            books = self.__extract_books_from_json()
            list_books.extend(books)
        return list_books

    def __extract_books_from_json(self):
        with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file:
            logging.info('reading from json file')
            parser = ijson.parse(input_file)
            books = []
            __found_book = False
            for prefix, event, value in parser:
                if prefix == "items.item.title":
                    __found_book = True
                    book = Book()
                    book.title = value
                    book.author = " "
                    book.publisher = " "
                    logging.info(book.title)
                if prefix == "items.item.products.item.price":
                    book.price = value
                if prefix == "items.item.authors.item.firstName":
                    book.author = value
                if prefix == "items.item.authors.item.lastName":
                    book.author = str(book.author) + " " + value
                if prefix == "items.item.audioPublisherTitle":
                    book.publisher = value
                if __found_book:
                    __found_book = False
                    books.append(book)

            return books

    @staticmethod
    def __parse_xml(response):
        try:

            root_namespace = "{http://schemas.datacontract.org/2004/07/System.Web.Http.OData}"
            item_namespace = "{http://schemas.datacontract.org/2004/07/Navaar.Data}"
            print(1)

            root = ET.fromstring(response.data)
            url = root.find(root_namespace + "NextPageLink").text
            print(url)
            # print(root.getchildren())

            items = root.find(root_namespace + "Items").getchildren()
            for item in items:
                book = Book()
                # print(item.getchildren())
                book.publisher = item.find(item_namespace + "AudioPublisherTitle").text
                book.title = item.find(item_namespace + "Title").text
                products = item.find(item_namespace + 'Products')
                for product in products:
                    book.price = product.find(item_namespace + "Price").text
                authors = item.find(item_namespace + "Authors")
                for author in authors:
                    book.author = author.find(item_namespace + "FirstName").text + " " + author.find(
                        item_namespace + "LastName").text
        except Exception as e:
            print(getattr(e, 'message', repr(e)))

    @staticmethod
    def __save_json_to_file(json_object, file_name):
        try:
            with open(file_name, 'w') as json_file:
                json.dump(json_object, json_file)
        except Exception as error:
            logging.error("JSON could not save")
            raise error

    @staticmethod
    def __remove_json_file():
        if os.path.exists(JSON_FILE_PATH):
            os.remove(JSON_FILE_PATH)

    @staticmethod
    def __get_next_offset_from_json():
        try:
            with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file:
                json_object = json.load(input_file)
                pagination = Pagination()
                pagination.offset = json_object["nextPageLink"]
                if pagination.offset is not None:
                    pagination.hasMore = True
                else:
                    pagination.hasMore = False
                return pagination
        except Exception:
            pagination = Pagination()
            pagination.offset = " "
            pagination.hasMore = True
            return pagination

    @staticmethod
    def __generate_category_url(category_id):
        unquote_url = furl(unquote(TEMPLATE_URL))
        "segment 3 belong to category"
        unquote_url.path.segments[3] = str(category_id)
        return unquote_url.url
예제 #5
0
 def __init__(self, without_browser=False, optimized_mode=True):
     self.driver = SeleniumDriver().chrome_driver(
         without_browser=without_browser, optimized_mode=optimized_mode)
     logging.info("initializing finished")
     self.csv_handler = CSVHandler()
예제 #6
0
class BookScrapper(object):
    def __init__(self, without_browser=False, optimized_mode=True):
        self.driver = SeleniumDriver().chrome_driver(
            without_browser=without_browser, optimized_mode=optimized_mode)
        logging.info("initializing finished")
        self.csv_handler = CSVHandler()

    def extract_books_by_category(self):
        logging.info("extract_books_by_category started")
        categories = self.__scrape_categories_link(category_url=CATEGORY_URL)
        out_put_file = ""
        for category in categories:
            out_put_file = self.csv_handler.export_book_to_csv(
                self.__scrape_books_by_category(category),
                file_name="ketabrah")
        return out_put_file

    def extract_books_by_publishers(self):
        logging.info("extract_books_by_publisher started")
        publishers = self.__scrape_publishers_link(
            publishers_url=PUBLISHERS_URL)
        out_put_file = ""
        # for test
        # publishers = publishers[-2:]
        for publisher in publishers:
            out_put_file = self.csv_handler.export_book_to_csv(
                self.__scrape_books_by_publishers(publisher),
                file_name="ketabrah")
        return out_put_file

    def __scrape_books_by_category(self, category):
        list_books = []
        page_index = 1
        while True:
            url = self.__generate_url(category.url, page_index)
            self.driver.get(url)
            try:
                books_list_element = self.driver.find_element_by_class_name(
                    "book-list")
                books_list_element = books_list_element.find_elements_by_class_name(
                    "item")
                try:
                    for book in books_list_element:
                        book_instance = Book()
                        book_instance.title = book.find_element_by_class_name(
                            "title").text
                        logging.info(book_instance.title)
                        book_instance.author = book.find_element_by_class_name(
                            "authors").text
                        book_instance.price = book.find_element_by_class_name(
                            "price").text
                        book_instance.category = category.title
                        list_books.append(book_instance)
                except Exception as e:
                    pass
                page_index = page_index + 1
            except Exception as e:
                return list_books

    @staticmethod
    def __generate_url(url, page_index):
        _url = url + "/page-" + str(page_index)
        return _url

    def __scrape_categories_link(self, category_url):
        categories = []
        self.driver.get(url=category_url)
        menu_items = self.driver.find_element_by_class_name("cr-menu")
        items = menu_items.find_elements_by_class_name("crm-item")
        for item in items:
            category_url = item.find_element_by_xpath(
                ".//a[@href]").get_attribute("href")
            if "book-category" in category_url:
                category = Category()
                category.url = category_url
                logging.info(item.text)
                category.title = item.text
                categories.append(category)
        return categories

    def __scrape_publishers_link(self, publishers_url):
        self.driver.get(publishers_url)
        publishers_list_element = self.driver.find_element_by_class_name(
            "publishers-list")
        publisher_blocks_elements = publishers_list_element.find_elements_by_class_name(
            "publisher-block")
        list_publishers = []
        for publisher_block in publisher_blocks_elements:
            publisher = Publisher()
            publisher.url = publisher_block.get_attribute("href")
            publisher.name = publisher_block.find_element_by_class_name(
                "publisher-block-name").text
            logging.info(publisher.name)
            list_publishers.append(publisher)
        return list_publishers

    def __scrape_books_by_publishers(self, publisher):
        list_books = []
        page_index = 1
        while True:
            url = self.__generate_url(publisher.url, page_index)
            self.driver.get(url)
            try:
                books_list_element = self.driver.find_element_by_class_name(
                    "book-list")
                books_list_element = books_list_element.find_elements_by_class_name(
                    "item")
                try:
                    for book in books_list_element:
                        book_instance = Book()
                        book_instance.title = book.find_element_by_class_name(
                            "title").text
                        logging.info(book_instance.title)
                        book_instance.author = book.find_element_by_class_name(
                            "authors").text
                        book_instance.price = book.find_element_by_class_name(
                            "price").text
                        book_instance.publisher = publisher.name
                        list_books.append(book_instance)
                except Exception as e:
                    pass
                page_index = page_index + 1
            except Exception as e:
                return list_books
예제 #7
0
 def __init__(self):
     self.__remove_json_file()
     self.response_count = 100
     self.INITIAL_OFFSET = "0-0-0-100"
     self.csv_handler = CSVHandler()
예제 #8
0
class BookScrapper(object):
    def __init__(self):
        self.__remove_json_file()
        self.response_count = 100
        self.INITIAL_OFFSET = "0-0-0-100"
        self.csv_handler = CSVHandler()

    def extract_books_api_by_category(self, category_id):
        __pagination = self.__get_next_offset_from_json()
        out_put_file = ""
        while True:
            if __pagination.hasMore:
                url = self.__generate_url_pagination_by_category(
                    category_id, offset=__pagination.offset)
                out_put_file = self.csv_handler.export_book_to_csv(
                    file_name="taghche",
                    books=self.__get_books_from_api(url=url))
                __pagination = self.__get_next_offset_from_json()
            else:
                return out_put_file

    def set_response_count(self, count: int):
        self.INITIAL_OFFSET = "0-0-0-" + str(count)
        self.response_count = count

    @staticmethod
    def __generate_url_pagination_by_category(category_id, offset):
        _unquote_url = furl(unquote(TEMPLATE_URL))
        _filters = json.loads(_unquote_url.query.params["filters"])
        _filters["list"][0]["value"] = str(category_id)
        _category_filter = json.dumps(_filters)
        _paginated_furl = furl(unquote(HOST)).add(args={
            'filters': _category_filter,
            'offset': offset
        })
        return _paginated_furl.url

    def __get_books_from_api(self, url):
        logging.info("API Called")
        r = requests.get(url)
        r.encoding = 'UTF-8'
        self.__save_json_to_file(r.json(), JSON_FILE_PATH)

        with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file:
            logging.info('reading from json file')
            json_books = ijson.items(input_file, "bookList.books.item")
            books = []
            for book in json_books:
                book_instance = Book()
                book_instance.title = book["title"]
                book_instance.page_number = book["numberOfPages"]
                book_instance.price = book["price"]
                book_instance.publisher = book["publisher"]
                book_instance.author = self.convert_authors_to_string(
                    book["authors"])
                book_instance.category = self.convert_categories_to_string(
                    book["categories"])
                # book_instance.beforeOffPrice = book["beforeOffPrice"]
                # book_instance.rating = book["rating"]
                # book_instance.physicalPrice = book["PhysicalPrice"]
                # book_instance.publish_date = book["publishDate"]
                books.append(book_instance)
        return books

    @staticmethod
    def convert_authors_to_string(list_authors):
        authors_name = " "
        for author_index in range(list_authors.__len__()):
            author = list_authors[author_index]
            authors_name = authors_name + str(author["firstName"]) + " " + str(
                author["lastName"]) + " "
        return authors_name

    @staticmethod
    def convert_categories_to_string(list_categories):
        category_name = " "
        for category_index in range(list_categories.__len__()):
            category = list_categories[category_index]
            category_name = category_name + str(category["title"]) + " "
        return category_name

    @staticmethod
    def change_offset_lentgh(offset, length=20):
        _custom_offset = offset.split('-')
        _custom_offset[-1] = str(length)
        _offset = '-'.join(_custom_offset)
        logging.info(offset)
        return _offset

    def __get_next_offset_from_json(self, ):
        try:
            with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file:
                json_object = json.load(input_file)
                pagination = Pagination()
                pagination.hasMore = json_object["hasMore"]
                pagination.offset = self.change_offset_lentgh(
                    json_object["nextOffset"], self.response_count)
                return pagination
        except Exception as error:
            pagination = Pagination()
            pagination.hasMore = True
            pagination.offset = self.INITIAL_OFFSET
            return pagination

    @staticmethod
    def __save_json_to_file(json_object, file_name):
        try:
            with open(file_name, 'w') as json_file:
                json.dump(json_object, json_file)
        except Exception as error:
            logging.error("JSON could not save")
            raise error

    @staticmethod
    def __remove_json_file():
        if os.path.exists(JSON_FILE_PATH):
            os.remove(JSON_FILE_PATH)