Example #1
0
    def read_file(self, file_name: str, folder: str) -> str:
        folder = self.clean_file_path_variables(path_element=folder)
        file_name = self.clean_file_path_variables(path_element=file_name)

        file_path = str(pathlib.Path.joinpath(self.ROOT_DIR, folder,
                                              file_name))

        if not os.path.isfile(path=file_path):
            try:
                with open(file_path, "w", encoding="UTF-8") as file:
                    file.write("")

            except IOError as e:
                LogHandler.log_message(
                    f"{self.HEADER}: ERROR: Could not find the file {file_path} and tried to create an empty one. "
                    f"An error while trying to create the empty file. Msg.: {str(e)}"
                )

        try:
            with open(file_path, "r") as file:
                content = file.read()

        except IOError as e:
            LogHandler.log_message(
                f"{self.HEADER}: ERROR: An error occurred while trying to read the file {file_path}. Msg.: {str(e)}"
            )
            content = None

        return content
Example #2
0
 def switch_to_run_scraper_success(self):
     LogHandler.log_message(
         f"{self.HEADER}: Changing window to 'Run-Top-Products-Scraper-Success'-Section"
     )
     self.main_window.FindElement("run_scraper_col").Update(visible=False)
     self.main_window.FindElement("run_scraper_success_col").Update(visible=True)
     self.update_window()
Example #3
0
    def write_result_to_csv(self,
                            field_names: list,
                            rows_for_print: list,
                            file_name_prefix: str = "",
                            file_name_suffix: str = ".csv"):
        file_name = file_name_prefix + "_" + str(
            datetime.today().date()) + file_name_suffix
        file_path = pathlib.Path.joinpath(self.ROOT_DIR, "csv_output",
                                          file_name)

        LogHandler.log_message(
            f"{self.HEADER} INFO: Start writing results to a new csv-file. File: {file_path}"
        )

        try:
            with open(str(file_path), "w", newline="") as csv_file:
                writer = csv.DictWriter(csv_file,
                                        fieldnames=field_names,
                                        dialect="excel",
                                        delimiter=";")
                writer.writeheader()

                for row in rows_for_print:
                    writer.writerow(row)

        except IOError as e:
            raise Exception(
                f"{self.HEADER}: ERROR: An error occurred while trying to write the results to a new csv. "
                f"Msg.: {str(e)}")

        LogHandler.log_message(
            f"{self.HEADER} INFO: Finished writing results to a new csv-file")

        return 0
Example #4
0
 def switch_to_change_settings_kindle(self):
     LogHandler.log_message(
         f"{self.HEADER}: Changing window to 'Change Settings E-Books'-Section"
     )
     self.main_window.FindElement("main_menu_col").Update(visible=False)
     self.main_window.FindElement("change_settings_kindle_col").Update(visible=True)
     self.update_window()
Example #5
0
    def switch_to_show_category_tree(self, product_type: str):
        LogHandler.log_message(
            f"{self.HEADER}: Changing window to 'Show Categories"
        )
        file_name = self.node_crawler_class.get_categories_tree_file_name(product_type=product_type)
        categories_json = self.file_handler.read_categories_tree_file(file_name=file_name)

        if categories_json:
            tree_json_elements = self.node_crawler_class.get_categories_tree_json_elements(categories_json)
            tree_json_str = ''.join(tree_json_elements)
            # print(tree_json_str)
            self.main_window.FindElement("categories_error_message").Update(
                value=""
            )
            self.main_window.FindElement("categories_tree_field").Update(
                value=tree_json_str
            )

        else:
            tree_json_str = "No categories have been discovered yet. Please run the scraper first. All discovered " \
                            "categories will be saved here."

            self.main_window.FindElement("categories_error_message").Update(
                value=tree_json_str
            )

        self.main_window.FindElement("main_menu_col").Update(visible=False)
        self.main_window.FindElement("show_category_tree_col").Update(visible=True)
Example #6
0
 def switch_to_change_config(self):
     LogHandler.log_message(
         f"{self.HEADER}: Changing window to 'Change Config'-Section"
     )
     self.main_window.FindElement("main_menu_col").Update(visible=False)
     self.main_window.FindElement("change_main_config_col").Update(visible=True)
     self.update_window()
Example #7
0
 def switch_to_run_scraper_failure(self, error_message: str):
     LogHandler.log_message(
         f"{self.HEADER}: ERROR: The following error occurred while trying to run the scraper: '{error_message}'."
     )
     LogHandler.log_message(
         f"{self.HEADER}: Changing window to 'Run-Top-Products-Scraper-Failure'-Section"
     )
     self.main_window.FindElement("run_scraper_col").Update(visible=False)
     self.main_window.FindElement("run_scraper_failure_col").Update(visible=True)
     self.update_window()
Example #8
0
    def api_call_with_retry(self, function, function_params: dict) -> list:
        time.sleep(1)
        result = None
        counter = 1
        while counter <= self.RETRY_LIMIT and not result:

            try:
                result = function(**function_params)

            except HTTPError:
                LogHandler.log_message(
                    f"{self.HEADER} WARNING (Nr. {counter}): API Call failed. Retrying "
                    f"in {self.COOLDOWN_TIME_FOR_RETRY} seconds ...")
                counter += 1
                time.sleep(self.COOLDOWN_TIME_FOR_RETRY)

        if not result:
            raise Exception(self.RETRY_ERROR_MESSAGE)

        return result
Example #9
0
    def update_categories_tree_file(self, categories_tree: dict,
                                    file_name: str):
        file_name = self.clean_file_path_variables(path_element=file_name)

        file_path = str(pathlib.Path.joinpath(self.ROOT_DIR, "data",
                                              file_name))

        try:
            with open(str(file_path), "w", encoding="UTF-8") as json_file:
                json_file.write(json.dumps(categories_tree))

        except IOError as e:
            print(
                f"{self.HEADER}: ERROR: An error occurred while trying to update "
                f"the discovered-categories-file. Msg.: {str(e)}")
            return 2

        LogHandler.log_message(
            f"{self.HEADER}: INFO: Finished updating the discovered-categories-file"
        )

        return 0
Example #10
0
    def switch_to_change_secrets(self, secrets_empty: bool = False):
        if secrets_empty:
            LogHandler.log_message(
                f"{self.HEADER}: Could not find secrets in secrets.json!"
            )
            self.main_window.FindElement('secrets_message').Update(
                value="                          Please enter your secret-values. Otherwise the app will not be able "
                      "to use the API!"
            )
            self.main_window.FindElement('cancel_change_secrets').Update(visible=False)
        else:
            self.main_window.FindElement('secrets_message').Update(
                value=""
            )
            self.main_window.FindElement('cancel_change_secrets').Update(visible=True)

        LogHandler.log_message(
            f"{self.HEADER}: Changing window to 'Change Secrets'-Section"
        )
        self.main_window.FindElement("main_menu_col").Update(visible=False)
        self.main_window.FindElement("change_secrets_col").Update(visible=True)
        self.update_window()
Example #11
0
    def create_api_connection(self, config_dict: dict, secrets_dict: dict):
        try:
            self.AMAZON_ACCESS_KEY = secrets_dict["AMAZON_ACCESS_KEY"]
            self.AMAZON_SECRET_KEY = secrets_dict["AMAZON_SECRET_KEY"]
            self.AMAZON_ASSOC_TAG = secrets_dict["AMAZON_ASSOC_TAG"]

        except Exception as e:
            raise Exception(
                f"{self.HEADER} FATAL: Could not read necessary access-, secret-key and association "
                f"tag from config. Error-Msg: {str(e)}")

        try:
            self.RETRY_LIMIT = int(config_dict["RETRY_LIMIT"])
            self.COOLDOWN_TIME_FOR_RETRY = int(
                config_dict["COOLDOWN_TIME_FOR_RETRY"])

        except Exception as e:
            LogHandler.log_message(
                f"{self.HEADER} WARNING: Could not read API-Handler-Values from config. Using default "
                f"values instead. Error-Msg: {str(e)}")

        self.RETRY_ERROR_MESSAGE = (
            f"{self.HEADER} ERROR: Could not reach API after {self.RETRY_LIMIT} retries!"
        )

        try:
            self.AMAZON_API = AmazonAPI(
                self.AMAZON_ACCESS_KEY,
                self.AMAZON_SECRET_KEY,
                self.AMAZON_ASSOC_TAG,
                region="DE",
                MaxQPS=0.5,
            )

        except TypeError as e:
            LogHandler.log_message(
                f"{self.HEADER} ERROR: Could not initialize Amazon-API-Connection! Have you "
                f"provided correct values in your secrets.json-File? Msg.: {str(e)}"
            )
Example #12
0
    def read_categories_tree_file(self, file_name: str):
        file_name = self.clean_file_path_variables(path_element=file_name)

        file_path = str(pathlib.Path.joinpath(self.ROOT_DIR, "data",
                                              file_name))

        try:
            with open(file_path, "r", encoding="UTF-8") as categories_file:
                try:
                    categories_json = json.loads(categories_file.read(),
                                                 encoding="UTF-8")

                except json.JSONDecodeError:
                    raise Exception(
                        f"{self.HEADER} ERROR: Could not read File '{file_name}'. Invalid JSON. Please contact"
                        f"the developer.")

        except IOError as e:
            LogHandler.log_message(
                f"{self.HEADER} WARNING: Could not read the original categories-tree-json. "
                f"Creating a new empty categories-tree-json. Msg.: {str(e)}")
            try:
                with open(file_path, "w", encoding="UTF-8") as categories_file:
                    categories_file.write("{}")

            except IOError as e:
                print(
                    f"{self.HEADER}: ERROR: An error occurred while trying to create an empty "
                    f" discovered-categories-file. File name: '{file_name}'. Msg.: {str(e)}"
                )
                return 2

            else:
                categories_json = {}

        return categories_json
Example #13
0
    def fetch_product_details(self,
                              final_rank_rows,
                              child_id,
                              categories_list: list = ()) -> list:

        if self.PRODUCT_LIMIT == 0:
            return final_rank_rows

        products = self.api_handler.api_call_with_retry(
            function=self.api_handler.browse_node_lookup,
            function_params={
                "BrowseNodeId": child_id,
                "ResponseGroup": "TopSellers"
            },
        )

        for product_index, product in enumerate(products):
            root = ET.fromstring(product.to_string())
            top_seller_items = root.findall(
                ".//owl:TopSellers/owl:TopSeller",
                namespaces=self.api_handler.AMAZON_NAMESPACE,
            )

            for item_index, item in enumerate(top_seller_items):
                if item_index == self.PRODUCT_LIMIT:
                    break

                title = item.find("owl:Title",
                                  namespaces=self.api_handler.AMAZON_NAMESPACE)
                asin = item.find("owl:ASIN",
                                 namespaces=self.api_handler.AMAZON_NAMESPACE)

                if asin is None:
                    continue
                else:
                    asin = asin.text

                if title is not None:
                    title = title.text

                try:
                    product_sales_rank = self.api_handler.api_call_with_retry(
                        function=self.api_handler.lookup,
                        function_params={
                            "ItemId": asin,
                            "Power": "binding:paperback",
                            "ResponseGroup": "SalesRank",
                        },
                    )

                    sales_rank = product_sales_rank.sales_rank
                    offer_url = product_sales_rank.offer_url

                except Exception as e:
                    LogHandler.log_message(
                        f"{self.HEADER} ERROR: Could not fetch sales_rank for ASIN {asin}! Msg: {str(e)}"
                    )

                    sales_rank = 0
                    offer_url = "N/A"

                row = {
                    "category": categories_list,
                    "asin": asin,
                    "title": title,
                    "sales_rank": int(sales_rank),
                    "offer_url": offer_url,
                }

                final_rank_rows.append(row)

                if self.verbose:
                    LogHandler.log_message(
                        f"{self.HEADER}: Current-Row: {row}")
                else:
                    LogHandler.log_message(
                        f"{self.HEADER}: Found-Book: {row['title']}")

        return final_rank_rows
Example #14
0
    def walk_through_categories(
            self,
            recursive_func,
            browse_nodes,
            final_rank_rows: list,
            categories_list: list = (),
            current_level: int = 0,
    ) -> list:
        for index, node in enumerate(browse_nodes):

            LogHandler.log_message(
                f"{self.HEADER} NODE: {index} C{current_level} {node.name} ({node.id})"
            )

            valid_category_check = False
            # If sub-categories are definied -> ignore selected main-categories
            if self.INCLUDED_MAIN_CATEGORIES[
                    self.product_type] and not self.SPECIFIC_SUB_CATEGORIES[
                        self.product_type]:
                for included_category in self.INCLUDED_MAIN_CATEGORIES[
                        self.product_type]:
                    if included_category in node.name.text or node.name.text in [
                            "Kategorien", "Kindle eBooks"
                    ]:
                        # If the category is in the included list -> proceed
                        valid_category_check = True
                        break

                    elif any(included_category in part
                             for part in categories_list):
                        valid_category_check = True

            else:
                valid_category_check = True

            if not valid_category_check:
                print(f"{self.HEADER} NODE: Skipping {node.name.text}")
                continue

            new_categories_list = deepcopy(categories_list)
            new_categories_list.append(f"{node.name.text} [{str(node.id)}]")

            try:
                children_references = node.children

            except AttributeError:
                # If no more children are found - use this node for getting the top-seller-items
                self.categories_paths.append(new_categories_list)

                final_rank_rows = self.fetch_product_details(
                    child_id=node.id,
                    categories_list=new_categories_list,
                    final_rank_rows=final_rank_rows,
                )
                return final_rank_rows

            else:
                for child_reference in children_references:
                    child_nodes = self.api_handler.api_call_with_retry(
                        function=self.api_handler.browse_node_lookup,
                        function_params={"BrowseNodeId": child_reference.id},
                    )

                    if child_nodes:
                        if not self.verbose or node.name.text in [
                                "Wirtschaftskriminalität", "Vampire"
                        ]:
                            LogHandler.log_message(
                                f"{self.HEADER} {node.name}: Children: {[i.name for i in child_nodes]}"
                            )

                        new_level = current_level + 1

                        time.sleep(1)

                        final_rank_rows = recursive_func(
                            recursive_func=recursive_func,
                            browse_nodes=child_nodes,
                            categories_list=new_categories_list,
                            final_rank_rows=final_rank_rows,
                            current_level=new_level,
                        )

        return final_rank_rows
Example #15
0
    def run_crawler(self, browse_node_id: int = '541686'):
        self.product_type = self.NODE_ID_TO_PRODUCT_TYPE[str(
            browse_node_id).lower()]

        final_rank_rows = []
        categories_list = [self.product_type.capitalize()]

        if self.SPECIFIC_SUB_CATEGORIES[self.product_type]:
            category_discovery = False
            for sub_category in self.SPECIFIC_SUB_CATEGORIES[
                    self.product_type]:
                node_id = sub_category.split('[')[1][:-1]

                browse_nodes = self.api_handler.api_call_with_retry(
                    function=self.api_handler.browse_node_lookup,
                    function_params={"BrowseNodeId": node_id},
                )

                if not browse_nodes:
                    print("ERROR")
                    return False

                final_rank_rows = self.walk_through_categories(
                    recursive_func=self.walk_through_categories,
                    browse_nodes=browse_nodes,
                    final_rank_rows=final_rank_rows,
                    categories_list=categories_list,
                    current_level=0,
                )

        else:
            category_discovery = True
            browse_nodes = self.api_handler.api_call_with_retry(
                function=self.api_handler.browse_node_lookup,
                function_params={"BrowseNodeId": browse_node_id},
            )

            final_rank_rows = self.walk_through_categories(
                recursive_func=self.walk_through_categories,
                browse_nodes=browse_nodes,
                final_rank_rows=final_rank_rows,
                categories_list=categories_list,
                current_level=0,
            )

        try:
            deepest_sub_category_level = max(
                [len(l["category"]) for l in final_rank_rows])

            categories_field_names = []
            for level in range(0, deepest_sub_category_level):
                categories_field_names.append(f"category-{level}")

            self.FIELD_NAMES_TOP_BOOKS = (categories_field_names +
                                          self.FIELD_NAMES_TOP_BOOKS)

            for index, row in enumerate(final_rank_rows):
                # TODO: If a sub-category is directly scraped -> all parent categories are currently missing.
                #  They should be included in the csv.
                for level in range(0, deepest_sub_category_level):

                    if level < len(row["category"]):
                        final_rank_rows[index][f"category-{level}"] = row[
                            "category"][level]

                    else:
                        final_rank_rows[index][f"category-{level}"] = ""

                del final_rank_rows[index]["category"]

            fields_for_sorting = ["sales_rank"]
            for level in range(0, deepest_sub_category_level):
                fields_for_sorting.insert(0, f"category-{level}")

            final_rank_rows = sorted(
                final_rank_rows,
                key=lambda x: [x[field] for field in fields_for_sorting],
            )

        except Exception as e:
            LogHandler.log_message(
                f"{self.HEADER} ERROR: An error occurred while trying to sort the product-list. "
                f"Msg: {str(e)}")
            pass

        if category_discovery:
            self.update_categories_tree()

        if self.PRODUCT_LIMIT != 0:
            self.file_handler.write_result_to_csv(
                field_names=self.FIELD_NAMES_TOP_BOOKS,
                rows_for_print=final_rank_rows,
                file_name_prefix=self.product_type,
                file_name_suffix=self.FILE_NAME_SUFFIX_TOP_BOOKS,
            )
Example #16
0
    def refresh_values_in_forms(self):
        self.config_dict = self.config_handler.config_dict
        # ---- Pre-Fill config-form ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
        self.main_window.FindElement("RETRY_LIMIT").Update(
            value=self.config_dict["RETRY_LIMIT"]
        )

        self.main_window.FindElement("COOLDOWN_TIME_FOR_RETRY").Update(
            value=self.config_dict["COOLDOWN_TIME_FOR_RETRY"]
        )

        verbosity_options_list = list(self.config_handler.VERBOSITY_OPTIONS.keys())
        self.main_window.FindElement("VERBOSE_OUTPUT").Update(
            values=verbosity_options_list
        )

        self.main_window.FindElement("PRODUCT_LIMIT").Update(
            value=self.config_dict["PRODUCT_LIMIT"]
        )

        verbosity_level = self.config_dict["VERBOSE_OUTPUT"]
        for key, value in self.config_handler.VERBOSITY_OPTIONS.items():
            if verbosity_level == value:
                verbosity_level = key
        try:
            self.main_window.FindElement("VERBOSE_OUTPUT").Update(
                set_to_index=verbosity_options_list.index(verbosity_level)
            )
        except (IndexError, ValueError) as e:
            LogHandler.log_message(e)

        # ---- Pre-Fill secrets-page ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
        self.main_window.FindElement('AMAZON_ACCESS_KEY').Update(
            value=self.secrets_dict['AMAZON_ACCESS_KEY']
        )

        self.main_window.FindElement('AMAZON_SECRET_KEY').Update(
            value=self.secrets_dict['AMAZON_SECRET_KEY']
        )

        self.main_window.FindElement('AMAZON_ASSOC_TAG').Update(
            value=self.secrets_dict['AMAZON_ASSOC_TAG']
        )

        # ---- Pre-Fill settings-books ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
        self.main_window.FindElement("books_included_main_categories").Update(
            values=self.node_crawler_class.MAIN_CATEGORIES['books'], set_to_index=-1
        )

        included_main_categories = self.config_dict["BrowseNodeCrawler"]['books']["INCLUDED_MAIN_CATEGORIES"]

        self.main_window.FindElement('books_included_main_categories_nr').Update(
            value=f"Currently Saved: {len(included_main_categories)} selected"
        )

        for item in included_main_categories:
            self.main_window.FindElement("books_included_main_categories").Update(
                set_to_index=self.node_crawler_class.MAIN_CATEGORIES['books'].index(item)
            )

        file_name = self.node_crawler_class.get_categories_tree_file_name(product_type="books")
        categories_json = self.file_handler.read_categories_tree_file(file_name=file_name)

        if categories_json:
            discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements(
                categories_json=categories_json
            )

            self.main_window.FindElement("books_specific_sub_categories").Update(
                values=discovered_sub_categories, set_to_index=-1
            )

            specific_sub_categories = self.config_dict["BrowseNodeCrawler"]['books']["SPECIFIC_SUB_CATEGORIES"]

            discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements(
                categories_json, format_values=False
            )

            for item in specific_sub_categories:
                try:
                    self.main_window.FindElement("books_specific_sub_categories").Update(
                        set_to_index=discovered_sub_categories.index(item)
                    )
                except (IndexError, ValueError):
                    pass

            self.main_window.FindElement('books_specific_sub_categories_nr').Update(
                value=f"Currently Saved: {len(specific_sub_categories)} selected"
            )

        else:
            self.main_window.FindElement("books_specific_sub_categories").Update(
                values=[], set_to_index=-1
            )
            self.main_window.FindElement('books_specific_sub_categories_nr').Update(
                value=f"No categories discovered yet"
            )

        # ---- Pre-Fill settings-kindle ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
        self.main_window.FindElement("kindle_books_included_main_categories").Update(
            values=self.node_crawler_class.MAIN_CATEGORIES['kindle_books'], set_to_index=-1
        )

        included_main_categories = self.config_dict["BrowseNodeCrawler"]['kindle_books']["INCLUDED_MAIN_CATEGORIES"]

        self.main_window.FindElement('kindle_books_included_main_categories_nr').Update(
            value=f"Currently Saved: {len(included_main_categories)} selected"
        )

        for item in included_main_categories:
            self.main_window.FindElement("kindle_books_included_main_categories").Update(
                set_to_index=self.node_crawler_class.MAIN_CATEGORIES['kindle_books'].index(item)
            )

        file_name = self.node_crawler_class.get_categories_tree_file_name(product_type="kindle_books")
        categories_json = self.file_handler.read_categories_tree_file(file_name=file_name)

        if categories_json:
            discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements(
                categories_json=categories_json
            )

            self.main_window.FindElement("kindle_books_specific_sub_categories").Update(
                values=discovered_sub_categories, set_to_index=-1
            )

            specific_sub_categories = self.config_dict["BrowseNodeCrawler"]['kindle_books']["SPECIFIC_SUB_CATEGORIES"]

            discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements(
                categories_json, format_values=False
            )

            for item in specific_sub_categories:
                try:
                    self.main_window.FindElement("kindle_books_specific_sub_categories").Update(
                        set_to_index=discovered_sub_categories.index(item)
                    )
                except (IndexError, ValueError):
                    pass

            self.main_window.FindElement('kindle_books_specific_sub_categories_nr').Update(
                value=f"Currently Saved: {len(specific_sub_categories)} selected"
            )

        else:
            self.main_window.FindElement("kindle_books_specific_sub_categories").Update(
                values=[], set_to_index=-1
            )
            self.main_window.FindElement('kindle_books_specific_sub_categories_nr').Update(
                value=f"No categories discovered yet"
            )