def read_file(self, file_name: str, folder: str) -> str: folder = self.clean_file_path_variables(path_element=folder) file_name = self.clean_file_path_variables(path_element=file_name) file_path = str(pathlib.Path.joinpath(self.ROOT_DIR, folder, file_name)) if not os.path.isfile(path=file_path): try: with open(file_path, "w", encoding="UTF-8") as file: file.write("") except IOError as e: LogHandler.log_message( f"{self.HEADER}: ERROR: Could not find the file {file_path} and tried to create an empty one. " f"An error while trying to create the empty file. Msg.: {str(e)}" ) try: with open(file_path, "r") as file: content = file.read() except IOError as e: LogHandler.log_message( f"{self.HEADER}: ERROR: An error occurred while trying to read the file {file_path}. Msg.: {str(e)}" ) content = None return content
def switch_to_run_scraper_success(self): LogHandler.log_message( f"{self.HEADER}: Changing window to 'Run-Top-Products-Scraper-Success'-Section" ) self.main_window.FindElement("run_scraper_col").Update(visible=False) self.main_window.FindElement("run_scraper_success_col").Update(visible=True) self.update_window()
def write_result_to_csv(self, field_names: list, rows_for_print: list, file_name_prefix: str = "", file_name_suffix: str = ".csv"): file_name = file_name_prefix + "_" + str( datetime.today().date()) + file_name_suffix file_path = pathlib.Path.joinpath(self.ROOT_DIR, "csv_output", file_name) LogHandler.log_message( f"{self.HEADER} INFO: Start writing results to a new csv-file. File: {file_path}" ) try: with open(str(file_path), "w", newline="") as csv_file: writer = csv.DictWriter(csv_file, fieldnames=field_names, dialect="excel", delimiter=";") writer.writeheader() for row in rows_for_print: writer.writerow(row) except IOError as e: raise Exception( f"{self.HEADER}: ERROR: An error occurred while trying to write the results to a new csv. " f"Msg.: {str(e)}") LogHandler.log_message( f"{self.HEADER} INFO: Finished writing results to a new csv-file") return 0
def switch_to_change_settings_kindle(self): LogHandler.log_message( f"{self.HEADER}: Changing window to 'Change Settings E-Books'-Section" ) self.main_window.FindElement("main_menu_col").Update(visible=False) self.main_window.FindElement("change_settings_kindle_col").Update(visible=True) self.update_window()
def switch_to_show_category_tree(self, product_type: str): LogHandler.log_message( f"{self.HEADER}: Changing window to 'Show Categories" ) file_name = self.node_crawler_class.get_categories_tree_file_name(product_type=product_type) categories_json = self.file_handler.read_categories_tree_file(file_name=file_name) if categories_json: tree_json_elements = self.node_crawler_class.get_categories_tree_json_elements(categories_json) tree_json_str = ''.join(tree_json_elements) # print(tree_json_str) self.main_window.FindElement("categories_error_message").Update( value="" ) self.main_window.FindElement("categories_tree_field").Update( value=tree_json_str ) else: tree_json_str = "No categories have been discovered yet. Please run the scraper first. All discovered " \ "categories will be saved here." self.main_window.FindElement("categories_error_message").Update( value=tree_json_str ) self.main_window.FindElement("main_menu_col").Update(visible=False) self.main_window.FindElement("show_category_tree_col").Update(visible=True)
def switch_to_change_config(self): LogHandler.log_message( f"{self.HEADER}: Changing window to 'Change Config'-Section" ) self.main_window.FindElement("main_menu_col").Update(visible=False) self.main_window.FindElement("change_main_config_col").Update(visible=True) self.update_window()
def switch_to_run_scraper_failure(self, error_message: str): LogHandler.log_message( f"{self.HEADER}: ERROR: The following error occurred while trying to run the scraper: '{error_message}'." ) LogHandler.log_message( f"{self.HEADER}: Changing window to 'Run-Top-Products-Scraper-Failure'-Section" ) self.main_window.FindElement("run_scraper_col").Update(visible=False) self.main_window.FindElement("run_scraper_failure_col").Update(visible=True) self.update_window()
def api_call_with_retry(self, function, function_params: dict) -> list: time.sleep(1) result = None counter = 1 while counter <= self.RETRY_LIMIT and not result: try: result = function(**function_params) except HTTPError: LogHandler.log_message( f"{self.HEADER} WARNING (Nr. {counter}): API Call failed. Retrying " f"in {self.COOLDOWN_TIME_FOR_RETRY} seconds ...") counter += 1 time.sleep(self.COOLDOWN_TIME_FOR_RETRY) if not result: raise Exception(self.RETRY_ERROR_MESSAGE) return result
def update_categories_tree_file(self, categories_tree: dict, file_name: str): file_name = self.clean_file_path_variables(path_element=file_name) file_path = str(pathlib.Path.joinpath(self.ROOT_DIR, "data", file_name)) try: with open(str(file_path), "w", encoding="UTF-8") as json_file: json_file.write(json.dumps(categories_tree)) except IOError as e: print( f"{self.HEADER}: ERROR: An error occurred while trying to update " f"the discovered-categories-file. Msg.: {str(e)}") return 2 LogHandler.log_message( f"{self.HEADER}: INFO: Finished updating the discovered-categories-file" ) return 0
def switch_to_change_secrets(self, secrets_empty: bool = False): if secrets_empty: LogHandler.log_message( f"{self.HEADER}: Could not find secrets in secrets.json!" ) self.main_window.FindElement('secrets_message').Update( value=" Please enter your secret-values. Otherwise the app will not be able " "to use the API!" ) self.main_window.FindElement('cancel_change_secrets').Update(visible=False) else: self.main_window.FindElement('secrets_message').Update( value="" ) self.main_window.FindElement('cancel_change_secrets').Update(visible=True) LogHandler.log_message( f"{self.HEADER}: Changing window to 'Change Secrets'-Section" ) self.main_window.FindElement("main_menu_col").Update(visible=False) self.main_window.FindElement("change_secrets_col").Update(visible=True) self.update_window()
def create_api_connection(self, config_dict: dict, secrets_dict: dict): try: self.AMAZON_ACCESS_KEY = secrets_dict["AMAZON_ACCESS_KEY"] self.AMAZON_SECRET_KEY = secrets_dict["AMAZON_SECRET_KEY"] self.AMAZON_ASSOC_TAG = secrets_dict["AMAZON_ASSOC_TAG"] except Exception as e: raise Exception( f"{self.HEADER} FATAL: Could not read necessary access-, secret-key and association " f"tag from config. Error-Msg: {str(e)}") try: self.RETRY_LIMIT = int(config_dict["RETRY_LIMIT"]) self.COOLDOWN_TIME_FOR_RETRY = int( config_dict["COOLDOWN_TIME_FOR_RETRY"]) except Exception as e: LogHandler.log_message( f"{self.HEADER} WARNING: Could not read API-Handler-Values from config. Using default " f"values instead. Error-Msg: {str(e)}") self.RETRY_ERROR_MESSAGE = ( f"{self.HEADER} ERROR: Could not reach API after {self.RETRY_LIMIT} retries!" ) try: self.AMAZON_API = AmazonAPI( self.AMAZON_ACCESS_KEY, self.AMAZON_SECRET_KEY, self.AMAZON_ASSOC_TAG, region="DE", MaxQPS=0.5, ) except TypeError as e: LogHandler.log_message( f"{self.HEADER} ERROR: Could not initialize Amazon-API-Connection! Have you " f"provided correct values in your secrets.json-File? Msg.: {str(e)}" )
def read_categories_tree_file(self, file_name: str): file_name = self.clean_file_path_variables(path_element=file_name) file_path = str(pathlib.Path.joinpath(self.ROOT_DIR, "data", file_name)) try: with open(file_path, "r", encoding="UTF-8") as categories_file: try: categories_json = json.loads(categories_file.read(), encoding="UTF-8") except json.JSONDecodeError: raise Exception( f"{self.HEADER} ERROR: Could not read File '{file_name}'. Invalid JSON. Please contact" f"the developer.") except IOError as e: LogHandler.log_message( f"{self.HEADER} WARNING: Could not read the original categories-tree-json. " f"Creating a new empty categories-tree-json. Msg.: {str(e)}") try: with open(file_path, "w", encoding="UTF-8") as categories_file: categories_file.write("{}") except IOError as e: print( f"{self.HEADER}: ERROR: An error occurred while trying to create an empty " f" discovered-categories-file. File name: '{file_name}'. Msg.: {str(e)}" ) return 2 else: categories_json = {} return categories_json
def fetch_product_details(self, final_rank_rows, child_id, categories_list: list = ()) -> list: if self.PRODUCT_LIMIT == 0: return final_rank_rows products = self.api_handler.api_call_with_retry( function=self.api_handler.browse_node_lookup, function_params={ "BrowseNodeId": child_id, "ResponseGroup": "TopSellers" }, ) for product_index, product in enumerate(products): root = ET.fromstring(product.to_string()) top_seller_items = root.findall( ".//owl:TopSellers/owl:TopSeller", namespaces=self.api_handler.AMAZON_NAMESPACE, ) for item_index, item in enumerate(top_seller_items): if item_index == self.PRODUCT_LIMIT: break title = item.find("owl:Title", namespaces=self.api_handler.AMAZON_NAMESPACE) asin = item.find("owl:ASIN", namespaces=self.api_handler.AMAZON_NAMESPACE) if asin is None: continue else: asin = asin.text if title is not None: title = title.text try: product_sales_rank = self.api_handler.api_call_with_retry( function=self.api_handler.lookup, function_params={ "ItemId": asin, "Power": "binding:paperback", "ResponseGroup": "SalesRank", }, ) sales_rank = product_sales_rank.sales_rank offer_url = product_sales_rank.offer_url except Exception as e: LogHandler.log_message( f"{self.HEADER} ERROR: Could not fetch sales_rank for ASIN {asin}! Msg: {str(e)}" ) sales_rank = 0 offer_url = "N/A" row = { "category": categories_list, "asin": asin, "title": title, "sales_rank": int(sales_rank), "offer_url": offer_url, } final_rank_rows.append(row) if self.verbose: LogHandler.log_message( f"{self.HEADER}: Current-Row: {row}") else: LogHandler.log_message( f"{self.HEADER}: Found-Book: {row['title']}") return final_rank_rows
def walk_through_categories( self, recursive_func, browse_nodes, final_rank_rows: list, categories_list: list = (), current_level: int = 0, ) -> list: for index, node in enumerate(browse_nodes): LogHandler.log_message( f"{self.HEADER} NODE: {index} C{current_level} {node.name} ({node.id})" ) valid_category_check = False # If sub-categories are definied -> ignore selected main-categories if self.INCLUDED_MAIN_CATEGORIES[ self.product_type] and not self.SPECIFIC_SUB_CATEGORIES[ self.product_type]: for included_category in self.INCLUDED_MAIN_CATEGORIES[ self.product_type]: if included_category in node.name.text or node.name.text in [ "Kategorien", "Kindle eBooks" ]: # If the category is in the included list -> proceed valid_category_check = True break elif any(included_category in part for part in categories_list): valid_category_check = True else: valid_category_check = True if not valid_category_check: print(f"{self.HEADER} NODE: Skipping {node.name.text}") continue new_categories_list = deepcopy(categories_list) new_categories_list.append(f"{node.name.text} [{str(node.id)}]") try: children_references = node.children except AttributeError: # If no more children are found - use this node for getting the top-seller-items self.categories_paths.append(new_categories_list) final_rank_rows = self.fetch_product_details( child_id=node.id, categories_list=new_categories_list, final_rank_rows=final_rank_rows, ) return final_rank_rows else: for child_reference in children_references: child_nodes = self.api_handler.api_call_with_retry( function=self.api_handler.browse_node_lookup, function_params={"BrowseNodeId": child_reference.id}, ) if child_nodes: if not self.verbose or node.name.text in [ "Wirtschaftskriminalität", "Vampire" ]: LogHandler.log_message( f"{self.HEADER} {node.name}: Children: {[i.name for i in child_nodes]}" ) new_level = current_level + 1 time.sleep(1) final_rank_rows = recursive_func( recursive_func=recursive_func, browse_nodes=child_nodes, categories_list=new_categories_list, final_rank_rows=final_rank_rows, current_level=new_level, ) return final_rank_rows
def run_crawler(self, browse_node_id: int = '541686'): self.product_type = self.NODE_ID_TO_PRODUCT_TYPE[str( browse_node_id).lower()] final_rank_rows = [] categories_list = [self.product_type.capitalize()] if self.SPECIFIC_SUB_CATEGORIES[self.product_type]: category_discovery = False for sub_category in self.SPECIFIC_SUB_CATEGORIES[ self.product_type]: node_id = sub_category.split('[')[1][:-1] browse_nodes = self.api_handler.api_call_with_retry( function=self.api_handler.browse_node_lookup, function_params={"BrowseNodeId": node_id}, ) if not browse_nodes: print("ERROR") return False final_rank_rows = self.walk_through_categories( recursive_func=self.walk_through_categories, browse_nodes=browse_nodes, final_rank_rows=final_rank_rows, categories_list=categories_list, current_level=0, ) else: category_discovery = True browse_nodes = self.api_handler.api_call_with_retry( function=self.api_handler.browse_node_lookup, function_params={"BrowseNodeId": browse_node_id}, ) final_rank_rows = self.walk_through_categories( recursive_func=self.walk_through_categories, browse_nodes=browse_nodes, final_rank_rows=final_rank_rows, categories_list=categories_list, current_level=0, ) try: deepest_sub_category_level = max( [len(l["category"]) for l in final_rank_rows]) categories_field_names = [] for level in range(0, deepest_sub_category_level): categories_field_names.append(f"category-{level}") self.FIELD_NAMES_TOP_BOOKS = (categories_field_names + self.FIELD_NAMES_TOP_BOOKS) for index, row in enumerate(final_rank_rows): # TODO: If a sub-category is directly scraped -> all parent categories are currently missing. # They should be included in the csv. for level in range(0, deepest_sub_category_level): if level < len(row["category"]): final_rank_rows[index][f"category-{level}"] = row[ "category"][level] else: final_rank_rows[index][f"category-{level}"] = "" del final_rank_rows[index]["category"] fields_for_sorting = ["sales_rank"] for level in range(0, deepest_sub_category_level): fields_for_sorting.insert(0, f"category-{level}") final_rank_rows = sorted( final_rank_rows, key=lambda x: [x[field] for field in fields_for_sorting], ) except Exception as e: LogHandler.log_message( f"{self.HEADER} ERROR: An error occurred while trying to sort the product-list. " f"Msg: {str(e)}") pass if category_discovery: self.update_categories_tree() if self.PRODUCT_LIMIT != 0: self.file_handler.write_result_to_csv( field_names=self.FIELD_NAMES_TOP_BOOKS, rows_for_print=final_rank_rows, file_name_prefix=self.product_type, file_name_suffix=self.FILE_NAME_SUFFIX_TOP_BOOKS, )
def refresh_values_in_forms(self): self.config_dict = self.config_handler.config_dict # ---- Pre-Fill config-form ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- self.main_window.FindElement("RETRY_LIMIT").Update( value=self.config_dict["RETRY_LIMIT"] ) self.main_window.FindElement("COOLDOWN_TIME_FOR_RETRY").Update( value=self.config_dict["COOLDOWN_TIME_FOR_RETRY"] ) verbosity_options_list = list(self.config_handler.VERBOSITY_OPTIONS.keys()) self.main_window.FindElement("VERBOSE_OUTPUT").Update( values=verbosity_options_list ) self.main_window.FindElement("PRODUCT_LIMIT").Update( value=self.config_dict["PRODUCT_LIMIT"] ) verbosity_level = self.config_dict["VERBOSE_OUTPUT"] for key, value in self.config_handler.VERBOSITY_OPTIONS.items(): if verbosity_level == value: verbosity_level = key try: self.main_window.FindElement("VERBOSE_OUTPUT").Update( set_to_index=verbosity_options_list.index(verbosity_level) ) except (IndexError, ValueError) as e: LogHandler.log_message(e) # ---- Pre-Fill secrets-page ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- self.main_window.FindElement('AMAZON_ACCESS_KEY').Update( value=self.secrets_dict['AMAZON_ACCESS_KEY'] ) self.main_window.FindElement('AMAZON_SECRET_KEY').Update( value=self.secrets_dict['AMAZON_SECRET_KEY'] ) self.main_window.FindElement('AMAZON_ASSOC_TAG').Update( value=self.secrets_dict['AMAZON_ASSOC_TAG'] ) # ---- Pre-Fill settings-books ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- self.main_window.FindElement("books_included_main_categories").Update( values=self.node_crawler_class.MAIN_CATEGORIES['books'], set_to_index=-1 ) included_main_categories = self.config_dict["BrowseNodeCrawler"]['books']["INCLUDED_MAIN_CATEGORIES"] self.main_window.FindElement('books_included_main_categories_nr').Update( value=f"Currently Saved: {len(included_main_categories)} selected" ) for item in included_main_categories: self.main_window.FindElement("books_included_main_categories").Update( set_to_index=self.node_crawler_class.MAIN_CATEGORIES['books'].index(item) ) file_name = self.node_crawler_class.get_categories_tree_file_name(product_type="books") categories_json = self.file_handler.read_categories_tree_file(file_name=file_name) if categories_json: discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements( categories_json=categories_json ) self.main_window.FindElement("books_specific_sub_categories").Update( values=discovered_sub_categories, set_to_index=-1 ) specific_sub_categories = self.config_dict["BrowseNodeCrawler"]['books']["SPECIFIC_SUB_CATEGORIES"] discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements( categories_json, format_values=False ) for item in specific_sub_categories: try: self.main_window.FindElement("books_specific_sub_categories").Update( set_to_index=discovered_sub_categories.index(item) ) except (IndexError, ValueError): pass self.main_window.FindElement('books_specific_sub_categories_nr').Update( value=f"Currently Saved: {len(specific_sub_categories)} selected" ) else: self.main_window.FindElement("books_specific_sub_categories").Update( values=[], set_to_index=-1 ) self.main_window.FindElement('books_specific_sub_categories_nr').Update( value=f"No categories discovered yet" ) # ---- Pre-Fill settings-kindle ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- self.main_window.FindElement("kindle_books_included_main_categories").Update( values=self.node_crawler_class.MAIN_CATEGORIES['kindle_books'], set_to_index=-1 ) included_main_categories = self.config_dict["BrowseNodeCrawler"]['kindle_books']["INCLUDED_MAIN_CATEGORIES"] self.main_window.FindElement('kindle_books_included_main_categories_nr').Update( value=f"Currently Saved: {len(included_main_categories)} selected" ) for item in included_main_categories: self.main_window.FindElement("kindle_books_included_main_categories").Update( set_to_index=self.node_crawler_class.MAIN_CATEGORIES['kindle_books'].index(item) ) file_name = self.node_crawler_class.get_categories_tree_file_name(product_type="kindle_books") categories_json = self.file_handler.read_categories_tree_file(file_name=file_name) if categories_json: discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements( categories_json=categories_json ) self.main_window.FindElement("kindle_books_specific_sub_categories").Update( values=discovered_sub_categories, set_to_index=-1 ) specific_sub_categories = self.config_dict["BrowseNodeCrawler"]['kindle_books']["SPECIFIC_SUB_CATEGORIES"] discovered_sub_categories = self.node_crawler_class.get_categories_tree_json_elements( categories_json, format_values=False ) for item in specific_sub_categories: try: self.main_window.FindElement("kindle_books_specific_sub_categories").Update( set_to_index=discovered_sub_categories.index(item) ) except (IndexError, ValueError): pass self.main_window.FindElement('kindle_books_specific_sub_categories_nr').Update( value=f"Currently Saved: {len(specific_sub_categories)} selected" ) else: self.main_window.FindElement("kindle_books_specific_sub_categories").Update( values=[], set_to_index=-1 ) self.main_window.FindElement('kindle_books_specific_sub_categories_nr').Update( value=f"No categories discovered yet" )