def build(self, url=None, wanted_list=None, html=None, request_args=None, update=False): """ Automatically constructs a set of rules to scrape the specified target[s] from a web page. The rules are represented as stack_list. Parameters: ---------- url: str, optional URL of the target web page. You should either pass url or html or both. wanted_list: list, optional A list of needed contents to be scraped. AutoScraper learns a set of rules to scrape these targets. html: str, optional An HTML string can also be passed instead of URL. You should either pass url or html or both. request_args: dict, optional A dictionary used to specify a set of additional request parameters used by requests module. You can specify proxy URLs, custom headers etc. update: bool, optional, defaults to False If True, new learned rules will be added to the previous ones. If False, all previously learned rules will be removed. Returns: -------- None """ soup = self._get_soup(url=url, html=html, request_args=request_args) result_list = [] if update is False: self.stack_list = [] wanted_list = [unicodedata.normalize("NFKD", w) for w in wanted_list] for wanted in wanted_list: children = self._get_children(soup, wanted, url) for child in children: result, stack = self._get_result_for_child(child, soup, url) result_list += result self.stack_list.append(stack) result_list = unique_hashable(result_list) if all(w in result_list for w in wanted_list): self.stack_list = unique_stack_list(self.stack_list) return result_list return None
def build(self, url=None, wanted_list=None, wanted_dict=None, html=None, request_args=None, update=False, text_fuzz_ratio=1.0): """ Automatically constructs a set of rules to scrape the specified target[s] from a web page. The rules are represented as stack_list. Parameters: ---------- url: str, optional URL of the target web page. You should either pass url or html or both. wanted_list: list, optional A list of needed contents to be scraped. AutoScraper learns a set of rules to scrape these targets. If specified, wanted_dict will be ignored. wanted_dict: dict, optional A dict of needed contents to be scraped. Keys are aliases and values are list of target texts. AutoScraper learns a set of rules to scrape these targets and sets its aliases. html: str, optional An HTML string can also be passed instead of URL. You should either pass url or html or both. request_args: dict, optional A dictionary used to specify a set of additional request parameters used by requests module. You can specify proxy URLs, custom headers etc. update: bool, optional, defaults to False If True, new learned rules will be added to the previous ones. If False, all previously learned rules will be removed. text_fuzz_ratio: float in range [0, 1], optional, defaults to 1.0 The fuzziness ratio threshold for matching the wanted contents. Returns: -------- List of similar results """ soup = self._get_soup(url=url, html=html, request_args=request_args) result_list = [] if update is False: self.stack_list = [] if wanted_list: wanted_dict = {'': wanted_list} wanted_list = [] for alias, wanted_items in wanted_dict.items(): wanted_items = [ unicodedata.normalize("NFKD", w) for w in wanted_items ] wanted_list += wanted_items for wanted in wanted_items: children = self._get_children(soup, wanted, url, text_fuzz_ratio) for child in children: result, stack = self._get_result_for_child( child, soup, url) stack['alias'] = alias result_list += result self.stack_list.append(stack) result_list = [item.text for item in result_list] result_list = unique_hashable(result_list) self.stack_list = unique_stack_list(self.stack_list) return result_list