def run(self): data = {} extracted_data = {} for selector in self.extractor.get('data_selectors', []): if selector.get('selector_attribute') == 'element' and len(selector.get('child_selectors', [])) > 0: # TODO - currently only support multiple elements strategy. what if multiple=False elements = self.response.css(selector.get('selector')) elements_data = [] for item_no, el in enumerate(elements): item_no = item_no + 1 # because enumerate starts from 0 datum = {} for child_selector in selector.get('child_selectors', []): _d = get_selector_element(el, child_selector) datum[child_selector.get('selector_id')] = _d.strip() if _d else None datum[self.ITER_KEY] = item_no elements_data.append(datum) if selector.get("multiple", False) is False: single_data = elements_data[0] single_data.pop(self.ITER_KEY) extracted_data[selector.get('selector_id')] = single_data else: extracted_data[selector.get('selector_id')] = elements_data else: _d = get_selector_element(self.response, selector) extracted_data[selector.get('selector_id')] = _d data[self.parser_id] = extracted_data return data
def run(self): data = {} extracted_data = {} for selector in self.extractor.get('data_selectors', []): if selector.get('selector_attribute') == 'element' and len( selector.get('child_selectors', [])) > 0: # TODO - currently only support multiple elements strategy. what if multiple=False elements = self.response.css(selector.get('selector')) elements_data = [] for el in elements: datum = {} for child_selector in selector.get('child_selectors', []): _d = get_selector_element(el, child_selector) datum[child_selector.get( 'selector_id')] = _d if _d else None elements_data.append(datum) data_type = selector.get("data_type", "RawField") if data_type.startswith("List") is False: single_data = elements_data[0] extracted_data[selector.get('selector_id')] = single_data else: extracted_data[selector.get('selector_id')] = elements_data else: _d = get_selector_element(self.response, selector) extracted_data[selector.get('selector_id')] = _d data[self.extractor_id] = extracted_data return data
def parse(self, response): print("Parser=========,", response.url) data = {} data['url'] = response.url max_pages = self.parser_config.get("next_page_selector", {}).get("max_pages", 1) current_page_count = self.parser_config.get( "next_page_selector", {}).get("current_page_count", 1) context = self.context print("context", context) for selector in self.parser_config['data_selectors']: if selector.get('selector_attribute') == 'element' and \ len(selector.get('child_selectors', [])) > 0: # TODO - currently only support multiple elements strategy. what if multiple=False elements = response.css(selector.get('selector')) elements_data = [] for item_no, el in enumerate(elements): item_no = item_no + 1 # because enumerate starts from 0 datum = {} for child_selector in selector.get('child_selectors', []): _d = get_selector_element(el, child_selector) datum[child_selector.get( 'id')] = _d.strip() if _d else None datum['item_no'] = item_no elements_data.append(datum) data[selector.get('id')] = elements_data else: _d = get_selector_element(response, selector) data[selector.get('id')] = _d.strip() if _d else None if context is not None: data.update({"context": context}) yield data print("current_page_count", current_page_count, max_pages) if current_page_count < max_pages: next_selector = self.parser_config.get('next_page_selector').get( 'selector') if next_selector: if self.parser_config.get('next_page_selector').get( 'selector_type') == 'css': next_pages = response.css(next_selector) elif self.parser_config.get('next_page_selector').get( 'selector_type') == 'xpath': next_pages = response.xpath(next_selector) else: next_pages = [] for next_page in next_pages: self.parser_config["next_page_selector"][ "current_page_count"] = current_page_count + 1 yield response.follow(next_page, self.parse) else: print("### ended")
def run(self): data = {} extracted_data = {} for selector in self.extractor.get('data_selectors', []): _d = get_selector_element(self.response, selector) extracted_data[selector.get('selector_id')] = _d data[self.parser_id] = extracted_data return data