コード例 #1
0
 def run(self):
     data = {}
     extracted_data = {}
     for selector in self.extractor.get('data_selectors', []):
         if selector.get('selector_attribute') == 'element' and len(selector.get('child_selectors', [])) > 0:
             # TODO - currently only support multiple elements strategy. what if multiple=False
             elements = self.response.css(selector.get('selector'))
             elements_data = []
             for item_no, el in enumerate(elements):
                 item_no = item_no + 1  # because enumerate starts from 0
                 datum = {}
                 for child_selector in selector.get('child_selectors', []):
                     _d = get_selector_element(el, child_selector)
                     datum[child_selector.get('selector_id')] = _d.strip() if _d else None
                 datum[self.ITER_KEY] = item_no
                 elements_data.append(datum)
             if selector.get("multiple", False) is False:
                 single_data = elements_data[0]
                 single_data.pop(self.ITER_KEY)
                 extracted_data[selector.get('selector_id')] = single_data
             else:
                 extracted_data[selector.get('selector_id')] = elements_data
         else:
             _d = get_selector_element(self.response, selector)
             extracted_data[selector.get('selector_id')] = _d
     data[self.parser_id] = extracted_data
     return data
コード例 #2
0
 def run(self):
     data = {}
     extracted_data = {}
     for selector in self.extractor.get('data_selectors', []):
         if selector.get('selector_attribute') == 'element' and len(
                 selector.get('child_selectors', [])) > 0:
             # TODO - currently only support multiple elements strategy. what if multiple=False
             elements = self.response.css(selector.get('selector'))
             elements_data = []
             for el in elements:
                 datum = {}
                 for child_selector in selector.get('child_selectors', []):
                     _d = get_selector_element(el, child_selector)
                     datum[child_selector.get(
                         'selector_id')] = _d if _d else None
                 elements_data.append(datum)
             data_type = selector.get("data_type", "RawField")
             if data_type.startswith("List") is False:
                 single_data = elements_data[0]
                 extracted_data[selector.get('selector_id')] = single_data
             else:
                 extracted_data[selector.get('selector_id')] = elements_data
         else:
             _d = get_selector_element(self.response, selector)
             extracted_data[selector.get('selector_id')] = _d
     data[self.extractor_id] = extracted_data
     return data
コード例 #3
0
ファイル: websites.py プロジェクト: dvlop/invana-bot
 def parse(self, response):
     print("Parser=========,", response.url)
     data = {}
     data['url'] = response.url
     max_pages = self.parser_config.get("next_page_selector",
                                        {}).get("max_pages", 1)
     current_page_count = self.parser_config.get(
         "next_page_selector", {}).get("current_page_count", 1)
     context = self.context
     print("context", context)
     for selector in self.parser_config['data_selectors']:
         if selector.get('selector_attribute') == 'element' and \
                 len(selector.get('child_selectors', [])) > 0:
             # TODO - currently only support multiple elements strategy. what if multiple=False
             elements = response.css(selector.get('selector'))
             elements_data = []
             for item_no, el in enumerate(elements):
                 item_no = item_no + 1  # because enumerate starts from 0
                 datum = {}
                 for child_selector in selector.get('child_selectors', []):
                     _d = get_selector_element(el, child_selector)
                     datum[child_selector.get(
                         'id')] = _d.strip() if _d else None
                 datum['item_no'] = item_no
                 elements_data.append(datum)
             data[selector.get('id')] = elements_data
         else:
             _d = get_selector_element(response, selector)
             data[selector.get('id')] = _d.strip() if _d else None
     if context is not None:
         data.update({"context": context})
     yield data
     print("current_page_count", current_page_count, max_pages)
     if current_page_count < max_pages:
         next_selector = self.parser_config.get('next_page_selector').get(
             'selector')
         if next_selector:
             if self.parser_config.get('next_page_selector').get(
                     'selector_type') == 'css':
                 next_pages = response.css(next_selector)
             elif self.parser_config.get('next_page_selector').get(
                     'selector_type') == 'xpath':
                 next_pages = response.xpath(next_selector)
             else:
                 next_pages = []
             for next_page in next_pages:
                 self.parser_config["next_page_selector"][
                     "current_page_count"] = current_page_count + 1
                 yield response.follow(next_page, self.parse)
     else:
         print("### ended")
コード例 #4
0
ファイル: links.py プロジェクト: yashodhank/invana-bot
 def run(self):
     data = {}
     extracted_data = {}
     for selector in self.extractor.get('data_selectors', []):
         _d = get_selector_element(self.response, selector)
         extracted_data[selector.get('selector_id')] = _d
     data[self.parser_id] = extracted_data
     return data