def test_clean_link_soup(self): self.assertEqual('http://example.com', clean_link_soup('http://example.com ')) self.assertEqual( 'http://example.com/', clean_link_soup('\n\r\thttp://example.com\n\r\r\r\n\t/')) self.assertEqual( 'http://example.com/ something', clean_link_soup('http://example.com\n\t / something \n\r\t')) self.assertEqual( 'http://example.com/dog cat/', clean_link_soup('http://example.com/\n dog \tcat\r/\n')) self.assertEqual('ßðf ¤Jáßðff ßðfœ³²œ¤ œë ßfœ', clean_link_soup('ß\tðf ¤Jáßðf\n f ßðfœ³²œ¤ œë ßfœ '))
def scrape_file(self, file, encoding=None, base_url=None): '''Scrape a file for links. See :meth:`scrape` for the return value. ''' elements = self.iter_elements(file, encoding=encoding) link_contexts = set() link_infos = self._element_walker.iter_links(elements) for link_info in link_infos: element_base_url = base_url if link_info.base_link: clean_base_url = clean_link_soup(link_info.base_link) if element_base_url and base_url: element_base_url = urljoin_safe( base_url, clean_base_url ) or base_url if element_base_url: url = urljoin_safe( element_base_url, clean_link_soup(link_info.link), allow_fragments=False ) else: url = clean_link_soup(link_info.link) if url: link_contexts.add(LinkContext( url, inline=link_info.inline, linked=link_info.linked, link_type=link_info.link_type, extra=link_info )) scrape_result = ScrapeResult(link_contexts, encoding) scrape_result['base_url'] = base_url return scrape_result
def test_clean_link_soup(self): self.assertEqual( 'http://example.com', clean_link_soup('http://example.com ') ) self.assertEqual( 'http://example.com/', clean_link_soup('\n\r\thttp://example.com\n\r\r\r\n\t/') ) self.assertEqual( 'http://example.com/ something', clean_link_soup('http://example.com\n\t / something \n\r\t') ) self.assertEqual( 'http://example.com/dog cat/', clean_link_soup('http://example.com/\n dog \tcat\r/\n') ) self.assertEqual( 'ßðf ¤Jáßðff ßðfœ³²œ¤ œë ßfœ', clean_link_soup('ß\tðf ¤Jáßðf\n f ßðfœ³²œ¤ œë ßfœ ') )
def _process_elements(self, elements, response, base_url, link_contexts): robots_check_needed = self._robots robots_no_follow = False inject_refresh = True doc_base_url = None for element in elements: if not isinstance(element, Element): continue if robots_check_needed and ElementWalker.robots_cannot_follow(element): robots_check_needed = False robots_no_follow = True if not doc_base_url and element.tag == 'base': doc_base_url = urljoin_safe( base_url, clean_link_soup(element.attrib.get('href', '')) ) link_infos = self._element_walker.iter_links_element(element) if inject_refresh and 'Refresh' in response.fields: link = parse_refresh(response.fields['Refresh']) if link: link_info = LinkInfo( element=None, tag='_refresh', attrib=None, link=link, inline=False, linked=True, base_link=None, value_type='refresh', link_type=None # treat it as a redirect ) link_infos = itertools.chain(link_infos, [link_info]) inject_refresh = False else: inject_refresh = False for link_info in link_infos: if self._only_relative: if link_info.base_link or '://' in link_info.link: continue if not self._is_accepted(link_info.tag): continue element_base_url = doc_base_url or base_url if link_info.base_link: clean_base_url = clean_link_soup(link_info.base_link) if clean_base_url: element_base_url = urljoin_safe( base_url, clean_base_url ) or base_url cleaned_url = clean_link_soup(link_info.link) if not cleaned_url: continue url = urljoin_safe( element_base_url, cleaned_url, allow_fragments=False ) if url: link_contexts.add(LinkContext( url, inline=link_info.inline, linked=link_info.linked, link_type=link_info.link_type, extra=link_info, )) return {'robots_no_follow': robots_no_follow}