def iter_processed_text(self, file, encoding=None, base_url=None): for text, is_link in self.iter_text(file, encoding): if is_link: try: new_text = json.loads('"{0}"'.format(text)) except ValueError: yield (text, False) continue if is_unlikely_link(new_text) or not is_likely_link(new_text): yield (text, False) continue if base_url: new_link = urljoin_safe(base_url, new_text, allow_fragments=False) else: new_link = new_text if new_link: yield (new_link, identify_link_type(new_link) or True) else: yield (text, False) else: yield (text, False)
def _add_listing_links(self, response): '''Add links from file listing response.''' base_url = response.request.url_info.url dir_urls_to_add = set() file_urls_to_add = set() if self._glob_pattern: level = self._url_item.url_record.level else: level = None for file_entry in response.files: if self._glob_pattern and \ not fnmatch.fnmatchcase(file_entry.name, self._glob_pattern): continue if file_entry.type == 'dir': linked_url = urljoin_safe(base_url, file_entry.name + '/') elif file_entry.type in ('file', 'symlink', None): if not self._processor.fetch_params.retr_symlinks and \ file_entry.type == 'symlink': self._make_symlink(file_entry.name, file_entry.dest) linked_url = None else: linked_url = urljoin_safe(base_url, file_entry.name) else: linked_url = None if linked_url: linked_url_info = parse_url_or_log(linked_url) if linked_url_info: linked_url_record = self._url_item.child_url_record(linked_url_info, level=level) verdict = self._fetch_rule.check_ftp_request( linked_url_info, linked_url_record)[0] if verdict: if linked_url_info.path.endswith('/'): dir_urls_to_add.add(linked_url_info.url) else: file_urls_to_add.add(linked_url_info.url) self._url_item.add_child_urls(dir_urls_to_add, link_type=LinkType.directory) self._url_item.add_child_urls(file_urls_to_add, link_type=LinkType.file, level=level)
def scrape_file(self, file, encoding=None, base_url=None): '''Scrape a file for links. See :meth:`scrape` for the return value. ''' elements = self.iter_elements(file, encoding=encoding) link_contexts = set() link_infos = self._element_walker.iter_links(elements) for link_info in link_infos: element_base_url = base_url if link_info.base_link: clean_base_url = clean_link_soup(link_info.base_link) if element_base_url and base_url: element_base_url = urljoin_safe( base_url, clean_base_url ) or base_url if element_base_url: url = urljoin_safe( element_base_url, clean_link_soup(link_info.link), allow_fragments=False ) else: url = clean_link_soup(link_info.link) if url: link_contexts.add(LinkContext( url, inline=link_info.inline, linked=link_info.linked, link_type=link_info.link_type, extra=link_info )) scrape_result = ScrapeResult(link_contexts, encoding) scrape_result['base_url'] = base_url return scrape_result
def iter_processed_links(self, file, encoding=None, base_url=None): '''Return the links. Returns: iterator: Each item is a str which represents a link. ''' for link in self.iter_links(file, encoding): new_link = urljoin_safe(base_url, link, allow_fragments=False) if new_link: yield new_link
def iter_processed_links(self, file, encoding=None, base_url=None): """Return the links. Returns: iterator: Each item is a str which represents a link. """ for link in self.iter_links(file, encoding): new_link = urljoin_safe(base_url, link, allow_fragments=False) if new_link: yield new_link
def iter_processed_text(self, file, encoding=None, base_url=None): '''Return the file text and processed absolute links. Args: file: A file object containing the document. encoding (str): The encoding of the document. base_url (str): The URL at which the document is located. Returns: iterator: Each item is a tuple: 1. str: The text 2. bool: Whether the text a link ''' for text, is_link in self.iter_text(file, encoding): if is_link and base_url: new_link = urljoin_safe(base_url, text, allow_fragments=False) if new_link: yield (new_link, is_link) else: yield (new_link, False) else: yield (text, is_link)
def iter_processed_text(self, file, encoding=None, base_url=None): """Return the file text and processed absolute links. Args: file: A file object containing the document. encoding (str): The encoding of the document. base_url (str): The URL at which the document is located. Returns: iterator: Each item is a tuple: 1. str: The text 2. bool: Whether the text a link """ for text, is_link in self.iter_text(file, encoding): if is_link and base_url: new_link = urljoin_safe(base_url, text, allow_fragments=False) if new_link: yield (new_link, is_link) else: yield (new_link, False) else: yield (text, is_link)
def _process_elements(self, elements, response, base_url, link_contexts): robots_check_needed = self._robots robots_no_follow = False inject_refresh = True doc_base_url = None for element in elements: if not isinstance(element, Element): continue if robots_check_needed and ElementWalker.robots_cannot_follow(element): robots_check_needed = False robots_no_follow = True if not doc_base_url and element.tag == 'base': doc_base_url = urljoin_safe( base_url, clean_link_soup(element.attrib.get('href', '')) ) link_infos = self._element_walker.iter_links_element(element) if inject_refresh and 'Refresh' in response.fields: link = parse_refresh(response.fields['Refresh']) if link: link_info = LinkInfo( element=None, tag='_refresh', attrib=None, link=link, inline=False, linked=True, base_link=None, value_type='refresh', link_type=None # treat it as a redirect ) link_infos = itertools.chain(link_infos, [link_info]) inject_refresh = False else: inject_refresh = False for link_info in link_infos: if self._only_relative: if link_info.base_link or '://' in link_info.link: continue if not self._is_accepted(link_info.tag): continue element_base_url = doc_base_url or base_url if link_info.base_link: clean_base_url = clean_link_soup(link_info.base_link) if clean_base_url: element_base_url = urljoin_safe( base_url, clean_base_url ) or base_url cleaned_url = clean_link_soup(link_info.link) if not cleaned_url: continue url = urljoin_safe( element_base_url, cleaned_url, allow_fragments=False ) if url: link_contexts.add(LinkContext( url, inline=link_info.inline, linked=link_info.linked, link_type=link_info.link_type, extra=link_info, )) return {'robots_no_follow': robots_no_follow}