コード例 #1
0
    def iter_processed_text(self, file, encoding=None, base_url=None):
        for text, is_link in self.iter_text(file, encoding):
            if is_link:
                try:
                    new_text = json.loads('"{0}"'.format(text))
                except ValueError:
                    yield (text, False)
                    continue

                if is_unlikely_link(new_text) or not is_likely_link(new_text):
                    yield (text, False)
                    continue

                if base_url:
                    new_link = urljoin_safe(base_url, new_text,
                                            allow_fragments=False)
                else:
                    new_link = new_text

                if new_link:
                    yield (new_link, identify_link_type(new_link) or True)
                else:
                    yield (text, False)
            else:
                yield (text, False)
コード例 #2
0
ファイル: ftp.py プロジェクト: flatron18116/wpull
    def _add_listing_links(self, response):
        '''Add links from file listing response.'''
        base_url = response.request.url_info.url
        dir_urls_to_add = set()
        file_urls_to_add = set()

        if self._glob_pattern:
            level = self._url_item.url_record.level
        else:
            level = None

        for file_entry in response.files:
            if self._glob_pattern and \
                    not fnmatch.fnmatchcase(file_entry.name, self._glob_pattern):
                continue

            if file_entry.type == 'dir':
                linked_url = urljoin_safe(base_url, file_entry.name + '/')
            elif file_entry.type in ('file', 'symlink', None):
                if not self._processor.fetch_params.retr_symlinks and \
                        file_entry.type == 'symlink':
                    self._make_symlink(file_entry.name, file_entry.dest)
                    linked_url = None
                else:
                    linked_url = urljoin_safe(base_url, file_entry.name)
            else:
                linked_url = None

            if linked_url:
                linked_url_info = parse_url_or_log(linked_url)

                if linked_url_info:
                    linked_url_record = self._url_item.child_url_record(linked_url_info, level=level)

                    verdict = self._fetch_rule.check_ftp_request(
                        linked_url_info, linked_url_record)[0]

                    if verdict:
                        if linked_url_info.path.endswith('/'):
                            dir_urls_to_add.add(linked_url_info.url)
                        else:
                            file_urls_to_add.add(linked_url_info.url)

        self._url_item.add_child_urls(dir_urls_to_add, link_type=LinkType.directory)
        self._url_item.add_child_urls(file_urls_to_add, link_type=LinkType.file, level=level)
コード例 #3
0
ファイル: html.py プロジェクト: flatron18116/wpull
    def scrape_file(self, file, encoding=None, base_url=None):
        '''Scrape a file for links.

        See :meth:`scrape` for the return value.
        '''
        elements = self.iter_elements(file, encoding=encoding)

        link_contexts = set()

        link_infos = self._element_walker.iter_links(elements)

        for link_info in link_infos:
            element_base_url = base_url

            if link_info.base_link:
                clean_base_url = clean_link_soup(link_info.base_link)

                if element_base_url and base_url:
                    element_base_url = urljoin_safe(
                        base_url, clean_base_url
                    ) or base_url

            if element_base_url:
                url = urljoin_safe(
                    element_base_url,
                    clean_link_soup(link_info.link),
                    allow_fragments=False
                )
            else:
                url = clean_link_soup(link_info.link)

            if url:
                link_contexts.add(LinkContext(
                    url,
                    inline=link_info.inline,
                    linked=link_info.linked,
                    link_type=link_info.link_type,
                    extra=link_info
                ))

        scrape_result = ScrapeResult(link_contexts, encoding)
        scrape_result['base_url'] = base_url
        return scrape_result
コード例 #4
0
ファイル: base.py プロジェクト: fakegit/ludios_wpull
    def iter_processed_links(self, file, encoding=None, base_url=None):
        '''Return the links.

        Returns:
            iterator: Each item is a str which represents a link.
        '''
        for link in self.iter_links(file, encoding):
            new_link = urljoin_safe(base_url, link, allow_fragments=False)
            if new_link:
                yield new_link
コード例 #5
0
ファイル: base.py プロジェクト: charygao/wpull
    def iter_processed_links(self, file, encoding=None, base_url=None):
        """Return the links.

        Returns:
            iterator: Each item is a str which represents a link.
        """
        for link in self.iter_links(file, encoding):
            new_link = urljoin_safe(base_url, link, allow_fragments=False)
            if new_link:
                yield new_link
コード例 #6
0
ファイル: javascript.py プロジェクト: pombredanne/wpull
    def iter_processed_text(self, file, encoding=None, base_url=None):
        for text, is_link in self.iter_text(file, encoding):
            if is_link:
                try:
                    new_text = json.loads('"{0}"'.format(text))
                except ValueError:
                    yield (text, False)
                    continue

                if is_unlikely_link(new_text) or not is_likely_link(new_text):
                    yield (text, False)
                    continue

                if base_url:
                    new_link = urljoin_safe(base_url, new_text, allow_fragments=False)
                else:
                    new_link = new_text

                if new_link:
                    yield (new_link, identify_link_type(new_link) or True)
                else:
                    yield (text, False)
            else:
                yield (text, False)
コード例 #7
0
ファイル: base.py プロジェクト: fakegit/ludios_wpull
    def iter_processed_text(self, file, encoding=None, base_url=None):
        '''Return the file text and processed absolute links.

        Args:
            file: A file object containing the document.
            encoding (str): The encoding of the document.
            base_url (str): The URL at which the document is located.

        Returns:
            iterator: Each item is a tuple:

            1. str: The text
            2. bool: Whether the text a link
        '''
        for text, is_link in self.iter_text(file, encoding):
            if is_link and base_url:
                new_link = urljoin_safe(base_url, text, allow_fragments=False)

                if new_link:
                    yield (new_link, is_link)
                else:
                    yield (new_link, False)
            else:
                yield (text, is_link)
コード例 #8
0
ファイル: base.py プロジェクト: charygao/wpull
    def iter_processed_text(self, file, encoding=None, base_url=None):
        """Return the file text and processed absolute links.

        Args:
            file: A file object containing the document.
            encoding (str): The encoding of the document.
            base_url (str): The URL at which the document is located.

        Returns:
            iterator: Each item is a tuple:

            1. str: The text
            2. bool: Whether the text a link
        """
        for text, is_link in self.iter_text(file, encoding):
            if is_link and base_url:
                new_link = urljoin_safe(base_url, text, allow_fragments=False)

                if new_link:
                    yield (new_link, is_link)
                else:
                    yield (new_link, False)
            else:
                yield (text, is_link)
コード例 #9
0
ファイル: html.py プロジェクト: flatron18116/wpull
    def _process_elements(self, elements, response, base_url, link_contexts):
        robots_check_needed = self._robots
        robots_no_follow = False
        inject_refresh = True
        doc_base_url = None

        for element in elements:
            if not isinstance(element, Element):
                continue

            if robots_check_needed and ElementWalker.robots_cannot_follow(element):
                robots_check_needed = False
                robots_no_follow = True

            if not doc_base_url and element.tag == 'base':
                doc_base_url = urljoin_safe(
                    base_url, clean_link_soup(element.attrib.get('href', ''))
                )

            link_infos = self._element_walker.iter_links_element(element)

            if inject_refresh and 'Refresh' in response.fields:
                link = parse_refresh(response.fields['Refresh'])

                if link:
                    link_info = LinkInfo(
                        element=None, tag='_refresh', attrib=None,
                        link=link,
                        inline=False, linked=True,
                        base_link=None, value_type='refresh',
                        link_type=None  # treat it as a redirect
                    )
                    link_infos = itertools.chain(link_infos, [link_info])

                inject_refresh = False
            else:
                inject_refresh = False

            for link_info in link_infos:
                if self._only_relative:
                    if link_info.base_link or '://' in link_info.link:
                        continue

                if not self._is_accepted(link_info.tag):
                    continue

                element_base_url = doc_base_url or base_url

                if link_info.base_link:
                    clean_base_url = clean_link_soup(link_info.base_link)

                    if clean_base_url:
                        element_base_url = urljoin_safe(
                            base_url, clean_base_url
                        ) or base_url

                cleaned_url = clean_link_soup(link_info.link)

                if not cleaned_url:
                    continue

                url = urljoin_safe(
                    element_base_url,
                    cleaned_url,
                    allow_fragments=False
                )

                if url:
                    link_contexts.add(LinkContext(
                        url,
                        inline=link_info.inline,
                        linked=link_info.linked,
                        link_type=link_info.link_type,
                        extra=link_info,
                    ))

        return {'robots_no_follow': robots_no_follow}