Beispiel #1
0
    def __validate_link__(self, obj, parent=None):
        """
        Check link

        Parameters
        --------------

            obj : bs4 tag,
                Beautiful Soup Tag
            parent : str (optional)
                Parent url 

        """

        page = Page()
        if obj.name == 'form':
            page.url = obj.get('action') if obj.get('action') else parent
            page.method = obj.get('method').lower() if obj.get(
                'method') else 'post'
            page.payload = {
                i.get('name'): random.sample(self.payloads, 1)
                for i in obj.find_all(
                    lambda tag: tag.name == "input" and tag.attrs.get(
                        'type') not in ('submit', 'hidden'))
            }
        else:
            page.url = urlparse(obj.get('href'))._replace(query=None).geturl()
        if page.url:
            page.url = self.CONDITIONS['part_of_link'](self, page.url)
            if self.CONDITIONS['is_not_visited'](
                    self,
                (page.url,
                 page.method)) and self.CONDITIONS['is_current_host'](
                     self, page.url) and self.CONDITIONS['not_in_ext'](
                         self, page.url) and self.CONDITIONS['is_url'](
                             self, page.url):
                self.visited_urls.add((page.url, page.method))
                return page
Beispiel #2
0
    def can_be_crawled(page: Page) -> bool:
        """
        Args:
            `domain_url`: URL, e.g. "ya.ru". The domain of interest.
            `page_url`: page URL.
        Returns:
            if page with given URL allowed to be crawled.
        """

        robots_parser = RobotsProvider._get_robots_parser(page.domain_url())
        result = robots_parser.can_fetch(URLGetter.USERAGENT, page.path())
        robots_logger.debug(
            "checking page is crawlable {} for domain {}: {}".format(
                page.url(), page.domain_url(), result))
        return result
Beispiel #3
0
# coding: utf-8
from handlers.page import PageAnalytic
from page import Page

page = Page()
page.url = "http://globoesporte.globo.com"
pg = PageAnalytic(page.url)
page.length = pg.get_page_size()

print (page.length)
Beispiel #4
0
from page import Page

for s in [
        'http://www.ozon.ru/context/shoes/',
        'http://www.ozon.ru/context/shoes',
        'https://www.lamoda.ru/1',
        'https://www.lamoda.ru/',
]:
    p = Page(s)
    condition = p.url() == s
    if not condition:
        print("p.url(): '{}' not equal to string: '{}'".format(p.url(), s))
    assert condition