def __validate_link__(self, obj, parent=None): """ Check link Parameters -------------- obj : bs4 tag, Beautiful Soup Tag parent : str (optional) Parent url """ page = Page() if obj.name == 'form': page.url = obj.get('action') if obj.get('action') else parent page.method = obj.get('method').lower() if obj.get( 'method') else 'post' page.payload = { i.get('name'): random.sample(self.payloads, 1) for i in obj.find_all( lambda tag: tag.name == "input" and tag.attrs.get( 'type') not in ('submit', 'hidden')) } else: page.url = urlparse(obj.get('href'))._replace(query=None).geturl() if page.url: page.url = self.CONDITIONS['part_of_link'](self, page.url) if self.CONDITIONS['is_not_visited']( self, (page.url, page.method)) and self.CONDITIONS['is_current_host']( self, page.url) and self.CONDITIONS['not_in_ext']( self, page.url) and self.CONDITIONS['is_url']( self, page.url): self.visited_urls.add((page.url, page.method)) return page
def can_be_crawled(page: Page) -> bool: """ Args: `domain_url`: URL, e.g. "ya.ru". The domain of interest. `page_url`: page URL. Returns: if page with given URL allowed to be crawled. """ robots_parser = RobotsProvider._get_robots_parser(page.domain_url()) result = robots_parser.can_fetch(URLGetter.USERAGENT, page.path()) robots_logger.debug( "checking page is crawlable {} for domain {}: {}".format( page.url(), page.domain_url(), result)) return result
# coding: utf-8 from handlers.page import PageAnalytic from page import Page page = Page() page.url = "http://globoesporte.globo.com" pg = PageAnalytic(page.url) page.length = pg.get_page_size() print (page.length)
from page import Page for s in [ 'http://www.ozon.ru/context/shoes/', 'http://www.ozon.ru/context/shoes', 'https://www.lamoda.ru/1', 'https://www.lamoda.ru/', ]: p = Page(s) condition = p.url() == s if not condition: print("p.url(): '{}' not equal to string: '{}'".format(p.url(), s)) assert condition