Example #1
0
def crawl_website(website):
    website.update_robots_txt()  # only updates if necessary
    rules = RobotExclusionRulesParser()
    rules.parse(website.robots_content)

    # TODO add check for site last updated timestamp

    # Has the index been retrieved yet?
    if not website.webpage_set.exists():
        # get index
        if rules.is_allowed('*', '/'):
            webpage = Webpage.objects.create(
                local_url='/',
                robots_allowed=True,
                website=website,
            )
            crawl_existing_webpage(webpage, rules)
        else:
            # create a placeholder index webpage
            webpage = Webpage.objects.create(
                local_url='/',
                robots_allowed=False,
                website=website,
            )
            print 'Robots not allowed to index root'
            return None

    # Are there webpages to be accessed?
    allowed_webpages = website.webpage_set.filter(robots_allowed=True)
    if not allowed_webpages.exists():
        # print 'no allowed webpages found for {website}'.format(website=website.url)
        return None

    # Are there new links to try out?
    new_webpages = allowed_webpages.filter(exists=None)
    if new_webpages.exists():
        # start with the oldest first
        # created and updated are the same for newly-created webpages
        webpage = new_webpages.order_by('created').first()
        print 'crawling new'
        return crawl_existing_webpage(webpage, rules)

    # Crawl an existing webpage
    if rules.is_allowed('*', '/foo.html'):
        webpage = allowed_webpages.filter(
            exists=True).order_by('updated').first()
        print 'crawling existing'
        return crawl_existing_webpage(webpage, rules)
Example #2
0
class RobotsTxt:
    '''
    Wrapper around robots.txt parser that adds the date the file was fetched.

    If the ``robots_file`` is None or cannot be parsed, then it's treated as a
    highly permissive robots.txt.
    '''
    def __init__(self, robots_doc):
        ''' Initialize from database document representation. '''
        self._updated_at = robots_doc['updated_at']
        self._robots = RobotExclusionRulesParser()

        if robots_doc['file'] is not None:
            try:
                self._robots.parse(robots_doc['file'])
            except:
                pass

    def is_allowed(self, user_agent, url):
        ''' Return True if ``url`` is allowed by this robots.txt file. '''
        return self._robots.is_allowed(user_agent, url)

    def is_older_than(self, age):
        ''' Return True if this robots file is older than ``age``. '''
        return (datetime.now(tzlocal()) - self._updated_at).seconds > age
Example #3
0
class RerpRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from robotexclusionrulesparser import RobotExclusionRulesParser
        self.spider = spider
        self.rp = RobotExclusionRulesParser()
        try:
            robotstxt_body = robotstxt_body.decode('utf-8')
        except UnicodeDecodeError:
            # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
            # Switch to 'allow all' state.
            logger.warning("Failure while parsing robots.txt using %(parser)s."
                           " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                           {'parser': "RobotExclusionRulesParser"},
                           exc_info=sys.exc_info(),
                           extra={'spider': self.spider})
            robotstxt_body = ''
        self.rp.parse(robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.is_allowed(user_agent, url)
class RerpWrapper(python_common.web.robots_txt.parser_base.RobotsTxtParser):
    def __init__(self, content=None, expires=None):
        super(RerpWrapper, self).__init__(content, expires)
        if content:
            self.parser = RobotExclusionRulesParser()
            self.parser.use_local_time = False
            self.parser.expiration_date = self.expires
            self.parser.parse(content)
        else:
            self.parser = None
            self.my_super = super(RerpWrapper, self)

    def allowed(self, user_agent, url):
        return self.parser.is_allowed(
            user_agent, url) if self.parser else self.my_super.allowed(
                user_agent, url)

    def delay(self, user_agent):
        return self.parser.get_crawl_delay(
            user_agent) if self.parser else self.my_super.delay(user_agent)

    @property
    def expired(self):
        return self.parser.is_expired if self.parser else self.my_super.expired

    @property
    def sitemaps(self):
        return self.parser.sitemaps if self.parser else self.my_super.sitemaps
Example #5
0
class Robot:
    def __init__(self, url):
        self.url = Url(urljoin(url, '/robots.txt'))
        self.rerp = RobotExclusionRulesParser()
        self.rerp.user_agent = 'Mozilla/5.0'
        self.rerp.fetch(self.url.url())

    def throttle_time(self):
        return self.rerp.get_crawl_delay('Mozilla/5.0')

    def should_block(self, url):
        return not self.rerp.is_allowed('Mozilla/5.0', url.url())
Example #6
0
    def allowed_url(self):
        #FIXME: Should use the geturl address as it may have been redirected
        scheme, netloc, path, query, fragment = urlsplit(self.url)
        robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

        #FIXME: Should cache robots.txt in a better persistent data structure
        if robot_url in ROBOT_CACHE:
            rp = ROBOT_CACHE[robot_url]
        else:
            rp = RobotExclusionRulesParser()
            try:
                rp.fetch(robot_url)
            # Currently if there's a problem we assume there is no robots.txt
            except IOError:
                # Should be catching the urllib2.URLError exception
                logging.debug("Couldn't retrieve robots.txt for %s" %
                              robot_url)
                rp = None
            except UnicodeDecodeError:
                logging.debug("Unicode decode error for robots.txt at %s" %
                              robot_url)
                rp = None
            except httplib.HTTPException:
                logging.debug("Generic HTTPException for robots.txt at %s" %
                              robot_url)
                rp = None
            ROBOT_CACHE[robot_url] = rp

        if rp is None or rp.is_allowed("*", self.url):
            base_url = urlunsplit([scheme, netloc, "", "", ""])

            # If there's a current delay on the site respect robots.txt and stall
            if self.db.exists(netloc):
                logging.debug("Obeying robot overlord for %s..." % netloc)
                URLHandler.add_to_busy(self.db, self.url)
                return False

            # Set a delay for any other requests to this site to respect robots.txt
            delay = rp.get_crawl_delay("*") if rp else None
            if delay:
                delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
            else:
                delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
            self.db.setex(netloc, "1", delay)

            return True
        else:
            return False
Example #7
0
def is_url_allowed(url):
    """
    Returns ``True`` if robots.txt rules for given URL allow fetching it. This
    function parses the robots rules for given URL (if any) and returns a
    boolean flag that tells you whether fetching it is allowed. Note that it
    doesn't test whether the URL exists on the host.

    :param url:     URL to test
    :returns:       ``True`` if URL can be fetched, ``False`` otherwise
    """
    robots = RobotParser()
    robots.user_agent = UA_STRING
    robots.fetch(get_robots_url(url))
    if robots.response_code != 200:
        return True
    return robots.is_allowed(UA_STRING, url)
Example #8
0
  def allowed_url(self):
    #FIXME: Should use the geturl address as it may have been redirected
    scheme, netloc, path, query, fragment = urlsplit(self.url)
    robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

    #FIXME: Should cache robots.txt in a better persistent data structure
    if robot_url in ROBOT_CACHE:
      rp = ROBOT_CACHE[robot_url]
    else:
      rp = RobotExclusionRulesParser()
      try:
        rp.fetch(robot_url)
      # Currently if there's a problem we assume there is no robots.txt
      except IOError:
        # Should be catching the urllib2.URLError exception
        logging.debug("Couldn't retrieve robots.txt for %s" % robot_url)
        rp = None
      except UnicodeDecodeError:
        logging.debug("Unicode decode error for robots.txt at %s" % robot_url)
        rp = None
      except httplib.HTTPException:
        logging.debug("Generic HTTPException for robots.txt at %s" % robot_url)
        rp = None
      ROBOT_CACHE[robot_url] = rp

    if rp is None or rp.is_allowed("*", self.url):
      base_url = urlunsplit([scheme, netloc, "", "", ""])

      # If there's a current delay on the site respect robots.txt and stall
      if self.db.exists(netloc):
        logging.debug("Obeying robot overlord for %s..." % netloc)
        URLHandler.add_to_busy(self.db, self.url)
        return False

      # Set a delay for any other requests to this site to respect robots.txt
      delay = rp.get_crawl_delay("*") if rp else None
      if delay:
        delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
      else:
        delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
      self.db.setex(netloc, "1", delay)

      return True
    else:
      return False
Example #9
0
class RerpRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from robotexclusionrulesparser import RobotExclusionRulesParser
        self.spider = spider
        self.rp = RobotExclusionRulesParser()
        robotstxt_body = decode_robotstxt(robotstxt_body, spider)
        self.rp.parse(robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.is_allowed(user_agent, url)
Example #10
0
def benchmark_rerp_parser(website):
    from robotexclusionrulesparser import RobotExclusionRulesParser
    rp = RobotExclusionRulesParser()
    rp.parse(website['robotstxt'])
    for link in website['links']:
        rp.is_allowed('googlebot', link)
Example #11
0
class CrawlerWorker(
        QObject
):  # spider that will get links of website # called to create instance of the class
    finish = False

    def __init__(self, info, instance, parent=None):
        super(CrawlerWorker, self).__init__(parent)
        self._instance = instance
        self.running = True
        self.base_url = info['base_url']  # main url of website
        self._links_to_crawl = []  # list of links yet to open
        self.crawled_links = {}  # dictionary of links opened/all links
        self.__parsed_crawled = {}  # list of urls and their html pages
        self.total = 0  # total number of found links
        self.total_crawled = 0  # total number of valid crawled links in website
        self.max_pages = info['max_crawl']  # max pages to crawl
        self.invalid_links_count = 0  # number of broken links found
        self.invalid_links_list = []  # list of broken links found
        self.dynamic = []
        self.info = info
        self.login_url = info['login_url']  # login page url if available
        if info['robo_url']:
            self._rb_parser = RobotExclusionRulesParser()
            self._rb_parser.fetch(info['robo_url'])
            self._user_agent = 'WASecBot'
        else:
            self._rb_parser = None
        self.browser = browser.RoboBrowser(parser="html.parser",
                                           user_agent="WASecBot")
        self.browser.session.verify = False
        self._logged_in = False
        self.running = True
        self._instance.btncrawlcancel.clicked.connect(self.pause)
        self._elapsed = 0
        self.delay = 15
        self._requests = 0
        self.start = None
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    def _opener(self, url):
        retry = 1
        while True:
            try:
                self.browser.open(url=url)
                break
            except exceptions.ConnectionError as ce:
                # sleep(self.delay * retry)
                if retry == 11:
                    return False
                else:
                    retry += 1
        return True

    def _compute_crawl_delay(self):
        self._requests += 1
        if self._requests <= 10:
            self._elapsed += self.browser.response.elapsed.total_seconds()
            delay = self._elapsed / self._requests
            self.delay = delay * 200
            if self.delay >= 180:
                self.delay = 15
        else:
            self._requests = 1
            self._elapsed = self.browser.response.elapsed.total_seconds()
            self.delay = self._elapsed * 200

    def pause(self):
        self.running = False
        self._instance.change_state.emit('Canceling...')
        choice = QtWidgets.QMessageBox.question(
            self._instance, "Cancel Crawl!",
            "WASec is not finished yet, are You sure you want to stop crawling?",
            QtWidgets.QMessageBox.Cancel | QtWidgets.QMessageBox.Yes)
        if choice == QtWidgets.QMessageBox.Yes:
            self.finish = True
            self.running = False
            self._instance.crawl_finished.emit(self._wrap_up())
        else:
            self.running = True

    # get total number of links opened so far
    def total_links(self):
        total = 0
        for index in self.crawled_links:
            total += len(self.crawled_links[index]['url'])
        return total

    # check if max pages reached
    def _crawled_max(self):
        result = (self.max_pages == 0) or (self.max_pages > self.total_links())
        return result

    # is link already listed
    def _is_link_listed(self, link):
        self._instance.change_state.emit('Check if URL is listed...')
        url = parse.urljoin(self.base_url, link)
        result = False
        for index in self.crawled_links:
            for opened in self.crawled_links[index]['url'].keys():
                if url == opened or link == opened:
                    result = True
        for to_open in self._links_to_crawl:
            if link == to_open[1] or url == to_open[1]:
                result = True
        return result

    # gets dynamic urls
    def _is_dynamic(self, url):
        self._instance.change_state.emit('Check if URL is dynamic...')
        if '?' in str(url) or '=' in str(url):
            self.dynamic.append(url)

    # check if page opened and exists
    def _is_response_ok(self, url):
        # status_code 200 means OK; no problems with page
        if 200 == self.browser.response.status_code:
            return True
        else:
            self._instance.change_state.emit('URL is invalid!')
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    def _is_html_page(self, url):
        try:
            if 'text/html' in self.browser.response.headers["content-type"]:
                return True
            else:
                self.invalid_links_count += 1
                self.invalid_links_list.append(url)
                self._instance.change_state.emit('URL is invalid!')
                return False
        except KeyError:
            return True

    def _is_same_page(self, url):
        if self.browser.url != url:
            res = self._opener(url)
        else:
            res = True
        if res:
            page = self.browser.parsed
            for index in self.crawled_links:
                for link in self.crawled_links[index]['url'].keys():
                    check = self.__parsed_crawled[link]
                    if check == page:
                        self._instance.change_state.emit('URL is invalid!')
                        return False
            return True
        else:
            self.finish = True
            self.running = False
            return False

    def _page_wise(self, url):
        if self.browser.url != url:
            res = self._opener(url)
        else:
            res = True
        if res:
            return self._is_response_ok(url) and self._is_html_page(
                url) and self._is_same_page(url)
        else:
            self.finish = True
            self.running = False
            return False

    def _is_same_query(self, page_link):
        parsed_url = parse.urlparse(page_link)
        query = parse.parse_qsl(parsed_url.query)
        query_len = len(query)
        if query_len > 0:
            for index in self.crawled_links:
                for link in self.crawled_links[index]['url'].keys():
                    parsed_link = parse.urlparse(link)
                    link_query = parse.parse_qsl(parsed_link.query)
                    if (parsed_link.path
                            == parsed_url.path) and (len(link_query)
                                                     == query_len):
                        i = n = 0
                        while i < query_len:
                            if query[i][0] == link_query[i][0]:
                                n += 1
                            i += 1
                        if n == query_len:
                            # result = self._is_same_page(page_link)
                            # return result
                            self._instance.change_state.emit('URL is invalid!')
                            print("is same query")
                            return False
        return True

    # check if given url belongs to website
    # i.e. is in the website's domain
    def _in_domain(self, url):
        if self.base_url in url:  # result = 0 meaning url belongs to website
            return True
        else:
            self._instance.change_state.emit('URL is invalid!')
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    # check for url protocol
    def _check_protocol(self, url):
        parsed = parse.urlparse(url)  # parse url to get information from it
        protocol = str.lower(str(parsed[0]))  # get url protocol
        if protocol == "http" or protocol == "https":  # is protocol 'http' or 'https'
            return True
        else:
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            self._instance.change_state.emit('URL is invalid!')
            return False

    def _is_robot_allowed(self, path):
        if self._rb_parser:
            return self._rb_parser.is_allowed(self._user_agent, path)
        else:
            return True

    def _url_wise(self, url):
        return self._in_domain(url) and self._check_protocol(
            url) and self._is_same_query(url)

    def _is_url_good(self, url):
        return self._url_wise(url) and self._page_wise(url)

    def _at_login(self, url):
        if not self.login_url or self.login_url != str(url):
            return False
        elif self.login_url == str(url):
            return True

    def _check_login(self, parsed):
        if self.info['logged_in']:
            self._instance.change_state.emit('Logging into the website...')
            handel = BeyondLogin(self.browser)
            self._logged_in = handel.get_login_info(self.info)
            parent = self._check_parent(handel.login_url)
            if self._logged_in:
                self._instance.change_state.emit('Login Successful!')
                # sleep(2)
                if parent:
                    self._add_crawled(handel.login_url, parent, parsed)
                else:
                    self._add_crawled(handel.login_url, self.base_url, parsed)
            else:
                self._instance.change_state.emit('Login Failed!')

            self._links_to_crawl.append(
                [handel.login_url, handel.redirect_url])
        else:
            self._instance.change_state.emit('Login Successful!')
            self._logged_in = True

    def _check_parent(self, url):
        for child in self._links_to_crawl:
            if child[1] == url:
                return child[0]
        return None

    # get all links in a given page
    def _get_page_links(self, url, page):
        self._instance.change_state.emit('Searching for all links in page...')
        # gets a list of all <a> tags in page
        links_tags = page.find_all("a")
        # going through each link
        for link in links_tags:
            self._instance.change_state.emit(
                'Searching for all links in page...')
            link_href = link.get(
                "href"
            )  # get <a> tag link reference. example: <a href="page.html"> ==> page.html
            # check that: link isn't already listed + link isn't blank
            link_listed = self._is_link_listed(link_href)
            if (not link_listed) and ('#' not in str(link_href)):
                # add link to list of links to open
                self._links_to_crawl.append([url, link_href])
                print("_get_page_links")
                print(url, link_href)
                self.total += 1

        forms = page.find_all("form")
        for form in forms:
            action = form.get("action")
            if action:  # link isn't blank
                # check that: link isn't already listed +
                link_listed = self._is_link_listed(action)
                if (not link_listed) and (action != "#"):
                    # add link to list of links to open
                    self._links_to_crawl.append([url, action])
                    self.total += 1
        self._instance.show_total.emit(self.total)

        image_map = page.find_all('area')
        for area in image_map:
            href = area.get(
                'href'
            )  # get 'href' attribute from <area shape="rect" href="#main"> tag
            listed = self._is_link_listed(href)
            if (not listed) and ('#' not in href):
                # add link to list of links to open
                self._links_to_crawl.append([url, href])
                self.total += 1
        self._instance.show_total.emit(self.total)

    # open a page and get its content
    def _open_url(self, url):
        if self.running:
            self._instance.change_state.emit('Pausing between requests...')
            # get page content
            parsed = self.browser.parsed
            if self.info['max_crawl'] != 1:
                self._get_page_links(
                    url, parsed)  # send content to retrieve links from

                # sleep(self.delay)
            else:
                self._add_crawled(url, url, parsed)
                self._is_dynamic(url)
                self._instance.show_total.emit(self.total_crawled)
            if self._at_login(url) and not self._logged_in:
                self._check_login(parsed)
            return parsed

    def _add_crawled(self, url, parent, parsed_page):
        self._instance.change_state.emit('Adding new crawled link...')
        found = False
        try:
            title = parsed_page.find('title')
            if not title:
                title = 'NO-TITLE'
            else:
                title = title.text
        except:
            title = 'NO-TITLE'

        for index in self.crawled_links:
            if self.crawled_links[index]['from'] == parent:
                self.crawled_links[index]['url'][url] = title
                found = True
                break
        if not found:
            self.crawled_links[self.total_crawled] = {
                'from': parent,
                'url': {
                    url: title
                }
            }
            self.total_crawled += 1
        self.__parsed_crawled[url] = parsed_page
        self._instance.on_info.emit(self.crawled_links)
        # sleep(2)

    # main spider function; creates our spider's web
    def run(self):
        self.start = datetime.now().time()
        self._opener(self.base_url)
        self._open_url(self.base_url)  # send main url to be opened and checked
        self._elapsed = self.browser.state.response.elapsed.total_seconds()
        self._compute_crawl_delay()
        # while there are still links to open
        self.i = len(self._links_to_crawl) - 1
        while (len(self._links_to_crawl)) > 0 and (
                self._crawled_max()) and not self.finish:
            self._instance.change_state.emit('Crawling...')
            # start from the last link in the list
            parent = self._links_to_crawl[self.i][0]
            link = self._links_to_crawl[self.i][1]
            print("----")
            print(parent, link)
            if parent[len(parent) - 1] != '/':
                parent = parent + '/'
            # url = parse.urljoin(self.base_url, link)  # join main url with page link
            url = parse.urljoin(parent, link)  # join main url with page link
            self._opener(url)
            if 200 != self.browser.response.status_code:
                url = parse.urljoin(self.base_url,
                                    link)  # join main url with page link
            print(url)
            if self._is_url_good(url) and self._is_robot_allowed(
                    link):  # is url valid and working
                print("good")
                self._instance.change_state.emit('URL is good!')
                parsed_page = self._open_url(url)  # open page
                self._add_crawled(url, parent, parsed_page)
                self._compute_crawl_delay()
                # add link to list of opened links
                self._is_dynamic(url)
            else:
                print("not good")
                self._instance.change_state.emit('URL is not good!')
            # delete opened link from list of links to open
            self._links_to_crawl.pop(self.i)
            if self.i > 0:
                self.i = self.i - 1
            elif self.i == 0:
                self.i = len(self._links_to_crawl) - 1
            if len(self._links_to_crawl) == 0 or self.i < 0:
                self._instance.change_state.emit('Finished.')
                self.finish = True
                break
        self.finish = True
        self._instance.crawl_finished.emit(self._wrap_up())

    def _calc_time(self):
        finish = datetime.now().time()
        delta1 = timedelta(seconds=self.start.second,
                           microseconds=self.start.microsecond,
                           minutes=self.start.minute,
                           hours=self.start.hour)
        delta2 = timedelta(seconds=finish.second,
                           microseconds=finish.microsecond,
                           minutes=finish.minute,
                           hours=finish.hour)
        taken = delta2 - delta1
        seconds = round(taken.total_seconds())
        if seconds >= 3600:
            hours = round(seconds / 3600)
            minutes = (round((seconds / 3600) / 60))
            elapsed = str(hours) + ':' + str(minutes) + ' hrs'
        elif seconds >= 60:
            minutes = round(seconds / 60)
            seconds = round(seconds % 60)
            elapsed = str(str(minutes) + '.' + str(seconds) + ' mins')
        else:
            elapsed = str(seconds) + ' secs'
        return elapsed

    def _wrap_up(self):
        wrap = {
            'links': self.crawled_links,
            'dynamic': self.dynamic,
            'total_crawled': self.total_links(),
            'total': self.total,
            'invalid': self.invalid_links_count,
            'running': self.running,
            'time': self._calc_time()
        }
        return wrap