Example #1
0
 def __init__(self, info, instance, parent=None):
     super(CrawlerWorker, self).__init__(parent)
     self._instance = instance
     self.running = True
     self.base_url = info['base_url']  # main url of website
     self._links_to_crawl = []  # list of links yet to open
     self.crawled_links = {}  # dictionary of links opened/all links
     self.__parsed_crawled = {}  # list of urls and their html pages
     self.total = 0  # total number of found links
     self.total_crawled = 0  # total number of valid crawled links in website
     self.max_pages = info['max_crawl']  # max pages to crawl
     self.invalid_links_count = 0  # number of broken links found
     self.invalid_links_list = []  # list of broken links found
     self.dynamic = []
     self.info = info
     self.login_url = info['login_url']  # login page url if available
     if info['robo_url']:
         self._rb_parser = RobotExclusionRulesParser()
         self._rb_parser.fetch(info['robo_url'])
         self._user_agent = 'WASecBot'
     else:
         self._rb_parser = None
     self.browser = browser.RoboBrowser(parser="html.parser",
                                        user_agent="WASecBot")
     self.browser.session.verify = False
     self._logged_in = False
     self.running = True
     self._instance.btncrawlcancel.clicked.connect(self.pause)
     self._elapsed = 0
     self.delay = 15
     self._requests = 0
     self.start = None
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Example #2
0
class RerpRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from robotexclusionrulesparser import RobotExclusionRulesParser
        self.spider = spider
        self.rp = RobotExclusionRulesParser()
        try:
            robotstxt_body = robotstxt_body.decode('utf-8')
        except UnicodeDecodeError:
            # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
            # Switch to 'allow all' state.
            logger.warning("Failure while parsing robots.txt using %(parser)s."
                           " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                           {'parser': "RobotExclusionRulesParser"},
                           exc_info=sys.exc_info(),
                           extra={'spider': self.spider})
            robotstxt_body = ''
        self.rp.parse(robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.is_allowed(user_agent, url)
Example #3
0
class RobotsTxt:
    '''
    Wrapper around robots.txt parser that adds the date the file was fetched.

    If the ``robots_file`` is None or cannot be parsed, then it's treated as a
    highly permissive robots.txt.
    '''
    def __init__(self, robots_doc):
        ''' Initialize from database document representation. '''
        self._updated_at = robots_doc['updated_at']
        self._robots = RobotExclusionRulesParser()

        if robots_doc['file'] is not None:
            try:
                self._robots.parse(robots_doc['file'])
            except:
                pass

    def is_allowed(self, user_agent, url):
        ''' Return True if ``url`` is allowed by this robots.txt file. '''
        return self._robots.is_allowed(user_agent, url)

    def is_older_than(self, age):
        ''' Return True if this robots file is older than ``age``. '''
        return (datetime.now(tzlocal()) - self._updated_at).seconds > age
class RerpWrapper(python_common.web.robots_txt.parser_base.RobotsTxtParser):
    def __init__(self, content=None, expires=None):
        super(RerpWrapper, self).__init__(content, expires)
        if content:
            self.parser = RobotExclusionRulesParser()
            self.parser.use_local_time = False
            self.parser.expiration_date = self.expires
            self.parser.parse(content)
        else:
            self.parser = None
            self.my_super = super(RerpWrapper, self)

    def allowed(self, user_agent, url):
        return self.parser.is_allowed(
            user_agent, url) if self.parser else self.my_super.allowed(
                user_agent, url)

    def delay(self, user_agent):
        return self.parser.get_crawl_delay(
            user_agent) if self.parser else self.my_super.delay(user_agent)

    @property
    def expired(self):
        return self.parser.is_expired if self.parser else self.my_super.expired

    @property
    def sitemaps(self):
        return self.parser.sitemaps if self.parser else self.my_super.sitemaps
Example #5
0
 def __get_robot_handler(url):
     rp = RobotExclusionRulesParser()
     if Util.is_url(url):
         # get the original base url
         base_url = Util.get_base_url(url)
         page = requests.get(urljoin(base_url, 'robots.txt'))
         rp.fetch(urljoin(base_url, 'robots.txt'))
     return rp
 def __init__(self, content=None, expires=None):
     super(RerpWrapper, self).__init__(content, expires)
     if content:
         self.parser = RobotExclusionRulesParser()
         self.parser.use_local_time = False
         self.parser.expiration_date = self.expires
         self.parser.parse(content)
     else:
         self.parser = None
         self.my_super = super(RerpWrapper, self)
Example #7
0
    def __init__(self, robots_doc):
        ''' Initialize from database document representation. '''
        self._updated_at = robots_doc['updated_at']
        self._robots = RobotExclusionRulesParser()

        if robots_doc['file'] is not None:
            try:
                self._robots.parse(robots_doc['file'])
            except:
                pass
Example #8
0
 def __init__(self, url, robots_fetch_timeout, user_agent, logger):
     self._logger = logger
     split_url = urlparse(url)
     split_list = list(split_url)
     split_list[2] = ROBOTS_FILE #The path at index
     robots_txt_url = str(urlunparse(tuple(split_list)))
     robots_filter = RobotExclusionRulesParser()
     logger.debug("Fetching robots filter from path: %s"%robots_txt_url)
     robots_filter.fetch(robots_txt_url, robots_fetch_timeout)
     self._robots_filter = robots_filter
     self._ua = user_agent
Example #9
0
class Robot:
    def __init__(self, url):
        self.url = Url(urljoin(url, '/robots.txt'))
        self.rerp = RobotExclusionRulesParser()
        self.rerp.user_agent = 'Mozilla/5.0'
        self.rerp.fetch(self.url.url())

    def throttle_time(self):
        return self.rerp.get_crawl_delay('Mozilla/5.0')

    def should_block(self, url):
        return not self.rerp.is_allowed('Mozilla/5.0', url.url())
def load_robot_rules():
    """ load rules from the robots.txt

    if the online online version is not accessible, then the local version is
    loaded from disk
    """
    rerp = RobotExclusionRulesParser()
    try:
        rerp.fetch(urlparse.urljoin(BASE_URL, '/robots.txt'))
    except:
        rerp.parse(open('robots.txt', 'r').read())
    return rerp
Example #11
0
def is_url_allowed(url):
    """
    Returns ``True`` if robots.txt rules for given URL allow fetching it. This
    function parses the robots rules for given URL (if any) and returns a
    boolean flag that tells you whether fetching it is allowed. Note that it
    doesn't test whether the URL exists on the host.

    :param url:     URL to test
    :returns:       ``True`` if URL can be fetched, ``False`` otherwise
    """
    robots = RobotParser()
    robots.user_agent = UA_STRING
    robots.fetch(get_robots_url(url))
    if robots.response_code != 200:
        return True
    return robots.is_allowed(UA_STRING, url)
Example #12
0
 def __init__(self, robotstxt_body, spider):
     from robotexclusionrulesparser import RobotExclusionRulesParser
     self.spider = spider
     self.rp = RobotExclusionRulesParser()
     try:
         robotstxt_body = robotstxt_body.decode('utf-8')
     except UnicodeDecodeError:
         # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
         # Switch to 'allow all' state.
         logger.warning("Failure while parsing robots.txt using %(parser)s."
                        " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                        {'parser': "RobotExclusionRulesParser"},
                        exc_info=sys.exc_info(),
                        extra={'spider': self.spider})
         robotstxt_body = ''
     self.rp.parse(robotstxt_body)
Example #13
0
class RerpRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from robotexclusionrulesparser import RobotExclusionRulesParser
        self.spider = spider
        self.rp = RobotExclusionRulesParser()
        robotstxt_body = decode_robotstxt(robotstxt_body, spider)
        self.rp.parse(robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.is_allowed(user_agent, url)
    def get_robots(url):
        robots_directory = 'robots'
        robots_file_path = robots_directory+'/'+url
        if os.path.isfile(robots_file_path):
            robots_file = open(robots_file_path,"rb")

#            robots_parser = RobotExclusionRulesParser()
#            robots_parser.parse(content)
            robots_parser = pickle.load(robots_file)
        else:
            buffer = StringIO.StringIO()
            c = pycurl.Curl()
            c.setopt(c.URL, 'http://'+url+'/robots.txt')
            c.setopt(c.REFERER,'')
            c.setopt(c.USERAGENT,'Curl')
            c.setopt(c.FOLLOWLOCATION, 1)
            c.setopt(c.WRITEFUNCTION, buffer.write)
            try:
                c.perform()
            except pycurl.error, e:
                print "Error code: ", e[0]
                print "Error message: ", e[1]
                c.close()
                robots_parser = RobotExclusionRulesParser()
                robots_parser.parse('')
                return robots_parser
            c.close()
#            print buffer.getvalue()
            robots_parser = RobotExclusionRulesParser()
            robots_parser.parse(buffer.getvalue())
            robots_file = open(robots_file_path,"wb")
            pickle.dump(robots_parser, robots_file)
Example #15
0
    def allowed_url(self):
        #FIXME: Should use the geturl address as it may have been redirected
        scheme, netloc, path, query, fragment = urlsplit(self.url)
        robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

        #FIXME: Should cache robots.txt in a better persistent data structure
        if robot_url in ROBOT_CACHE:
            rp = ROBOT_CACHE[robot_url]
        else:
            rp = RobotExclusionRulesParser()
            try:
                rp.fetch(robot_url)
            # Currently if there's a problem we assume there is no robots.txt
            except IOError:
                # Should be catching the urllib2.URLError exception
                logging.debug("Couldn't retrieve robots.txt for %s" %
                              robot_url)
                rp = None
            except UnicodeDecodeError:
                logging.debug("Unicode decode error for robots.txt at %s" %
                              robot_url)
                rp = None
            except httplib.HTTPException:
                logging.debug("Generic HTTPException for robots.txt at %s" %
                              robot_url)
                rp = None
            ROBOT_CACHE[robot_url] = rp

        if rp is None or rp.is_allowed("*", self.url):
            base_url = urlunsplit([scheme, netloc, "", "", ""])

            # If there's a current delay on the site respect robots.txt and stall
            if self.db.exists(netloc):
                logging.debug("Obeying robot overlord for %s..." % netloc)
                URLHandler.add_to_busy(self.db, self.url)
                return False

            # Set a delay for any other requests to this site to respect robots.txt
            delay = rp.get_crawl_delay("*") if rp else None
            if delay:
                delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
            else:
                delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
            self.db.setex(netloc, "1", delay)

            return True
        else:
            return False
Example #16
0
def crawl_website(website):
    website.update_robots_txt()  # only updates if necessary
    rules = RobotExclusionRulesParser()
    rules.parse(website.robots_content)

    # TODO add check for site last updated timestamp

    # Has the index been retrieved yet?
    if not website.webpage_set.exists():
        # get index
        if rules.is_allowed('*', '/'):
            webpage = Webpage.objects.create(
                local_url='/',
                robots_allowed=True,
                website=website,
            )
            crawl_existing_webpage(webpage, rules)
        else:
            # create a placeholder index webpage
            webpage = Webpage.objects.create(
                local_url='/',
                robots_allowed=False,
                website=website,
            )
            print 'Robots not allowed to index root'
            return None

    # Are there webpages to be accessed?
    allowed_webpages = website.webpage_set.filter(robots_allowed=True)
    if not allowed_webpages.exists():
        # print 'no allowed webpages found for {website}'.format(website=website.url)
        return None

    # Are there new links to try out?
    new_webpages = allowed_webpages.filter(exists=None)
    if new_webpages.exists():
        # start with the oldest first
        # created and updated are the same for newly-created webpages
        webpage = new_webpages.order_by('created').first()
        print 'crawling new'
        return crawl_existing_webpage(webpage, rules)

    # Crawl an existing webpage
    if rules.is_allowed('*', '/foo.html'):
        webpage = allowed_webpages.filter(
            exists=True).order_by('updated').first()
        print 'crawling existing'
        return crawl_existing_webpage(webpage, rules)
Example #17
0
    def check(self, hostkey, relurl):
        """ Return True if allowed to fetch, False if not, None
        if we do not have robots.txt for this entry. """

        robotstxt, expiration = self.robots.get(hostkey, (None, None))

        if robotstxt is None:
            return None

        # FIXME: mtime?  we need to let robots.txt expire.

        robotparser = RobotExclusionRulesParser()

        if robotsparser.is_expired():
            return None

        robotparser.seturl(hostkey + '/robots.txt')
        robotparser.parse(robotstxt.splitlines())
        return robotparser.can_fetch(hostkey + relurl)
Example #18
0
  def allowed_url(self):
    #FIXME: Should use the geturl address as it may have been redirected
    scheme, netloc, path, query, fragment = urlsplit(self.url)
    robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

    #FIXME: Should cache robots.txt in a better persistent data structure
    if robot_url in ROBOT_CACHE:
      rp = ROBOT_CACHE[robot_url]
    else:
      rp = RobotExclusionRulesParser()
      try:
        rp.fetch(robot_url)
      # Currently if there's a problem we assume there is no robots.txt
      except IOError:
        # Should be catching the urllib2.URLError exception
        logging.debug("Couldn't retrieve robots.txt for %s" % robot_url)
        rp = None
      except UnicodeDecodeError:
        logging.debug("Unicode decode error for robots.txt at %s" % robot_url)
        rp = None
      except httplib.HTTPException:
        logging.debug("Generic HTTPException for robots.txt at %s" % robot_url)
        rp = None
      ROBOT_CACHE[robot_url] = rp

    if rp is None or rp.is_allowed("*", self.url):
      base_url = urlunsplit([scheme, netloc, "", "", ""])

      # If there's a current delay on the site respect robots.txt and stall
      if self.db.exists(netloc):
        logging.debug("Obeying robot overlord for %s..." % netloc)
        URLHandler.add_to_busy(self.db, self.url)
        return False

      # Set a delay for any other requests to this site to respect robots.txt
      delay = rp.get_crawl_delay("*") if rp else None
      if delay:
        delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
      else:
        delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
      self.db.setex(netloc, "1", delay)

      return True
    else:
      return False
Example #19
0
 def _parse_robots(self, response):
     rp = RobotExclusionRulesParser()
     rp.parse(response.body)
     self._parsers[urlparse_cached(response).netloc] = rp
Example #20
0
 def __init__(self, robotstxt_body, spider):
     from robotexclusionrulesparser import RobotExclusionRulesParser
     self.spider = spider
     self.rp = RobotExclusionRulesParser()
     robotstxt_body = decode_robotstxt(robotstxt_body, spider)
     self.rp.parse(robotstxt_body)
Example #21
0
#!/usr/bin/python
#encoding:utf-8

from robotexclusionrulesparser import RobotExclusionRulesParser as RobotsParser

rb = RobotsParser()
# rb.fetch("http://www.zhihu.com/robots.txt")
# print rb
# print rb._RobotExclusionRulesParser__rulesets
# print rb.is_allowed('*', 'http://www.zhihu.com/loginasdkj?encode=12')
# print rb.is_allowed('*', '/admin_inbox')
# print '======'

rb.fetch("http://www.iplaypython.com/robots.txt")
print rb
print '======'

rb.fetch("http://baidu.com/robots.txt")
print rb
print '======'

rb.fetch("http://jaysonhwang.com/robots.txt")
print rb
print '======'
Example #22
0
def benchmark_rerp_parser(website):
    from robotexclusionrulesparser import RobotExclusionRulesParser
    rp = RobotExclusionRulesParser()
    rp.parse(website['robotstxt'])
    for link in website['links']:
        rp.is_allowed('googlebot', link)
Example #23
0
class CrawlerWorker(
        QObject
):  # spider that will get links of website # called to create instance of the class
    finish = False

    def __init__(self, info, instance, parent=None):
        super(CrawlerWorker, self).__init__(parent)
        self._instance = instance
        self.running = True
        self.base_url = info['base_url']  # main url of website
        self._links_to_crawl = []  # list of links yet to open
        self.crawled_links = {}  # dictionary of links opened/all links
        self.__parsed_crawled = {}  # list of urls and their html pages
        self.total = 0  # total number of found links
        self.total_crawled = 0  # total number of valid crawled links in website
        self.max_pages = info['max_crawl']  # max pages to crawl
        self.invalid_links_count = 0  # number of broken links found
        self.invalid_links_list = []  # list of broken links found
        self.dynamic = []
        self.info = info
        self.login_url = info['login_url']  # login page url if available
        if info['robo_url']:
            self._rb_parser = RobotExclusionRulesParser()
            self._rb_parser.fetch(info['robo_url'])
            self._user_agent = 'WASecBot'
        else:
            self._rb_parser = None
        self.browser = browser.RoboBrowser(parser="html.parser",
                                           user_agent="WASecBot")
        self.browser.session.verify = False
        self._logged_in = False
        self.running = True
        self._instance.btncrawlcancel.clicked.connect(self.pause)
        self._elapsed = 0
        self.delay = 15
        self._requests = 0
        self.start = None
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    def _opener(self, url):
        retry = 1
        while True:
            try:
                self.browser.open(url=url)
                break
            except exceptions.ConnectionError as ce:
                # sleep(self.delay * retry)
                if retry == 11:
                    return False
                else:
                    retry += 1
        return True

    def _compute_crawl_delay(self):
        self._requests += 1
        if self._requests <= 10:
            self._elapsed += self.browser.response.elapsed.total_seconds()
            delay = self._elapsed / self._requests
            self.delay = delay * 200
            if self.delay >= 180:
                self.delay = 15
        else:
            self._requests = 1
            self._elapsed = self.browser.response.elapsed.total_seconds()
            self.delay = self._elapsed * 200

    def pause(self):
        self.running = False
        self._instance.change_state.emit('Canceling...')
        choice = QtWidgets.QMessageBox.question(
            self._instance, "Cancel Crawl!",
            "WASec is not finished yet, are You sure you want to stop crawling?",
            QtWidgets.QMessageBox.Cancel | QtWidgets.QMessageBox.Yes)
        if choice == QtWidgets.QMessageBox.Yes:
            self.finish = True
            self.running = False
            self._instance.crawl_finished.emit(self._wrap_up())
        else:
            self.running = True

    # get total number of links opened so far
    def total_links(self):
        total = 0
        for index in self.crawled_links:
            total += len(self.crawled_links[index]['url'])
        return total

    # check if max pages reached
    def _crawled_max(self):
        result = (self.max_pages == 0) or (self.max_pages > self.total_links())
        return result

    # is link already listed
    def _is_link_listed(self, link):
        self._instance.change_state.emit('Check if URL is listed...')
        url = parse.urljoin(self.base_url, link)
        result = False
        for index in self.crawled_links:
            for opened in self.crawled_links[index]['url'].keys():
                if url == opened or link == opened:
                    result = True
        for to_open in self._links_to_crawl:
            if link == to_open[1] or url == to_open[1]:
                result = True
        return result

    # gets dynamic urls
    def _is_dynamic(self, url):
        self._instance.change_state.emit('Check if URL is dynamic...')
        if '?' in str(url) or '=' in str(url):
            self.dynamic.append(url)

    # check if page opened and exists
    def _is_response_ok(self, url):
        # status_code 200 means OK; no problems with page
        if 200 == self.browser.response.status_code:
            return True
        else:
            self._instance.change_state.emit('URL is invalid!')
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    def _is_html_page(self, url):
        try:
            if 'text/html' in self.browser.response.headers["content-type"]:
                return True
            else:
                self.invalid_links_count += 1
                self.invalid_links_list.append(url)
                self._instance.change_state.emit('URL is invalid!')
                return False
        except KeyError:
            return True

    def _is_same_page(self, url):
        if self.browser.url != url:
            res = self._opener(url)
        else:
            res = True
        if res:
            page = self.browser.parsed
            for index in self.crawled_links:
                for link in self.crawled_links[index]['url'].keys():
                    check = self.__parsed_crawled[link]
                    if check == page:
                        self._instance.change_state.emit('URL is invalid!')
                        return False
            return True
        else:
            self.finish = True
            self.running = False
            return False

    def _page_wise(self, url):
        if self.browser.url != url:
            res = self._opener(url)
        else:
            res = True
        if res:
            return self._is_response_ok(url) and self._is_html_page(
                url) and self._is_same_page(url)
        else:
            self.finish = True
            self.running = False
            return False

    def _is_same_query(self, page_link):
        parsed_url = parse.urlparse(page_link)
        query = parse.parse_qsl(parsed_url.query)
        query_len = len(query)
        if query_len > 0:
            for index in self.crawled_links:
                for link in self.crawled_links[index]['url'].keys():
                    parsed_link = parse.urlparse(link)
                    link_query = parse.parse_qsl(parsed_link.query)
                    if (parsed_link.path
                            == parsed_url.path) and (len(link_query)
                                                     == query_len):
                        i = n = 0
                        while i < query_len:
                            if query[i][0] == link_query[i][0]:
                                n += 1
                            i += 1
                        if n == query_len:
                            # result = self._is_same_page(page_link)
                            # return result
                            self._instance.change_state.emit('URL is invalid!')
                            print("is same query")
                            return False
        return True

    # check if given url belongs to website
    # i.e. is in the website's domain
    def _in_domain(self, url):
        if self.base_url in url:  # result = 0 meaning url belongs to website
            return True
        else:
            self._instance.change_state.emit('URL is invalid!')
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    # check for url protocol
    def _check_protocol(self, url):
        parsed = parse.urlparse(url)  # parse url to get information from it
        protocol = str.lower(str(parsed[0]))  # get url protocol
        if protocol == "http" or protocol == "https":  # is protocol 'http' or 'https'
            return True
        else:
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            self._instance.change_state.emit('URL is invalid!')
            return False

    def _is_robot_allowed(self, path):
        if self._rb_parser:
            return self._rb_parser.is_allowed(self._user_agent, path)
        else:
            return True

    def _url_wise(self, url):
        return self._in_domain(url) and self._check_protocol(
            url) and self._is_same_query(url)

    def _is_url_good(self, url):
        return self._url_wise(url) and self._page_wise(url)

    def _at_login(self, url):
        if not self.login_url or self.login_url != str(url):
            return False
        elif self.login_url == str(url):
            return True

    def _check_login(self, parsed):
        if self.info['logged_in']:
            self._instance.change_state.emit('Logging into the website...')
            handel = BeyondLogin(self.browser)
            self._logged_in = handel.get_login_info(self.info)
            parent = self._check_parent(handel.login_url)
            if self._logged_in:
                self._instance.change_state.emit('Login Successful!')
                # sleep(2)
                if parent:
                    self._add_crawled(handel.login_url, parent, parsed)
                else:
                    self._add_crawled(handel.login_url, self.base_url, parsed)
            else:
                self._instance.change_state.emit('Login Failed!')

            self._links_to_crawl.append(
                [handel.login_url, handel.redirect_url])
        else:
            self._instance.change_state.emit('Login Successful!')
            self._logged_in = True

    def _check_parent(self, url):
        for child in self._links_to_crawl:
            if child[1] == url:
                return child[0]
        return None

    # get all links in a given page
    def _get_page_links(self, url, page):
        self._instance.change_state.emit('Searching for all links in page...')
        # gets a list of all <a> tags in page
        links_tags = page.find_all("a")
        # going through each link
        for link in links_tags:
            self._instance.change_state.emit(
                'Searching for all links in page...')
            link_href = link.get(
                "href"
            )  # get <a> tag link reference. example: <a href="page.html"> ==> page.html
            # check that: link isn't already listed + link isn't blank
            link_listed = self._is_link_listed(link_href)
            if (not link_listed) and ('#' not in str(link_href)):
                # add link to list of links to open
                self._links_to_crawl.append([url, link_href])
                print("_get_page_links")
                print(url, link_href)
                self.total += 1

        forms = page.find_all("form")
        for form in forms:
            action = form.get("action")
            if action:  # link isn't blank
                # check that: link isn't already listed +
                link_listed = self._is_link_listed(action)
                if (not link_listed) and (action != "#"):
                    # add link to list of links to open
                    self._links_to_crawl.append([url, action])
                    self.total += 1
        self._instance.show_total.emit(self.total)

        image_map = page.find_all('area')
        for area in image_map:
            href = area.get(
                'href'
            )  # get 'href' attribute from <area shape="rect" href="#main"> tag
            listed = self._is_link_listed(href)
            if (not listed) and ('#' not in href):
                # add link to list of links to open
                self._links_to_crawl.append([url, href])
                self.total += 1
        self._instance.show_total.emit(self.total)

    # open a page and get its content
    def _open_url(self, url):
        if self.running:
            self._instance.change_state.emit('Pausing between requests...')
            # get page content
            parsed = self.browser.parsed
            if self.info['max_crawl'] != 1:
                self._get_page_links(
                    url, parsed)  # send content to retrieve links from

                # sleep(self.delay)
            else:
                self._add_crawled(url, url, parsed)
                self._is_dynamic(url)
                self._instance.show_total.emit(self.total_crawled)
            if self._at_login(url) and not self._logged_in:
                self._check_login(parsed)
            return parsed

    def _add_crawled(self, url, parent, parsed_page):
        self._instance.change_state.emit('Adding new crawled link...')
        found = False
        try:
            title = parsed_page.find('title')
            if not title:
                title = 'NO-TITLE'
            else:
                title = title.text
        except:
            title = 'NO-TITLE'

        for index in self.crawled_links:
            if self.crawled_links[index]['from'] == parent:
                self.crawled_links[index]['url'][url] = title
                found = True
                break
        if not found:
            self.crawled_links[self.total_crawled] = {
                'from': parent,
                'url': {
                    url: title
                }
            }
            self.total_crawled += 1
        self.__parsed_crawled[url] = parsed_page
        self._instance.on_info.emit(self.crawled_links)
        # sleep(2)

    # main spider function; creates our spider's web
    def run(self):
        self.start = datetime.now().time()
        self._opener(self.base_url)
        self._open_url(self.base_url)  # send main url to be opened and checked
        self._elapsed = self.browser.state.response.elapsed.total_seconds()
        self._compute_crawl_delay()
        # while there are still links to open
        self.i = len(self._links_to_crawl) - 1
        while (len(self._links_to_crawl)) > 0 and (
                self._crawled_max()) and not self.finish:
            self._instance.change_state.emit('Crawling...')
            # start from the last link in the list
            parent = self._links_to_crawl[self.i][0]
            link = self._links_to_crawl[self.i][1]
            print("----")
            print(parent, link)
            if parent[len(parent) - 1] != '/':
                parent = parent + '/'
            # url = parse.urljoin(self.base_url, link)  # join main url with page link
            url = parse.urljoin(parent, link)  # join main url with page link
            self._opener(url)
            if 200 != self.browser.response.status_code:
                url = parse.urljoin(self.base_url,
                                    link)  # join main url with page link
            print(url)
            if self._is_url_good(url) and self._is_robot_allowed(
                    link):  # is url valid and working
                print("good")
                self._instance.change_state.emit('URL is good!')
                parsed_page = self._open_url(url)  # open page
                self._add_crawled(url, parent, parsed_page)
                self._compute_crawl_delay()
                # add link to list of opened links
                self._is_dynamic(url)
            else:
                print("not good")
                self._instance.change_state.emit('URL is not good!')
            # delete opened link from list of links to open
            self._links_to_crawl.pop(self.i)
            if self.i > 0:
                self.i = self.i - 1
            elif self.i == 0:
                self.i = len(self._links_to_crawl) - 1
            if len(self._links_to_crawl) == 0 or self.i < 0:
                self._instance.change_state.emit('Finished.')
                self.finish = True
                break
        self.finish = True
        self._instance.crawl_finished.emit(self._wrap_up())

    def _calc_time(self):
        finish = datetime.now().time()
        delta1 = timedelta(seconds=self.start.second,
                           microseconds=self.start.microsecond,
                           minutes=self.start.minute,
                           hours=self.start.hour)
        delta2 = timedelta(seconds=finish.second,
                           microseconds=finish.microsecond,
                           minutes=finish.minute,
                           hours=finish.hour)
        taken = delta2 - delta1
        seconds = round(taken.total_seconds())
        if seconds >= 3600:
            hours = round(seconds / 3600)
            minutes = (round((seconds / 3600) / 60))
            elapsed = str(hours) + ':' + str(minutes) + ' hrs'
        elif seconds >= 60:
            minutes = round(seconds / 60)
            seconds = round(seconds % 60)
            elapsed = str(str(minutes) + '.' + str(seconds) + ' mins')
        else:
            elapsed = str(seconds) + ' secs'
        return elapsed

    def _wrap_up(self):
        wrap = {
            'links': self.crawled_links,
            'dynamic': self.dynamic,
            'total_crawled': self.total_links(),
            'total': self.total,
            'invalid': self.invalid_links_count,
            'running': self.running,
            'time': self._calc_time()
        }
        return wrap
Example #24
0
 def __init__(self, url):
     self.url = Url(urljoin(url, '/robots.txt'))
     self.rerp = RobotExclusionRulesParser()
     self.rerp.user_agent = 'Mozilla/5.0'
     self.rerp.fetch(self.url.url())
class Crawler:
    start_page_url = ''
    rerp = RobotExclusionRulesParser()
    cFiles = CrawlerFiles()
    tld = ''
    waiting_url_set = set()
    crawled_url_set = set()
    bad_url_set = set()
    find_string_set = set()
    find_flname_set = set()
    found_flname_set = set()
    found_string_set = set()
    stop_request = False
    download_chunk_size = 0
    conn_timeout = 0
    delay = 0
    user_agent = ''
    bad_url_prefix = '->Bad Url '
    found_string_prefix = '->Found '
    found_flname_prefix = '->Saved '

    def __init__(self, save_dir, start_url, find_flname_set, find_string_set,
                 chunk_size, conn_timeout, default_delay, user_agent):
        logger.info('->Starting RERP')
        Crawler.rerp.fetch(start_url + '/robots.txt')
        Crawler.user_agent = user_agent
        delay = Crawler.rerp.get_crawl_delay(Crawler.user_agent)
        Crawler.conn_timeout = conn_timeout
        if delay is None:
            Crawler.delay = default_delay
        else:
            Crawler.delay = delay
        Crawler.cFiles = CrawlerFiles(save_dir, start_url)
        logger.info('->Getting Previous Session files (if any) ')
        Crawler.crawled_url_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.crawled_file)
        Crawler.found_flname_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.found_files_file)
        Crawler.found_string_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.found_strings_file)
        Crawler.bad_url_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.invalid_file)
        Crawler.waiting_url_set = Crawler.cFiles.get_file_data(
            Crawler.cFiles.waiting_file)
        info = Crawler.cFiles.get_file_data(Crawler.cFiles.info_file)

        Crawler.start_page_url = start_url

        Crawler.tld = url_func.return_tld(start_url)

        Crawler.find_string_set = find_string_set
        Crawler.find_flname_set = find_flname_set
        Crawler.download_chunk_size = chunk_size
        logger.info('Crawler Initiated')
        logger.info('->Loading Website Info')
        logger.debug('* ' * 20 + 'Website Info' + '* ' * 20)
        if info is None:
            info = url_func.get_domain_info(Crawler.tld)
            Crawler.cFiles.set_file_data(Crawler.cFiles.info_file, info)
        for key in info:
            val = info[key]
            if val:
                logger.debug("%-20s : %s" % (str(key).upper(), str(val)))
        logger.debug('* ' * 40)

    @staticmethod
    def crawl_page(t_name, page_url):
        # noinspection PyBroadException
        try:
            logger.debug("%s - %s" % (t_name, page_url))
            if not Crawler.rerp.is_allowed(Crawler.user_agent, page_url):
                logger.debug('->%s not allowed to crawl %s' %
                             (t_name, page_url))
                return
            Crawler.add_urls(page_url)
            if not Crawler.stop_request:
                Crawler.waiting_url_set.remove(page_url)
                Crawler.crawled_url_set.add(page_url)
                time.sleep(Crawler.delay)
        except requests.HTTPError as h:
            string = "HTTP Error %d - %s" % (h.response.status_code, page_url)
            logger.debug(Crawler.bad_url_prefix + string)
            Crawler.bad_url_set.add(string)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)
        except requests.ReadTimeout:
            string = "Timeout %0.1f secs - %s " % (Crawler.conn_timeout,
                                                   page_url)
            logger.debug(Crawler.bad_url_prefix + string)
            Crawler.bad_url_set.add(string)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)
        except requests.TooManyRedirects as t:
            string = "%s - %s" % (t, page_url)
            logger.debug(Crawler.bad_url_prefix + string)
            Crawler.bad_url_set.add(string)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)
        except (requests.ConnectionError, requests.ConnectTimeout):
            if url_func.check_connection() != url_func.CONNECTION_OK:
                Crawler.wait(t_name)
        except Exception:
            logger.exception('Exception in %s ' % page_url)
            Crawler.waiting_url_set.remove(page_url)
            Crawler.crawled_url_set.add(page_url)

    @staticmethod
    def add_urls(page_url):
        not_html = False
        with closing(
                requests.get(page_url,
                             stream=True,
                             timeout=Crawler.conn_timeout)
        ) as page:  # html code of page
            type_of_page = page.headers[
                'Content-Type']  # get content type from header of html page
            page.raise_for_status()
            if 'html' in type_of_page:  # web page
                soup = BeautifulSoup(
                    page.content, "html.parser")  # parse the content of page
                text = soup.text
                for string in Crawler.find_string_set:
                    if Crawler.stop_request:  # if stop is requested by user
                        return
                    str_url = string + ' ' + page_url
                    if text is not None and string in text:
                        Crawler.found_string_set.add(str_url)
                        logger.debug(
                            '%s %s %s' %
                            (Crawler.found_string_prefix, string, page_url))
                for a_tag_content in soup.find_all('a'):
                    if Crawler.stop_request:  # if stop is requested by user
                        return
                    url = parse.urljoin(Crawler.start_page_url,
                                        a_tag_content.get('href'))
                    if '#' in url:
                        url = url.split('#')[0]
                    if ' ' in url:
                        url = url.replace(' ', '%20')
                    if url_func.return_tld(url) == Crawler.tld:
                        if url not in Crawler.crawled_url_set:
                            Crawler.waiting_url_set.add(url)
            else:
                not_html = True
        if not_html:
            f_name = page_url.split('/')[-1]
            download_file = False
            for string in Crawler.find_flname_set:
                if Crawler.stop_request:
                    break
                if string in f_name:
                    download_file = True
                    break
            if download_file:
                type_split = type_of_page.split('/')
                f_dir = Crawler.cFiles.save_dir + '/' + type_split[0]
                if not dir_exists(f_dir):
                    make_dir(f_dir)
                Crawler.found_flname_set.add(page_url)
                Crawler.file_download(page_url, f_dir, f_name)
                if not Crawler.stop_request:
                    logger.debug('%s %s' %
                                 (Crawler.found_flname_prefix, page_url))

    # wait
    @staticmethod
    def wait(t_name):
        logger.info('->%s waiting for connection...' % t_name)
        while True:
            if Crawler.stop_request:
                break
            if url_func.check_connection() == url_func.CONNECTION_OK:
                break
            time.sleep(2)

    @staticmethod
    def update_files():
        logger.info('Updating Files')
        Crawler.cFiles.set_file_data(Crawler.cFiles.crawled_file,
                                     Crawler.crawled_url_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.found_files_file,
                                     Crawler.found_flname_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.found_strings_file,
                                     Crawler.found_string_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.invalid_file,
                                     Crawler.bad_url_set)
        Crawler.cFiles.set_file_data(Crawler.cFiles.waiting_file,
                                     Crawler.waiting_url_set)

    @staticmethod
    def file_download(file_url, f_dir, f_name):
        f_path = get_file_path(f_dir, f_name)
        # logger.info('Saving  ', f_name)
        dl = file_size(f_path)
        resume_header = {'Range': 'bytes=%d-' % dl}
        with closing(
                requests.get(file_url,
                             stream=True,
                             headers=resume_header,
                             timeout=Crawler.conn_timeout)) as file:
            tl_str = file.headers.get('content-length')
            # if there is no content length specified in header website doesnt support resuming
            mode = 'ab' if tl_str else 'wb'
            with open(f_path, mode) as handle:
                for chunk in file.iter_content(
                        chunk_size=Crawler.download_chunk_size):
                    if Crawler.stop_request:  # if stop is requested by user
                        return
                    if chunk:
                        handle.write(chunk)
Example #26
0
def manual_add_robot_policies():
    # coz some critical sites have invalid robots.txt
    ## surprised to see SO MANY sites without valid robots.txt!
    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /search\n' +
                  'Disallow: /advanced_search\n')
    robots_policies['findingaids.library.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n' +
                  'Disallow: /contact\n' + 'Disallow: /downloads\n' +
                  'Disallow: /users\n')
    robots_policies['digitalhub.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /catalog\n')
    robots_policies['images.library.northwestern.edu'] = site_rp
    robots_policies['images.northwestern.edu'] = site_rp
    robots_policies['media.northwestern.edu'] = site_rp
    robots_policies['arch.library.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    site_rp.parse('User-agent: * \n' + 'Disallow: /?*\n')
    robots_policies['schedule.radiology.northwestern.edu'] = site_rp

    site_rp = RobotExclusionRulesParser()
    try:
        request = urllib2.Request('http://www.ctd.northwestern.edu/robots.txt')
        response = urllib2.urlopen(request, timeout=5)
        content = response.read()
    except:
        content = 'User-agent: * \n'
    content += ('Disallow: /courses?*\n')
    site_rp.parse(content)
    robots_policies['www.ctd.northwestern.edu'] = site_rp
Example #27
0
    def crawl(self, in_url):
        global global_id, last_update, DOMAIN
        print("Crawler %d on P#%d: %s" % (self.id, url_ids[in_url], in_url))
        try:
            request = urllib2.Request(in_url)
            response = urllib2.urlopen(request, timeout=5)
            real_url = w3lib.url.canonicalize_url(response.geturl())
            real_uri = urlparse(real_url)
            extension = real_uri.path.lower().split('.')[-1]
            if response.info(
            ).maintype != 'text' or extension in skip_file_types:
                content = ''
            else:
                content = response.read()
        except:
            real_url = in_url
            content = ''

        if real_url == in_url:  # no redirect
            soup = BeautifulSoup(content, "html.parser")
            raw_urls = [link.get('href') for link in soup.find_all('a')]
        else:  # redirect
            raw_urls = [real_url]

        out_urls = set()
        for url in raw_urls:
            #print('parsing', url)
            if url is None or len(url) <= 1:
                continue

            url = url.strip()

            if url.startswith('/http://') or url.startswith('/https://'):
                # why would someone do this?
                url = url[1:]
            if url.startswith('mailto:') or url.startswith('mailto@'):
                continue

            fixed_url = w3lib.url.canonicalize_url(urljoin(in_url, url))
            if len(fixed_url) > 1000:  # long urls tend to be wrong urls
                continue
            uri = urlparse(fixed_url)
            if uri.scheme is not None and uri.scheme not in [
                    'http', 'https', ''
            ]:
                continue
            if uri.hostname is not None:
                if not uri.hostname.endswith(DOMAIN):
                    continue
                elif uri.hostname not in robots_policies:
                    site_rp = RobotExclusionRulesParser()
                    try:
                        site_rp.fetch('http://' + uri.hostname + '/robots.txt',
                                      timeout=3)
                    except:
                        print "error with", ('http://' + uri.hostname +
                                             '/robots.txt')
                    rp_lock.acquire()
                    robots_policies[uri.hostname] = site_rp
                    rp_lock.release()
                if not (robots_policies[uri.hostname].is_allowed(
                        "*", fixed_url)):
                    continue
            extension = uri.path.lower().split('.')[-1]
            if extension in skip_file_types:
                continue
            if 1 < len(extension) < 8 and '/' not in extension:
                urls_extensions.add(extension)

            out_urls.add(fixed_url)

        #print out_urls
        #get lock
        write_lock.acquire()
        out_ids = []
        for url in out_urls:
            if url in url_ids:
                out_ids.append(url_ids[url])
            else:
                url_ids[url] = global_id
                out_ids.append(global_id)
                url_id_file.write('%d\t%s\n' % (global_id, url))
                url_id_file.flush()
                global_id += 1
                url_tasks.put(url)
        transition_file.write('%d\t%s\n' % (url_ids[in_url], str(out_ids)))
        transition_file.flush()
        last_update = time.time()
        write_lock.release()
        #release lock
        print('%d urls in total reported by %d' % (global_id, self.id))