Exemple #1
0
 def __get_robot_handler(url):
     rp = RobotExclusionRulesParser()
     if Util.is_url(url):
         # get the original base url
         base_url = Util.get_base_url(url)
         page = requests.get(urljoin(base_url, 'robots.txt'))
         rp.fetch(urljoin(base_url, 'robots.txt'))
     return rp
 def __init__(self, url, robots_fetch_timeout, user_agent, logger):
     self._logger = logger
     split_url = urlparse(url)
     split_list = list(split_url)
     split_list[2] = ROBOTS_FILE #The path at index
     robots_txt_url = str(urlunparse(tuple(split_list)))
     robots_filter = RobotExclusionRulesParser()
     logger.debug("Fetching robots filter from path: %s"%robots_txt_url)
     robots_filter.fetch(robots_txt_url, robots_fetch_timeout)
     self._robots_filter = robots_filter
     self._ua = user_agent
Exemple #3
0
class Robot:
    def __init__(self, url):
        self.url = Url(urljoin(url, '/robots.txt'))
        self.rerp = RobotExclusionRulesParser()
        self.rerp.user_agent = 'Mozilla/5.0'
        self.rerp.fetch(self.url.url())

    def throttle_time(self):
        return self.rerp.get_crawl_delay('Mozilla/5.0')

    def should_block(self, url):
        return not self.rerp.is_allowed('Mozilla/5.0', url.url())
def load_robot_rules():
    """ load rules from the robots.txt

    if the online online version is not accessible, then the local version is
    loaded from disk
    """
    rerp = RobotExclusionRulesParser()
    try:
        rerp.fetch(urlparse.urljoin(BASE_URL, '/robots.txt'))
    except:
        rerp.parse(open('robots.txt', 'r').read())
    return rerp
Exemple #5
0
    def allowed_url(self):
        #FIXME: Should use the geturl address as it may have been redirected
        scheme, netloc, path, query, fragment = urlsplit(self.url)
        robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

        #FIXME: Should cache robots.txt in a better persistent data structure
        if robot_url in ROBOT_CACHE:
            rp = ROBOT_CACHE[robot_url]
        else:
            rp = RobotExclusionRulesParser()
            try:
                rp.fetch(robot_url)
            # Currently if there's a problem we assume there is no robots.txt
            except IOError:
                # Should be catching the urllib2.URLError exception
                logging.debug("Couldn't retrieve robots.txt for %s" %
                              robot_url)
                rp = None
            except UnicodeDecodeError:
                logging.debug("Unicode decode error for robots.txt at %s" %
                              robot_url)
                rp = None
            except httplib.HTTPException:
                logging.debug("Generic HTTPException for robots.txt at %s" %
                              robot_url)
                rp = None
            ROBOT_CACHE[robot_url] = rp

        if rp is None or rp.is_allowed("*", self.url):
            base_url = urlunsplit([scheme, netloc, "", "", ""])

            # If there's a current delay on the site respect robots.txt and stall
            if self.db.exists(netloc):
                logging.debug("Obeying robot overlord for %s..." % netloc)
                URLHandler.add_to_busy(self.db, self.url)
                return False

            # Set a delay for any other requests to this site to respect robots.txt
            delay = rp.get_crawl_delay("*") if rp else None
            if delay:
                delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
            else:
                delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
            self.db.setex(netloc, "1", delay)

            return True
        else:
            return False
def is_url_allowed(url):
    """
    Returns ``True`` if robots.txt rules for given URL allow fetching it. This
    function parses the robots rules for given URL (if any) and returns a
    boolean flag that tells you whether fetching it is allowed. Note that it
    doesn't test whether the URL exists on the host.

    :param url:     URL to test
    :returns:       ``True`` if URL can be fetched, ``False`` otherwise
    """
    robots = RobotParser()
    robots.user_agent = UA_STRING
    robots.fetch(get_robots_url(url))
    if robots.response_code != 200:
        return True
    return robots.is_allowed(UA_STRING, url)
Exemple #7
0
  def allowed_url(self):
    #FIXME: Should use the geturl address as it may have been redirected
    scheme, netloc, path, query, fragment = urlsplit(self.url)
    robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

    #FIXME: Should cache robots.txt in a better persistent data structure
    if robot_url in ROBOT_CACHE:
      rp = ROBOT_CACHE[robot_url]
    else:
      rp = RobotExclusionRulesParser()
      try:
        rp.fetch(robot_url)
      # Currently if there's a problem we assume there is no robots.txt
      except IOError:
        # Should be catching the urllib2.URLError exception
        logging.debug("Couldn't retrieve robots.txt for %s" % robot_url)
        rp = None
      except UnicodeDecodeError:
        logging.debug("Unicode decode error for robots.txt at %s" % robot_url)
        rp = None
      except httplib.HTTPException:
        logging.debug("Generic HTTPException for robots.txt at %s" % robot_url)
        rp = None
      ROBOT_CACHE[robot_url] = rp

    if rp is None or rp.is_allowed("*", self.url):
      base_url = urlunsplit([scheme, netloc, "", "", ""])

      # If there's a current delay on the site respect robots.txt and stall
      if self.db.exists(netloc):
        logging.debug("Obeying robot overlord for %s..." % netloc)
        URLHandler.add_to_busy(self.db, self.url)
        return False

      # Set a delay for any other requests to this site to respect robots.txt
      delay = rp.get_crawl_delay("*") if rp else None
      if delay:
        delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
      else:
        delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
      self.db.setex(netloc, "1", delay)

      return True
    else:
      return False
Exemple #8
0
#!/usr/bin/python
#encoding:utf-8

from robotexclusionrulesparser import RobotExclusionRulesParser as RobotsParser

rb = RobotsParser()
# rb.fetch("http://www.zhihu.com/robots.txt")
# print rb
# print rb._RobotExclusionRulesParser__rulesets
# print rb.is_allowed('*', 'http://www.zhihu.com/loginasdkj?encode=12')
# print rb.is_allowed('*', '/admin_inbox')
# print '======'

rb.fetch("http://www.iplaypython.com/robots.txt")
print rb
print '======'

rb.fetch("http://baidu.com/robots.txt")
print rb
print '======'

rb.fetch("http://jaysonhwang.com/robots.txt")
print rb
print '======'
Exemple #9
0
    def crawl(self, in_url):
        global global_id, last_update, DOMAIN
        print("Crawler %d on P#%d: %s" % (self.id, url_ids[in_url], in_url))
        try:
            request = urllib2.Request(in_url)
            response = urllib2.urlopen(request, timeout=5)
            real_url = w3lib.url.canonicalize_url(response.geturl())
            real_uri = urlparse(real_url)
            extension = real_uri.path.lower().split('.')[-1]
            if response.info(
            ).maintype != 'text' or extension in skip_file_types:
                content = ''
            else:
                content = response.read()
        except:
            real_url = in_url
            content = ''

        if real_url == in_url:  # no redirect
            soup = BeautifulSoup(content, "html.parser")
            raw_urls = [link.get('href') for link in soup.find_all('a')]
        else:  # redirect
            raw_urls = [real_url]

        out_urls = set()
        for url in raw_urls:
            #print('parsing', url)
            if url is None or len(url) <= 1:
                continue

            url = url.strip()

            if url.startswith('/http://') or url.startswith('/https://'):
                # why would someone do this?
                url = url[1:]
            if url.startswith('mailto:') or url.startswith('mailto@'):
                continue

            fixed_url = w3lib.url.canonicalize_url(urljoin(in_url, url))
            if len(fixed_url) > 1000:  # long urls tend to be wrong urls
                continue
            uri = urlparse(fixed_url)
            if uri.scheme is not None and uri.scheme not in [
                    'http', 'https', ''
            ]:
                continue
            if uri.hostname is not None:
                if not uri.hostname.endswith(DOMAIN):
                    continue
                elif uri.hostname not in robots_policies:
                    site_rp = RobotExclusionRulesParser()
                    try:
                        site_rp.fetch('http://' + uri.hostname + '/robots.txt',
                                      timeout=3)
                    except:
                        print "error with", ('http://' + uri.hostname +
                                             '/robots.txt')
                    rp_lock.acquire()
                    robots_policies[uri.hostname] = site_rp
                    rp_lock.release()
                if not (robots_policies[uri.hostname].is_allowed(
                        "*", fixed_url)):
                    continue
            extension = uri.path.lower().split('.')[-1]
            if extension in skip_file_types:
                continue
            if 1 < len(extension) < 8 and '/' not in extension:
                urls_extensions.add(extension)

            out_urls.add(fixed_url)

        #print out_urls
        #get lock
        write_lock.acquire()
        out_ids = []
        for url in out_urls:
            if url in url_ids:
                out_ids.append(url_ids[url])
            else:
                url_ids[url] = global_id
                out_ids.append(global_id)
                url_id_file.write('%d\t%s\n' % (global_id, url))
                url_id_file.flush()
                global_id += 1
                url_tasks.put(url)
        transition_file.write('%d\t%s\n' % (url_ids[in_url], str(out_ids)))
        transition_file.flush()
        last_update = time.time()
        write_lock.release()
        #release lock
        print('%d urls in total reported by %d' % (global_id, self.id))
Exemple #10
0
class CrawlerWorker(
        QObject
):  # spider that will get links of website # called to create instance of the class
    finish = False

    def __init__(self, info, instance, parent=None):
        super(CrawlerWorker, self).__init__(parent)
        self._instance = instance
        self.running = True
        self.base_url = info['base_url']  # main url of website
        self._links_to_crawl = []  # list of links yet to open
        self.crawled_links = {}  # dictionary of links opened/all links
        self.__parsed_crawled = {}  # list of urls and their html pages
        self.total = 0  # total number of found links
        self.total_crawled = 0  # total number of valid crawled links in website
        self.max_pages = info['max_crawl']  # max pages to crawl
        self.invalid_links_count = 0  # number of broken links found
        self.invalid_links_list = []  # list of broken links found
        self.dynamic = []
        self.info = info
        self.login_url = info['login_url']  # login page url if available
        if info['robo_url']:
            self._rb_parser = RobotExclusionRulesParser()
            self._rb_parser.fetch(info['robo_url'])
            self._user_agent = 'WASecBot'
        else:
            self._rb_parser = None
        self.browser = browser.RoboBrowser(parser="html.parser",
                                           user_agent="WASecBot")
        self.browser.session.verify = False
        self._logged_in = False
        self.running = True
        self._instance.btncrawlcancel.clicked.connect(self.pause)
        self._elapsed = 0
        self.delay = 15
        self._requests = 0
        self.start = None
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    def _opener(self, url):
        retry = 1
        while True:
            try:
                self.browser.open(url=url)
                break
            except exceptions.ConnectionError as ce:
                # sleep(self.delay * retry)
                if retry == 11:
                    return False
                else:
                    retry += 1
        return True

    def _compute_crawl_delay(self):
        self._requests += 1
        if self._requests <= 10:
            self._elapsed += self.browser.response.elapsed.total_seconds()
            delay = self._elapsed / self._requests
            self.delay = delay * 200
            if self.delay >= 180:
                self.delay = 15
        else:
            self._requests = 1
            self._elapsed = self.browser.response.elapsed.total_seconds()
            self.delay = self._elapsed * 200

    def pause(self):
        self.running = False
        self._instance.change_state.emit('Canceling...')
        choice = QtWidgets.QMessageBox.question(
            self._instance, "Cancel Crawl!",
            "WASec is not finished yet, are You sure you want to stop crawling?",
            QtWidgets.QMessageBox.Cancel | QtWidgets.QMessageBox.Yes)
        if choice == QtWidgets.QMessageBox.Yes:
            self.finish = True
            self.running = False
            self._instance.crawl_finished.emit(self._wrap_up())
        else:
            self.running = True

    # get total number of links opened so far
    def total_links(self):
        total = 0
        for index in self.crawled_links:
            total += len(self.crawled_links[index]['url'])
        return total

    # check if max pages reached
    def _crawled_max(self):
        result = (self.max_pages == 0) or (self.max_pages > self.total_links())
        return result

    # is link already listed
    def _is_link_listed(self, link):
        self._instance.change_state.emit('Check if URL is listed...')
        url = parse.urljoin(self.base_url, link)
        result = False
        for index in self.crawled_links:
            for opened in self.crawled_links[index]['url'].keys():
                if url == opened or link == opened:
                    result = True
        for to_open in self._links_to_crawl:
            if link == to_open[1] or url == to_open[1]:
                result = True
        return result

    # gets dynamic urls
    def _is_dynamic(self, url):
        self._instance.change_state.emit('Check if URL is dynamic...')
        if '?' in str(url) or '=' in str(url):
            self.dynamic.append(url)

    # check if page opened and exists
    def _is_response_ok(self, url):
        # status_code 200 means OK; no problems with page
        if 200 == self.browser.response.status_code:
            return True
        else:
            self._instance.change_state.emit('URL is invalid!')
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    def _is_html_page(self, url):
        try:
            if 'text/html' in self.browser.response.headers["content-type"]:
                return True
            else:
                self.invalid_links_count += 1
                self.invalid_links_list.append(url)
                self._instance.change_state.emit('URL is invalid!')
                return False
        except KeyError:
            return True

    def _is_same_page(self, url):
        if self.browser.url != url:
            res = self._opener(url)
        else:
            res = True
        if res:
            page = self.browser.parsed
            for index in self.crawled_links:
                for link in self.crawled_links[index]['url'].keys():
                    check = self.__parsed_crawled[link]
                    if check == page:
                        self._instance.change_state.emit('URL is invalid!')
                        return False
            return True
        else:
            self.finish = True
            self.running = False
            return False

    def _page_wise(self, url):
        if self.browser.url != url:
            res = self._opener(url)
        else:
            res = True
        if res:
            return self._is_response_ok(url) and self._is_html_page(
                url) and self._is_same_page(url)
        else:
            self.finish = True
            self.running = False
            return False

    def _is_same_query(self, page_link):
        parsed_url = parse.urlparse(page_link)
        query = parse.parse_qsl(parsed_url.query)
        query_len = len(query)
        if query_len > 0:
            for index in self.crawled_links:
                for link in self.crawled_links[index]['url'].keys():
                    parsed_link = parse.urlparse(link)
                    link_query = parse.parse_qsl(parsed_link.query)
                    if (parsed_link.path
                            == parsed_url.path) and (len(link_query)
                                                     == query_len):
                        i = n = 0
                        while i < query_len:
                            if query[i][0] == link_query[i][0]:
                                n += 1
                            i += 1
                        if n == query_len:
                            # result = self._is_same_page(page_link)
                            # return result
                            self._instance.change_state.emit('URL is invalid!')
                            print("is same query")
                            return False
        return True

    # check if given url belongs to website
    # i.e. is in the website's domain
    def _in_domain(self, url):
        if self.base_url in url:  # result = 0 meaning url belongs to website
            return True
        else:
            self._instance.change_state.emit('URL is invalid!')
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    # check for url protocol
    def _check_protocol(self, url):
        parsed = parse.urlparse(url)  # parse url to get information from it
        protocol = str.lower(str(parsed[0]))  # get url protocol
        if protocol == "http" or protocol == "https":  # is protocol 'http' or 'https'
            return True
        else:
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            self._instance.change_state.emit('URL is invalid!')
            return False

    def _is_robot_allowed(self, path):
        if self._rb_parser:
            return self._rb_parser.is_allowed(self._user_agent, path)
        else:
            return True

    def _url_wise(self, url):
        return self._in_domain(url) and self._check_protocol(
            url) and self._is_same_query(url)

    def _is_url_good(self, url):
        return self._url_wise(url) and self._page_wise(url)

    def _at_login(self, url):
        if not self.login_url or self.login_url != str(url):
            return False
        elif self.login_url == str(url):
            return True

    def _check_login(self, parsed):
        if self.info['logged_in']:
            self._instance.change_state.emit('Logging into the website...')
            handel = BeyondLogin(self.browser)
            self._logged_in = handel.get_login_info(self.info)
            parent = self._check_parent(handel.login_url)
            if self._logged_in:
                self._instance.change_state.emit('Login Successful!')
                # sleep(2)
                if parent:
                    self._add_crawled(handel.login_url, parent, parsed)
                else:
                    self._add_crawled(handel.login_url, self.base_url, parsed)
            else:
                self._instance.change_state.emit('Login Failed!')

            self._links_to_crawl.append(
                [handel.login_url, handel.redirect_url])
        else:
            self._instance.change_state.emit('Login Successful!')
            self._logged_in = True

    def _check_parent(self, url):
        for child in self._links_to_crawl:
            if child[1] == url:
                return child[0]
        return None

    # get all links in a given page
    def _get_page_links(self, url, page):
        self._instance.change_state.emit('Searching for all links in page...')
        # gets a list of all <a> tags in page
        links_tags = page.find_all("a")
        # going through each link
        for link in links_tags:
            self._instance.change_state.emit(
                'Searching for all links in page...')
            link_href = link.get(
                "href"
            )  # get <a> tag link reference. example: <a href="page.html"> ==> page.html
            # check that: link isn't already listed + link isn't blank
            link_listed = self._is_link_listed(link_href)
            if (not link_listed) and ('#' not in str(link_href)):
                # add link to list of links to open
                self._links_to_crawl.append([url, link_href])
                print("_get_page_links")
                print(url, link_href)
                self.total += 1

        forms = page.find_all("form")
        for form in forms:
            action = form.get("action")
            if action:  # link isn't blank
                # check that: link isn't already listed +
                link_listed = self._is_link_listed(action)
                if (not link_listed) and (action != "#"):
                    # add link to list of links to open
                    self._links_to_crawl.append([url, action])
                    self.total += 1
        self._instance.show_total.emit(self.total)

        image_map = page.find_all('area')
        for area in image_map:
            href = area.get(
                'href'
            )  # get 'href' attribute from <area shape="rect" href="#main"> tag
            listed = self._is_link_listed(href)
            if (not listed) and ('#' not in href):
                # add link to list of links to open
                self._links_to_crawl.append([url, href])
                self.total += 1
        self._instance.show_total.emit(self.total)

    # open a page and get its content
    def _open_url(self, url):
        if self.running:
            self._instance.change_state.emit('Pausing between requests...')
            # get page content
            parsed = self.browser.parsed
            if self.info['max_crawl'] != 1:
                self._get_page_links(
                    url, parsed)  # send content to retrieve links from

                # sleep(self.delay)
            else:
                self._add_crawled(url, url, parsed)
                self._is_dynamic(url)
                self._instance.show_total.emit(self.total_crawled)
            if self._at_login(url) and not self._logged_in:
                self._check_login(parsed)
            return parsed

    def _add_crawled(self, url, parent, parsed_page):
        self._instance.change_state.emit('Adding new crawled link...')
        found = False
        try:
            title = parsed_page.find('title')
            if not title:
                title = 'NO-TITLE'
            else:
                title = title.text
        except:
            title = 'NO-TITLE'

        for index in self.crawled_links:
            if self.crawled_links[index]['from'] == parent:
                self.crawled_links[index]['url'][url] = title
                found = True
                break
        if not found:
            self.crawled_links[self.total_crawled] = {
                'from': parent,
                'url': {
                    url: title
                }
            }
            self.total_crawled += 1
        self.__parsed_crawled[url] = parsed_page
        self._instance.on_info.emit(self.crawled_links)
        # sleep(2)

    # main spider function; creates our spider's web
    def run(self):
        self.start = datetime.now().time()
        self._opener(self.base_url)
        self._open_url(self.base_url)  # send main url to be opened and checked
        self._elapsed = self.browser.state.response.elapsed.total_seconds()
        self._compute_crawl_delay()
        # while there are still links to open
        self.i = len(self._links_to_crawl) - 1
        while (len(self._links_to_crawl)) > 0 and (
                self._crawled_max()) and not self.finish:
            self._instance.change_state.emit('Crawling...')
            # start from the last link in the list
            parent = self._links_to_crawl[self.i][0]
            link = self._links_to_crawl[self.i][1]
            print("----")
            print(parent, link)
            if parent[len(parent) - 1] != '/':
                parent = parent + '/'
            # url = parse.urljoin(self.base_url, link)  # join main url with page link
            url = parse.urljoin(parent, link)  # join main url with page link
            self._opener(url)
            if 200 != self.browser.response.status_code:
                url = parse.urljoin(self.base_url,
                                    link)  # join main url with page link
            print(url)
            if self._is_url_good(url) and self._is_robot_allowed(
                    link):  # is url valid and working
                print("good")
                self._instance.change_state.emit('URL is good!')
                parsed_page = self._open_url(url)  # open page
                self._add_crawled(url, parent, parsed_page)
                self._compute_crawl_delay()
                # add link to list of opened links
                self._is_dynamic(url)
            else:
                print("not good")
                self._instance.change_state.emit('URL is not good!')
            # delete opened link from list of links to open
            self._links_to_crawl.pop(self.i)
            if self.i > 0:
                self.i = self.i - 1
            elif self.i == 0:
                self.i = len(self._links_to_crawl) - 1
            if len(self._links_to_crawl) == 0 or self.i < 0:
                self._instance.change_state.emit('Finished.')
                self.finish = True
                break
        self.finish = True
        self._instance.crawl_finished.emit(self._wrap_up())

    def _calc_time(self):
        finish = datetime.now().time()
        delta1 = timedelta(seconds=self.start.second,
                           microseconds=self.start.microsecond,
                           minutes=self.start.minute,
                           hours=self.start.hour)
        delta2 = timedelta(seconds=finish.second,
                           microseconds=finish.microsecond,
                           minutes=finish.minute,
                           hours=finish.hour)
        taken = delta2 - delta1
        seconds = round(taken.total_seconds())
        if seconds >= 3600:
            hours = round(seconds / 3600)
            minutes = (round((seconds / 3600) / 60))
            elapsed = str(hours) + ':' + str(minutes) + ' hrs'
        elif seconds >= 60:
            minutes = round(seconds / 60)
            seconds = round(seconds % 60)
            elapsed = str(str(minutes) + '.' + str(seconds) + ' mins')
        else:
            elapsed = str(seconds) + ' secs'
        return elapsed

    def _wrap_up(self):
        wrap = {
            'links': self.crawled_links,
            'dynamic': self.dynamic,
            'total_crawled': self.total_links(),
            'total': self.total,
            'invalid': self.invalid_links_count,
            'running': self.running,
            'time': self._calc_time()
        }
        return wrap