Exemple #1
0
 def __init__(self):
     self.unvisitedURLs = set()
     self.visitedURLs = set()
     self.buggyURLs = set()
     self.robotParser = robotparser.RobotFileParser()
     self.contentDigest = {}
     self.http = httplib2.Http(".cache")
Exemple #2
0
 def check_robots(url):
     robots_parser = robotparser.RobotFileParser()
     robots_parser.set_url(Spider.currentBaseUrl + '/robots.txt')
     robots_parser.read()
     if not robots_parser.can_fetch(Spider.AGENT_NAME, url):
         return False
     return True
 def craw(self, rool_url):
     count = 1
     self.urls.add_new_url(rool_url)
     throttle = self.downloader.Throttle(0)
     rp = robotparser.RobotFileParser()
     rp.set_url('https://baike.baidu.com/robots.txt')
     rp.read()
     user_agent = 'wswp'
     while self.urls.has_new_url():
         try:
             new_url = self.urls.get_new_url()
             print u'第%d个页面%s' % (count, new_url)
             if True:  #rp.can_fetch(user_agent,new_url):
                 #print "if is running"
                 throttle.wait(new_url)
                 self.craw_isrunning(new_url)
                 if count == 20:
                     break
                 count = count + 1
             else:
                 print 'Blocked by robots.txt', new_url
         except Exception, e:
             print 'craw failed'
             print e.message
             print "111"
Exemple #4
0
def connect_mozrepl(url_addr):
    quit = False
    t = telnetlib.Telnet("localhost", 4242)
    t.read_until("repl>")

    #verifies page was accepted
    rp = robotparser.RobotFileParser()
    fetched = rp.can_fetch("*", url_addr)
    print fetched
    state = True
    while (state == True):
        if fetched == True:
            rdm = random.random() * 500
            print rdm
            time.sleep(rdm)  #WAIT FOR WEBPAGE TO LOAD
            str = "content.location.href='" + url_addr.strip() + "'\n"
            print str
            t.write(str)
            body = t.read_until("repl>")
            state = False
        else:
            state = False
            print "unable to fetch web page, exiting!!!"
            quit = True
            break
    t.write("content.document.body.innerHTML\n")
    body = t.read_until("repl>")

    t.close()

    return body, quit
Exemple #5
0
 def _get_robotparser(self, link):
     """Return the proper robots parser for the given url or None if one
     cannot be constructed. Robot parsers are cached per scheme and
     netloc."""
     # only some schemes have a meaningful robots.txt file
     if link.scheme != 'http' and link.scheme != 'https':
         debugio.debug(
             'crawler._get_robotparser() called with unsupported scheme (%s)'
             % link.scheme)
         return None
     # split out the key part of the url
     location = urlparse.urlunsplit((link.scheme, link.netloc, '', '', ''))
     # try to create a new robotparser if we don't already have one
     if not self._robotparsers.has_key(location):
         import httplib
         debugio.info('  getting robots.txt for %s' % location)
         self._robotparsers[location] = None
         try:
             rp = robotparser.RobotFileParser()
             rp.set_url(
                 urlparse.urlunsplit(
                     (link.scheme, link.netloc, '/robots.txt', '', '')))
             rp.read()
             self._robotparsers[location] = rp
         except (TypeError, IOError, httplib.HTTPException):
             # ignore any problems setting up robot parser
             pass
     return self._robotparsers[location]
Exemple #6
0
def link_crawler(seed_url,
                 delay=1,
                 link_regex=None,
                 proxies=None,
                 max_depth=2,
                 user_agent='wswp',
                 num_retries=2,
                 scrape_callback=ScrapeCallback(),
                 cache=None):
    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    rp = robotparser.RobotFileParser()
    #initialize downloader
    cache = MongoCache()
    D = downloader(cache=cache)

    while crawl_queue:
        url = crawl_queue.pop()
        #detect whether this url is baned
        #if rp.can_fetch(user_agent, url):
        depth = seen[url]
        if depth != max_depth:
            html = D(url)
            #html parser and convert to csv
            if scrape_callback:
                scrape_callback(url, html)
            for link in get_links(html):
                if re.search(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    """
    #此为先前版本,为了防止爬虫陷阱,把sean修改为字典
    #为了防止死循环(页面间的互相链接)加入seen表示已接受的链接
    #set表示生成集合,不重复
    #seen=set(crawl_queue)
    """
    #max_depth设置为负数就可以取消“避免爬虫陷阱”的功能
    #该功能维护一个字典<url,深度> 当深度超过最大深度时停止深挖
    max_depth = 1
    seen = {}
    #按robots.txt要求抓取  下面三行都要加入,不能少任何一行
    rp = robotparser.RobotFileParser()
    rp.set_url('http://example.webscraping.com/robots.txt')
    rp.read()
    while crawl_queue:
        url = crawl_queue.pop()
        #BadCrawler会被阻塞
        if rp.can_fetch('Crawler', url):
            #加入限速
            throttle = Throttle(delay=2000)
            throttle.wait(url)
            html = download(url)
            depth = 0
            if depth != max_depth:
                for link in get_links(html):
                    if re.match(link_regex, link):
                        #该处用到了导入包所包含的方法,使相对路径变绝对路径
                        link = urlparse.urljoin(seed_url, link)
                        if link not in seen:
                            seen[link] = depth + 1
                            crawl_queue.append(link)
        else:
            print 'Block by robots.txt', url
Exemple #8
0
    def Allowed(self, url, UserAgentString):
        try:
            parsed = urlparse(url)
            port = ""
            if (parsed.port):
                port = ":" + str(parsed.port)
        except ValueError:
            print("ValueError: " + url)

        roboturl = ""
        try:
            roboturl = parsed.scheme + "://" + parsed.hostname + port + "/robots.txt"
        except TypeError:
            print(parsed)
        if roboturl not in self.RuleDict:
            self.RuleDict[roboturl] = robotparser.RobotFileParser(roboturl)
            try:
                self.RuleDict[roboturl].read()
            except IOError:
                del self.RuleDict[roboturl]
                return True

        try:
            return self.RuleDict[roboturl].can_fetch(UserAgentString, url)
        except KeyError:
            print("Keyerror: " + url)
            return True
Exemple #9
0
def robotparser(url):
    import robotparser
    import urlparse
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
Exemple #10
0
    def check_robots(self, msg):
        data = json.loads(msg.body)
        self.ch.basic_ack(msg.delivery_tag)

        # get the robots.txt URL
        url = self.get_robots_url(data['url'])
        logging.info("Using robots url: %s", url)
        try:
            # fetch robots.txt
            robots_txt = requests.get(url, headers=self.headers)
            # pass the content to the robots.txt parser
            rbp = robotparser.RobotFileParser()
            rbp.parse(robots_txt.text.splitlines())

            # check to see if we're allowed in - test using OrgProbe's useragent
            if not rbp.can_fetch(self.config.get('daemon', 'probe_useragent'),
                                 data['url']):
                logging.warn("Disallowed: %s", data['url'])
                # write rejection to DB
                self.set_url_status(data['url'], 'disallowed-by-robots-txt')
                return True
            else:
                # we're allowed in.
                logging.info("Allowed: %s", data['url'])
        except Exception, v:
            # if anything bad happens, log it but continue
            logging.error("Exception: %s", v)
def get_robots(url):
    """解析robots.txt文件
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
Exemple #12
0
 def _can_fetch(self, url):
     return True
     robots_file = self._get_robots_file_url(url)
     rp = robotparser.RobotFileParser()
     rp.set_url(robots_file)
     rp.read()
     return rp.can_fetch(USER_AGENT, url)
Exemple #13
0
def link_crawler(seed_url, link_regex):
    """
     crawlfrom the given seed URL following links matched by link_regex
     :param seed_url: 
     :param link_regex: 
     :return: 
     """
    #read the robots.txt
    rp = robotparser.RobotFileParser()
    rp.set_url('http://example.webscraping.com/robots.txt')
    rp.read()
    #set the agent's name
    user_agent = "667's Python Spider"
    #set the delay for crawl speed    5 second

    th = Throttle.Throttle(5)

    #set the crawl queue for crawled url
    crawl_queue = [seed_url]
    visited = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        if rp.can_fetch(user_agent, url):
            th.wait(url)
            html = download_network_page(url)
            print html
            # filter for links matching out regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)

                    if link not in visited:
                        visited.add(link)
                        crawl_queue.append(link)
Exemple #14
0
def check_robots(url, user_agent, robots_name='robots.txt'):
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, robots_name))
    rp.read()
    result = rp.can_fetch(user_agent, url)

    return result
Exemple #15
0
	def check_robot(self, url):
		"""Check robots.txt on the root of the URL"""
		global robot_trace
		global lock_robottrace

		robot_url = urlparse.urljoin(url, '/robots.txt')
		fetch_robot = False
		lock_robottrace.acquire()
		if robot_url not in robot_trace:
			fetch_robot = True
		else:
			ret = robot_trace[robot_url].can_fetch("*", url)
		lock_robottrace.release()

		rp = None
		try:
			if fetch_robot:
				rp = robotparser.RobotFileParser()
				rp.set_url(robot_url)
				rp.read()
			else:
				return ret
		except:
			return True
		else:
			lock_robottrace.acquire()
			if robot_url not in robot_trace:
				robot_trace[robot_url] = rp
				print "Find robots.txt at %s" % robot_url
			ret = robot_trace[robot_url].can_fetch("*", url)
			lock_robottrace.release()
			return ret				
Exemple #16
0
 def add_to_redis(r_conn, redis_server, redis_db):
     robots = rParser.RobotFileParser()
     robots.set_url(id + "/robots.txt")
     robots.read()
     r_conn.set(id, cP.dumps(robots))
     r_conn.expire(id, 7200)
     return robots
Exemple #17
0
def link_crawler1(seed_url, link_regex, user_agent='lcy', max_depth=2):
    crawl_queue = [seed_url]
    seen = {}  #记录见到过的url
    seen[seed_url] = 1

    rp = robotparser.RobotFileParser()
    rp.set_url(seed_url)
    rp.read()

    while crawl_queue:
        url = crawl_queue.pop()

        if rp.can_fetch(user_agent, url):  #解析robots文件,判断代理是否可以访问
            html = download1(url, user_agent)
            #get_links解析出html中所有的静态url
            #link_regex是目标url,过滤掉非目标页面

            #未达到指定深度则提取links
            depth = seen[url]
            if depth != max_depth:  #令max_depth=负数,则永远不等,则屏蔽了深度限制
                for link in get_links(html):
                    if re.match(link_regex, link):
                        #网页中的href使用的是相对链接路径时,浏览器可以正常访问,但urllib2不能
                        #使用urljoin生成绝对链接路径
                        link = urlparse.urljoin(seed_url, link)
                        if link not in seen:
                            seen[link] = depth + 1
                            crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt:', url
Exemple #18
0
 def robot_fetch(path):
     for rule in kw["robots_exclusions"]:
         robot = robotparser.RobotFileParser()
         robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)])
         if not robot.can_fetch("*", '/' + path):
             return False  # not robot food
     return True
 def testPythonOrg(self):
     test_support.requires('network')
     parser = robotparser.RobotFileParser(
         "http://www.python.org/robots.txt")
     parser.read()
     self.assertTrue(
         parser.can_fetch("*", "http://www.python.org/robots.txt"))
Exemple #20
0
    def get_robots(self, url):
        #extract domain base url
        #check for existance of robots.txt
        #process robots.txt (User-agent, Allow, Disallow, Crawl-delay and Sitemap)??
        #If a sitemap is defined shuld all the URLs defined within it be added to the frontier exclusively or additionaly
        #If site not already in DB, write it there
        #else just try to find site's RP object in local cache
        cursor = self.cursor
        conn = self.db_conn
        parsed_uri = urlparse(url)
        domain = Crawler_worker.remove_www(parsed_uri.netloc)
        domain_url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
        ##### restore from cache if stored else create #####
        if domain in Crawler_worker.cache_robots:
            rp = Crawler_worker.cache_robots[domain]
        else:
            robots_url = domain_url + 'robots.txt'
            rp = robotparser.RobotFileParser()
            rp.set_url(robots_url)
            try:
                rp.read()
            except Exception as e:
                print(self.id, 'EXCEPTION get_robots()', e)
                pass

        Crawler_worker.cache_robots_lock.acquire()
        self.cache_robots_lock_timestamp = time.time()
        if domain not in Crawler_worker.cache_robots:
            Crawler_worker.cache_robots[domain] = rp
        Crawler_worker.cache_robots_lock.release()
        self.cache_robots_lock_timestamp = None
        return rp
Exemple #21
0
def get_robots(url):
	""" 返回robots.txt中的url限制判断器
	"""
	rp = robotparser.RobotFileParser()
	rp.set_url(urlparse.urljoin(url, '/robots.txt'))
	rp.read()
	return rp
Exemple #22
0
    def __init__(self, url, urlLimit, delay):
        #self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        #self.s.settimeout(3600)
        #self.s.connect(("localhost", 1337))

        self.sleeptime = delay
        self.limit = urlLimit
        self.url = url

        self.db = MySql.Database("127.0.0.1", "pythondb", "pythonUser",
                                 "pythondb")

        self.filename = self.url

        self.urlfile = open(self.filename + '.txt', 'wb')

        self.url = 'http://' + self.url
        self.urls = [self.url]
        self.counter = 0

        self.rp = robotparser.RobotFileParser()
        self.rp.set_url(self.url + '/robots.txt')
        self.rp.read()
        print 'Set robotparser url to:', self.url + '/robots.txt'

        self.urlOpener = myOpener()
def link_crawler(seed_url, link_regex, max_depth=1, scrape_callback=None):
    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    rp = robotparser.RobotFileParser()
    while crawl_queue:
        url = crawl_queue.pop()
        rp.set_url(url + '/robots.txt')
        rp.read()
        user_agent = 'wswp'

        if rp.can_fetch(user_agent, url):
            throttle = Throttle.Throttle(5)
            throttle.wait(url)
            html = download(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
            depth = seen[url]
            if depth != max_depth:
                for link in get_links(html):
                    if re.match(link_regex, link):
                        link = urlparse.urljoin(seed_url, link)
                        if link not in seen:
                            seen[link] = depth + 1
                            # seen.add(link)
                            crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt:', url
Exemple #24
0
def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
Exemple #25
0
def link_crawler(seed_url, max_depth=2):
    """Crawl from the given seed URL following links matched by link_regex
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(seed_url)
    rp.read()
    user_agent = 'Mozilla/5.0'
    throttle = Throttle(2)

    crawl_queue = [seed_url]
    # seen = set(crawl_queue)
    seen = {}
    while crawl_queue:
        url = crawl_queue.pop()
        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url)
            # Filter for links matching our regular expression
            for link in get_links(html):
                link = urlparse.urljoin(seed_url, link)
                # print link
                # check if crawler has already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt', url
 def testPasswordProtectedSite(self):
     test_support.requires('network')
     with test_support.transient_internet('mueblesmoraleda.com'):
         url = 'http://mueblesmoraleda.com'
         robots_url = url + "/robots.txt"
         # First check the URL is usable for our purposes, since the
         # test site is a bit flaky.
         try:
             urlopen(robots_url)
         except HTTPError as e:
             if e.code not in {401, 403}:
                 self.skipTest(
                     "%r should return a 401 or 403 HTTP error, not %r" %
                     (robots_url, e.code))
         else:
             self.skipTest(
                 "%r should return a 401 or 403 HTTP error, not succeed" %
                 (robots_url))
         parser = robotparser.RobotFileParser()
         parser.set_url(url)
         try:
             parser.read()
         except IOError:
             self.skipTest('%s is unavailable' % url)
         self.assertEqual(parser.can_fetch("*", robots_url), False)
Exemple #27
0
    def get_links(self, soup, base_url):
        """Extract the urls from a parsed html."""
        base = soup.find("base", href=True)
        if base:
            base_url = urljoin(base_url, base.get("href"))

        all_links = [urljoin(base_url, i.get('href').strip())
                     for i in soup.find_all('a', href=True)]

        # I remove urls starting with "/"
        # For debugging purpose I use a long version of the following:
        # links = [l for l in all_links if not l.startwith("/")]
        links = []
        for l in all_links:
            if l.startswith("/"):
                logging.debug("skipping url starting with '/'.base: %s link: %s"
                              % (base_url, l))
            else:
                links.append(l)

        if self.nofollow_compliant is True:
            nofollow = Set(
                [urljoin(base_url, i.get('href'))
                 for i in soup.find_all('a', {"rel": "nofollow"}, href=True)]
            )
            links = [l for l in links if l not in nofollow]

        allowed_links = Set()
        domains = Set()
        for l in links:
            domain = self.get_domain(l)
            if len(self.allowed_domains) == 0 or domain in self.allowed_domains:
                norm_l = self.normalize_url(l)
                if [True for ex in self.exclude_pages if re.match(ex, norm_l)]:
                    continue
                rb = None
                domains.add(domain)
                if self.robots_compliant:
                    rb = self.robotparser_cache.get(domain)
   
                    if rb is None:
                        rb = robotparser.RobotFileParser()
                        # TODO: fix this
                        rb.set_url("http://" + domain + "/robots.txt")
                        rb.read()
                        self.robotparser_cache[domain] = rb

                # INFO: this try-catch is here because sometimes
                #       can_fatch returns unicode problems. It seemd
                #       to be a bug of the library though.
                ################################################################
                try:
                    # TODO: ho messo l'* perche' non so  lo user-agent qui per ora
                    if not rb or rb.can_fetch("*", norm_l):
                        allowed_links.add(norm_l)

                except KeyError:
                    pass

        return list(allowed_links)
Exemple #28
0
def get_robots(url):
    """Return True if both URL's belong to same domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
 def testPythonOrg(self):
     test_support.requires('network')
     with test_support.transient_internet('www.python.org'):
         parser = robotparser.RobotFileParser(
             "https://www.python.org/robots.txt")
         parser.read()
         self.assertTrue(
             parser.can_fetch("*", "https://www.python.org/robots.txt"))
Exemple #30
0
def robot_file(domain):
    rp = robotparser.RobotFileParser(urlparse.urljoin(domain, "robots.txt"))
    rp.read()

    def _clos(url):
        return rp.can_fetch("*", url)

    return _clos