Exemple #1
0
def robots(url):
    a = {}
    rp = robotparser.RobotFileParser()
    rp.set_url(url)
    a['网址'] = rp.read()
    a['好爬虫'] = rp.can_fetch('GoodCrawler', url)
    a['恶意爬虫'] = rp.can_fetch('Bad', url)
    a['Google'] = rp.can_fetch('Googlebot', url)
    print(a)
Exemple #2
0
def get_robots_parser(robots_url):
    " Return the robots parser object using the robots_url "
    try:
        rp = robotparser.RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp
    except Exception as e:
        return 'Error finding robots_url:', robots_url, e
Exemple #3
0
def parse_robots(robots_url):
    print(f"robots url {robots_url}")

    try:
        rp = robotparser.RobotFileParser(robots_url)
        rp.read()
        return rp
    except Exception as e:
        print(f"robots parse error {e}")
Exemple #4
0
 def process_robot_txt(self):
     """Обрабатываем файл robots.txt и возвращаем true или false в зависимости от того,можно ли нам краулить сайт"""
     rp = robotparser.RobotFileParser()
     try:
         rp.set_url(self + '/robots.txt')
         rp.read()
     except (URLError, UnicodeEncodeError, UnicodeDecodeError):
         return True
     return rp.can_fetch('*', self)
Exemple #5
0
def get_robots_parser(robots_url):
    """return the robots parser object using the robots_url"""
    try:
        rp = robotparser.RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp
    except Exception as e:
        print("Error finding robots url:", robots_url, e)
Exemple #6
0
    def get(self, url):
        print('Downloading:', url)
        headers = {'User-agent': self.user_agent}	 # Preparación del Header a ser incluido en el Request de Inicio
		                                             # que incluye el Registro del Agente a utilizar durante la operación
													 
		# En vista de lo pequeño de este tipo de archivos, tiene sentido complicar la tarea manteniendo 
		# una copia local durante el acceso, así que se accesa cada vez que desea navegarse por alguna página del site
		
	    rp = robotparser.RobotFileParser()           # Inicializa objeto para parsing de robots.txt 
def checkRobots(robots, url):
    try:
        rp = robotparser.RobotFileParser()
        rp.set_url(robots)
        rp.read()
        return rp.can_fetch("*", url)
    except:
        #no robots? allows everything
        return True
Exemple #8
0
 def __init__(self, scheme, hostname):
     self.hostname = hostname
     self.iter = 0
     self.cnt_urls = {0: scheme + '://' + self.hostname}
     self.urls_cnt = {scheme + '://' + self.hostname: 0}
     self.free_numbers = set()
     self.timestamps = {0: None}
     self._rp = urobot.RobotFileParser()
     self._rp.set_url(scheme + '://' + hostname + '/robots.txt')
Exemple #9
0
def get_robots(url):
    try:
        url = url + "/robots.txt"
        rp = robotparser.RobotFileParser()
        rp.set_url(url)
        rp.read()
        return rp
    except Exception as e:
        print(str(e))
        return None
Exemple #10
0
 def checkrobot(self, u):
     try:
         robUrl = u if u.find(
             "/", MAX_PROTO_LEN) == -1 else u[:u.find("/", MAX_PROTO_LEN)]
         robUrl = robUrl + ROBOTS_TXT
         rob = rp.RobotFileParser()
         rob.set_url(robUrl)
         return rob.can_fetch("*", u)
     except:
         return True
Exemple #11
0
def polite(robotcheckers, url):
	host = urlparse(url).netloc
	try:
		rc = robotcheckers[host]
	except KeyError:
		rc = robotparser.RobotFileParser()
		rc.set_url('http://' + host + '/robots.txt')
		rc.read()
		robotcheckers[host] = rc
	return rc.can_fetch('*', url)
Exemple #12
0
def get_robots(url):
    rp = robotparser.RobotFileParser()
    rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))
    try:
        rp.read()
    except urllib.error.URLError as e:
        print('robot get error:', e.reason)
        rp = None

    return rp
Exemple #13
0
def get_robots(url):
    """
    为该链接初始化robots
    :param url:
    :return:
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(parse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
Exemple #14
0
def my_robot():
    import urllib.robotparser as robot
    par = robot.RobotFileParser()
    par.set_url('https://www.samsclub.com/robots.txt')
    par.read()  # reading the URL content
    print('~' * 20)
    print(par)
    print('~' * 20)
    print(par.can_fetch('*', 'https:/www.samsclub.com/friend'))
    print(par.can_fetch('*', 'https://www.samsclub.com/friend'))
Exemple #15
0
 def _check_robots(url):
     """Check that our crawler satisfies robot exclusion standard"""
     try:
         robot_url = Robots.robots_url(url)
         parse = robotparser.RobotFileParser()
         parse.set_url(robot_url)
         parse.read()
         return parse.can_fetch('*', url)
     except:
         return True
Exemple #16
0
def robotsAllowed(url):
    robotUrl = baseUrl(url) + "/robots.txt"
    if robotUrl in robotsDict:
        return robotsDict[robotUrl].can_fetch("*", url)
    else:
        rp = robotparser.RobotFileParser()
        rp.set_url(robotUrl)
        rp.read()
        robotsDict[robotUrl] = rp
        return rp.can_fetch("*", url)
Exemple #17
0
 def init_robot_parser(self):
     robparser = robotparser.RobotFileParser()
     robparser.set_url(self.base_url + "/robots.txt")
     try:
         robparser.read()
         return robparser
     except Exception as e:
         print(e)
         print("could not find robots.txt on url: " + self.base_url + "/robots.txt")
         exit(0)
Exemple #18
0
        def wrapper(*args, **kwargs):
            parser = robotparser.RobotFileParser(url=kwargs['url'])
            parser.read()

            if parser.can_fetch(agent_name, kwargs['url']):
                return func(*args, **kwargs)
            else:
                raise PermissionError(
                    f'The robots.txt permitts the crawling of the site {kwargs["url"]}'
                )
Exemple #19
0
    def init_robot_parser(host):
        try:
            robot_parser = robotparser.RobotFileParser()
            robot_parser.set_url(urljoin(host, 'robots.txt'))
            robot_parser.read()
            return robot_parser
        except Exception as e:
            log(WARNING, e)

        return None
Exemple #20
0
def get_site_maps(url):
    robot_checkers = {}
    host = urlparse(url).netloc
    try:
        rc = robot_checkers[host]
    except KeyError:
        rc = robotparser.RobotFileParser()
        rc.set_url('http://' + host + '/robots.txt')
        rc.read()
        robot_checkers[host] = rc
        return rc.site_maps()
Exemple #21
0
def get_robots_parser(robots_url):
	'''
	args:
		robots_url(str): url of website's robot.txt e.g http://www.a.com/robot.txt
	returns:
		rp (robotparser.RobotFileParser)
	'''
	rp = robotparser.RobotFileParser()
	rp.set_url(robots_url)
	rp.read()
	return rp
Exemple #22
0
def robot_check(url_parts):
    url = ''.join([str(p) for p in url_parts])
    rp = robotparser.RobotFileParser()
    rp.set_url(url)
    rp.read()
    if rp.can_fetch("*", url):
        print("Robots.txt: User Allowed")
        return True
    else:
        print("Robots.txt: User Disallowed. Please abort.")
        return False
Exemple #23
0
def get_robots_parser(robots_url):
    """
    Return robot parser object using robots url
    """
    try:
        robot_parser = robotparser.RobotFileParser()
        robot_parser.set_url(robots_url)
        robot_parser.read()
        return robot_parser
    except Exception as e:
        print('Error finding robots url:', robots_url, e)
Exemple #24
0
def robot():
    mrp = rp.RobotFileParser()
    mrp.set_url('https://www.tmall.com/robots.txt')
    mrp.read()
    url = 'https://www.baidu.com'
    user_agent = 'BadCrawler'
    flag = mrp.can_fetch(user_agent, url)
    print(flag)
    user_agent = ''
    flag = mrp.can_fetch(user_agent, url)
    print(flag)
def robot_allow(url):
    # parses the current url down to the domain and then adds '/robots.txt' to the
    # end of it so that beautifulsoup can view if the current path that the
    # url is going to is allowed
    url_parsed = urlparse(url)
    robots_url = url_parsed.scheme + "://" + url_parsed.netloc + "/robots.txt"

    x = RFP.RobotFileParser()
    x.set_url(robots_url)
    x.read()
    return x.can_fetch(useragent, url)
Exemple #26
0
    def __init__(self, root_url: str, user_agent: str):
        self.__parser = robotparser.RobotFileParser()

        # Parse the URL
        url_parse = parse.urlparse(root_url)
        robotsPath = "%s://%s/robots.txt" % (url_parse.scheme,
                                             url_parse.netloc)

        self.__parser.set_url(robotsPath)
        self.__user_agent = user_agent
        self.__parser.read()
Exemple #27
0
def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urljoin(url, '/robots.txt'))
    html_ = urlopen(urljoin(url, '/robots.txt')).read().decode(
        'utf-8', errors='ignore').split('\n')
    rp.parse(
        html_
    )  #rp.read()解析出错UnicodeDecodeError: 'utf-8' codec can't decode byte
    return rp
Exemple #28
0
def canuse(baseurl, path):
    parser = urobot.RobotFileParser()
    parser.set_url(urljoin(baseurl, 'robots.txt'))
    parser.read()
    canParse = False

    if (parser.can_fetch(AGENT_NAME, path)):
        canParse = True
    if (parser.can_fetch(AGENT_NAME, urljoin(baseurl, path))):
        canParse = True

    return canParse
 def locate_rules(self, root_url):
     try:
         robots_url = urlunparse(
             (root_url.scheme, root_url.netloc, "robots.txt", "", "", ""))
         robots = robotparser.RobotFileParser()
         robots.set_url(robots_url)
         robots.read()
         self.crawler_rules[root_url.netloc] = robots
     except Exception as e:
         custom_logger().log_message("Exception in robots:\n" + str(e),
                                     logger_handler.log_level_ERROR)
         self.crawler_rules[root_url.netloc] = None
Exemple #30
0
 def addrobot(self, root):
     root = urlparse.urljoin(root, "/")
     if root in self.robots: return
     url = urlparse.urljoin(root, "/robots.txt")
     self.robots[root] = rp = robotparser.RobotFileParser()
     self.note(2, "Parsing %s", url)
     rp.debug = self.verbose > 3
     rp.set_url(url)
     try:
         rp.read()
     except (OSError, IOError) as msg:
         self.note(1, "I/O error parsing %s: %s", url, msg)