Python RobotFileParser Exemples, urllib.robotparser.RobotFileParser Python Exemples

Exemple #1

0

Afficher le fichier

def robots(url):
    a = {}
    rp = robotparser.RobotFileParser()
    rp.set_url(url)
    a['网址'] = rp.read()
    a['好爬虫'] = rp.can_fetch('GoodCrawler', url)
    a['恶意爬虫'] = rp.can_fetch('Bad', url)
    a['Google'] = rp.can_fetch('Googlebot', url)
    print(a)

Exemple #2

0

Afficher le fichier

def get_robots_parser(robots_url):
    " Return the robots parser object using the robots_url "
    try:
        rp = robotparser.RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp
    except Exception as e:
        return 'Error finding robots_url:', robots_url, e

Exemple #3

0

Afficher le fichier

Fichier : main.py Projet : QMXDD/python_crawler

def parse_robots(robots_url):
    print(f"robots url {robots_url}")

    try:
        rp = robotparser.RobotFileParser(robots_url)
        rp.read()
        return rp
    except Exception as e:
        print(f"robots parse error {e}")

Exemple #4

0

Afficher le fichier

Fichier : work_with_url.py Projet : Lenikas/web-crawler

 def process_robot_txt(self):
     """Обрабатываем файл robots.txt и возвращаем true или false в зависимости от того,можно ли нам краулить сайт"""
     rp = robotparser.RobotFileParser()
     try:
         rp.set_url(self + '/robots.txt')
         rp.read()
     except (URLError, UnicodeEncodeError, UnicodeDecodeError):
         return True
     return rp.can_fetch('*', self)

Exemple #5

0

Afficher le fichier

def get_robots_parser(robots_url):
    """return the robots parser object using the robots_url"""
    try:
        rp = robotparser.RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp
    except Exception as e:
        print("Error finding robots url:", robots_url, e)

Exemple #6

0

Afficher le fichier

Fichier : xDownLoad.py Projet : acarciap/web-scraping

    def get(self, url):
        print('Downloading:', url)
        headers = {'User-agent': self.user_agent}	 # Preparación del Header a ser incluido en el Request de Inicio
		                                             # que incluye el Registro del Agente a utilizar durante la operación
													 
		# En vista de lo pequeño de este tipo de archivos, tiene sentido complicar la tarea manteniendo 
		# una copia local durante el acceso, así que se accesa cada vez que desea navegarse por alguna página del site
		
	    rp = robotparser.RobotFileParser()           # Inicializa objeto para parsing de robots.txt

Exemple #7

0

Afficher le fichier

Fichier : generic_crawler.py Projet : fulano7/DrugCrawling

def checkRobots(robots, url):
    try:
        rp = robotparser.RobotFileParser()
        rp.set_url(robots)
        rp.read()
        return rp.can_fetch("*", url)
    except:
        #no robots? allows everything
        return True

Exemple #8

0

Afficher le fichier

 def __init__(self, scheme, hostname):
     self.hostname = hostname
     self.iter = 0
     self.cnt_urls = {0: scheme + '://' + self.hostname}
     self.urls_cnt = {scheme + '://' + self.hostname: 0}
     self.free_numbers = set()
     self.timestamps = {0: None}
     self._rp = urobot.RobotFileParser()
     self._rp.set_url(scheme + '://' + hostname + '/robots.txt')

Exemple #9

0

Afficher le fichier

def get_robots(url):
    try:
        url = url + "/robots.txt"
        rp = robotparser.RobotFileParser()
        rp.set_url(url)
        rp.read()
        return rp
    except Exception as e:
        print(str(e))
        return None

Exemple #10

0

Afficher le fichier

 def checkrobot(self, u):
     try:
         robUrl = u if u.find(
             "/", MAX_PROTO_LEN) == -1 else u[:u.find("/", MAX_PROTO_LEN)]
         robUrl = robUrl + ROBOTS_TXT
         rob = rp.RobotFileParser()
         rob.set_url(robUrl)
         return rob.can_fetch("*", u)
     except:
         return True

Exemple #11

0

Afficher le fichier

def polite(robotcheckers, url):
	host = urlparse(url).netloc
	try:
		rc = robotcheckers[host]
	except KeyError:
		rc = robotparser.RobotFileParser()
		rc.set_url('http://' + host + '/robots.txt')
		rc.read()
		robotcheckers[host] = rc
	return rc.can_fetch('*', url)

Exemple #12

0

Afficher le fichier

Fichier : crawl_function.py Projet : FunkGG/scrap

def get_robots(url):
    rp = robotparser.RobotFileParser()
    rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))
    try:
        rp.read()
    except urllib.error.URLError as e:
        print('robot get error:', e.reason)
        rp = None

    return rp

Exemple #13

0

Afficher le fichier

Fichier : chapter_1.py Projet : u14e/python-spider

def get_robots(url):
    """
    为该链接初始化robots
    :param url:
    :return:
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(parse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp

Exemple #14

0

Afficher le fichier

def my_robot():
    import urllib.robotparser as robot
    par = robot.RobotFileParser()
    par.set_url('https://www.samsclub.com/robots.txt')
    par.read()  # reading the URL content
    print('~' * 20)
    print(par)
    print('~' * 20)
    print(par.can_fetch('*', 'https:/www.samsclub.com/friend'))
    print(par.can_fetch('*', 'https://www.samsclub.com/friend'))

Exemple #15

0

Afficher le fichier

Fichier : fetcher.py Projet : KokoMind/KokoSearch

 def _check_robots(url):
     """Check that our crawler satisfies robot exclusion standard"""
     try:
         robot_url = Robots.robots_url(url)
         parse = robotparser.RobotFileParser()
         parse.set_url(robot_url)
         parse.read()
         return parse.can_fetch('*', url)
     except:
         return True

Exemple #16

0

Afficher le fichier

def robotsAllowed(url):
    robotUrl = baseUrl(url) + "/robots.txt"
    if robotUrl in robotsDict:
        return robotsDict[robotUrl].can_fetch("*", url)
    else:
        rp = robotparser.RobotFileParser()
        rp.set_url(robotUrl)
        rp.read()
        robotsDict[robotUrl] = rp
        return rp.can_fetch("*", url)

Exemple #17

0

Afficher le fichier

 def init_robot_parser(self):
     robparser = robotparser.RobotFileParser()
     robparser.set_url(self.base_url + "/robots.txt")
     try:
         robparser.read()
         return robparser
     except Exception as e:
         print(e)
         print("could not find robots.txt on url: " + self.base_url + "/robots.txt")
         exit(0)

Exemple #18

0

Afficher le fichier

        def wrapper(*args, **kwargs):
            parser = robotparser.RobotFileParser(url=kwargs['url'])
            parser.read()

            if parser.can_fetch(agent_name, kwargs['url']):
                return func(*args, **kwargs)
            else:
                raise PermissionError(
                    f'The robots.txt permitts the crawling of the site {kwargs["url"]}'
                )

Exemple #19

0

Afficher le fichier

Fichier : crawler.py Projet : sunqf/data-tools

    def init_robot_parser(host):
        try:
            robot_parser = robotparser.RobotFileParser()
            robot_parser.set_url(urljoin(host, 'robots.txt'))
            robot_parser.read()
            return robot_parser
        except Exception as e:
            log(WARNING, e)

        return None

Exemple #20

0

Afficher le fichier

def get_site_maps(url):
    robot_checkers = {}
    host = urlparse(url).netloc
    try:
        rc = robot_checkers[host]
    except KeyError:
        rc = robotparser.RobotFileParser()
        rc.set_url('http://' + host + '/robots.txt')
        rc.read()
        robot_checkers[host] = rc
        return rc.site_maps()

Exemple #21

0

Afficher le fichier

def get_robots_parser(robots_url):
	'''
	args:
		robots_url(str): url of website's robot.txt e.g http://www.a.com/robot.txt
	returns:
		rp (robotparser.RobotFileParser)
	'''
	rp = robotparser.RobotFileParser()
	rp.set_url(robots_url)
	rp.read()
	return rp

Exemple #22

0

Afficher le fichier

def robot_check(url_parts):
    url = ''.join([str(p) for p in url_parts])
    rp = robotparser.RobotFileParser()
    rp.set_url(url)
    rp.read()
    if rp.can_fetch("*", url):
        print("Robots.txt: User Allowed")
        return True
    else:
        print("Robots.txt: User Disallowed. Please abort.")
        return False

Exemple #23

0

Afficher le fichier

def get_robots_parser(robots_url):
    """
    Return robot parser object using robots url
    """
    try:
        robot_parser = robotparser.RobotFileParser()
        robot_parser.set_url(robots_url)
        robot_parser.read()
        return robot_parser
    except Exception as e:
        print('Error finding robots url:', robots_url, e)

Exemple #24

0

Afficher le fichier

def robot():
    mrp = rp.RobotFileParser()
    mrp.set_url('https://www.tmall.com/robots.txt')
    mrp.read()
    url = 'https://www.baidu.com'
    user_agent = 'BadCrawler'
    flag = mrp.can_fetch(user_agent, url)
    print(flag)
    user_agent = ''
    flag = mrp.can_fetch(user_agent, url)
    print(flag)

Exemple #25

0

Afficher le fichier

Fichier : scraper.py Projet : Someperson99/CompSci-121-Assign-3

def robot_allow(url):
    # parses the current url down to the domain and then adds '/robots.txt' to the
    # end of it so that beautifulsoup can view if the current path that the
    # url is going to is allowed
    url_parsed = urlparse(url)
    robots_url = url_parsed.scheme + "://" + url_parsed.netloc + "/robots.txt"

    x = RFP.RobotFileParser()
    x.set_url(robots_url)
    x.read()
    return x.can_fetch(useragent, url)

Exemple #26

0

Afficher le fichier

Fichier : Chewable.py Projet : nidawi/2DV515-Project

    def __init__(self, root_url: str, user_agent: str):
        self.__parser = robotparser.RobotFileParser()

        # Parse the URL
        url_parse = parse.urlparse(root_url)
        robotsPath = "%s://%s/robots.txt" % (url_parse.scheme,
                                             url_parse.netloc)

        self.__parser.set_url(robotsPath)
        self.__user_agent = user_agent
        self.__parser.read()

Exemple #27

0

Afficher le fichier

def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urljoin(url, '/robots.txt'))
    html_ = urlopen(urljoin(url, '/robots.txt')).read().decode(
        'utf-8', errors='ignore').split('\n')
    rp.parse(
        html_
    )  #rp.read()解析出错UnicodeDecodeError: 'utf-8' codec can't decode byte
    return rp

Exemple #28

0

Afficher le fichier

Fichier : crawler.py Projet : ImDaBigBoss/WebCrawler

def canuse(baseurl, path):
    parser = urobot.RobotFileParser()
    parser.set_url(urljoin(baseurl, 'robots.txt'))
    parser.read()
    canParse = False

    if (parser.can_fetch(AGENT_NAME, path)):
        canParse = True
    if (parser.can_fetch(AGENT_NAME, urljoin(baseurl, path))):
        canParse = True

    return canParse

Exemple #29

0

Afficher le fichier

Fichier : crawler.py Projet : sivaramanl/Information-Retrieval

 def locate_rules(self, root_url):
     try:
         robots_url = urlunparse(
             (root_url.scheme, root_url.netloc, "robots.txt", "", "", ""))
         robots = robotparser.RobotFileParser()
         robots.set_url(robots_url)
         robots.read()
         self.crawler_rules[root_url.netloc] = robots
     except Exception as e:
         custom_logger().log_message("Exception in robots:\n" + str(e),
                                     logger_handler.log_level_ERROR)
         self.crawler_rules[root_url.netloc] = None

Exemple #30

0

Afficher le fichier

 def addrobot(self, root):
     root = urlparse.urljoin(root, "/")
     if root in self.robots: return
     url = urlparse.urljoin(root, "/robots.txt")
     self.robots[root] = rp = robotparser.RobotFileParser()
     self.note(2, "Parsing %s", url)
     rp.debug = self.verbose > 3
     rp.set_url(url)
     try:
         rp.read()
     except (OSError, IOError) as msg:
         self.note(1, "I/O error parsing %s: %s", url, msg)