Beispiel #1
0
    def validate_robots(self):
        """
        Confere o arquivo robots.txt dos sites e verifica condições de Dissalow
        adicionando duas keys novas ao dict da cidade correspondente:
            - has_robotstxt (True/False): responde a pergunta se tem ou não o arquivo robots.txt
            - can_crawling (True/False): responde a pergunta se pode ou não fazer crawling
        """

        try:
            self.city['has_robotstxt'] = False
            self.city['can_crawling'] = False

            city_url = self.city['url']
            city_url_robots = city_url + 'robots.txt'

            robotstxt = requests.get(city_url_robots, timeout=30)

            if robotstxt.status_code == 404:
                self.city['can_crawling'] = False

            if robotstxt.status_code == 200:
                self.city['has_robotstxt'] = True

                robotparser = urllib.robotparser.RobotFileParser()
                robotparser.set_url(city_url_robots)
                robotparser.read()

                if robotparser.can_fetch('*', city_url):
                    self.city['can_crawling'] = True
                    self.validate_recommendations()

        except requests.exceptions.RequestException as error:
            print(dt.timestamp(dt.now()), self.city['city_name'], error)
    def urlchecker(self, url):
        if url is None:
            return False
        normalized_url = urltools.normalize(url)
        robotparser = urllib.robotparser.RobotFileParser()

        try:
            url_comp = urlparse(normalized_url)
            base_url = url_comp.scheme + "://" + url_comp.netloc + "/"
        except:
            self.logger.error("Cannot parse: " + url)
        try:
            robotparser.set_url(base_url + "robots.txt")
            robotparser.read()
            if not robotparser.can_fetch("*", normalized_url):
                self.logger.error(url + " is excluded due to protocol")
                return False
        except:
            self.logger.error("Cannot determine robots exclusion protocol: " +
                              url)

        if normalized_url in self.visited_urls:
            self.logger.debug(url + " Has been visited before! ")
            return False
        elif base_url in self.sites_times and self.sites_times[base_url] > int(
                self.limit):
            #
            self.logger.debug(
                url + " Times visiting this site have reach the limit ")
            return False
        elif 'cgi' in normalized_url:
            return False
        else:
            return True
Beispiel #3
0
    def validate_robots(self):
        """
        Confere o arquivo robots.txt dos sites e verifica condições de Dissalow
        adicionando duas keys novas ao dict da cidade correspondente:
            - has_robotstxt (True/False): responde a pergunta se tem ou não o arquivo robots.txt
            - can_crawling (True/False): responde a pergunta se pode ou não fazer crawling
        """
        print(f'{self.city["city_name"]}: Verificação do arquivo robots.')

        try:
            self.city['timestamp'] = dt.timestamp(dt.now())
            self.city['has_robotstxt'] = False
            self.city['can_crawling'] = True

            city_url = self.city['url']
            city_url_robots = city_url + 'robots.txt'
            robotstxt = requests.get(city_url_robots, timeout=30, headers={'user-agent': 'uscs/0.0.1'})

            if robotstxt.status_code == 200:
                self.city['has_robotstxt'] = True
                robotparser = urllib.robotparser.RobotFileParser()
                robotparser.set_url(city_url_robots)
                robotparser.read()

                if not robotparser.can_fetch('*', city_url):
                    self.city['can_crawling'] = False
                    print(f'{self.city["city_name"]}: Sem permissão para fazer crawling.')

            if self.city['can_crawling']:
                self.sourcecode = self.get_sourcecode()
                self.validate_recommendations()

        except requests.exceptions.RequestException as error:
            print(dt.timestamp(dt.now()), self.city['_id'], self.city["city_name"], error)
Beispiel #4
0
 def get_can_fetch(cls, node_url):
     user_agent = cls.get_user_agent()
     parsed_url = urlparse(node_url)
     robot_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
     robotparser = urllib.robotparser.RobotFileParser()
     robotparser.set_url(robot_url)
     robotparser.read()
     return robotparser.can_fetch(user_agent, node_url)
Beispiel #5
0
def checkRobotsForUrl(url):
    # Extract base URL
    parsedURL = urllib.parse.urlparse(url)
    robotsURL = parsedURL.scheme +  "://" + parsedURL.netloc + "/robots.txt"
    try:
        robotparser = urllib.robotparser.RobotFileParser()
        robotparser.set_url(robotsURL)
        robotparser.read()
        can_fetch = robotparser.can_fetch("*", url)
        return can_fetch
    except:
        return True
Beispiel #6
0
def get_data(url, robotparser):
    # doesn't work...Because just match /dontgohere/, now it works
    if robotparser.can_fetch("*", suburl(url)):
        request = urllib.request.Request(url=url, headers=headers)
        print("Now URL:" + url)
        response = urllib.request.urlopen(request)
        data = response.read()
        data = data.decode('utf-8')
        history.append(url)
        # print("All History:"+''.join(str(e) for e in history))
        return data
    else:
        return None
Beispiel #7
0
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        rp = robotparser.can_fetch()
        html = download(url)
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urljoin(seed_url, link)
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
Beispiel #8
0
import urllib.robotparser
from urllib.parse import urlparse

user_agent = 'unpackbot'
url = "https://buzzfeed.com/contests"
parsed_url = urlparse(url)
robot_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
print(url, robot_url)
robotparser = urllib.robotparser.RobotFileParser()
robotparser.set_url(robot_url)
robotparser.read()
can_fetch = robotparser.can_fetch(user_agent, url)
print(can_fetch)
Beispiel #9
0
def check_robot(url, user_agent):
    robotparser = urllib.robotparser.RobotFileParser()
    robotparser.set_url(url+'/robots.txt')
    robotparser.read()
    print(robotparser.can_fetch(user_agent,url))
    return robotparser.can_fetch(user_agent, url)
Beispiel #10
0
def check_robot(url, **kwargs):
    user_agent = 'python-requests/2.18.4 (Compatible; John Doe)'
    robotparser = urllib.robotparser.RobotFileParser()
    robotparser.set_url(url)
    robotparser.read()
    return robotparser.can_fetch(user_agent, url)