Beispiel #1
0
def seoaPass(request, url):
    getBtcPrice()
    price = sysvar.objects.get(pk=1)
    btcPrice = price.btcPrice

    robotsUrl = re.findall(r'.*[.][a-zA-Z]{2,3}', url)
    robotsUrl = str(robotsUrl[0])
    robotsUrltxt = robotsUrl + '/robots.txt'
    rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
    print(robotsUrltxt)
    print(url)
    try:
        rerp.fetch(robotsUrltxt)
        if rerp.is_allowed("hello this is https://bl4btc.io", url):
            print("true")
            return True
        else:
            message = "your robots.txt disallows indexing bots from visiting your url"
            return render(request, 'validate.html', {
                'message': message,
                'btcPrice': btcPrice
            })

    except:
        return False
Beispiel #2
0
 def __build_exclusion(self, obey_robots, timeout):
     if obey_robots:
         rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
         rerp.fetch('{}/robots.txt'.format(self.base_domain), timeout)
         return rerp
     else:
         return None
def Start():
    startingpage = input().strip()
    parsedUrl = urlparse(startingpage)
    actualUrl = '{}://{}/robots.txt'.format(parsedUrl.scheme,parsedUrl.netloc)
    RoParser = robotexclusionrulesparser.RobotExclusionRulesParser()
    RoParser.fetch(actualUrl)
    return startingpage, RoParser
    def get_rules(self,
                  request_url,
                  timeout=None,
                  proxies=None,
                  verify=None,
                  cert=None):
        url = urlsplit(request_url)
        robots_url = '{0}://{1}/robots.txt'.format(
            url.scheme,
            url.netloc,
        )

        try:
            rerp = self.registry[robots_url]
        except KeyError:
            r = self._intermediate_send(
                'GET',
                robots_url,
                timeout=timeout,
                proxies=proxies,
                verify=verify,
                cert=cert,
            )
            if r.ok:
                rerp = robots.RobotExclusionRulesParser()
                rerp.parse(r.text)
            elif r.status_code == 404:
                rerp = None
            else:
                r.raise_for_status()
            self.registry[robots_url] = rerp

        return rerp
Beispiel #5
0
 def _cache_empty_robots(self, schemenetloc, final_schemenetloc):
     parsed = robotexclusionrulesparser.RobotExclusionRulesParser()
     parsed.parse('')
     self.datalayer.cache_robots(schemenetloc, (parsed, False))
     if final_schemenetloc:
         self.datalayer.cache_robots(final_schemenetloc, (parsed, False))
     self.in_progress.discard(schemenetloc)
     return parsed, False
def access_right(currentURL):
    try:
        rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        robotpath = find_base(currentURL) + '/robots.txt'
        rerp.fetch(robotpath)
        return rerp.is_allowed("*", currentURL)
    except:
        return True
Beispiel #7
0
 def _get_robotstxt(self, robotstxt_url):
     try:
         robotsfile = robotstxt.RobotExclusionRulesParser()
         robotsfile.fetch(url=robotstxt_url.strip(), timeout=60)
     except:
         if DEBUG:
             Debugger.print_stack_trace()
         return None
     else:
         return robotsfile
Beispiel #8
0
def main():
    # crawler = Crawler()
    prog_start_time = time.clock()
    frontier_out = open("out", "w")
    summary_out = open("summary_out", "w")
    filetypes_domains_out = open("domains_out", "w")
    print "Starting crawler..."
    visits_limit = 50
    visit_num = 0
    my_domain = "uky.edu"
    start_url = "http://www." + my_domain
    # start_url = "http://www.uky.edu"
    url = start_url
    url_frontier = links.Url_Frontier()
    subdomains = set()
    robots = robotexclusionrulesparser.RobotExclusionRulesParser()
    robots.user_agent = "schoolbot"

    # robots = RobotsCache()

    while True:
        print "-------------------------------------"
        loop_start_time = time.clock()
        print "\nVisit #:", visit_num
        # print "Len filetypes:", len(url_frontier.filetypes)
        url_frontier = crawl(url_frontier,
                             robots,
                             url,
                             subdomains,
                             frontier_out,
                             summary_out,
                             domain=my_domain,
                             limit_domain=True,
                             skip=True)
        url = url_frontier.get_list()[visit_num]
        # summary_out.close()
        if visit_num % 5 == 0:
            print_summary_out(summary_out, url_frontier, visit_num)
            print_file_domains_out(filetypes_domains_out, url_frontier)
        print_summary(url_frontier, visit_num)
        visit_num += 1
        end_loop_time = time.clock() - loop_start_time
        prog_elapsed = time.clock() - prog_start_time
        # print "URL time:", time.strftime("%S", time.gmtime(end_loop_time)), " s."
        print "Accumulated time:", time.strftime(
            "%S", time.gmtime(prog_elapsed)), "sec"
        print "URLs seen:", format(
            len(url_frontier.get_list()) / prog_elapsed, '.2f'), "pg/sec"
        print "Crawls:", format(url_frontier.finish_parsed / prog_elapsed,
                                '.2f'), "pg/sec"
        print "-------------------------------------"

    frontier_out.close()
    summary_out.close()
Beispiel #9
0
 def Is_Allowable(self):
     try:
         parsedUrl = urlparse(self.url)
         base = parsedUrl[1]
         robotsUrl = "http://" + base + "/robots.txt"
         parser = robotexclusionrulesparser.RobotExclusionRulesParser()
         parser.user_agent = self.agent
         parser.fetch(robotsUrl)
         return (parser.is_allowed(self.agent, self.url))
     except:
         return False
Beispiel #10
0
def parseRobot(domain):
    robot_url = '/'.join([domain, 'robots.txt'])
    try:
        robot_file = urllib2.urlopen(robot_url).read()
        robot_content = ''
        for l in robot_file.split('\n'):
            if l.replace(' ', '') != '':
                robot_content += l + '\n'
        robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser()
        robot_parser.parse(robot_content)
        return robot_parser
    except:
        return None
Beispiel #11
0
    def __init__(self, root, charset):
        self.root = root
        self.charset = charset
        self.user_agent = 'zfz-bot/1.0'
        self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]',
                                       re.U | re.I)
        self.price_pattern = re.compile(
            ur'租(\s|&nbsp;)*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月',
            re.U | re.I)
        self.area_pattern = re.compile(
            ur'(面(\s|&nbsp;)*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)',
            re.U | re.I)
        self.arch_pattern = re.compile(
            ur'[房户](\s|&nbsp;)*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]',
            re.U | re.I)
        self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>',
                                        re.U | re.I)
        self.address_pattern = re.compile(
            ur'地(\s|&nbsp;)*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)
        self.district_pattern = re.compile(
            ur'(小(\s|&nbsp;)*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]',
            re.U | re.I)

        self.max_url_length = 200
        self.max_price_length = 10
        self.max_area_length = 10
        self.max_arch_length = 20
        self.max_title_length = 100
        self.max_address_length = 100
        self.max_district_length = 20

        self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891')
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(),
                                           urllib2.HTTPRedirectHandler())
        self.opener.addheaders = [('User-agent', self.user_agent)]

        self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
        self.rerp.user_agent = self.user_agent
        try:
            self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt")
        except:
            pass

        self.min_delay_seconds = 120.0
        self.max_crawl_seconds_per_site = 2 * 24 * 3600  # 2 days

        self.max_allowed_urlopen_error = 20
        self.current_urlopen_error = 0

        self.debug = True
Beispiel #12
0
 def __init__(self, name, crawler):
     # config
     self.name = name
     self.crawler = crawler
     # counters for statistics
     self.downloaded = 0
     self.downloaded_count = 0
     # robots.txt handling
     self.crawl_delay = timedelta(seconds=crawler.default_crawl_delay)
     self.last_crawl_time = datetime.now() - self.crawl_delay
     self.rp = robotexclusionrulesparser.RobotExclusionRulesParser()
     self.robots_txt_task = fetcher.FakeAsyncResult(ready=False)
     self.parsed_robots_txt = False
     self.setup_robots_txt()
Beispiel #13
0
    def setUp(self):
        self.parser = robotexclusionrulesparser.RobotExclusionRulesParser()

        s = """
        # robots.txt for http://www.example.com/

        # In the classic syntax, * is treated literally, not as a wildcard.
        # A Webmaster might expect the line below to disallow everything, but
        # that's not how it works.
        User-agent: foobot
        Disallow: *

        User-agent: barbot
        Disallow: /private/*
        """
        self.parser.parse(s)
Beispiel #14
0
def seoaPass(url):
    robotsUrl = re.findall(r'.*[.][a-zA-Z]{2,3}', url)
    
    robotsUrl = str(robotsUrl[0])
    
    robotsUrltxt = robotsUrl + '/robots.txt'
    rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
    try:
        rerp.fetch(robotsUrltxt)
        if rerp.is_allowed("seoabot/0.9 selenium webdriver, see http//:seoabot.com", url):
            return True
        else:
            print(url + "      :disallow")
            return False
    except:
        return False
Beispiel #15
0
def checkRobotstxt(link):
    #Creating link to ROBOTS.TXT
    urlparts = urlparse(link)
    robotstxt_link = urlparts[0] + '://' + urlparts[1] + '/' + 'robots.txt'
    #Checking scrapability
    answer = False
    err = False
    rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
    # Set the (optional) user_agent before calling fetch.
    rerp.user_agent = 'PHist/1.0; https://sites.utu.fi/pseudohistoria/en/'
    user_agent = 'PHist/1.0; https://sites.utu.fi/pseudohistoria/en/'
    try:
        rerp.fetch(robotstxt_link)
        ans = rerp.is_allowed(user_agent, link)
        answer = ans
    except:
        err = True
    return answer, err
Beispiel #16
0
def robots_test():
    rerp = robotexclusionrulesparser.RobotExclusionRulesParser()
    # I'll set the (optional) user_agent before calling fetch.
    rerp.user_agent = "schoolbot"
    # Note that there should be a try/except here to handle urllib2.URLError,
    # socket.timeout, UnicodeError, etc.
    # regex = r"(https?:\/\/.*?\/)"
    regex = r"(https?:\/\/.*?(?:/|$))"
    url = "http://cs.fhdfuky.edu"
    match = re.search(regex, url)
    topdomain_url = ""
    if match != None:
        topdomain_url = match.group(1)
    print "topdomain_url:", topdomain_url
    # try:
    rerp.fetch(topdomain_url + "/robots.txt", timeout=4)
    print "type:", rerp.fetch(topdomain_url + "/robots.txt", timeout=4)
    print rerp.is_allowed(rerp.user_agent, url)
    print rerp.get_crawl_delay(rerp.user_agent)
Beispiel #17
0
def parseRobot(domain):
    robot_url = '/'.join([domain, 'robots.txt'])
    try:
        robot_file = urllib2.urlopen(robot_url).read()
        robot_content = ''
        for l in robot_file.split('\n'):
            if l.replace(' ','') != '':
                robot_content += l + '\n'
        robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser()
        robot_parser.parse(robot_content)

        try:
            crawler_delay = robot_parser.get_crawl_delay('*')
        except Exception as e:
            print 'crawler_delay exception: {}'.format(e)
            crawler_delay = None
        
        return robot_parser, crawler_delay
    except Exception as e:
        print 'robot parse exception: {}'.format(e)
        return None
Beispiel #18
0
    async def fetch_robots(self,
                           schemenetloc,
                           mock_url,
                           headers=None,
                           proxy=None):
        '''
        robotexclusionrules fetcher is not async, so fetch the file ourselves

        https://developers.google.com/search/reference/robots_txt
        3xx redir == follow up to 5 hops, then consider it a 404.
        4xx errors == no crawl restrictions
        5xx errors == full disallow. fast retry if 503.
           if site appears to return 5xx for 404, then 5xx is treated as a 404
        '''
        url = URL(schemenetloc + '/robots.txt')

        if proxy:
            raise ValueError('not yet implemented')

        # We might enter this routine multiple times, so, sleep if we aren't the first
        # XXX this is frequently racy, according to the logfiles!
        if schemenetloc in self.in_progress:
            while schemenetloc in self.in_progress:
                LOGGER.debug(
                    'sleeping because someone beat me to the robots punch')
                # XXX make this a stat?
                with stats.coroutine_state('robots collision sleep'):
                    interval = random.uniform(0.2, 0.3)
                    await asyncio.sleep(interval)

            # at this point robots might be in the cache... or not.
            try:
                robots = self.datalayer.read_robots_cache(schemenetloc)
            except KeyError:
                robots = None
            if robots is not None:
                return robots

            # ok, so it's not in the cache -- and the other guy's
            # fetch failed. if we just fell through there would be a
            # big race. treat this as a failure.
            # XXX note that we have no negative caching
            LOGGER.debug('some other fetch of robots has failed.'
                         )  # XXX make this a stat
            return None

        self.in_progress.add(schemenetloc)

        f = await fetcher.fetch(url,
                                self.session,
                                max_page_size=self.max_robots_page_size,
                                headers=headers,
                                proxy=proxy,
                                mock_url=mock_url,
                                allow_redirects=True,
                                max_redirects=5,
                                stats_prefix='robots ')

        json_log = {'action': 'fetch'}
        if f.last_exception:
            json_log[
                'error'] = 'max tries exceeded, final exception is: ' + f.last_exception
            self.jsonlog(schemenetloc, json_log)
            self.in_progress.discard(schemenetloc)
            return None

        stats.stats_sum('robots fetched', 1)

        # If the url was redirected to a different host/robots.txt, let's cache that too
        # XXX use f.response.history to get them all
        final_url = str(
            f.response.url
        )  # this is a yarl.URL object now -- str() or url.human_repr()? XXX
        final_schemenetloc = None
        if final_url != url.url:
            final_parts = urllib.parse.urlsplit(final_url)
            if final_parts.path == '/robots.txt':
                final_schemenetloc = final_parts.scheme + '://' + final_parts.netloc

        status = f.response.status
        json_log['status'] = status
        json_log['t_first_byte'] = f.t_first_byte

        # if the final status is a redirect, we exceeded max redirects -- treat as a 404, same as googlebot
        # Googlebot treats all 4xx as an empty robots.txt
        if str(status).startswith('3') or str(status).startswith('4'):
            if status >= 400:
                error = 'got a 4xx, treating as empty robots'
            else:
                error = 'got too many redirects, treating as empty robots'
            json_log['error'] = error
            self.jsonlog(schemenetloc, json_log)
            return self._cache_empty_robots(schemenetloc, final_schemenetloc)

        # Googlebot treats all 5xx as deny, unless they think the host returns 5xx instead of 404:
        if str(status).startswith('5'):
            json_log['error'] = 'got a 5xx, treating as deny'
            self.jsonlog(schemenetloc, json_log)
            self.in_progress.discard(schemenetloc)
            return None

        body_bytes = f.body_bytes

        with stats.record_burn('robots sha1'):
            sha1 = 'sha1:' + hashlib.sha1(body_bytes).hexdigest()
        json_log['checksum'] = sha1

        body_bytes = strip_bom(body_bytes)

        plausible, message = self.is_plausible_robots(schemenetloc,
                                                      f.body_bytes,
                                                      f.t_first_byte)
        if not plausible:
            # policy: treat as empty
            json_log[
                'error'] = 'saw an implausible robots.txt, treating as empty'
            json_log['implausible'] = message
            self.jsonlog(schemenetloc, json_log)
            return self._cache_empty_robots(schemenetloc, final_schemenetloc)

        # go from bytes to a string, despite bogus utf8
        # XXX what about non-utf8?
        try:
            body = f.body_bytes.decode(encoding='utf8')
        except UnicodeError:  # pragma: no cover
            # try again assuming utf8 and ignoring errors
            body = f.body_bytes.decode(encoding='utf8', errors='replace')
        except asyncio.CancelledError:
            raise
        except Exception as e:
            # log as surprising, also treat like a fetch error
            json_log[
                'error'] = 'robots body decode threw a surprising exception: ' + repr(
                    e)
            self.jsonlog(schemenetloc, json_log)
            self.in_progress.discard(schemenetloc)
            return None

        if self.robotname in body:
            json_log['mentions-us'] = True

        with stats.record_burn('robots parse', url=schemenetloc):
            parsed = robotexclusionrulesparser.RobotExclusionRulesParser()
            parsed.parse(preprocess_robots(body))
        self.datalayer.cache_robots(schemenetloc, parsed)
        self.in_progress.discard(schemenetloc)
        if final_schemenetloc:
            # we did not set this but we'll discard it anyway
            self.in_progress.discard(final_schemenetloc)
        if parsed.sitemaps:
            json_log['has-sitemaps'] = True

        self.jsonlog(schemenetloc, json_log)
        return parsed
Beispiel #19
0
    async def fetch_robots(self,
                           schemenetloc,
                           mock_url,
                           headers=None,
                           proxy=None):
        '''
        robotexclusionrules parser is not async, so fetch the file ourselves
        '''
        url = URL(schemenetloc + '/robots.txt')

        if proxy:
            raise ValueError('not yet implemented')

        # We might enter this routine multiple times, so, sleep if we aren't the first
        # XXX this is frequently racy, according to the logfiles!
        if schemenetloc in self.in_progress:
            while schemenetloc in self.in_progress:
                # XXX make this a stat?
                # XXX does it go off for wide when it shouldn't?
                LOGGER.debug(
                    'sleeping because someone beat me to the robots punch')
                with stats.coroutine_state('robots collision sleep'):
                    await asyncio.sleep(0.3)

            # at this point robots might be in the cache... or not.
            try:
                robots = self.datalayer.read_robots_cache(schemenetloc)
            except KeyError:
                robots = None
            if robots is not None:
                return robots

            # ok, so it's not in the cache -- and the other guy's
            # fetch failed. if we just fell through there would be a
            # big race. treat this as a failure.
            # XXX note that we have no negative caching
            LOGGER.debug('some other fetch of robots has failed.'
                         )  # XXX make this a stat
            return None

        self.in_progress.add(schemenetloc)

        f = await fetcher.fetch(url,
                                self.session,
                                self.config,
                                headers=headers,
                                proxy=proxy,
                                mock_url=mock_url,
                                allow_redirects=True,
                                stats_me=False)
        if f.last_exception:
            self.jsonlog(
                schemenetloc, {
                    'error': 'max tries exceeded, final exception is: ' +
                    f.last_exception,
                    'action': 'fetch'
                })
            self.in_progress.discard(schemenetloc)
            return None

        stats.stats_sum('robots fetched', 1)

        # If the url was redirected to a different host/robots.txt, let's cache that too
        # XXX use f.response.history to get them all
        final_url = str(
            f.response.url
        )  # this is a yarl.URL object now -- str() or url.human_repr()? XXX
        final_schemenetloc = None
        if final_url != url.url:
            final_parts = urllib.parse.urlparse(final_url)
            if final_parts.path == '/robots.txt':
                final_schemenetloc = final_parts.scheme + '://' + final_parts.netloc

        # if we got a 404, return an empty robots.txt
        if f.response.status == 404:
            self.jsonlog(
                schemenetloc, {
                    'error': 'got a 404, treating as empty robots',
                    'action': 'fetch',
                    't_first_byte': f.t_first_byte
                })
            parsed = robotexclusionrulesparser.RobotExclusionRulesParser()
            parsed.parse('')
            self.datalayer.cache_robots(schemenetloc, parsed)
            if final_schemenetloc:
                self.datalayer.cache_robots(final_schemenetloc, parsed)
            self.in_progress.discard(schemenetloc)
            return parsed

        # if we got a non-200, some should be empty and some should be None (XXX Policy)
        # this implements only None (deny)
        if str(f.response.status).startswith('4') or str(
                f.response.status).startswith('5'):
            self.jsonlog(
                schemenetloc, {
                    'error':
                    'got an unexpected status of {}, treating as deny'.format(
                        f.response.status),
                    'action':
                    'fetch',
                    't_first_byte':
                    f.t_first_byte
                })
            self.in_progress.discard(schemenetloc)
            return None

        if not self.is_plausible_robots(schemenetloc, f.body_bytes,
                                        f.t_first_byte):
            # policy: treat as empty
            self.jsonlog(
                schemenetloc, {
                    'warning':
                    'saw an implausible robots.txt, treating as empty',
                    'action': 'fetch',
                    't_first_byte': f.t_first_byte
                })
            parsed = robotexclusionrulesparser.RobotExclusionRulesParser()
            parsed.parse('')
            self.datalayer.cache_robots(schemenetloc, parsed)
            if final_schemenetloc:
                self.datalayer.cache_robots(final_schemenetloc, parsed)
            self.in_progress.discard(schemenetloc)
            return parsed

        # go from bytes to a string, despite bogus utf8
        try:
            body = await f.response.text()
        except UnicodeError:  # pragma: no cover
            # try again assuming utf8 and ignoring errors
            body = str(f.body_bytes, 'utf-8', 'ignore')
        except (aiohttp.ClientError, aiodns.error.DNSError,
                asyncio.TimeoutError, RuntimeError) as e:
            # something unusual went wrong.
            # policy: treat like a fetch error.
            # (could be a broken tcp session etc.) XXX use list from cocrawler.py
            self.jsonlog(
                schemenetloc, {
                    'error':
                    'robots body decode threw an exception: ' + repr(e),
                    'action': 'fetch',
                    't_first_byte': f.t_first_byte
                })
            self.in_progress.discard(schemenetloc)
            return None
        except asyncio.CancelledError:
            raise
        except Exception as e:
            # log as surprising, also treat like a fetch error
            self.jsonlog(
                schemenetloc, {
                    'error':
                    'robots body decode threw a surprising exception: ' +
                    repr(e),
                    'action':
                    'fetch',
                    't_first_byte':
                    f.t_first_byte
                })
            self.in_progress.discard(schemenetloc)
            return None

        with stats.record_burn('robots parse', url=schemenetloc):
            parsed = robotexclusionrulesparser.RobotExclusionRulesParser()
            parsed.parse(preprocess_robots(body))
        self.datalayer.cache_robots(schemenetloc, parsed)
        self.in_progress.discard(schemenetloc)
        if final_schemenetloc:
            self.in_progress.discard(final_schemenetloc)
        self.jsonlog(schemenetloc, {
            'action': 'fetch',
            't_first_byte': f.t_first_byte
        })
        return parsed
Beispiel #20
0
    async def fetch_robots(self,
                           schemenetloc,
                           mock_url,
                           host_geoip,
                           seed_host,
                           crawler,
                           headers=None,
                           proxy=None):
        '''
        robotexclusionrules fetcher is not async, so fetch the file ourselves

        https://developers.google.com/search/reference/robots_txt
        3xx redir == follow up to 5 hops, then consider it a 404.
        4xx errors == no crawl restrictions
        5xx errors == full disallow. fast retry if 503.
           if site appears to return 5xx for 404, then 5xx is treated as a 404
        '''
        url = URL(schemenetloc + '/robots.txt')

        if proxy:
            raise ValueError('not yet implemented')

        # We might enter this routine multiple times, so, sleep if we aren't the first
        if schemenetloc in self.in_progress:
            while schemenetloc in self.in_progress:
                LOGGER.debug(
                    'sleeping because someone beat me to the robots punch')
                # XXX make this a stat?
                with stats.coroutine_state('robots collision sleep'):
                    interval = random.uniform(0.2, 0.3)
                    await asyncio.sleep(interval)

            # at this point robots might be in the cache... or not.
            try:
                (robots,
                 mentions_us) = self.datalayer.read_robots_cache(schemenetloc)
            except KeyError:
                robots = None
            if robots is not None:
                return robots, mentions_us

            # ok, so it's not in the cache -- and the other guy's fetch failed.
            # if we just fell through, there would be a big race.
            # treat this as a "no data" failure.
            LOGGER.debug('some other fetch of robots has failed.'
                         )  # XXX make this a stat
            return None, False

        self.in_progress.add(schemenetloc)

        f = await fetcher.fetch(url,
                                self.session,
                                max_page_size=self.max_robots_page_size,
                                headers=headers,
                                proxy=proxy,
                                mock_url=mock_url,
                                allow_redirects=True,
                                max_redirects=5,
                                stats_prefix='robots ')

        json_log = {'action': 'fetch', 'time': time.time()}

        if f.last_exception:
            json_log[
                'error'] = 'max tries exceeded, final exception is: ' + f.last_exception
            self.jsonlog(schemenetloc, json_log)
            self.in_progress.discard(schemenetloc)
            return None, False

        if f.response.history:
            redir_history = [str(h.url) for h in f.response.history]
            redir_history.append(str(f.response.url))
            json_log['redir_history'] = redir_history

        stats.stats_sum('robots fetched', 1)

        # If the url was redirected to a different host/robots.txt, let's cache that final host too
        final_url = str(f.response.url)  # YARL object
        final_schemenetloc = None
        if final_url != url.url:
            final_parts = urllib.parse.urlsplit(final_url)
            if final_parts.path == '/robots.txt':
                final_schemenetloc = final_parts.scheme + '://' + final_parts.netloc
                json_log['final_host'] = final_schemenetloc

        status = f.response.status
        json_log['status'] = status
        json_log['t_first_byte'] = f.t_first_byte

        # if the final status is a redirect, we exceeded max redirects -- treat as a 404, same as googlebot
        # Googlebot treats all 4xx as an empty robots.txt
        if str(status).startswith('3') or str(status).startswith('4'):
            if status >= 400:
                error = 'got a 4xx, treating as empty robots'
            else:
                error = 'got too many redirects, treating as empty robots'
            json_log['error'] = error
            self.jsonlog(schemenetloc, json_log)
            return self._cache_empty_robots(schemenetloc, final_schemenetloc)

        # Googlebot treats all 5xx as deny, unless they think the host returns 5xx instead of 404:
        # XXX implement googlebot strategy
        if str(status).startswith('5'):
            json_log['error'] = 'got a 5xx, treating as deny'
            self.jsonlog(schemenetloc, json_log)
            self.in_progress.discard(schemenetloc)
            return None, False

        # we got a 2xx, so let's use the final headers to facet the final server
        if final_schemenetloc:
            robots_url = final_schemenetloc + '/robots.txt'
            # if the hostname is the same and only the scheme is different, that's ok
            if ((robots_url.replace('https://', 'http://', 1) != url.url
                 and robots_url.replace('http://', 'https://', 1) != url.url)):
                host_geoip = {}  # the passed-in one is for the initial server
        else:
            robots_url = url.url
        post_fetch.post_robots_txt(f,
                                   robots_url,
                                   host_geoip,
                                   json_log['time'],
                                   crawler,
                                   seed_host=seed_host)

        body_bytes = f.body_bytes

        with stats.record_burn('robots sha1'):
            sha1 = 'sha1:' + hashlib.sha1(body_bytes).hexdigest()
        json_log['checksum'] = sha1

        body_bytes = strip_bom(body_bytes).lstrip()

        plausible, message = is_plausible_robots(body_bytes)
        if not plausible:
            # policy: treat as empty
            json_log[
                'error'] = 'saw an implausible robots.txt, treating as empty'
            json_log['implausible'] = message
            self.jsonlog(schemenetloc, json_log)
            return self._cache_empty_robots(schemenetloc, final_schemenetloc)

        # go from bytes to a string, despite bogus utf8
        # XXX what about non-utf8?
        try:
            body = body_bytes.decode(encoding='utf8', errors='replace')
        except asyncio.CancelledError:
            raise
        except Exception as e:
            # log as surprising, also treat like a fetch error
            json_log[
                'error'] = 'robots body decode threw a surprising exception: ' + repr(
                    e)
            self.jsonlog(schemenetloc, json_log)
            self.in_progress.discard(schemenetloc)
            return None, False

        preprocessed, mentions_us = preprocess_robots(body, self.robotname,
                                                      json_log)

        with stats.record_burn('robots parse', url=schemenetloc):
            robots = robotexclusionrulesparser.RobotExclusionRulesParser()
            robots.parse(preprocessed)

        with stats.record_burn('robots is_allowed', url=schemenetloc):
            check = robots.is_allowed('*', '/')
            if not check:
                json_log['generic-deny-slash'] = True
                check = robots.is_allowed('googlebot', '/')
                json_log['google-deny-slash'] = not check

        self.datalayer.cache_robots(schemenetloc, (robots, mentions_us))
        self.in_progress.discard(schemenetloc)
        if final_schemenetloc:
            self.datalayer.cache_robots(final_schemenetloc,
                                        (robots, mentions_us))
            # we did not set this but we'll discard it anyway
            self.in_progress.discard(final_schemenetloc)
        if robots.sitemaps:
            json_log['has-sitemaps'] = len(robots.sitemaps)

        self.jsonlog(schemenetloc, json_log)
        return robots, mentions_us
Beispiel #21
0
 def Get_Delay(self):
     parser = robotexclusionrulesparser.RobotExclusionRulesParser()
     parser.user_agent = self.agent
     return (parser.get_crawl_delay(parser.user_agent))
#!/usr/bin/python
import sys
import re
import robotexclusionrulesparser
snip = "(<------- SNIP HERE -------->,"
sniplen = len(snip)
robots = dict()
# read robot file into associative array of robots indexed by domain
robot = None
with open(sys.argv[1], 'r') as infile:
    for line in infile:
        if snip in line:
            if robot is not None:
                rparser = robotexclusionrulesparser.RobotExclusionRulesParser()
                rparser.parse(robot)
                robots[domain] = rparser
                robot = None
            header = line[sniplen:].split(',')
            domain = header[0]
            # first line of robots.txt is in with context info
            robot = header[-1]
        else:
            robot = "".join([robot, line])
    # final robot
    rparser = robotexclusionrulesparser.RobotExclusionRulesParser()
    rparser.parse(robot)
    robots[domain] = rparser
positives = 0
negatives = 0
urltot = 0
domains = dict()
 def setUp(self):
     self.parser = robotexclusionrulesparser.RobotExclusionRulesParser()
Beispiel #24
0
 def setUp(self):
     self.parser = robotexclusionrulesparser.RobotExclusionRulesParser()
     # I force the parser to use UTC to keep things simple.
     self.parser.use_local_time = False
Beispiel #25
0
    def run(self):
        while not frontier.empty():
            # get next url from frontier
            url = frontier.get()

            # parse url to get base url and domain name
            split_url = urlsplit(url)
            base = "{0.netloc}".format(split_url)

            domain = base.replace("www.", "") if "www." in base else base
            base_url = "{0.scheme}://{0.netloc}/".format(split_url)

            # first check if can access page
            canAccess = self.checkIPAccessTime(domain)
            if canAccess != None:
                if not canAccess:
                    # return url to frontier and move on to the next url
                    frontier.put(url)
                    continue
            else:
                continue

            # check if site already saved
            robotLock.acquire()
            site = self.findSiteByDomain(domain)
            if site:
                robotLock.release()
                siteID = site[0]
                robot_content = site[2]
            else:
                # retrieve robots.txt content
                try:
                    r = requests.get(parse.urljoin(base_url, 'robots.txt'))
                    robot_content = None

                    # if it exists, save it
                    if r.status_code == requests.codes.ok:
                        robot_content = r.text
                except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
                    robot_content = None

                # wait some time
                time.sleep(MINOR_TIMEOUT)

                # get sitemap.xml
                try:
                    s = requests.get(parse.urljoin(base_url, 'sitemap.xml'))
                    sitemap_content = None

                    # if it exists save it
                    if s.status_code == requests.codes.ok:
                        sitemap_content = s.text
                except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema):
                    sitemap_content = None

                # wait some time
                time.sleep(MINOR_TIMEOUT)

                # save site
                siteID = self.insertSite(domain, robot_content, sitemap_content)
                robotLock.release()

            # create robot file parser object
            robot = robotexclusionrulesparser.RobotExclusionRulesParser()
            if robot_content:
                robot.parse(robot_content)

            # check if current url is allowed by robots.txt
            duplicatesLock.acquire()
            if not robot.is_allowed(USER_AGENT, url):
                pageID = self.findPageByUrl(url)
                self.deleteLinkByID(pageID)
                self.deletePageByUrl(url)
                duplicatesLock.release()
                continue

            duplicatesLock.release()

            # download content from url
            try:
                self.webDriver.get(url)
                time.sleep(TIMEOUT)
            except TimeoutException:
                # save timeout
                if pageID:
                    # page already saved
                    self.updatePage(pageID, siteID, PAGE_TIMEOUT, None, req.response.status_code, datetime.now())
                else:
                    # save new page
                    pageID = self.insertPage(siteID, PAGE_TIMEOUT, url, None, req.response.status_code, datetime.now())

                # continue to next url in frontier
                del self.webDriver.requests
                print(f"Worker {self.threadID}: {url} done...")
                continue

            # retrieve request that loaded page
            req = None
            for request in self.webDriver.requests:
                if request.response and request.response.status_code >= 300 and request.response.status_code <= 399:
                    continue

                if request.response and request.path == url:
                    req = request
                    break

                if request.response and request.response.status_code == requests.codes.ok:
                    req = request
                    break

            if req == None:
                for request in self.webDriver.requests:
                    if request.response:
                        if request.response.status_code == 403 or request.response.status_code == 503:
                            req = request
                            break

                if not req:
                    req = self.webDriver.last_request

            # check page type and save page info
            pageID = self.findPageByUrl(url)
            if req and req.response:
                content_type = req.response.headers.get('Content-Type')
                if content_type:
                    if "text/html" in content_type:
                        # HTML page

                        # check for canonical link
                        try:
                            canonicalLink = self.webDriver.find_element_by_xpath("//link[@rel='canonical']")
                            if canonicalLink:
                                link = canonicalLink.get_attribute('href')

                                if link != url:
                                    # is duplicate
                                    duplicatesLock.acquire()

                                    # check if original page already saved
                                    originalPageID = self.findPageByUrl(link)
                                    if originalPageID:
                                        duplicatesLock.release()

                                        if pageID:
                                            # page already saved
                                            self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                                        else:
                                            # save new page and remember id
                                            pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                                        # add link to original page
                                        self.insertLink(pageID, originalPageID)

                                        # continue to next url in frontier
                                        del self.webDriver.requests
                                        print(f"Worker {self.threadID}: {url} done...")
                                        continue
                                    else:
                                        # create blank page
                                        originalPageID = self.insertPage(None, FRONTIER, link, None, None, None)
                                        duplicatesLock.release()

                                        if pageID:
                                            # page already saved
                                            self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                                        else:
                                            # save new page and remember id
                                            pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                                        # add link to original page
                                        self.insertLink(pageID, originalPageID)

                                        # add url to frontier
                                        frontier.put(link)

                                        # continue to next url in frontier
                                        del self.webDriver.requests
                                        print(f"Worker {self.threadID}: {url} done...")
                                        continue
                        except(NoSuchElementException, StaleElementReferenceException):
                            pass

                        # check for duplicate content
                        originalPageID = self.findPageByContent(self.webDriver.page_source)
                        if originalPageID:
                            # is duplicate
                            if pageID:
                                # page already saved
                                self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now())
                            else:
                                # save new page and remember id
                                pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now())

                            # add link to original page
                            self.insertLink(pageID, originalPageID)

                            # continue to next url in frontier
                            del self.webDriver.requests
                            print(f"Worker {self.threadID}: {url} done...")
                            continue

                        # not duplicate
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, FRONTIER_HTML, self.webDriver.page_source, req.response.status_code, datetime.now())
                        else:
                            # save new page and remember id
                            pageID = self.insertPage(siteID, FRONTIER_HTML, url, self.webDriver.page_source, req.response.status_code, datetime.now())

                        # let through only pages that loaded successfully
                        if req.response.status_code != requests.codes.ok:
                            del self.webDriver.requests
                            print(f"Worker {self.threadID}: {url} done...")
                            continue
                    elif "text/plain" in content_type:
                        # TXT content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, TXT)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/pdf" in content_type:
                        # PDF content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PDF)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/msword" in content_type:
                        # DOC content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, DOC)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type:
                        # DOCX content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, DOCX)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.ms-powerpoint" in content_type:
                        # PPT content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PPT)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" in content_type:
                        # PPTX content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, PPTX)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "image" in content_type:
                        # IMAGE content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # parse file name
                        filename = urlparse(url)

                        # insert image data
                        self.insertImage(pageID, os.path.basename(filename.path), content_type, datetime.now())

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "text/css" in content_type:
                        # CSS content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, CSS)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "text/csv" in content_type:
                        # CSV content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, CSV)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    elif "application/zip" in content_type:
                        # ZIP content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, ZIP)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                    else:
                        # unknown BINARY content
                        if pageID:
                            # page already saved
                            self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now())
                        else:
                            # save new page
                            pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now())

                        # insert page data
                        self.insertPageData(pageID, UNKNOWN)

                        # continue to next url in frontier
                        del self.webDriver.requests
                        print(f"Worker {self.threadID}: {url} done...")
                        continue
                else:
                    # no content header -> mark page as UNDEFINED
                    if pageID:
                        # page already saved
                        self.updatePage(pageID, siteID, UNDEFINED, None, req.response.status_code, datetime.now())
                    else:
                        # save new page
                        pageID = self.insertPage(siteID, UNDEFINED, url, None, req.response.status_code, datetime.now())

                    # continue to next url in frontier
                    del self.webDriver.requests
                    print(f"Worker {self.threadID}: {url} done...")
                    continue
            else:
                # some kind of error happened
                if pageID:
                    # page already saved
                    self.updatePage(pageID, siteID, NO_RESPONSE, None, None, datetime.now())
                else:
                    # save new page
                    pageID = self.insertPage(siteID, NO_RESPONSE, url, None, None, datetime.now())

                # continue to next url in frontier
                del self.webDriver.requests
                print(f"Worker {self.threadID}: {url} done...")
                continue

            # only if page is of HTML type
            # extract links

            # href
            elements = self.webDriver.find_elements_by_xpath("//*[@href]")
            for element in elements:
                try:
                    link = element.get_attribute('href')

                    # check if url allowed by robots.txt and if is from .gov.si
                    if self.isGov(link) and robot.is_allowed(USER_AGENT, link):
                        # canonicalize url
                        link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                        # add url to frontier
                        self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            # onclick
            elements = self.webDriver.find_elements_by_xpath("//*[@onclick]")
            for element in elements:
                try:
                    line = element.get_attribute('onclick')
                    if line:
                        link = ""
                        if "location.href='" in line:
                            rightLine = line.split("location.href='")[1]
                            link = rightLine.split("'")[0]
                        elif "document.location='" in line:
                            rightLine = line.split("document.location='")[1]
                            link = rightLine.split("'")[0]

                        if link != "":
                            # check if url allowed by robots.txt and if is from .gov.si
                            if self.isGov(link) and robot.is_allowed(USER_AGENT, link):
                                # canonicalize url
                                link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                                # add url to frontier
                                self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            # extract images
            elements = self.webDriver.find_elements_by_tag_name('img')
            for element in elements:
                try:
                    link = element.get_attribute('src')

                    # check if url allowed by robots.txt, if is from .gov.si and if src attribute has URL
                    if self.isGov(link) and robot.is_allowed(USER_AGENT, link) and re.match(self.urlValidator, link):
                        link = str(urlcanon.whatwg(urlcanon.parse_url(link)))

                        self.addUrlToFrontier(pageID, link)
                except(NoSuchElementException, StaleElementReferenceException):
                    continue

            del self.webDriver.requests
            print(f"Worker {self.threadID}: {url} done...")

        self.conn.close()
        self.webDriver.quit()
        print(f"Worker {self.threadID}: finished crawling.")
Beispiel #26
0
def ReadSequential (url_field, robots_level_field, spider_type, output_can_fetch):
  rp = robotexclusionrulesparser.RobotExclusionRulesParser()

  robots_host = ''
  robots_loaded = False
  robots_raw_content = ''

  every_n_idx = 0
  every_n = 100000

  for line in sys.stdin:
  # for line in open('test/samples.test'):
    #pdb.set_trace()
    array = line.strip('\n').split('\t')
    cur_host = array[0]
    tag = array[1]
    url = ''
    if tag is 'A':
      robots_host = cur_host
      robots_loaded = False
      robots_raw_content = array[2]
      continue
    else:
      url = urllib.quote(array[url_field], safe='/@#?:')

    #pdb.set_trace()
    can_fetch = True
    oneboxspider_allowed = False
    someotherspider_allowed = False
    has_robots_txt = False
    if cur_host == robots_host:
      if robots_loaded == False:
        LoadRobots(robots_raw_content, rp)
        robots_loaded = True
      has_robots_txt = True
      (can_fetch, oneboxspider_allowed, someotherspider_allowed) = CanFetch(url, rp)

    robots_level = 0
    if can_fetch == False:
      robots_level = 0
    elif has_robots_txt == False:
      robots_level = 1
    elif oneboxspider_allowed == False and someotherspider_allowed == True:
      robots_level = 2
    elif oneboxspider_allowed == True:
      robots_level = 3
    else:
      sys.stderr.write("invalid robots_level")
      sys.exit(1)

    if spider_type == 3:
      can_fetch = True
    elif spider_type ==1 and can_fetch == True and has_robots_txt == True and oneboxspider_allowed == False:
      can_fetch = False

    if can_fetch == output_can_fetch:
      if robots_level_field > 0:
        array.insert(robots_level_field, str(robots_level))
      output_line = "\t".join(array[2:])
      sys.stdout.write('%s\n' % (output_line))
    else:
# 打印到标准错误的日志
      if robots_level_field > 0:
        array.insert(robots_level_field, str(robots_level))
      output_line = "\t".join(array[2:])
      if every_n_idx % every_n == 0:
        sys.stderr.write('%s\n' % (output_line))
        every_n_idx = 0
      every_n_idx += 1
Beispiel #27
0
 def __init__(self, domain, user_agent):
     self.parser = robotexclusionrulesparser.RobotExclusionRulesParser()
     self.parser.user_agent = user_agent
     self.parser.fetch("http://%s/robots.txt" % domain)
     self.last_crawled_time = 0