def seoaPass(request, url): getBtcPrice() price = sysvar.objects.get(pk=1) btcPrice = price.btcPrice robotsUrl = re.findall(r'.*[.][a-zA-Z]{2,3}', url) robotsUrl = str(robotsUrl[0]) robotsUrltxt = robotsUrl + '/robots.txt' rerp = robotexclusionrulesparser.RobotExclusionRulesParser() print(robotsUrltxt) print(url) try: rerp.fetch(robotsUrltxt) if rerp.is_allowed("hello this is https://bl4btc.io", url): print("true") return True else: message = "your robots.txt disallows indexing bots from visiting your url" return render(request, 'validate.html', { 'message': message, 'btcPrice': btcPrice }) except: return False
def __build_exclusion(self, obey_robots, timeout): if obey_robots: rerp = robotexclusionrulesparser.RobotExclusionRulesParser() rerp.fetch('{}/robots.txt'.format(self.base_domain), timeout) return rerp else: return None
def Start(): startingpage = input().strip() parsedUrl = urlparse(startingpage) actualUrl = '{}://{}/robots.txt'.format(parsedUrl.scheme,parsedUrl.netloc) RoParser = robotexclusionrulesparser.RobotExclusionRulesParser() RoParser.fetch(actualUrl) return startingpage, RoParser
def get_rules(self, request_url, timeout=None, proxies=None, verify=None, cert=None): url = urlsplit(request_url) robots_url = '{0}://{1}/robots.txt'.format( url.scheme, url.netloc, ) try: rerp = self.registry[robots_url] except KeyError: r = self._intermediate_send( 'GET', robots_url, timeout=timeout, proxies=proxies, verify=verify, cert=cert, ) if r.ok: rerp = robots.RobotExclusionRulesParser() rerp.parse(r.text) elif r.status_code == 404: rerp = None else: r.raise_for_status() self.registry[robots_url] = rerp return rerp
def _cache_empty_robots(self, schemenetloc, final_schemenetloc): parsed = robotexclusionrulesparser.RobotExclusionRulesParser() parsed.parse('') self.datalayer.cache_robots(schemenetloc, (parsed, False)) if final_schemenetloc: self.datalayer.cache_robots(final_schemenetloc, (parsed, False)) self.in_progress.discard(schemenetloc) return parsed, False
def access_right(currentURL): try: rerp = robotexclusionrulesparser.RobotExclusionRulesParser() robotpath = find_base(currentURL) + '/robots.txt' rerp.fetch(robotpath) return rerp.is_allowed("*", currentURL) except: return True
def _get_robotstxt(self, robotstxt_url): try: robotsfile = robotstxt.RobotExclusionRulesParser() robotsfile.fetch(url=robotstxt_url.strip(), timeout=60) except: if DEBUG: Debugger.print_stack_trace() return None else: return robotsfile
def main(): # crawler = Crawler() prog_start_time = time.clock() frontier_out = open("out", "w") summary_out = open("summary_out", "w") filetypes_domains_out = open("domains_out", "w") print "Starting crawler..." visits_limit = 50 visit_num = 0 my_domain = "uky.edu" start_url = "http://www." + my_domain # start_url = "http://www.uky.edu" url = start_url url_frontier = links.Url_Frontier() subdomains = set() robots = robotexclusionrulesparser.RobotExclusionRulesParser() robots.user_agent = "schoolbot" # robots = RobotsCache() while True: print "-------------------------------------" loop_start_time = time.clock() print "\nVisit #:", visit_num # print "Len filetypes:", len(url_frontier.filetypes) url_frontier = crawl(url_frontier, robots, url, subdomains, frontier_out, summary_out, domain=my_domain, limit_domain=True, skip=True) url = url_frontier.get_list()[visit_num] # summary_out.close() if visit_num % 5 == 0: print_summary_out(summary_out, url_frontier, visit_num) print_file_domains_out(filetypes_domains_out, url_frontier) print_summary(url_frontier, visit_num) visit_num += 1 end_loop_time = time.clock() - loop_start_time prog_elapsed = time.clock() - prog_start_time # print "URL time:", time.strftime("%S", time.gmtime(end_loop_time)), " s." print "Accumulated time:", time.strftime( "%S", time.gmtime(prog_elapsed)), "sec" print "URLs seen:", format( len(url_frontier.get_list()) / prog_elapsed, '.2f'), "pg/sec" print "Crawls:", format(url_frontier.finish_parsed / prog_elapsed, '.2f'), "pg/sec" print "-------------------------------------" frontier_out.close() summary_out.close()
def Is_Allowable(self): try: parsedUrl = urlparse(self.url) base = parsedUrl[1] robotsUrl = "http://" + base + "/robots.txt" parser = robotexclusionrulesparser.RobotExclusionRulesParser() parser.user_agent = self.agent parser.fetch(robotsUrl) return (parser.is_allowed(self.agent, self.url)) except: return False
def parseRobot(domain): robot_url = '/'.join([domain, 'robots.txt']) try: robot_file = urllib2.urlopen(robot_url).read() robot_content = '' for l in robot_file.split('\n'): if l.replace(' ', '') != '': robot_content += l + '\n' robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser() robot_parser.parse(robot_content) return robot_parser except: return None
def __init__(self, root, charset): self.root = root self.charset = charset self.user_agent = 'zfz-bot/1.0' self.link_pattern = re.compile(r'\s+href="([^\s\'">]+)"[\s>]', re.U | re.I) self.price_pattern = re.compile( ur'租(\s| )*金[^::]*[::]\s*(<[^<>]+>\s*)*(\d+)\s*(<[^<>]+>\s*)*元/月', re.U | re.I) self.area_pattern = re.compile( ur'(面(\s| )*积[::]\s*(<[^<>]+>\s*)*|室\s*|卫\s*|厅\s*)([\d\.]+)\s*(平米|㎡|平方米)', re.U | re.I) self.arch_pattern = re.compile( ur'[房户](\s| )*型[^::]*[::]\s*(<[^<>]+>\s*)*(\d[^<\s]+)[<\s]', re.U | re.I) self.title_pattern = re.compile(ur'<title>\s*([^<]+[^\s])\s*</title>', re.U | re.I) self.address_pattern = re.compile( ur'地(\s| )*址[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.district_pattern = re.compile( ur'(小(\s| )*区(名称)?|楼盘名称)[::]\s*(<[^<>]+>\s*)*([^<>\s]+)[<\s]', re.U | re.I) self.max_url_length = 200 self.max_price_length = 10 self.max_area_length = 10 self.max_arch_length = 20 self.max_title_length = 100 self.max_address_length = 100 self.max_district_length = 20 self.db = Connection('127.0.0.1', 'zfz', 'zfz', 'zfz...891') self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) self.opener.addheaders = [('User-agent', self.user_agent)] self.rerp = robotexclusionrulesparser.RobotExclusionRulesParser() self.rerp.user_agent = self.user_agent try: self.rerp.fetch(self.root[:self.root.find('/', 7)] + "/robots.txt") except: pass self.min_delay_seconds = 120.0 self.max_crawl_seconds_per_site = 2 * 24 * 3600 # 2 days self.max_allowed_urlopen_error = 20 self.current_urlopen_error = 0 self.debug = True
def __init__(self, name, crawler): # config self.name = name self.crawler = crawler # counters for statistics self.downloaded = 0 self.downloaded_count = 0 # robots.txt handling self.crawl_delay = timedelta(seconds=crawler.default_crawl_delay) self.last_crawl_time = datetime.now() - self.crawl_delay self.rp = robotexclusionrulesparser.RobotExclusionRulesParser() self.robots_txt_task = fetcher.FakeAsyncResult(ready=False) self.parsed_robots_txt = False self.setup_robots_txt()
def setUp(self): self.parser = robotexclusionrulesparser.RobotExclusionRulesParser() s = """ # robots.txt for http://www.example.com/ # In the classic syntax, * is treated literally, not as a wildcard. # A Webmaster might expect the line below to disallow everything, but # that's not how it works. User-agent: foobot Disallow: * User-agent: barbot Disallow: /private/* """ self.parser.parse(s)
def seoaPass(url): robotsUrl = re.findall(r'.*[.][a-zA-Z]{2,3}', url) robotsUrl = str(robotsUrl[0]) robotsUrltxt = robotsUrl + '/robots.txt' rerp = robotexclusionrulesparser.RobotExclusionRulesParser() try: rerp.fetch(robotsUrltxt) if rerp.is_allowed("seoabot/0.9 selenium webdriver, see http//:seoabot.com", url): return True else: print(url + " :disallow") return False except: return False
def checkRobotstxt(link): #Creating link to ROBOTS.TXT urlparts = urlparse(link) robotstxt_link = urlparts[0] + '://' + urlparts[1] + '/' + 'robots.txt' #Checking scrapability answer = False err = False rerp = robotexclusionrulesparser.RobotExclusionRulesParser() # Set the (optional) user_agent before calling fetch. rerp.user_agent = 'PHist/1.0; https://sites.utu.fi/pseudohistoria/en/' user_agent = 'PHist/1.0; https://sites.utu.fi/pseudohistoria/en/' try: rerp.fetch(robotstxt_link) ans = rerp.is_allowed(user_agent, link) answer = ans except: err = True return answer, err
def robots_test(): rerp = robotexclusionrulesparser.RobotExclusionRulesParser() # I'll set the (optional) user_agent before calling fetch. rerp.user_agent = "schoolbot" # Note that there should be a try/except here to handle urllib2.URLError, # socket.timeout, UnicodeError, etc. # regex = r"(https?:\/\/.*?\/)" regex = r"(https?:\/\/.*?(?:/|$))" url = "http://cs.fhdfuky.edu" match = re.search(regex, url) topdomain_url = "" if match != None: topdomain_url = match.group(1) print "topdomain_url:", topdomain_url # try: rerp.fetch(topdomain_url + "/robots.txt", timeout=4) print "type:", rerp.fetch(topdomain_url + "/robots.txt", timeout=4) print rerp.is_allowed(rerp.user_agent, url) print rerp.get_crawl_delay(rerp.user_agent)
def parseRobot(domain): robot_url = '/'.join([domain, 'robots.txt']) try: robot_file = urllib2.urlopen(robot_url).read() robot_content = '' for l in robot_file.split('\n'): if l.replace(' ','') != '': robot_content += l + '\n' robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser() robot_parser.parse(robot_content) try: crawler_delay = robot_parser.get_crawl_delay('*') except Exception as e: print 'crawler_delay exception: {}'.format(e) crawler_delay = None return robot_parser, crawler_delay except Exception as e: print 'robot parse exception: {}'.format(e) return None
async def fetch_robots(self, schemenetloc, mock_url, headers=None, proxy=None): ''' robotexclusionrules fetcher is not async, so fetch the file ourselves https://developers.google.com/search/reference/robots_txt 3xx redir == follow up to 5 hops, then consider it a 404. 4xx errors == no crawl restrictions 5xx errors == full disallow. fast retry if 503. if site appears to return 5xx for 404, then 5xx is treated as a 404 ''' url = URL(schemenetloc + '/robots.txt') if proxy: raise ValueError('not yet implemented') # We might enter this routine multiple times, so, sleep if we aren't the first # XXX this is frequently racy, according to the logfiles! if schemenetloc in self.in_progress: while schemenetloc in self.in_progress: LOGGER.debug( 'sleeping because someone beat me to the robots punch') # XXX make this a stat? with stats.coroutine_state('robots collision sleep'): interval = random.uniform(0.2, 0.3) await asyncio.sleep(interval) # at this point robots might be in the cache... or not. try: robots = self.datalayer.read_robots_cache(schemenetloc) except KeyError: robots = None if robots is not None: return robots # ok, so it's not in the cache -- and the other guy's # fetch failed. if we just fell through there would be a # big race. treat this as a failure. # XXX note that we have no negative caching LOGGER.debug('some other fetch of robots has failed.' ) # XXX make this a stat return None self.in_progress.add(schemenetloc) f = await fetcher.fetch(url, self.session, max_page_size=self.max_robots_page_size, headers=headers, proxy=proxy, mock_url=mock_url, allow_redirects=True, max_redirects=5, stats_prefix='robots ') json_log = {'action': 'fetch'} if f.last_exception: json_log[ 'error'] = 'max tries exceeded, final exception is: ' + f.last_exception self.jsonlog(schemenetloc, json_log) self.in_progress.discard(schemenetloc) return None stats.stats_sum('robots fetched', 1) # If the url was redirected to a different host/robots.txt, let's cache that too # XXX use f.response.history to get them all final_url = str( f.response.url ) # this is a yarl.URL object now -- str() or url.human_repr()? XXX final_schemenetloc = None if final_url != url.url: final_parts = urllib.parse.urlsplit(final_url) if final_parts.path == '/robots.txt': final_schemenetloc = final_parts.scheme + '://' + final_parts.netloc status = f.response.status json_log['status'] = status json_log['t_first_byte'] = f.t_first_byte # if the final status is a redirect, we exceeded max redirects -- treat as a 404, same as googlebot # Googlebot treats all 4xx as an empty robots.txt if str(status).startswith('3') or str(status).startswith('4'): if status >= 400: error = 'got a 4xx, treating as empty robots' else: error = 'got too many redirects, treating as empty robots' json_log['error'] = error self.jsonlog(schemenetloc, json_log) return self._cache_empty_robots(schemenetloc, final_schemenetloc) # Googlebot treats all 5xx as deny, unless they think the host returns 5xx instead of 404: if str(status).startswith('5'): json_log['error'] = 'got a 5xx, treating as deny' self.jsonlog(schemenetloc, json_log) self.in_progress.discard(schemenetloc) return None body_bytes = f.body_bytes with stats.record_burn('robots sha1'): sha1 = 'sha1:' + hashlib.sha1(body_bytes).hexdigest() json_log['checksum'] = sha1 body_bytes = strip_bom(body_bytes) plausible, message = self.is_plausible_robots(schemenetloc, f.body_bytes, f.t_first_byte) if not plausible: # policy: treat as empty json_log[ 'error'] = 'saw an implausible robots.txt, treating as empty' json_log['implausible'] = message self.jsonlog(schemenetloc, json_log) return self._cache_empty_robots(schemenetloc, final_schemenetloc) # go from bytes to a string, despite bogus utf8 # XXX what about non-utf8? try: body = f.body_bytes.decode(encoding='utf8') except UnicodeError: # pragma: no cover # try again assuming utf8 and ignoring errors body = f.body_bytes.decode(encoding='utf8', errors='replace') except asyncio.CancelledError: raise except Exception as e: # log as surprising, also treat like a fetch error json_log[ 'error'] = 'robots body decode threw a surprising exception: ' + repr( e) self.jsonlog(schemenetloc, json_log) self.in_progress.discard(schemenetloc) return None if self.robotname in body: json_log['mentions-us'] = True with stats.record_burn('robots parse', url=schemenetloc): parsed = robotexclusionrulesparser.RobotExclusionRulesParser() parsed.parse(preprocess_robots(body)) self.datalayer.cache_robots(schemenetloc, parsed) self.in_progress.discard(schemenetloc) if final_schemenetloc: # we did not set this but we'll discard it anyway self.in_progress.discard(final_schemenetloc) if parsed.sitemaps: json_log['has-sitemaps'] = True self.jsonlog(schemenetloc, json_log) return parsed
async def fetch_robots(self, schemenetloc, mock_url, headers=None, proxy=None): ''' robotexclusionrules parser is not async, so fetch the file ourselves ''' url = URL(schemenetloc + '/robots.txt') if proxy: raise ValueError('not yet implemented') # We might enter this routine multiple times, so, sleep if we aren't the first # XXX this is frequently racy, according to the logfiles! if schemenetloc in self.in_progress: while schemenetloc in self.in_progress: # XXX make this a stat? # XXX does it go off for wide when it shouldn't? LOGGER.debug( 'sleeping because someone beat me to the robots punch') with stats.coroutine_state('robots collision sleep'): await asyncio.sleep(0.3) # at this point robots might be in the cache... or not. try: robots = self.datalayer.read_robots_cache(schemenetloc) except KeyError: robots = None if robots is not None: return robots # ok, so it's not in the cache -- and the other guy's # fetch failed. if we just fell through there would be a # big race. treat this as a failure. # XXX note that we have no negative caching LOGGER.debug('some other fetch of robots has failed.' ) # XXX make this a stat return None self.in_progress.add(schemenetloc) f = await fetcher.fetch(url, self.session, self.config, headers=headers, proxy=proxy, mock_url=mock_url, allow_redirects=True, stats_me=False) if f.last_exception: self.jsonlog( schemenetloc, { 'error': 'max tries exceeded, final exception is: ' + f.last_exception, 'action': 'fetch' }) self.in_progress.discard(schemenetloc) return None stats.stats_sum('robots fetched', 1) # If the url was redirected to a different host/robots.txt, let's cache that too # XXX use f.response.history to get them all final_url = str( f.response.url ) # this is a yarl.URL object now -- str() or url.human_repr()? XXX final_schemenetloc = None if final_url != url.url: final_parts = urllib.parse.urlparse(final_url) if final_parts.path == '/robots.txt': final_schemenetloc = final_parts.scheme + '://' + final_parts.netloc # if we got a 404, return an empty robots.txt if f.response.status == 404: self.jsonlog( schemenetloc, { 'error': 'got a 404, treating as empty robots', 'action': 'fetch', 't_first_byte': f.t_first_byte }) parsed = robotexclusionrulesparser.RobotExclusionRulesParser() parsed.parse('') self.datalayer.cache_robots(schemenetloc, parsed) if final_schemenetloc: self.datalayer.cache_robots(final_schemenetloc, parsed) self.in_progress.discard(schemenetloc) return parsed # if we got a non-200, some should be empty and some should be None (XXX Policy) # this implements only None (deny) if str(f.response.status).startswith('4') or str( f.response.status).startswith('5'): self.jsonlog( schemenetloc, { 'error': 'got an unexpected status of {}, treating as deny'.format( f.response.status), 'action': 'fetch', 't_first_byte': f.t_first_byte }) self.in_progress.discard(schemenetloc) return None if not self.is_plausible_robots(schemenetloc, f.body_bytes, f.t_first_byte): # policy: treat as empty self.jsonlog( schemenetloc, { 'warning': 'saw an implausible robots.txt, treating as empty', 'action': 'fetch', 't_first_byte': f.t_first_byte }) parsed = robotexclusionrulesparser.RobotExclusionRulesParser() parsed.parse('') self.datalayer.cache_robots(schemenetloc, parsed) if final_schemenetloc: self.datalayer.cache_robots(final_schemenetloc, parsed) self.in_progress.discard(schemenetloc) return parsed # go from bytes to a string, despite bogus utf8 try: body = await f.response.text() except UnicodeError: # pragma: no cover # try again assuming utf8 and ignoring errors body = str(f.body_bytes, 'utf-8', 'ignore') except (aiohttp.ClientError, aiodns.error.DNSError, asyncio.TimeoutError, RuntimeError) as e: # something unusual went wrong. # policy: treat like a fetch error. # (could be a broken tcp session etc.) XXX use list from cocrawler.py self.jsonlog( schemenetloc, { 'error': 'robots body decode threw an exception: ' + repr(e), 'action': 'fetch', 't_first_byte': f.t_first_byte }) self.in_progress.discard(schemenetloc) return None except asyncio.CancelledError: raise except Exception as e: # log as surprising, also treat like a fetch error self.jsonlog( schemenetloc, { 'error': 'robots body decode threw a surprising exception: ' + repr(e), 'action': 'fetch', 't_first_byte': f.t_first_byte }) self.in_progress.discard(schemenetloc) return None with stats.record_burn('robots parse', url=schemenetloc): parsed = robotexclusionrulesparser.RobotExclusionRulesParser() parsed.parse(preprocess_robots(body)) self.datalayer.cache_robots(schemenetloc, parsed) self.in_progress.discard(schemenetloc) if final_schemenetloc: self.in_progress.discard(final_schemenetloc) self.jsonlog(schemenetloc, { 'action': 'fetch', 't_first_byte': f.t_first_byte }) return parsed
async def fetch_robots(self, schemenetloc, mock_url, host_geoip, seed_host, crawler, headers=None, proxy=None): ''' robotexclusionrules fetcher is not async, so fetch the file ourselves https://developers.google.com/search/reference/robots_txt 3xx redir == follow up to 5 hops, then consider it a 404. 4xx errors == no crawl restrictions 5xx errors == full disallow. fast retry if 503. if site appears to return 5xx for 404, then 5xx is treated as a 404 ''' url = URL(schemenetloc + '/robots.txt') if proxy: raise ValueError('not yet implemented') # We might enter this routine multiple times, so, sleep if we aren't the first if schemenetloc in self.in_progress: while schemenetloc in self.in_progress: LOGGER.debug( 'sleeping because someone beat me to the robots punch') # XXX make this a stat? with stats.coroutine_state('robots collision sleep'): interval = random.uniform(0.2, 0.3) await asyncio.sleep(interval) # at this point robots might be in the cache... or not. try: (robots, mentions_us) = self.datalayer.read_robots_cache(schemenetloc) except KeyError: robots = None if robots is not None: return robots, mentions_us # ok, so it's not in the cache -- and the other guy's fetch failed. # if we just fell through, there would be a big race. # treat this as a "no data" failure. LOGGER.debug('some other fetch of robots has failed.' ) # XXX make this a stat return None, False self.in_progress.add(schemenetloc) f = await fetcher.fetch(url, self.session, max_page_size=self.max_robots_page_size, headers=headers, proxy=proxy, mock_url=mock_url, allow_redirects=True, max_redirects=5, stats_prefix='robots ') json_log = {'action': 'fetch', 'time': time.time()} if f.last_exception: json_log[ 'error'] = 'max tries exceeded, final exception is: ' + f.last_exception self.jsonlog(schemenetloc, json_log) self.in_progress.discard(schemenetloc) return None, False if f.response.history: redir_history = [str(h.url) for h in f.response.history] redir_history.append(str(f.response.url)) json_log['redir_history'] = redir_history stats.stats_sum('robots fetched', 1) # If the url was redirected to a different host/robots.txt, let's cache that final host too final_url = str(f.response.url) # YARL object final_schemenetloc = None if final_url != url.url: final_parts = urllib.parse.urlsplit(final_url) if final_parts.path == '/robots.txt': final_schemenetloc = final_parts.scheme + '://' + final_parts.netloc json_log['final_host'] = final_schemenetloc status = f.response.status json_log['status'] = status json_log['t_first_byte'] = f.t_first_byte # if the final status is a redirect, we exceeded max redirects -- treat as a 404, same as googlebot # Googlebot treats all 4xx as an empty robots.txt if str(status).startswith('3') or str(status).startswith('4'): if status >= 400: error = 'got a 4xx, treating as empty robots' else: error = 'got too many redirects, treating as empty robots' json_log['error'] = error self.jsonlog(schemenetloc, json_log) return self._cache_empty_robots(schemenetloc, final_schemenetloc) # Googlebot treats all 5xx as deny, unless they think the host returns 5xx instead of 404: # XXX implement googlebot strategy if str(status).startswith('5'): json_log['error'] = 'got a 5xx, treating as deny' self.jsonlog(schemenetloc, json_log) self.in_progress.discard(schemenetloc) return None, False # we got a 2xx, so let's use the final headers to facet the final server if final_schemenetloc: robots_url = final_schemenetloc + '/robots.txt' # if the hostname is the same and only the scheme is different, that's ok if ((robots_url.replace('https://', 'http://', 1) != url.url and robots_url.replace('http://', 'https://', 1) != url.url)): host_geoip = {} # the passed-in one is for the initial server else: robots_url = url.url post_fetch.post_robots_txt(f, robots_url, host_geoip, json_log['time'], crawler, seed_host=seed_host) body_bytes = f.body_bytes with stats.record_burn('robots sha1'): sha1 = 'sha1:' + hashlib.sha1(body_bytes).hexdigest() json_log['checksum'] = sha1 body_bytes = strip_bom(body_bytes).lstrip() plausible, message = is_plausible_robots(body_bytes) if not plausible: # policy: treat as empty json_log[ 'error'] = 'saw an implausible robots.txt, treating as empty' json_log['implausible'] = message self.jsonlog(schemenetloc, json_log) return self._cache_empty_robots(schemenetloc, final_schemenetloc) # go from bytes to a string, despite bogus utf8 # XXX what about non-utf8? try: body = body_bytes.decode(encoding='utf8', errors='replace') except asyncio.CancelledError: raise except Exception as e: # log as surprising, also treat like a fetch error json_log[ 'error'] = 'robots body decode threw a surprising exception: ' + repr( e) self.jsonlog(schemenetloc, json_log) self.in_progress.discard(schemenetloc) return None, False preprocessed, mentions_us = preprocess_robots(body, self.robotname, json_log) with stats.record_burn('robots parse', url=schemenetloc): robots = robotexclusionrulesparser.RobotExclusionRulesParser() robots.parse(preprocessed) with stats.record_burn('robots is_allowed', url=schemenetloc): check = robots.is_allowed('*', '/') if not check: json_log['generic-deny-slash'] = True check = robots.is_allowed('googlebot', '/') json_log['google-deny-slash'] = not check self.datalayer.cache_robots(schemenetloc, (robots, mentions_us)) self.in_progress.discard(schemenetloc) if final_schemenetloc: self.datalayer.cache_robots(final_schemenetloc, (robots, mentions_us)) # we did not set this but we'll discard it anyway self.in_progress.discard(final_schemenetloc) if robots.sitemaps: json_log['has-sitemaps'] = len(robots.sitemaps) self.jsonlog(schemenetloc, json_log) return robots, mentions_us
def Get_Delay(self): parser = robotexclusionrulesparser.RobotExclusionRulesParser() parser.user_agent = self.agent return (parser.get_crawl_delay(parser.user_agent))
#!/usr/bin/python import sys import re import robotexclusionrulesparser snip = "(<------- SNIP HERE -------->," sniplen = len(snip) robots = dict() # read robot file into associative array of robots indexed by domain robot = None with open(sys.argv[1], 'r') as infile: for line in infile: if snip in line: if robot is not None: rparser = robotexclusionrulesparser.RobotExclusionRulesParser() rparser.parse(robot) robots[domain] = rparser robot = None header = line[sniplen:].split(',') domain = header[0] # first line of robots.txt is in with context info robot = header[-1] else: robot = "".join([robot, line]) # final robot rparser = robotexclusionrulesparser.RobotExclusionRulesParser() rparser.parse(robot) robots[domain] = rparser positives = 0 negatives = 0 urltot = 0 domains = dict()
def setUp(self): self.parser = robotexclusionrulesparser.RobotExclusionRulesParser()
def setUp(self): self.parser = robotexclusionrulesparser.RobotExclusionRulesParser() # I force the parser to use UTC to keep things simple. self.parser.use_local_time = False
def run(self): while not frontier.empty(): # get next url from frontier url = frontier.get() # parse url to get base url and domain name split_url = urlsplit(url) base = "{0.netloc}".format(split_url) domain = base.replace("www.", "") if "www." in base else base base_url = "{0.scheme}://{0.netloc}/".format(split_url) # first check if can access page canAccess = self.checkIPAccessTime(domain) if canAccess != None: if not canAccess: # return url to frontier and move on to the next url frontier.put(url) continue else: continue # check if site already saved robotLock.acquire() site = self.findSiteByDomain(domain) if site: robotLock.release() siteID = site[0] robot_content = site[2] else: # retrieve robots.txt content try: r = requests.get(parse.urljoin(base_url, 'robots.txt')) robot_content = None # if it exists, save it if r.status_code == requests.codes.ok: robot_content = r.text except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema): robot_content = None # wait some time time.sleep(MINOR_TIMEOUT) # get sitemap.xml try: s = requests.get(parse.urljoin(base_url, 'sitemap.xml')) sitemap_content = None # if it exists save it if s.status_code == requests.codes.ok: sitemap_content = s.text except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema): sitemap_content = None # wait some time time.sleep(MINOR_TIMEOUT) # save site siteID = self.insertSite(domain, robot_content, sitemap_content) robotLock.release() # create robot file parser object robot = robotexclusionrulesparser.RobotExclusionRulesParser() if robot_content: robot.parse(robot_content) # check if current url is allowed by robots.txt duplicatesLock.acquire() if not robot.is_allowed(USER_AGENT, url): pageID = self.findPageByUrl(url) self.deleteLinkByID(pageID) self.deletePageByUrl(url) duplicatesLock.release() continue duplicatesLock.release() # download content from url try: self.webDriver.get(url) time.sleep(TIMEOUT) except TimeoutException: # save timeout if pageID: # page already saved self.updatePage(pageID, siteID, PAGE_TIMEOUT, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, PAGE_TIMEOUT, url, None, req.response.status_code, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # retrieve request that loaded page req = None for request in self.webDriver.requests: if request.response and request.response.status_code >= 300 and request.response.status_code <= 399: continue if request.response and request.path == url: req = request break if request.response and request.response.status_code == requests.codes.ok: req = request break if req == None: for request in self.webDriver.requests: if request.response: if request.response.status_code == 403 or request.response.status_code == 503: req = request break if not req: req = self.webDriver.last_request # check page type and save page info pageID = self.findPageByUrl(url) if req and req.response: content_type = req.response.headers.get('Content-Type') if content_type: if "text/html" in content_type: # HTML page # check for canonical link try: canonicalLink = self.webDriver.find_element_by_xpath("//link[@rel='canonical']") if canonicalLink: link = canonicalLink.get_attribute('href') if link != url: # is duplicate duplicatesLock.acquire() # check if original page already saved originalPageID = self.findPageByUrl(link) if originalPageID: duplicatesLock.release() if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # create blank page originalPageID = self.insertPage(None, FRONTIER, link, None, None, None) duplicatesLock.release() if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # add url to frontier frontier.put(link) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue except(NoSuchElementException, StaleElementReferenceException): pass # check for duplicate content originalPageID = self.findPageByContent(self.webDriver.page_source) if originalPageID: # is duplicate if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # not duplicate if pageID: # page already saved self.updatePage(pageID, siteID, FRONTIER_HTML, self.webDriver.page_source, req.response.status_code, datetime.now()) else: # save new page and remember id pageID = self.insertPage(siteID, FRONTIER_HTML, url, self.webDriver.page_source, req.response.status_code, datetime.now()) # let through only pages that loaded successfully if req.response.status_code != requests.codes.ok: del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/plain" in content_type: # TXT content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, TXT) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/pdf" in content_type: # PDF content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PDF) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/msword" in content_type: # DOC content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, DOC) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type: # DOCX content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, DOCX) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.ms-powerpoint" in content_type: # PPT content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PPT) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" in content_type: # PPTX content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PPTX) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "image" in content_type: # IMAGE content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # parse file name filename = urlparse(url) # insert image data self.insertImage(pageID, os.path.basename(filename.path), content_type, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/css" in content_type: # CSS content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, CSS) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/csv" in content_type: # CSV content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, CSV) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/zip" in content_type: # ZIP content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, ZIP) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # unknown BINARY content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, UNKNOWN) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # no content header -> mark page as UNDEFINED if pageID: # page already saved self.updatePage(pageID, siteID, UNDEFINED, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, UNDEFINED, url, None, req.response.status_code, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # some kind of error happened if pageID: # page already saved self.updatePage(pageID, siteID, NO_RESPONSE, None, None, datetime.now()) else: # save new page pageID = self.insertPage(siteID, NO_RESPONSE, url, None, None, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # only if page is of HTML type # extract links # href elements = self.webDriver.find_elements_by_xpath("//*[@href]") for element in elements: try: link = element.get_attribute('href') # check if url allowed by robots.txt and if is from .gov.si if self.isGov(link) and robot.is_allowed(USER_AGENT, link): # canonicalize url link = str(urlcanon.whatwg(urlcanon.parse_url(link))) # add url to frontier self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue # onclick elements = self.webDriver.find_elements_by_xpath("//*[@onclick]") for element in elements: try: line = element.get_attribute('onclick') if line: link = "" if "location.href='" in line: rightLine = line.split("location.href='")[1] link = rightLine.split("'")[0] elif "document.location='" in line: rightLine = line.split("document.location='")[1] link = rightLine.split("'")[0] if link != "": # check if url allowed by robots.txt and if is from .gov.si if self.isGov(link) and robot.is_allowed(USER_AGENT, link): # canonicalize url link = str(urlcanon.whatwg(urlcanon.parse_url(link))) # add url to frontier self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue # extract images elements = self.webDriver.find_elements_by_tag_name('img') for element in elements: try: link = element.get_attribute('src') # check if url allowed by robots.txt, if is from .gov.si and if src attribute has URL if self.isGov(link) and robot.is_allowed(USER_AGENT, link) and re.match(self.urlValidator, link): link = str(urlcanon.whatwg(urlcanon.parse_url(link))) self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") self.conn.close() self.webDriver.quit() print(f"Worker {self.threadID}: finished crawling.")
def ReadSequential (url_field, robots_level_field, spider_type, output_can_fetch): rp = robotexclusionrulesparser.RobotExclusionRulesParser() robots_host = '' robots_loaded = False robots_raw_content = '' every_n_idx = 0 every_n = 100000 for line in sys.stdin: # for line in open('test/samples.test'): #pdb.set_trace() array = line.strip('\n').split('\t') cur_host = array[0] tag = array[1] url = '' if tag is 'A': robots_host = cur_host robots_loaded = False robots_raw_content = array[2] continue else: url = urllib.quote(array[url_field], safe='/@#?:') #pdb.set_trace() can_fetch = True oneboxspider_allowed = False someotherspider_allowed = False has_robots_txt = False if cur_host == robots_host: if robots_loaded == False: LoadRobots(robots_raw_content, rp) robots_loaded = True has_robots_txt = True (can_fetch, oneboxspider_allowed, someotherspider_allowed) = CanFetch(url, rp) robots_level = 0 if can_fetch == False: robots_level = 0 elif has_robots_txt == False: robots_level = 1 elif oneboxspider_allowed == False and someotherspider_allowed == True: robots_level = 2 elif oneboxspider_allowed == True: robots_level = 3 else: sys.stderr.write("invalid robots_level") sys.exit(1) if spider_type == 3: can_fetch = True elif spider_type ==1 and can_fetch == True and has_robots_txt == True and oneboxspider_allowed == False: can_fetch = False if can_fetch == output_can_fetch: if robots_level_field > 0: array.insert(robots_level_field, str(robots_level)) output_line = "\t".join(array[2:]) sys.stdout.write('%s\n' % (output_line)) else: # 打印到标准错误的日志 if robots_level_field > 0: array.insert(robots_level_field, str(robots_level)) output_line = "\t".join(array[2:]) if every_n_idx % every_n == 0: sys.stderr.write('%s\n' % (output_line)) every_n_idx = 0 every_n_idx += 1
def __init__(self, domain, user_agent): self.parser = robotexclusionrulesparser.RobotExclusionRulesParser() self.parser.user_agent = user_agent self.parser.fetch("http://%s/robots.txt" % domain) self.last_crawled_time = 0