class RobotsMiddleware(BaseMiddleware): def __init__(self, *args, **kwargs): self.cache = RobotsCache(*args, **kwargs) self.visited = collections.defaultdict(dict) def check_disallow(self, url, agent): if not self.cache.allowed(url, agent): raise RobotsDisallowedError def check_crawl_delay(self, url, agent): delay = self.cache.delay(url, agent) if delay is None: return now = datetime.datetime.utcnow() host = urlparse.urlparse(url).hostname try: last_visit = self.visited[agent][host] if (now - last_visit).seconds < delay: raise RobotsThrottledError except KeyError: pass self.visited[agent][host] = now def before_send(self, request, *args, **kwargs): url = request.url agent = request.headers.get('User-Agent') self.check_disallow(url, agent) self.check_crawl_delay(url, agent)
def download_pages_in_queue(self, queue): current_page_url = queue.get() robot = RobotsCache() if (robot.allowed(current_page_url, "*")): print current_page_url if len(current_page_url) < 10: return current_page_html = download_page_by_url(current_page_url) bs = BeautifulSoup(current_page_html, "html.parser") links = bs.find_all('a', href=True) post_links = [link['href'] for link in links] for post_link in post_links: if len(post_link) < 10: continue if str(post_link).find('http') != 0: post_link = str(self.start_url) + str(post_link) queue.put(post_link) self.sites_num = self.sites_num + 1 page = Pages(url = current_page_url, parsed_text = get_text_from_html(current_page_html), is_indexed = False) page.save() else: print "Page can't be indexed because of the rules in ROBOTS.TXT"
def __FetchRobotFileInfo__(self, url, robotDictForDomains, timeStamp): domainName = self.__GetComSubdomainOfUrl__(url) robotUrl = "" if robotDictForDomains.has_key(domainName) == False: robotUrl = self.__GetRobotUrlForUrl__(domainName) cache = RobotsCache() try: timeStamp[domainName] = datetime.datetime.now() robotFileObj = cache.fetch(robotUrl) doesUrlExistOnline = self.__DoesUrlExistOnline__(robotUrl) except: doesUrlExistOnline = False robotDictForDomains[domainName] = (doesUrlExistOnline, object) if doesUrlExistOnline == True: robotDictForDomains[domainName] = (doesUrlExistOnline, robotFileObj) else: robotDictForDomains[domainName] = (doesUrlExistOnline, object) doesUrlExistOnline = robotDictForDomains[domainName][0] robotFileObj = robotDictForDomains[domainName][1] # print "heyyy",robotUrl, doesUrlExistOnline, robotFileObj, robotDictForDomains return doesUrlExistOnline, robotFileObj, robotDictForDomains, timeStamp, domainName
def download_pages_in_queue(self, queue): current_page_url = queue.get() robot = RobotsCache() if (robot.allowed(current_page_url, "*")): print current_page_url if len(current_page_url) < 10: return current_page_html = download_page_by_url(current_page_url) bs = BeautifulSoup(current_page_html, "html.parser") links = bs.find_all('a', href=True) post_links = [link['href'] for link in links] for post_link in post_links: if len(post_link) < 10: continue if str(post_link).find('http') != 0: post_link = str(self.start_url) + str(post_link) queue.put(post_link) self.sites_num = self.sites_num + 1 page = Pages(url=current_page_url, parsed_text=get_text_from_html(current_page_html), is_indexed=False) page.save() else: print "Page can't be indexed because of the rules in ROBOTS.TXT"
def __init__(self, url, config={}, proxies={}, auth=None, ua=DEFAULT_HODOR_UA, pagination_max_limit=DEFAULT_HODOR_MAX_PAGES, crawl_delay=DEFAULT_CRAWL_DELAY, ssl_verify=False, trim_values=True, robots=True, reppy_capacity=100): self.content = None self.url = url self.domain = self._get_domain() self.proxies = proxies self.auth = auth self.ua = ua self.trim_values = trim_values self.ssl_verify = ssl_verify self.config = {} self.extra_config = {} self.robots = RobotsCache(capacity=reppy_capacity) if robots else None self._pages = [] self._page_count = 0 self._pagination_max_limit = pagination_max_limit self.crawl_delay = self._crawl_delay(crawl_delay) for k, v in config.items(): if k.startswith("_"): self.extra_config[k.lstrip("_")] = v else: self.config[k] = v
def __init__(self, store, *args, **kwargs): RobotsCache.__init__(self, *args, **kwargs) self._store = store self._cache = NoSQLDict(dbtype=self._store["engine"], param={'host': self._store['host'], 'port': self._store['port'], 'db': self._store['db']['robot']})
def __init__(self, file, ua, check=True, output="output.csv" ): #setting output to false disables file output if check: #only setup robot checker if robot checking is enabled self.ua = ua #user agent self.robo = RobotsCache(capacity=100) #check var disables or enables robots.txt checking #recommended to keep default True value self.check = check self.req = requests #request obj for parsing url self.output = output #where to output file self.data = [] #init array of grabbed sites self.configarr = [] #empty array of all configs if type(file) is list: self.configarr = file else: self.configarr.append(file)
def testRobot3(self): robots = RobotsCache() rules = robots.fetch("http://www.realwire.com/") crawl_delay = rules.delay("idiot") print("delay is:", crawl_delay) for i in range(1, 1000): print(rules.allowed("http://api.google.com/search/", agent="idiot"))
def __init__(self, store, *args, **kwargs): RobotsCache.__init__(self, *args, **kwargs) self._store = store self._cache = NoSQLDict(dbtype=self._store["engine"], param={ 'host': self._store['host'], 'port': self._store['port'], 'db': self._store['db']['robot'] })
def robot_pass(self,page): """ Accepts page [object] Creates instance of RobotsCache (from reppy) Passes URL of page as string into robots.allowed method Returns True or False """ robots = RobotsCache() return robots.allowed(page.get_url(), '*')
def get_scanner_mock(request_limit): robots_cache = RobotsCache() robots_cache.fetch = MagicMock(return_value=robots_cache) robots_cache.allowed = MagicMock(return_value=True) robots_validator = RobotsValidator(agent='*') robots_validator.robots = robots_cache scanner = UrlScanner(request_limit) scanner.url_fetcher = get_url_fetcher_mock(request_limit) scanner.robots_validator = robots_validator return scanner
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: rules = robots.fetch(_domain, timeout=5) except Exception as exc: print('FAIL to fatch robot.txt {},{}'.format(_url_scheme, _url_netloc)) print(exc) return None return rules
def check_for_robot_access(self, page): self.f.write('--- checking for robots %s\n' %page) robots = RobotsCache() try: if robots.allowed(page+'robots.txt', 'my-agent'): print 'robots allowed' self.f.write('robots allowed. \n') return True except ServerError, r: print 'error ', r return False
def check_for_robot_access(self, page): self.f.write('--- checking for robots %s\n' % page) robots = RobotsCache() try: if robots.allowed(page + 'robots.txt', 'my-agent'): print 'robots allowed' self.f.write('robots allowed. \n') return True except ServerError, r: print 'error ', r return False
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: rules = robots.fetch(_domain, timeout=5) except Exception as exc: print('FAIL to fatch robot.txt {},{}'.format( _url_scheme, _url_netloc)) print(exc) return None return rules
def get_text_by_base_url(self): robots = RobotsCache(capacity=100) if not robots.allowed(self.base_url, "python-requests"): return ["Crawling this site is not allowed by robots.txt"] text_list = [] for slug in self.__get_links_by_url_depth(): sleep(0.5) text_list.append( remove_emoji( remove_url(self.__get_text_by_url(self.base_url + slug))).strip()) return text_list
def confirm_robots_txt(target_url, max_capacity): '''confirm that target url is allowed to crawl :type target_url: str :param target_url: agent wanna crawl :type max_capacity: int :param max_capacity: limit of max crawling pages :rtype: bool :return: weather it is possible to scrape ''' robots = RobotsCache(max_capacity) return robots.allowed(target_url, 'python program')
def run(self): global terminator pattern='(http://)(\w*\.)+\w+(/\w*)*' #Initialize RobotsCache object robots=RobotsCache() while 1: if terminator: break cur_raw_tweet=raw_tweets.get(True) curtweet=json.loads(cur_raw_tweet) if DEBUG: print "Got an item from raw_tweets", current_thread().getName() # Check if twitter has tate limited you by sending a blank tweet if u'text' in curtweet.keys(): text=curtweet[u'text'] else: print "Rate limited by twitter. Continuing" continue #Get text and check if it has links using regex. link=re.search(pattern,text) if link: if DEBUG: print "match" flink=link.group() #Check if crawling is allowed try: if robots.allowed(flink,'tweetbot'): soup=BeautifulSoup(urllib2.urlopen(flink),"lxml") #Check if page has title if soup.title: curtweet[u'linkTitle']=soup.title.string except reppy.ReppyException: print "Error fetching robots.txt. Continuing" continue except urllib2.URLError: print "Bad Url. Report to the developer. Continuing" continue except urllib2.HTTPError: print "Error Fetching Web Page. Continuing" continue else: if DEBUG: print "not match" processed_tweets.put(json.dumps(curtweet),True) if DEBUG: print "Put on processed queue. ProcessedSize", processed_tweets.qsize()
def __init__(self, file, ua, check=True, output="output.csv"): if check: #only setup robot checker if robot checking is enabled self.ua = ua #user agent self.robo = RobotsCache(capacity=0) #check disables or enables robots.txt checking #recommended to keep default True value self.check = check self.req = requests if os.path.exists(file): with open(file) as f: self.config = json.load(f) #opens and parses json file
def __init__(self, robots_url=None): if robots_url: robots = RobotsCache() self._rules = robots.fetch(robots_url) self.is_use_robots = True else: self.is_use_robots = False self._url_norm = UrlNorm() self.counter = 0 self.urls = dict() self.connections = defaultdict(set) self._lock = RLock()
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects #_parsed_url = urlparse(_url) _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: #print('DOMAIN: {}'.format(_domain)) rules = robots.fetch(_domain) except Exception as exc: print('FAIL to fatch robot.txt') print(_url_scheme, _url_netloc) print(exc) return None return rules
def get_robot_agent(root_domain: str, protocol="http") -> Rules: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(root_domain)[4] versions = ["http://", "https://", "http://www.", "https://www."] suffix = "/robots.txt" current = "" found = False for version in versions: temp_link = version + root_domain + suffix try: status_code, content_type = LinkChecker.get_response(temp_link) if status_code == ResponseCode.LinkOK: current = temp_link found = True break else: raise ConnectionError except: pass if found: try: robots = RobotsCache() req = robots.session.get(current) ttl = max(robots.min_ttl, Utility.get_ttl(req.headers, robots.default_ttl)) # And now parse the thing and return it return parser.Rules(current, req.status_code, req.content, time.time() + ttl) # rules = robots.fetch(current) # return rules except: return None else: return None
def get_all_links(domain, path, maxSize): #response = requests.get(domain+path, headers={'User-Agent': 'Mozilla/5.0'}) driver = webdriver.PhantomJS() driver.get(domain + path) soup = BeautifulSoup(driver.page_source, "html.parser") links = [] rp = RobotsCache(10000) for div in soup.findAll('div'): for link in div.findAll('a', href=True): #print(link.get('href')) if (rrobots(domain, link.get('href'), rp)): regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) if re.match(regex, domain + link.get('href')) is not None: if (len(link.get('href')) > 0): if ((link.get('href')[0] >= 'a' and link.get('href')[0] <= 'z') or (link.get('href')[0] >= '1' and link.get('href')[0] <= '9')): links.append('/' + link.get('href')) else: links.append(link.get('href')) return links
def __init__(self): self.DOMAIN = Settings.START_LINK.split('/')[2] self.ROBOTS_LINK = Settings.START_LINK + 'robots.txt' self.COUNT_URLS = 200000 self.THREADS = 4 self.RAND_NUM = 5 self.TIMEOUT = 30 self.AGENT = '' self.DATABASE = 'mongodb://*****:*****@name="description"]/@content', u'keywords': u'//meta[@name="keywords"]/@content', u'robots': u'//meta[@name="robots"]/@content', u'canonical': u'//link[@rel="canonical"]/@href', u'h1': u'//h1//text()', u'h2': u'//h2//text()', u'h3': u'//h3//text()', u'text': u'''//body//*[not(self::script or self::a or self::h1 or self::h2 or self::h3)]/text()[normalize-space()]''', u'script': u'//script//text()', u'p': u'//p//text()', u'anchors': u'//a//text()', u'alt': u'//img/@alt', u'title2': u'//@title' } self.robots = RobotsCache() self.rules = self.robots.cache(self.ROBOTS_LINK) self.client = MongoClient(self.DATABASE) self.db = self.client[self.DB_NAME]
def __init__(self): self.agent = "jerry's crawler" self.robots = RobotsCache() self.pool = None self.cookieJar = cookielib.CookieJar() timeout = 60 socket.setdefaulttimeout(timeout)
def __init__(self, crawler): if not crawler.settings.getbool('ROBOTSTXT_OBEY'): raise NotConfigured self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ()) self.blacklist = [] self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ()) self.hasblacklist = False self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ()) self.crawler = crawler self._useragent = crawler.settings.get('USER_AGENT') self._parsers = {} self._spider_netlocs = set() self.robots = RobotsCache() self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*') self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*')
def crawl_Pages(Seed): r = RobotsCache() robots_url=urljoin(Seed,'/robots.txt') x = r.fetch(robots_url) unvisited=[Seed] visited=[] cnt=0 delay=5 while unvisited: page=unvisited.pop(0) hdr={'User-Agent':'*'} try: req = urllib2.Request(page, headers=hdr) pagecontent=urllib2.urlopen(req) if page not in visited: time.sleep(delay) s=pagecontent.read() if (ishtmlcontent(pagecontent)): soup=BeautifulSoup(s) links=soup.findAll('a',href=True) for l in links: if (isurlvalid(l['href'])): u1=urljoin(page,l['href']) unvisited.append(u1) if x.allowed(page,'*'): visited.append(page) cnt=cnt+1 print cnt print 'Crawled:'+page visited=remove_duplicates(visited) else: if(page.endswith(".pdf")): visited.append(page) cnt=cnt+1 print 'Crawled:'+page visited=remove_duplicates(visited) if(len(visited)==100): unvisited=[] except Exception, err: print Exception, err continue
def __init__(self, url="file:///Users/tharak/Dropbox/code/Python/webcrawler/mock_website/example.org/index.html", useragent="User Agent", outdir="out", max_depth=1000, debug=0): self.url = url self.useragent = useragent self.siteMap = {self.url:""} self.outdir=outdir.rstrip("/")+"/" self.depth = 0 self.MaxDepth = max_depth self.crawled=Set([]) self.debug=debug self.domains=Set([urlparse(self.url).netloc.lower()]) self.robots = RobotsCache()
def __init__(self, base_url, forum_codes, archive_location, user_agent, worker_count): archiver_logger.info('Archiver initialized.') self.base_url = base_url self.archive_base_url = urljoin(self.base_url, ScraperConfig.ARCHIVE_SUBURL) self.forum_codes = forum_codes self.archive_location = archive_location self.user_agent = user_agent self.robot_parser = RobotsCache() self.scraper_timer = None self.shutdown_event = threading.Event() self.delay_time = 1 self.workers = [] self.worker_count = worker_count self.pages_need_visiting = Queue() self.pages_need_analysis_counter = RachetingCounter() self.pages_visited_lock = threading.Lock() self.pages_visited = [] self.page_re_filters = []
def setup_method(self, _): """Configure the app.""" self.url = "http://aetfiws.ovh" self.code1 = test_data.CODE1 self.code2 = test_data.CODE2 self.code3 = test_data.CODE3 self.parser = parsers.ExtractData() self.parser_encoding = parsers.ExtractEncoding() self.STOPWORDS = {'fr':('mot', 'pour', 'de')} self.BADWORDS = {'fr': ('pipe', 'xxx')} self.is_title = True self.title = 'letter' self.headers = {'status': '200 OK', 'content-type': 'text/html; charset=utf-8', 'vary': 'X-PJAX, Accept-Encoding'} self.reqrobots = RobotsCache(capacity=100)
def allowed(self, url): surl = urlparse(url) rurl = surl.scheme + '://' + surl.hostname + '/robots.txt' if rurl in self.__robot: if not self.__robot[rurl].expired: return self.__robot[rurl].allowed(url, UA) try: r = RobotsCache().fetch(rurl) except: return False else: self.__robot[rurl] = r # add a rule object return self.__robot[rurl].allowed(url, UA)
def __init__(self, args): if args.statsd and not re.match(REGEX_STATSD_HOST, args.statsd): raise Exception("Invalid statsd host provided") self.statsd_host = self.statsd_port = None self.statsd_disabled = True if args.statsd: self.statsd_host, self.statsd_port = args.statsd.rsplit(":", 1) self.statsd_port = int(self.statsd_port) self.statsd_disabled = False self.statsd_connection = statsd.Connection(host=self.statsd_host, port=self.statsd_port, sample_rate=0.5, disabled=self.statsd_disabled) self.statsd = statsd.Client("hydra_worker", self.statsd_connection) self.statsd_timers = {} self.statsd_counter = self.statsd.get_client(class_=statsd.Counter) self.get_info = self.time(self.get_info) utils.find_urls = self.time(utils.find_urls) self.get_jobs = self.time(self.get_jobs) self.done_jobs = self.time(self.done_jobs) self.get_tasks = self.time(self.get_tasks) self.args = args self.threads = [] self.working = True self.break_now = False self.jobs = {} self.job_lock = threading.Lock() self.broker_lock = threading.Lock() self.insert_queue = queue() self.print_queue = queue() self.fill_queue = queue() self.robots = RobotsCache() self.worker_id = args.worker_id or uuid.uuid1().hex self.socket = context.socket(zmq.REQ) self.socket.connect(args.broker_address) self.get_info() self.headers = {"Accept-encoding": "gzip", "User-agent": self.info["u"]} if args.db_override: self.info["d"] += " host=%s" % args.db_override self.database = database.Database(self.info["d"])
class RobotsValidator(object): """ Validates urls via robots.txt file """ def __init__(self, agent): self._agent = agent self.robots = RobotsCache() def get_allowed_from(self, child_urls): """ :param child_urls: List of child urls to check robots.txt on :return: A list of allowed child urls to crawl """ allowed = [] domains = list(set('{0}'.format(get_domain(url)) for url in child_urls)) domain_to_children = {domain: filter(lambda u: get_domain(u) == domain, child_urls) for domain in domains} for domain in domain_to_children: try: rules = self.robots.fetch(domain) for url in domain_to_children[domain]: if rules.allowed(url, self._agent): allowed.append(url) except: allowed.extend(domain_to_children[domain]) return allowed
class Mole: """ fetch web page based on robots.txt """ def __init__(self): self.agent = "jerry's crawler" self.robots = RobotsCache() self.pool = None self.cookieJar = cookielib.CookieJar() timeout = 60 socket.setdefaulttimeout(timeout) def fetch(self, uri): # timeout in seconds if self.robots.allowed(uri, self.agent): opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookieJar)) req = urllib2.Request(uri) req.add_header('User-Agent', self.agent) response = opener.open(req) if response.code == 200: return response.read() return None def filter_punctuation(self, tokens): non_punct = re.compile('.*[A-Za-z0-9].*') return [w for w in tokens if non_punct.match(w)] def get_sitexml_robots(self, url): robot_url = '/'.join([url, 'robots.txt']) content = self.fetch(robot_url) lines = content.split('\n') site = [] for line in lines: line = line.lower() index = line.find("sitemap") if index < 0 : continue m = re.search('sitemap\s*:\s*(\S+)',line[index:]) site.append(m.group(1)) return site def is_within_days(self, d, days=1): ago = date.today() - timedelta(days) return ago <= d def read_sitemap_file(self, mapfile): content = self.fetch(mapfile) if content is None: return None if mapfile.endswith('.gz'): d = zlib.decompressobj(16+zlib.MAX_WBITS) content = d.decompress(content) return content def create_thread_pool(self, size=10): self.pool = WorkerPool(size) def page2tokens(self, content): return nltk.word_tokenize(nltk.clean_html(content))
class Worker(object): def __init__(self, args): if args.statsd and not re.match(REGEX_STATSD_HOST, args.statsd): raise Exception("Invalid statsd host provided") self.statsd_host = self.statsd_port = None self.statsd_disabled = True if args.statsd: self.statsd_host, self.statsd_port = args.statsd.rsplit(":", 1) self.statsd_port = int(self.statsd_port) self.statsd_disabled = False self.statsd_connection = statsd.Connection(host=self.statsd_host, port=self.statsd_port, sample_rate=0.5, disabled=self.statsd_disabled) self.statsd = statsd.Client("hydra_worker", self.statsd_connection) self.statsd_timers = {} self.statsd_counter = self.statsd.get_client(class_=statsd.Counter) self.get_info = self.time(self.get_info) utils.find_urls = self.time(utils.find_urls) self.get_jobs = self.time(self.get_jobs) self.done_jobs = self.time(self.done_jobs) self.get_tasks = self.time(self.get_tasks) self.args = args self.threads = [] self.working = True self.break_now = False self.jobs = {} self.job_lock = threading.Lock() self.broker_lock = threading.Lock() self.insert_queue = queue() self.print_queue = queue() self.fill_queue = queue() self.robots = RobotsCache() self.worker_id = args.worker_id or uuid.uuid1().hex self.socket = context.socket(zmq.REQ) self.socket.connect(args.broker_address) self.get_info() self.headers = {"Accept-encoding": "gzip", "User-agent": self.info["u"]} if args.db_override: self.info["d"] += " host=%s" % args.db_override self.database = database.Database(self.info["d"]) def get_timer(self): return self.statsd.get_client(class_=statsd.Timer) def time(self, function): def wrapper(*args, **kwargs): with self.get_timer().time(function.__name__): return function(*args, **kwargs) return wrapper def start(self): threaded_jobs = [] threaded_jobs.append(self.do_heartbeat) threaded_jobs.append(self.insert_urls) threaded_jobs.append(self.printer) threaded_jobs.append(self.fill_jobs) for job in threaded_jobs: new_thread = threading.Thread(target=job) new_thread.daemon = True new_thread.start() self.threads.append(new_thread) self.do_jobs() def get_info(self): self.socket.send(make_message(BYTE_HELLO, self.worker_id)) response = check_message(self.socket.recv(), BYTE_HELLO) self.info = response[1] def get_jobs(self, count = 1): self.print_queue.put("broker getting jobs") jobs = {} first_loop = True while self.working and len(jobs) < count: if not first_loop: time.sleep(2) first_loop = False count_needed = count-len(jobs)-1 self.broker_lock.acquire() self.socket.send(make_message(BYTE_GET_JOB, self.worker_id, count_needed)) response = check_message(self.socket.recv(), BYTE_GET_JOB) self.broker_lock.release() for job, tasks in response[1]: job = job.decode("utf8") tasks = [[https, task.decode("utf8")] for https, task in tasks] jobs[job] = tasks for job, tasks in jobs.items(): try: robots = self.robots.fetch("http://%s" % job, timeout=5) sleep = robots.delay(self.info["n"]) except Exception as e: robots = sleep = None self.print_queue.put("failed to get robots") sleep = sleep or self.info["s"] self.jobs[job] = {"tasks": tasks, "robots": robots, "sleep": sleep, "timestamp": 0.0} self.insert_queue.put([job, False, job]) task_count = 0 for _, tasks in jobs.items(): task_count += len(tasks) self.print_queue.put("broker got %d jobs (%d tasks)" % (count, task_count)) def done_jobs(self, *jobs): self.broker_lock.acquire() self.socket.send(make_message(BYTE_JOB_DONE, *jobs)) self.socket.recv() self.broker_lock.release() self.job_lock.acquire() for job in jobs: self.jobs.pop(job) self.job_lock.release() def get_tasks(self, job): if not job in self.jobs: return 0 new_urls = self.database.get_urls(job) self.print_queue.put("got tasks for %s (%d)" % (job, len(new_urls))) new_urls = [[https, task.decode("utf8")] for https, task in new_urls] new_allowed_urls = [] for new_https, new_url in new_urls: task_scheme = "%s%s" % (utils.get_scheme(new_https), new_url) if not self.jobs[job]["robots"] or self.jobs[job]["robots"].allowed( task_scheme, self.info["n"]): new_allowed_urls.append([new_https, new_url]) else: self.database.timestamp(new_url) self.jobs[job]["tasks"] += new_urls return len(new_urls) def fill_jobs(self): while self.working: job = self.fill_queue.get(True) if self.get_tasks(job) == 0 and not self.jobs[job]["tasks"]: self.print_queue.put("removing job %s" % job) self.done_jobs(job) self.get_jobs() def yield_tasks(self): while self.working: tasks = [] for job in list(self.jobs): if not self.jobs[job]["tasks"]: self.fill_queue.put(job) continue after_delay = self.jobs[job]["timestamp"]+self.jobs[job]["sleep"] time_since = after_delay-time.time() if not time_since > 0 or self.jobs[job]["timestamp"] == 0.0: https, task = self.jobs[job]["tasks"].pop(0) if not len(self.jobs[job]["tasks"]) > 0: self.fill_queue.put(job) task_scheme = "%s%s" % (utils.get_scheme(https), task) tasks.append(task_scheme) self.jobs[job]["timestamp"] = time.time() if tasks: yield tasks def printer(self): while self.working: line = self.print_queue.get(True) print datetime.datetime.now(), line def insert_urls(self): while self.working: self.database.start_transaction() urls = [] hostnames = set([]) for insert in xrange(self.info["b"]): try: to_insert = self.insert_queue.get(True, 4) except Empty: break if to_insert[0] and to_insert[2]: urls.append(to_insert) hostnames.add((to_insert[2],)) self.insert_queue.task_done() if urls and hostnames: self.database.insert(urls, hostnames) self.database.stop_transaction() def do_jobs(self): self.get_jobs(self.args.jobs or 2) time_before_yield = 0.0 for tasks in self.yield_tasks(): if not time_before_yield == 0.0: self.print_queue.put("got yield tasks, took %f seconds" % ( time.time()-time_before_yield)) time_before_get = time.time() get_requests = dict([(grequests.get(task, timeout=5), task) for task in tasks]) grequests.map(get_requests.keys(), stream=True) self.print_queue.put("got responses, took %f seconds" % ( time.time()-time_before_get)) found_count = 0.0 get_successful = [] get_failed = [] get_wrong_type = [] get_responses = {} time_before_process = time.time() for request in list(get_requests): original_url = get_requests[request].split("://", 1)[-1] response = request.response if not response or not response.status_code < 400: get_failed.append(original_url) elif not response.headers.get("content-type", "").startswith( "text/"): get_wrong_type.append(original_url) else: get_responses[response] = original_url self.print_queue.put("finished processing, took %f seconds" % ( time.time()-time_before_process)) time_before_responses = time.time() gevent.joinall([gevent.spawn(getattr, response, "text") for response in get_responses]) self.print_queue.put("got second responses, took %f seconds" % ( time.time()-time_before_responses)) time_before_second_process = time.time() for response in list(get_responses): original_url = get_responses[response] try: text = response.text except: get_failed.append(original_url) continue if not text: get_failed.append(original_url) else: actual_url = response.url get_successful.append(original_url) found_urls = utils.find_urls(response.text, actual_url) found_count += len(found_urls) for url in found_urls: url_parts = utils.process_url(url) if url_parts: self.statsd_counter.increment("url_found") self.insert_queue.put(url_parts) self.print_queue.put("finished second processing, took %f seconds" % (time.time()-time_before_second_process)) time_taken = time.time()-time_before_get stats = "tried %d" % len(tasks) stats += ", success %d" % len(get_successful) stats += ", fail %d" % len(get_failed) stats += ", wrong %d" % len(get_wrong_type) stats += ", took %f seconds" % time_taken if get_successful: stats += ", found %d" % found_count stats += ", %f/site" % (found_count/len(get_successful)) stats += ", %f/second" % (found_count/time_taken) self.print_queue.put(stats) for url in get_successful: self.database.timestamp(url) for url in get_failed: self.database.timestamp(url, 1) for url in get_wrong_type: self.database.timestamp(url, 2) time_before_join = time.time() self.insert_queue.join() self.print_queue.put("finished insert queue join, took %f seconds" % (time.time()-time_before_join)) time_before_yield = time.time() def do_heartbeat(self): while self.working: time.sleep(self.info["h"]) request = make_message(BYTE_HEARTBEAT, self.worker_id, *list(self.jobs)) self.broker_lock.acquire() self.socket.send(request) response = check_message(self.socket.recv(), BYTE_HEARTBEAT) self.broker_lock.release() if len(response) > 1 and response[1] == BYTE_GET_JOB: for bad_job in response[2:]: self.job_lock.acquire() assert bad_job in self.jobs del self.jobs[bad_job] self.job_lock.release() self.get_job()
http://qiita.com/rusarusa/items/d7f014ba80d6fe7a3e07 ・PythonでWEB上の画像をまとめてダウンロード http://www.dyesac.com/pythonでweb上の画像をまとめてダウンロード/ ・画像クローラー http://qiita.com/komakomako/items/dd380f980e56e70fa321 Targets: ・https://reverb.com/jp/marketplace/electric-guitars ・https://www.yahoo.co.jp """ # (1) クロールするurlを決める target_url = "https://www.yahoo.co.jp" # (2) robot.txtを読み込むため際に使用するインスタンスの作成 robots = RobotsCache(100) # (3) もし、robot.txtを読み込んでみて、クロール許可をもらえたら、先の処理に進む if robots.allowed(target_url, 'python program'): # (4) Javascriptで生成されたコードでもクロールできるようにPhatomJSインスタンスを作成する driver = webdriver.PhantomJS() # (5) 作成したインスタンスのGetリクエストを呼ぶメソッドに対象のurlを引数として与え、domの情報を手に入れる driver.get(target_url) # <selenium.webdriver.phantomjs.webdriver.WebDriver (session="b140b9a0-74d3-11e7-b434-8b9f5b309f17")> # type(driver) # <class 'selenium.webdriver.phantomjs.webdriver.WebDriver'> # (6) 先ほど取得したdomの情報をutf-8でエンコードして、クロール対象ページの情報をbyte型として保持する html = driver.page_source.encode('utf-8') # type(html) # <class 'bytes'>
""" """ # Imports import json import time import requests import urlparse doi_url = 'http://dx.doi.org/' # Get crawl-delay parameter from robots.txt from reppy.cache import RobotsCache robots = RobotsCache() doi_delay = robots.delay(doi_url, '*') def doi_to_csl(doi): """ Fetch CSL-formatted reference by DOI. """ # Build URL url = urlparse.urljoin(doi_url, doi) # Send request req = requests.get( url, headers={ 'accept' : 'application/citeproc+json' } ) # Wait for crawl-delay
def _fetch_sitemap_from_url(self, url): robots = RobotsCache() try: return robots.fetch(url, timeout=1.5).sitemaps except: return []
def setUp(self): self.robots = RobotsCache()
from reppy.cache import RobotsCache agent = 'spoderman' sandcrawler = RobotsCache(timeout=2) def is_allowed(url): try: return sandcrawler.allowed(url, agent) except: return False def crawl_delay(url): try: delay = sandcrawler.delay(url, agent) Print('Crawl delay for', url, delay) return delay if delay else 1 except: return 1
class EZWS: """ SELF: config json config file ua user agent robo robotcache obj link current link urlp url parse object for current link soup current html page soup obj req requests obj raw raw html from req.get() check check for robot files, keep true output name of output csv file """ def __init__(self, file, ua, check=True, output="output.csv" ): #setting output to false disables file output if check: #only setup robot checker if robot checking is enabled self.ua = ua #user agent self.robo = RobotsCache(capacity=100) #check var disables or enables robots.txt checking #recommended to keep default True value self.check = check self.req = requests #request obj for parsing url self.output = output #where to output file self.data = [] #init array of grabbed sites self.configarr = [] #empty array of all configs if type(file) is list: self.configarr = file else: self.configarr.append(file) def allowed(self, url): #checks if url is ok to download if self.check: if self.robo.allowed(url, self.ua): #checks robot file return True else: print(url, "is not allowed") #notify user if url isnt allowed return False else: return True #if robot checking is off, return true regardless @property #when url is called, return it def url(self): if hasattr(self, "link"): #handles whether self has link attribute return self.link else: return "" #if not return empty string @url.setter #when url is set, parse it def url(self, url): self.link = url self.urlp = urlparse(url) def download(self, url): if self.allowed(url): self.raw = self.req.get(url).content self.soup = BeautifulSoup(self.raw, "html.parser") #loads html into soup obj def xpath(self, html, xp): #takes html and returns data from xpath tree = lxmlhtml.fromstring(html) #generates tree return tree.xpath(xp) #returns data from tree def select(self, html, obj): #determines whether to grab using css or xpath if "xpath" in obj: #if xpath items = self.xpath(html.getText(), obj["xpath"]) #return xpath selector arr else: #css items = html.select(obj["css"]) #return a css selector arr if self.config["header"]: #if theres a header keep data to one column items = items[:1] if "css" in obj: #if data is css attribute(s) from element row = [] for item in items: cont = [ ] #arr for storing attribs from each css selected element if type( obj["contents"] ) is str: #if contents is a string, put it into an array obj["contents"] = [obj["contents"]] for content in obj["contents"]: if content: #if not empty, get the element from tag cont.append(item[content]) else: #if empty, get the text from tag cont.append(item.text) row += cont #append attribs to attrib array return row #return all the attribs (css) else: return items #return xpath def clear(self): self.data = [] def load(self, index): tmp = self.configarr[index] if type(tmp) is dict: #if file is json obj, load it self.config = tmp else: #assume it is a file and load it if os.path.exists(tmp): with open(tmp) as f: self.config = json.load(f) #opens and parses json file def grab(self, index=None): if index == None: #using grab() with no params will grab all configs passed for i in range(len(self.configarr)): self.grab(i) #grab "i" config file else: self.load(index) #get current file obj if self.output: #only create simplecsv obj if file outputting is on sc = simplecsv(self.output, mode="w+") #using w+ mode to remove old output if self.config["header"]: sc.writerow( self.config["header"]) #add header from config to csv for link in self.config["links"]: #loop through links samelinks = [] #empty list of links for now if type(link["url"]) is str: samelinks.append( link["url"] ) #if url is a single str not array append it to an array else: #assume it is an array samelinks = link["url"] for samelink in samelinks: #passing "url" an array of urls will do the same params on all the links if self.allowed(samelink): #check if url is allowed self.download(samelink) #if so download it for divs in self.soup.select(link["container"]): add = [] for get in link[ "grab"]: #grabs each element from inside each div add += self.select(divs, get) self.data += add #update internal data if self.output: sc.writerow( add ) #only write to disk if file output is on if self.output: sc.close() #only close "sc" if file output is on
class Hodor(object): def __init__(self, url, config={}, proxies={}, auth=None, ua=DEFAULT_HODOR_UA, pagination_max_limit=DEFAULT_HODOR_MAX_PAGES, crawl_delay=DEFAULT_CRAWL_DELAY, ssl_verify=False, trim_values=True, robots=True, reppy_capacity=100): self.content = None self.url = url self.domain = self._get_domain() self.proxies = proxies self.auth = auth self.ua = ua self.trim_values = trim_values self.ssl_verify = ssl_verify self.config = {} self.extra_config = {} self.robots = RobotsCache(capacity=reppy_capacity) if robots else None self._pages = [] self._page_count = 0 self._pagination_max_limit = pagination_max_limit self.crawl_delay = self._crawl_delay(crawl_delay) for k, v in config.items(): if k.startswith("_"): self.extra_config[k.lstrip("_")] = v else: self.config[k] = v def _get_domain(self): parsed_uri = urlparse(self.url) return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) def _crawl_delay(self, crawl_delay): if self.robots not in EMPTY_VALUES: expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain)) delay = robots.agent(self.ua).delay try: crawl_delay = max(filter(partial(is_not, None), [delay, crawl_delay])) except ConnectionException: pass return crawl_delay def _fetch(self, url): '''Does the requests fetching and stores result in self.content''' if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua): session = requests.session() headers = {'User-Agent': self.ua} if len(self.proxies) > 0: session.proxies = self.proxies if self.auth: r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify) else: r = session.get(url, headers=headers, verify=self.ssl_verify) self.content = r.content return self.content @staticmethod def _get_value(content, rule): '''Returns result for a specific xpath''' try: tree = html.fromstring(content) except TypeError: tree = None post_processing = rule.get('transform', lambda data: data) data = "" if tree not in EMPTY_VALUES: if 'xpath' in rule: data = tree.xpath(rule['xpath']) elif 'css' in rule: data = [node.text_content() for node in tree.cssselect(rule['css'])] many = rule.get('many', True) if not many: if len(data) == 0: data = None else: data = post_processing(data[0]) else: data = [post_processing(d) for d in data] return data @staticmethod def _group_data(data, groups, config): del_fields = [] for dest, group_fields in groups.items(): if '__all__' in group_fields or group_fields == '__all__': group_fields = [rule for rule in config.keys() if not rule.startswith('_')] del_fields.extend(group_fields) gdata = [] for field in group_fields: gdata.append(data[field]) data[dest] = [] for gd in zip(*gdata): d = {} for i, field in enumerate(group_fields): d[field] = gd[i] data[dest].append(d) if len(del_fields) == 0: del_fields = [field for field_set in groups.values() for field in field_set] for field in del_fields: if field in data: del data[field] def _package_pages(self): self._data = {} if len(self._pages) == 1: self._data = self._pages[0] else: self._data = {key: [] for key in self._pages[0].keys()} for page in self._pages: for k, v in page.items(): if hasattr(v, '__iter__'): self._data[k].extend(v) else: self._data[k].append(v) return self._data @classmethod def _parse(cls, content, config={}, extra_config={}, trim_values=True): '''Parses the content based on the config set''' if len(config) is 0: _data = {'content': content} else: _data = {} try: str_class = basestring except NameError: str_class = str for key, rule in config.items(): value = cls._get_value(content, rule) if trim_values and value not in EMPTY_VALUES: if 'many' in rule and rule['many']: value = [v.strip() if isinstance(v, str_class) else v for v in value] else: value = value.strip() if isinstance(value, str_class) else value _data[key] = value paginate_by = extra_config.get('paginate_by') if paginate_by: paginate_by = cls._get_value(content, paginate_by) groups = extra_config.get('groups', {}) if groups: cls._group_data(_data, groups, config) return _data, paginate_by def _get(self, url): self._fetch(url) data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values) if paginate_by not in EMPTY_VALUES: paginate_by = urljoin(self.domain, paginate_by) return data, paginate_by def get(self, url=None): url = url if url else self.url self._data, paginate_by = self._get(url) self._pages.append(self._data) self._page_count += 1 if paginate_by and self._page_count < self._pagination_max_limit: time.sleep(self.crawl_delay) self.get(paginate_by) self._package_pages() return self._data @property def data(self): if not hasattr(self, '_data'): self.get() return self._data
def __init__(self,db_name): """Initialises the crawler with the name of the database""" self.con=sqlite.connect(db_name) self.stemmer = nltk.stem.porter.PorterStemmer() self.headers = { "User-Agent" : "Faizan Bhat's Web Crawler" } self.robots = RobotsCache()
class Crawler: def __init__(self,db_name): """Initialises the crawler with the name of the database""" self.con=sqlite.connect(db_name) self.stemmer = nltk.stem.porter.PorterStemmer() self.headers = { "User-Agent" : "Faizan Bhat's Web Crawler" } self.robots = RobotsCache() def __del__(self): self.con.close() def db_commit(self): self.con.commit() def get_entry_id(self,table,field,value,create_new=True): """Auxiliary function for getting an entry id and adding it if it is not present""" # Construct query cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value)) # Fetch res = cur.fetchone() # If not found if res==None: cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] def add_to_index(self,url,soup): """Indexes an individual page""" if self.is_indexed(url): return print 'Indexing ' + url # Get text from soup text = self.get_text_only(soup) # Separate words words = self.separate_words(text) # Stem the list of words words = map(self.stem_word, words) # Get the url ID url_id = self.get_entry_id('urllist','url',url) # Link each word to this url for i in range(len(words)): word = words[i] if word in ignore_words: continue word_id=self.get_entry_id('wordlist','word',word) self.con.execute('insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)' % (url_id,word_id,i)) def get_text_only(self,soup): """Extracts the text from an HTML page (without tags)""" v=soup.string if v==None: c=soup.contents result_text='' for t in c: sub_text = self.get_text_only(t) result_text = result_text + sub_text+'\n' return result_text else: return v.strip() def separate_words(self,text): """Separates the words by any non-whitespace characters""" splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] def stem_word(self,word): """Uses NLTK porter stemming algorithm to stem a word""" return self.stemmer.stem(word) def is_indexed(self,url): """Return True if url is already indexed""" u=self.con.execute \ ("select rowid from urllist where url='%s'" % url).fetchone() if u!=None: # Check if it has been crawled v=self.con.execute( 'select * from wordlocation where urlid=%d' % u[0]).fetchone() if v!=None: return True return False def add_link_ref(self,url_from,url_to,link_text): """Adds a link between two pages""" words = self.separate_words(link_text) from_id = self.get_entry_id('urllist','url','urlFrom') to_id=self.get_entry_id('urllist','url','urlTo') if from_id == to_id: return cur = self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (from_id,to_id)) link_id = cur.lastrowid for word in words: if word in ignore_words: continue word_id = self.get_entry_id('wordlist','word',word) self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (link_id,word_id)) def crawl(self,pages,depth=2): """Does a breadth first search on a given list of pages and indexes as we go""" for i in range(depth): print "Depth = " + str(i) newpages=set() for page in pages: if not self.robots.allowed(page,"*"): print "%s disallows robots. Moving on." %page continue try: req = urllib2.Request(page, None, self.headers) c=urllib2.urlopen(req) except: print "Could not open %s" %page continue soup = BeautifulSoup(c.read()) self.add_to_index(page,soup) links=soup('a') for link in links: if ('href' in dict(link.attrs)): url=urljoin(page,link['href']) if url.find("'")!=-1:continue url=url.split('#')[0] if url[0:4]: if not self.is_indexed(url): newpages.add(url) link_text=self.get_text_only(link) self.add_link_ref(page,url,link_text) self.db_commit() pages=newpages def create_index_tables(self): """Creates the database tables""" self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer, toid integer)') self.con.execute('create table linkwords(wordid, linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index wordurlidx on wordlocation(wordid)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.db_commit()
class WebCrawler(): """ Web crawler class crawls a specific website """ def __init__(self, url="file:///Users/tharak/Dropbox/code/Python/webcrawler/mock_website/example.org/index.html", useragent="User Agent", outdir="out", max_depth=1000, debug=0): self.url = url self.useragent = useragent self.siteMap = {self.url:""} self.outdir=outdir.rstrip("/")+"/" self.depth = 0 self.MaxDepth = max_depth self.crawled=Set([]) self.debug=debug self.domains=Set([urlparse(self.url).netloc.lower()]) self.robots = RobotsCache() def __crawl_site(self, url_key=""): """Recursively crawls the url passed and populates the sitemap datastructure """ #Do not continue crawling if we are at maximum allowed depth if self.depth > self.MaxDepth: return if url_key=="": url=self.url else: url=url_key #Check the site's robot.txt to figure the list of allowed locs #Do not check robots.txt if the file is located locally if "http" in urlparse(url).scheme: if not self.robots.allowed(url, self.useragent): if(self.debug > 0): print "Page disallowed in robots.txt %s"%(url) return if(self.debug > 0): print "Now crawling: %s"%(url) url_list=[] #When we cycle through the siteMap datastructure we convert to a url_list #Otherwise, the interpreter complains that dictionary is constantly changing for key in self.siteMap: url_list.append(key) for key in url_list: #Fetch the URLs in the webpage and append to siteMap for URLs that have not yet been crawled. if self.siteMap[key] == "": urls =self.__extract_url(url) self.siteMap[key] = urls for url_key in urls: #If the URL has already been crawled or has a # tag, dont crawl it. if (self.debug > 1): print "url_key: %s, crawled: %s"%(url_key,self.crawled) if url_key in self.crawled: continue if "#" in url_key: continue #We do not want to crawl external domains. parsed = urlparse(url_key) if (self.debug > 1): print parsed.netloc #If netloc is empty or is the main domain then the page is part of local domain and needs to be crawled. if parsed.netloc.lower() in self.domains: if (self.debug > 1): print "\ndepth=%s,URL=%s\n"%(self.depth, url_key) self.siteMap[url_key] = "" self.crawled.add(url_key) self.depth = self.depth+1 self.__crawl_site(url_key) self.depth = self.depth-1 def __print_siteMap(self): """Prints the siteMap datastructure in an XML like format """ #Dump Sitemap to an XML file try: fd = open(self.outdir+"site.xml", "w") try: fd.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") fd.write("<WEBSITE>\n") for key in self.siteMap: fd.write("\t<WEBPAGE>\n") fd.write("\t\t<ADDRESS>\"%s\"</ADDRESS>\n"%(key)) for loc in self.siteMap[key]: fd.write("\t\t<LINK>\"%s\"</LINK>\n"%(loc)) fd.write("\t</WEBPAGE>\n") fd.write("</WEBSITE>\n") finally: fd.close() except IOError: pass #Dump siteMap to a json file import json with open(self.outdir+'site.json', 'w') as fp: json.dump(self.siteMap, fp, indent=4) def get_siteMap(self): """Initiates the crawler and populates the siteMap """ from os import makedirs from shutil import rmtree rmtree(self.outdir) makedirs(self.outdir) self.__crawl_site() self.__print_siteMap() return self.siteMap def __extract_url(self, url): """Extracts the links in the input URL """ import urllib2 from urllister import URLLister from sgmllib import SGMLParseError req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) try: usock = urllib2.urlopen(req) parser = URLLister(url) try: parser.feed(usock.read()) parser.close() except Exception as exception: if (self.debug > 0): print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() pass usock.close() return parser.urls except (KeyboardInterrupt, SystemExit): raise except Exception as exception: if (self.debug > 0): print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() return []
class RobotsTxtMiddleware(object): DOWNLOAD_PRIORITY = 1000 def __init__(self, crawler): if not crawler.settings.getbool('ROBOTSTXT_OBEY'): raise NotConfigured self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ()) self.blacklist = [] self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ()) self.hasblacklist = False self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ()) self.crawler = crawler self._useragent = crawler.settings.get('USER_AGENT') self._parsers = {} self._spider_netlocs = set() self.robots = RobotsCache() self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*') self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*') @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): useragent = self._useragent if not self.hasblacklist: self.hasblacklist = True if ('http://' + spider.domain ) in self.completeblacklist and self.completeblacklist[ 'http://' + spider.domain] != None: self.blacklist = [ el.lower() for el in self.completeblacklist['http://' + spider.domain] ] log.msg(format="Got blacklist from DB for domain", level=log.DEBUG, request=request) else: log.msg(format="Didn't get a blacklist from DB for domain", level=log.DEBUG, request=request) self.blacklist.extend([el.lower() for el in self.generalblacklist]) #Check for silly repeating arguments if self.stoprepetitionsrearg.match( request.url) != None or self.stoprepetitionsreslash.match( request.url) != None: log.msg(format="URL is suspicious: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest #Blacklist overrides whitelist and robots if any(bl in request.url.lower() for bl in self.blacklist): log.msg(format="Forbidden by blacklist: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest if not any(wl in request.url for wl in self.whitelist) and self.robots and not self.robots.allowed( request.url, useragent): log.msg(format="Forbidden by robots.txt: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest
class RobotsTxtMiddleware(object): DOWNLOAD_PRIORITY = 1000 def __init__(self, crawler): if not crawler.settings.getbool('ROBOTSTXT_OBEY'): raise NotConfigured self.completeblacklist = crawler.settings.get('ROBOTSTXT_BLACKLIST', ()) self.blacklist = [] self.generalblacklist = crawler.settings.get('GENERAL_BLACKLIST', ()) self.hasblacklist = False self.whitelist = crawler.settings.get('ROBOTSTXT_WHITELIST', ()) self.crawler = crawler self._useragent = crawler.settings.get('USER_AGENT') self._parsers = {} self._spider_netlocs = set() self.robots = RobotsCache() self.stoprepetitionsrearg = re.compile(ur'.*?\&(.*?\&)\1{1,}.*') self.stoprepetitionsreslash = re.compile(ur'.*?\/(.*?\/)\1{1,}.*') @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): useragent = self._useragent if not self.hasblacklist: self.hasblacklist = True if ('http://' + spider.domain) in self.completeblacklist and self.completeblacklist['http://' + spider.domain] != None: self.blacklist = [el.lower() for el in self.completeblacklist['http://' + spider.domain]] log.msg(format="Got blacklist from DB for domain", level=log.DEBUG, request=request) else: log.msg(format="Didn't get a blacklist from DB for domain", level=log.DEBUG, request=request) self.blacklist.extend([el.lower() for el in self.generalblacklist]) #Check for silly repeating arguments if self.stoprepetitionsrearg.match(request.url) != None or self.stoprepetitionsreslash.match(request.url) != None: log.msg(format="URL is suspicious: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest #Blacklist overrides whitelist and robots if any(bl in request.url.lower() for bl in self.blacklist): log.msg(format="Forbidden by blacklist: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest if not any(wl in request.url for wl in self.whitelist) and self.robots and not self.robots.allowed(request.url, useragent): log.msg(format="Forbidden by robots.txt: %(request)s", level=log.DEBUG, request=request) raise IgnoreRequest
#! /usr/bin/env python from __future__ import print_function from contextlib import contextmanager import time from reppy.cache import RobotsCache from reppy.parser import Rules content = ''' User-agent: '*' Allow: / ''' cache = RobotsCache() cache.add(Rules('http://example.com/', 200, content, float('inf'))) @contextmanager def timer(count): '''Time this block.''' start = time.time() try: yield count finally: duration = time.time() - start print('Total: %s' % duration) print(' Avg: %s' % (duration / count)) print(' Rate: %s' % (count / duration))
def _set_robot_rule(self): """ Set the robots.txt rules """ self.rules = RobotsCache().fetch(self.url)
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual(self.robots.allowed( 'http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): old_ttl = self.robots.min_ttl self.robots.min_ttl = 0 self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) self.robots.min_ttl = old_ttl def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual( self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch( 'http://localhost:8080/foo')) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse(self.robots.disallowed( 'http://localhost:8080/foo', 'rogerbot')) urls = [ 'http://localhost:8080/foo', 'http://localhost:8080/bar' ] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual(self.robots.delay( 'http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])
def __init__(self, agent): self._agent = agent self.robots = RobotsCache()
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual( self.robots.allowed('http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch('http://localhost:8080/foo')) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse( self.robots.disallowed('http://localhost:8080/foo', 'rogerbot')) urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar'] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual( self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])
import sqlite3 import urllib import time from bs4 import BeautifulSoup from reppy.cache import RobotsCache from reppy.robots import Robots ################################################# default_crawl_delay = 5 # caching robots.txt files for fast access robots_cache = RobotsCache(capacity=200) # db commit rate commit_rate = 1 current_r = 0 ################################################# db_location = 'content.db' conn = sqlite3.connect(db_location) cur = conn.cursor() ################################################# ################################################# # populate url_frontier url_frontier = set() cur.execute("SELECT `url_link` FROM `crawled_urls` WHERE `is_scraped` = 0")
def __init__(self, *args, **kwargs): self.cache = RobotsCache(*args, **kwargs) self.visited = collections.defaultdict(dict)
class EZWS: robo = RobotsCache(capacity=100, cache_policy=ReraiseExceptionPolicy(0)) data: List[str] = [] """ SELF: config json config file ua user agent robo robotcache obj soup current html page soup obj raw raw html from req.get() check check for robot files, keep true output name of output csv file """ def __init__(self, file: Union[str, Dict], ua: str = "", check: bool = True, output: str = "output.csv") -> None: self.ua = ua self.check = check #setting output to false disables file output self.output = output self.configarr = _listify(file) def allowed(self, url: str) -> bool: if not self.check: return True try: if self.robo.allowed(url, self.ua): return True print(url, "is not allowed") except ConnectionException: print(url, "seems to be down") return False def download(self, url: str) -> Optional[Any]: if not self.allowed(url): return None self.raw = requests.get(url).content return BeautifulSoup(self.raw, "html.parser") def xpath(self, html: str, xp: str) -> List[Any]: return cast(List[Any], lxmlhtml.fromstring(html).xpath(xp)) def select(self, html: Any, json: Dict) -> List[str]: xpath = json.get("xpath", "") css = json.get("css", "") if xpath: found = self.xpath(html.getText(), xpath) return [found[0]] if self.config["header"] else found #assume css was passed found = html.select(css) if self.config["header"]: found = [found[0]] completed = [] for item in found: output = [] contents = _listify(json["contents"]) for content in contents: if content and item.has_attr(content): output.append(item[content]) else: output.append(item.text) completed += output return completed def clear(self) -> None: self.data = [] def load(self, index: int) -> None: config = self.configarr[index] if isinstance(config, Dict): self.config = config else: if os.path.exists(config): with open(config) as f: self.config = json.load(f) return None def grab(self, index: Optional[int] = None) -> None: if index is None: #using grab() with no params will grab all configs passed for i in range(len(self.configarr)): self.grab(i) return None self.load(index) if self.output: sc = simplecsv(self.output, mode="w+") if self.config["header"]: sc.writerow(self.config["header"]) for json in self.config["links"]: for link in chain( *[explode(link) for link in _listify(json["urls"])]): if not self.allowed(link): return None soup = self.download(link) if not soup: print("could not download file") return None for divs in soup.select(json["container"]): data = [] for grab in json["grab"]: data += self.select(divs, grab) self.data += data if self.output: sc.writerow(data) if self.output: sc.close()